kv-tube/app/services/transcript_service.py
KV-Tube Deployer f429116ed0
Some checks failed
Docker Build & Push / build (push) Has been cancelled
v3.1: WebLLM summarization, improved translations, copy button, removed mini player
- Added WebLLM service for client-side AI summarization and translation
- Improved summary quality (5 sentences, 600 char limit)
- Added Vietnamese character detection for proper language labels
- Added Copy button for summary content
- Key Points now extract conceptual ideas, not transcript excerpts
- Removed mini player (scroll-to-minimize) feature
- Fixed main.js null container error
- Silent WebLLM loading (no overlay/toasts)
- Added transcript service with yt-dlp
2026-01-19 19:03:09 +07:00

211 lines
7.3 KiB
Python
Executable file

"""
Transcript Service Module
Fetches video transcripts with fallback strategy: yt-dlp -> ytfetcher
"""
import os
import re
import glob
import json
import random
import logging
from typing import Optional
logger = logging.getLogger(__name__)
class TranscriptService:
"""Service for fetching YouTube video transcripts with fallback support."""
@classmethod
def get_transcript(cls, video_id: str) -> Optional[str]:
"""
Get transcript text for a video.
Strategy:
1. Try yt-dlp (current method, handles auto-generated captions)
2. Fallback to ytfetcher library if yt-dlp fails
Args:
video_id: YouTube video ID
Returns:
Transcript text or None if unavailable
"""
video_id = video_id.strip()
# Try yt-dlp first (primary method)
text = cls._fetch_with_ytdlp(video_id)
if text:
logger.info(f"Transcript fetched via yt-dlp for {video_id}")
return text
# Fallback to ytfetcher
logger.info(f"yt-dlp failed, trying ytfetcher for {video_id}")
text = cls._fetch_with_ytfetcher(video_id)
if text:
logger.info(f"Transcript fetched via ytfetcher for {video_id}")
return text
logger.warning(f"All transcript methods failed for {video_id}")
return None
@classmethod
def _fetch_with_ytdlp(cls, video_id: str) -> Optional[str]:
"""Fetch transcript using yt-dlp (downloading subtitles to file)."""
import yt_dlp
try:
logger.info(f"Fetching transcript for {video_id} using yt-dlp")
# Use a temporary filename pattern
temp_prefix = f"transcript_{video_id}_{random.randint(1000, 9999)}"
ydl_opts = {
'skip_download': True,
'quiet': True,
'no_warnings': True,
'cookiefile': os.environ.get('COOKIES_FILE', 'cookies.txt') if os.path.exists(os.environ.get('COOKIES_FILE', 'cookies.txt')) else None,
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en', 'vi', 'en-US'],
'outtmpl': f"/tmp/{temp_prefix}",
'subtitlesformat': 'json3/vtt/best',
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([f"https://www.youtube.com/watch?v={video_id}"])
# Find the downloaded file
downloaded_files = glob.glob(f"/tmp/{temp_prefix}*")
if not downloaded_files:
logger.warning("yt-dlp finished but no subtitle file found.")
return None
# Pick the best file (prefer json3, then vtt)
selected_file = None
for ext in ['.json3', '.vtt', '.ttml', '.srv3']:
for f in downloaded_files:
if f.endswith(ext):
selected_file = f
break
if selected_file:
break
if not selected_file:
selected_file = downloaded_files[0]
# Read content
with open(selected_file, 'r', encoding='utf-8') as f:
content = f.read()
# Cleanup
for f in downloaded_files:
try:
os.remove(f)
except:
pass
# Parse based on format
if selected_file.endswith('.json3') or content.strip().startswith('{'):
return cls._parse_json3(content)
else:
return cls._parse_vtt(content)
except Exception as e:
logger.error(f"yt-dlp transcript fetch failed: {e}")
return None
@classmethod
def _fetch_with_ytfetcher(cls, video_id: str) -> Optional[str]:
"""Fetch transcript using ytfetcher library as fallback."""
try:
from ytfetcher import YTFetcher
logger.info(f"Using ytfetcher for {video_id}")
# Create fetcher for single video
fetcher = YTFetcher.from_video_ids(video_ids=[video_id])
# Fetch transcripts
data = fetcher.fetch_transcripts()
if not data:
logger.warning(f"ytfetcher returned no data for {video_id}")
return None
# Extract text from transcript objects
text_parts = []
for item in data:
transcripts = getattr(item, 'transcripts', []) or []
for t in transcripts:
txt = getattr(t, 'text', '') or ''
txt = txt.strip()
if txt and txt != '\n':
text_parts.append(txt)
if not text_parts:
logger.warning(f"ytfetcher returned empty transcripts for {video_id}")
return None
return " ".join(text_parts)
except ImportError:
logger.warning("ytfetcher not installed. Run: pip install ytfetcher")
return None
except Exception as e:
logger.error(f"ytfetcher transcript fetch failed: {e}")
return None
@staticmethod
def _parse_json3(content: str) -> Optional[str]:
"""Parse JSON3 subtitle format."""
try:
json_data = json.loads(content)
events = json_data.get('events', [])
text_parts = []
for event in events:
segs = event.get('segs', [])
for seg in segs:
txt = seg.get('utf8', '').strip()
if txt and txt != '\n':
text_parts.append(txt)
return " ".join(text_parts)
except Exception as e:
logger.warning(f"JSON3 parse failed: {e}")
return None
@staticmethod
def _parse_vtt(content: str) -> Optional[str]:
"""Parse VTT/XML subtitle content."""
try:
lines = content.splitlines()
text_lines = []
seen = set()
for line in lines:
line = line.strip()
if not line:
continue
if "-->" in line:
continue
if line.isdigit():
continue
if line.startswith("WEBVTT"):
continue
if line.startswith("Kind:"):
continue
if line.startswith("Language:"):
continue
# Remove tags like <c> or <00:00:00>
clean = re.sub(r'<[^>]+>', '', line)
if clean and clean not in seen:
seen.add(clean)
text_lines.append(clean)
return " ".join(text_lines)
except Exception as e:
logger.error(f"VTT transcript parse error: {e}")
return None