""" KV-Tube API Blueprint All JSON API endpoints for the frontend """ from flask import Blueprint, request, jsonify, Response import os import sys import subprocess import json import sqlite3 import re import heapq import logging import time import random import concurrent.futures import yt_dlp # from ytfetcher import YTFetcher from app.services.settings import SettingsService from app.services.summarizer import TextRankSummarizer from app.services.gemini_summarizer import summarize_with_gemini, extract_key_points_with_gemini from app.services.youtube import YouTubeService logger = logging.getLogger(__name__) api_bp = Blueprint('api', __name__, url_prefix='/api') # Database path DATA_DIR = os.environ.get("KVTUBE_DATA_DIR", "data") DB_NAME = os.path.join(DATA_DIR, "kvtube.db") # Caching API_CACHE = {} CACHE_TIMEOUT = 60 # 1 minute for fresher content def get_db_connection(): """Get database connection with row factory.""" conn = sqlite3.connect(DB_NAME) conn.row_factory = sqlite3.Row return conn # --- Helper Functions --- def extractive_summary(text, num_sentences=5): """Extract key sentences from text using word frequency.""" # Clean text clean_text = re.sub(r"\[.*?\]", "", text) clean_text = clean_text.replace("\n", " ") # Split into sentences sentences = re.split(r"(? 1024**3: size_str = f"{f_filesize / 1024**3:.1f} GB" elif f_filesize > 1024**2: size_str = f"{f_filesize / 1024**2:.1f} MB" elif f_filesize > 1024: size_str = f"{f_filesize / 1024:.1f} KB" if f_ext in ["mp4", "webm"]: vcodec = f.get("vcodec", "none") acodec = f.get("acodec", "none") if vcodec != "none" and acodec != "none": video_formats.append({ "quality": f"{quality} (with audio)", "ext": f_ext, "size": size_str, "url": f_url, "type": "combined", "has_audio": True, }) elif vcodec != "none": video_formats.append({ "quality": quality, "ext": f_ext, "size": size_str, "url": f_url, "type": "video", "has_audio": False, }) elif acodec != "none": audio_formats.append({ "quality": quality, "ext": f_ext, "size": size_str, "url": f_url, "type": "audio", }) def parse_quality(f): q = f["quality"].lower() for i, res in enumerate(["4k", "2160", "1080", "720", "480", "360", "240", "144"]): if res in q: return i return 99 video_formats.sort(key=parse_quality) audio_formats.sort(key=parse_quality) return jsonify({ "success": True, "video_id": video_id, "title": title, "duration": duration, "thumbnail": thumbnail, "formats": {"video": video_formats[:10], "audio": audio_formats[:5]}, }) except Exception as e: logger.error(f"Download formats error: {e}") return jsonify({"success": False, "error": str(e)}), 500 @api_bp.route("/get_stream_info") def get_stream_info(): """Get video stream info with caching.""" video_id = request.args.get("v") if not video_id: return jsonify({"error": "No video ID"}), 400 try: conn = get_db_connection() cached = conn.execute( "SELECT data, expires_at FROM video_cache WHERE video_id = ?", (video_id,) ).fetchone() current_time = time.time() if cached: try: expires_at = float(cached["expires_at"]) if current_time < expires_at: data = json.loads(cached["data"]) conn.close() from urllib.parse import quote proxied_url = f"/video_proxy?url={quote(data['original_url'], safe='')}" data["stream_url"] = proxied_url response = jsonify(data) response.headers["X-Cache"] = "HIT" return response except (ValueError, KeyError): pass # Use YouTubeService which handles failover (Local -> Remote) info = YouTubeService.get_video_info(video_id) if not info: return jsonify({"error": "Failed to fetch video info from all engines"}), 500 stream_url = info.get("stream_url") if not stream_url: return jsonify({"error": "No stream URL found"}), 500 response_data = { "original_url": stream_url, "title": info.get("title", "Unknown"), "description": info.get("description", ""), "uploader": info.get("uploader", ""), "uploader_id": info.get("uploader_id", ""), "channel_id": info.get("channel_id", ""), "upload_date": info.get("upload_date", ""), "view_count": info.get("view_count", 0), "subtitle_url": info.get("subtitle_url"), "related": [], } from urllib.parse import quote # Encode headers into the proxy URL http_headers = info.get("http_headers", {}) header_params = "" for k, v in http_headers.items(): # Only pass critical headers that might affect access if k.lower() in ['user-agent', 'cookie', 'referer', 'origin']: header_params += f"&h_{quote(k)}={quote(v)}" proxied_url = f"/video_proxy?url={quote(stream_url, safe='')}{header_params}" response_data["stream_url"] = proxied_url # Cache it expiry = current_time + 3600 conn.execute( "INSERT OR REPLACE INTO video_cache (video_id, data, expires_at) VALUES (?, ?, ?)", (video_id, json.dumps(response_data), expiry), ) conn.commit() conn.close() response = jsonify(response_data) response.headers["X-Cache"] = "MISS" return response except Exception as e: return jsonify({"error": str(e)}), 500 @api_bp.route("/stream/qualities") def get_stream_qualities(): """Get available stream qualities for a video with proxied URLs.""" video_id = request.args.get("v") if not video_id: return jsonify({"success": False, "error": "No video ID"}), 400 try: url = f"https://www.youtube.com/watch?v={video_id}" ydl_opts = { "format": "best", "noplaylist": True, "quiet": True, "no_warnings": True, "skip_download": True, "youtube_include_dash_manifest": False, "youtube_include_hls_manifest": False, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) qualities = [] seen_resolutions = set() # Sort formats by quality (highest first) formats = info.get("formats", []) for f in formats: f_url = f.get("url", "") if not f_url or "m3u8" in f_url: continue # Only include formats with both video and audio (progressive) vcodec = f.get("vcodec", "none") acodec = f.get("acodec", "none") if vcodec == "none" or acodec == "none": continue f_ext = f.get("ext", "") if f_ext not in ["mp4", "webm"]: continue # Get resolution label height = f.get("height", 0) format_note = f.get("format_note", "") if height: label = f"{height}p" elif format_note: label = format_note else: continue # Skip duplicates if label in seen_resolutions: continue seen_resolutions.add(label) # Create proxied URL from urllib.parse import quote proxied_url = f"/video_proxy?url={quote(f_url, safe='')}" qualities.append({ "label": label, "height": height, "url": proxied_url, "ext": f_ext, }) # Sort by height descending (best first) qualities.sort(key=lambda x: x.get("height", 0), reverse=True) # Add "Auto" option at the beginning (uses best available) if qualities: auto_quality = { "label": "Auto", "height": 9999, # Highest priority "url": qualities[0]["url"], # Use best quality "ext": qualities[0]["ext"], "default": True, } qualities.insert(0, auto_quality) return jsonify({ "success": True, "video_id": video_id, "qualities": qualities[:8], # Limit to 8 options }) except Exception as e: logger.error(f"Stream qualities error: {e}") return jsonify({"success": False, "error": str(e)}), 500 @api_bp.route("/search") def search(): """Search for videos.""" query = request.args.get("q") if not query: return jsonify({"error": "No query provided"}), 400 try: # Check if URL url_match = re.match(r"(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})", query) if url_match: video_id = url_match.group(1) # Fetch single video info ydl_opts = { "quiet": True, "no_warnings": True, "noplaylist": True, "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36", } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False) return jsonify([{ "id": video_id, "title": info.get("title", "Unknown"), "uploader": info.get("uploader", "Unknown"), "thumbnail": f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg", "view_count": info.get("view_count", 0), "upload_date": info.get("upload_date", ""), "duration": None, }]) # Standard search results = fetch_videos(query, limit=20, filter_type="video") return jsonify(results) except Exception as e: logger.error(f"Search Error: {e}") return jsonify({"error": str(e)}), 500 @api_bp.route("/channel") def get_channel_videos_simple(): """Get videos from a channel.""" channel_id = request.args.get("id") filter_type = request.args.get("filter_type", "video") if not channel_id: return jsonify({"error": "No channel ID provided"}), 400 try: # Construct URL suffix = "shorts" if filter_type == "shorts" else "videos" if channel_id.startswith("UC"): url = f"https://www.youtube.com/channel/{channel_id}/{suffix}" elif channel_id.startswith("@"): url = f"https://www.youtube.com/{channel_id}/{suffix}" else: url = f"https://www.youtube.com/channel/{channel_id}/{suffix}" cmd = [ sys.executable, "-m", "yt_dlp", url, "--dump-json", "--flat-playlist", "--playlist-end", "20", "--no-warnings", ] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) stdout, stderr = proc.communicate() videos = [] for line in stdout.splitlines(): try: v = json.loads(line) dur_str = None if v.get("duration"): m, s = divmod(int(v["duration"]), 60) h, m = divmod(m, 60) dur_str = f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}" videos.append({ "id": v.get("id"), "title": v.get("title"), "thumbnail": f"https://i.ytimg.com/vi/{v.get('id')}/mqdefault.jpg", "view_count": v.get("view_count") or 0, "duration": dur_str, "upload_date": v.get("upload_date"), "uploader": v.get("uploader") or v.get("channel") or "", }) except json.JSONDecodeError: continue return jsonify(videos) except Exception as e: logger.error(f"Channel Fetch Error: {e}") return jsonify({"error": str(e)}), 500 @api_bp.route("/trending") def trending(): """Get trending videos.""" from flask import current_app category = request.args.get("category", "all") page = int(request.args.get("page", 1)) sort = request.args.get("sort", "newest") region = request.args.get("region", "vietnam") cache_key = f"trending_{category}_{page}_{sort}_{region}" # Check cache if cache_key in API_CACHE: cached_time, cached_data = API_CACHE[cache_key] if time.time() - cached_time < CACHE_TIMEOUT: return jsonify(cached_data) try: # Category search queries queries = { "all": "trending videos 2024", "music": "music trending", "gaming": "gaming trending", "news": "news today", "tech": "technology reviews 2024", "movies": "movie trailers 2024", "sports": "sports highlights", } # For 'all' category, always fetch from multiple categories for diverse content if category == "all": region_suffix = " vietnam" if region == "vietnam" else "" # Rotate through different queries based on page for variety query_sets = [ [f"trending videos 2024{region_suffix}", f"music trending{region_suffix}", f"tech reviews 2024{region_suffix}"], [f"movie trailers 2024{region_suffix}", f"gaming trending{region_suffix}", f"sports highlights{region_suffix}"], [f"trending music 2024{region_suffix}", f"viral videos{region_suffix}", f"entertainment news{region_suffix}"], [f"tech gadgets{region_suffix}", f"comedy videos{region_suffix}", f"documentary{region_suffix}"], ] # Use different query set based on page to get variety query_index = (page - 1) % len(query_sets) current_queries = query_sets[query_index] # Calculate offset within query set start_offset = ((page - 1) // len(query_sets)) * 7 + 1 # Fetch from multiple categories in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: futures = [ executor.submit(fetch_videos, q, limit=7, filter_type="video", playlist_start=start_offset) for q in current_queries ] results = [f.result() for f in futures] # Combine all videos and deduplicate all_videos = [] seen_ids = set() for video_list in results: for vid in video_list: if vid['id'] not in seen_ids: seen_ids.add(vid['id']) all_videos.append(vid) # Shuffle for variety random.shuffle(all_videos) # Cache result API_CACHE[cache_key] = (time.time(), all_videos) return jsonify(all_videos) # Single category - support proper pagination query = queries.get(category, queries["all"]) if region == "vietnam": query += " vietnam" videos = fetch_videos(query, limit=20, filter_type="video", playlist_start=(page-1)*20+1) # Cache result API_CACHE[cache_key] = (time.time(), videos) return jsonify(videos) except Exception as e: return jsonify({"error": str(e)}), 500 @api_bp.route("/transcript") def get_transcript(): """Get video transcript (VTT).""" video_id = request.args.get("v") if not video_id: return "No video ID", 400 try: url = f"https://www.youtube.com/watch?v={video_id}" # Use yt-dlp to get subtitles cmd = [ sys.executable, "-m", "yt_dlp", url, "--write-auto-sub", "--sub-lang", "en,vi", "--skip-download", "--no-warnings", "--quiet", "--sub-format", "vtt", "--output", "CAPTIONS_%(id)s" ] # We need to run this in a temp dir or handle output names # Simplified: fetch info and get subtitle URL # Better approach: Get subtitle URL from extract_info with yt_dlp.YoutubeDL({'quiet': True, 'skip_download': True}) as ydl: info = ydl.extract_info(url, download=False) subtitles = info.get('subtitles') or info.get('automatic_captions') or {} # Prefer English, then Vietnamese, then any lang = 'en' if 'en' not in subtitles and 'vi' in subtitles: lang = 'vi' elif 'en' not in subtitles: # Pick first available langs = list(subtitles.keys()) if langs: lang = langs[0] if lang and lang in subtitles: subs_list = subtitles[lang] # Find vtt vtt_url = next((s['url'] for s in subs_list if s.get('ext') == 'vtt'), None) if not vtt_url: vtt_url = subs_list[0]['url'] # Fallback # Fetch the VTT content import requests res = requests.get(vtt_url) return Response(res.content, mimetype="text/vtt") return "No transcript available", 404 except Exception as e: logger.error(f"Transcript error: {e}") return str(e), 500 @api_bp.route("/summarize") def summarize_video(): """Get video summary from transcript using AI (Gemini) or TextRank fallback.""" video_id = request.args.get("v") video_title = request.args.get("title", "") translate_to = request.args.get("lang") # Optional: 'vi' for Vietnamese if not video_id: return jsonify({"error": "No video ID"}), 400 try: # 1. Get Transcript Text text = get_transcript_text(video_id) if not text: return jsonify({ "success": False, "error": "No transcript available to summarize." }) # 2. Use TextRank Summarizer (Gemini removed per user request) summarizer = TextRankSummarizer() summary_text = summarizer.summarize(text, num_sentences=3) # Limit to 300 characters for concise display if len(summary_text) > 300: summary_text = summary_text[:297] + "..." # Extract key points from summary (heuristic) sentences = [s.strip() for s in summary_text.split('.') if len(s.strip()) > 15] key_points = sentences[:3] # Store original versions original_summary = summary_text original_key_points = key_points.copy() if key_points else [] # 3. Translate if requested translated_summary = None translated_key_points = None if translate_to == 'vi': try: translated_summary = translate_text(summary_text, 'vi') translated_key_points = [translate_text(p, 'vi') for p in key_points] if key_points else [] except Exception as te: logger.warning(f"Translation failed: {te}") # 4. Return structured data return jsonify({ "success": True, "summary": original_summary, "key_points": original_key_points, "translated_summary": translated_summary, "translated_key_points": translated_key_points, "lang": translate_to or "en", "video_id": video_id, "ai_powered": False }) except Exception as e: logger.error(f"Summarization error: {e}") return jsonify({"success": False, "error": str(e)}) def translate_text(text, target_lang='vi'): """Translate text to target language using Google Translate.""" try: from googletrans import Translator translator = Translator() result = translator.translate(text, dest=target_lang) return result.text except Exception as e: logger.error(f"Translation error: {e}") return text # Return original text if translation fails def get_transcript_text(video_id): """ Fetch transcript using strictly YTFetcher as requested. Ensure 'ytfetcher' is up to date before usage. """ from ytfetcher import YTFetcher from ytfetcher.config import HTTPConfig import random import os import http.cookiejar try: # 1. Prepare Cookies if available # This was key to the previous success! cookie_header = "" cookies_path = os.environ.get('COOKIES_FILE', 'cookies.txt') if os.path.exists(cookies_path): try: cj = http.cookiejar.MozillaCookieJar(cookies_path) cj.load() cookies_list = [] for cookie in cj: cookies_list.append(f"{cookie.name}={cookie.value}") cookie_header = "; ".join(cookies_list) logger.info(f"Loaded {len(cookies_list)} cookies for YTFetcher") except Exception as e: logger.warning(f"Failed to process cookies: {e}") # 2. Configuration to look like a real browser user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0" ] headers = { "User-Agent": random.choice(user_agents), "Accept-Language": "en-US,en;q=0.9", } # Inject cookie header if we have it if cookie_header: headers["Cookie"] = cookie_header config = HTTPConfig(headers=headers) # Initialize Fetcher fetcher = YTFetcher.from_video_ids( video_ids=[video_id], http_config=config, languages=['en', 'en-US', 'vi'] ) # Fetch logger.info(f"Fetching transcript with YTFetcher for {video_id}") results = fetcher.fetch_transcripts() if results: data = results[0] # Check for transcript data if data.transcripts: logger.info("YTFetcher: Transcript found.") text_lines = [t.text.strip() for t in data.transcripts if t.text.strip()] return " ".join(text_lines) else: logger.warning("YTFetcher: No transcript in result.") except Exception as e: import traceback tb = traceback.format_exc() logger.error(f"YTFetcher Execution Failed: {e}\n{tb}") return None return None def parse_transcript_content(content): """Helper to parse VTT/XML content.""" try: # Simple VTT cleaner lines = content.splitlines() text_lines = [] seen = set() for line in lines: line = line.strip() if not line: continue if "-->" in line: continue if line.isdigit(): continue if line.startswith("WEBVTT"): continue if line.startswith("Kind:"): continue if line.startswith("Language:"): continue # Remove tags like or <00:00:00> clean = re.sub(r'<[^>]+>', '', line) if clean and clean not in seen: seen.add(clean) text_lines.append(clean) return " ".join(text_lines) except Exception as e: logger.error(f"Transcript parse error: {e}") return None @api_bp.route("/update_ytdlp", methods=["POST"]) def update_ytdlp(): """Update yt-dlp to latest version.""" try: cmd = [sys.executable, "-m", "pip", "install", "-U", "yt-dlp"] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: ver_cmd = [sys.executable, "-m", "yt_dlp", "--version"] ver_result = subprocess.run(ver_cmd, capture_output=True, text=True) version = ver_result.stdout.strip() return jsonify({"success": True, "message": f"Updated to {version}"}) else: return jsonify({"success": False, "message": f"Update failed: {result.stderr}"}), 500 except Exception as e: return jsonify({"success": False, "message": str(e)}), 500 @api_bp.route("/update_package", methods=["POST"]) def update_package(): """Update a Python package (yt-dlp stable/nightly, ytfetcher).""" try: data = request.json or {} pkg = data.get("package", "ytdlp") version = data.get("version", "stable") if pkg == "ytdlp": if version == "nightly": # Install nightly/master from GitHub # Force reinstall and NO CACHE to ensure we get the latest commit cmd = [sys.executable, "-m", "pip", "install", "--no-cache-dir", "--force-reinstall", "-U", "https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz"] else: # Install stable from PyPI cmd = [sys.executable, "-m", "pip", "install", "-U", "yt-dlp"] result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode == 0: ver_cmd = [sys.executable, "-m", "yt_dlp", "--version"] ver_result = subprocess.run(ver_cmd, capture_output=True, text=True) ver_str = ver_result.stdout.strip() suffix = " (nightly)" if version == "nightly" else "" return jsonify({"success": True, "message": f"yt-dlp updated to {ver_str}{suffix}"}) else: return jsonify({"success": False, "message": f"Update failed: {result.stderr[:200]}"}), 500 elif pkg == "ytfetcher": # Install/update ytfetcher from GitHub cmd = [sys.executable, "-m", "pip", "install", "-U", "git+https://github.com/kaya70875/ytfetcher.git"] result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.returncode == 0: return jsonify({"success": True, "message": "ytfetcher updated successfully"}) else: return jsonify({"success": False, "message": f"Update failed: {result.stderr[:200]}"}), 500 else: return jsonify({"success": False, "message": f"Unknown package: {pkg}"}), 400 except subprocess.TimeoutExpired: return jsonify({"success": False, "message": "Update timed out"}), 500 except Exception as e: return jsonify({"success": False, "message": str(e)}), 500 @api_bp.route("/comments") def get_comments(): """Get comments for a video.""" video_id = request.args.get("v") if not video_id: return jsonify({"error": "No video ID"}), 400 try: url = f"https://www.youtube.com/watch?v={video_id}" cmd = [ sys.executable, "-m", "yt_dlp", url, "--write-comments", "--skip-download", "--dump-json", ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) if result.returncode == 0: data = json.loads(result.stdout) comments_data = data.get("comments", []) comments = [] for c in comments_data[:50]: comments.append({ "author": c.get("author", "Unknown"), "author_thumbnail": c.get("author_thumbnail", ""), "text": c.get("text", ""), "likes": c.get("like_count", 0), "time": c.get("time_text", ""), "is_pinned": c.get("is_pinned", False), }) return jsonify({"comments": comments, "count": data.get("comment_count", len(comments))}) else: return jsonify({"comments": [], "count": 0, "error": "Could not load comments"}) except subprocess.TimeoutExpired: return jsonify({"comments": [], "count": 0, "error": "Comments loading timed out"}) except Exception as e: return jsonify({"comments": [], "count": 0, "error": str(e)}) @api_bp.route("/settings", methods=["GET"]) def get_settings(): """Get all settings.""" return jsonify(SettingsService.get_all()) @api_bp.route("/package/version") def get_package_version(): """Get version of a package.""" pkg = request.args.get("package", "yt_dlp") try: if pkg == "yt_dlp" or pkg == "ytdlp": import yt_dlp version = yt_dlp.version.__version__ # Check if it looks like nightly (contains dev or current date) return jsonify({"success": True, "package": "yt-dlp", "version": version}) elif pkg == "ytfetcher": try: import ytfetcher # ytfetcher might not have __version__ exposed easily, but let's try version = getattr(ytfetcher, "__version__", "installed") return jsonify({"success": True, "package": "ytfetcher", "version": version}) except ImportError: return jsonify({"success": False, "package": "ytfetcher", "version": "not installed"}) else: return jsonify({"error": "Unknown package"}), 400 except Exception as e: return jsonify({"success": False, "error": str(e)}), 500 @api_bp.route("/settings", methods=["POST"]) def update_settings(): """Update a setting.""" data = request.json if not data or 'key' not in data or 'value' not in data: return jsonify({"error": "Invalid request"}), 400 try: SettingsService.set(data['key'], data['value']) return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 500 @api_bp.route("/settings/test", methods=["POST"]) def test_engine(): """Test the current engine configuration.""" from app.services.youtube import YouTubeService # Use a known safe video (Me at the zoo) TEST_VID = "jNQXAC9IVRw" try: # Force a fresh fetch ignoring cache logic if possible # We just call get_video_info which uses the current SettingsService engine info = YouTubeService.get_video_info(TEST_VID) if info and info.get('stream_url'): return jsonify({ "success": True, "message": f"Successfully fetched via {SettingsService.get('youtube_engine', 'auto')}", "details": { "title": info.get('title'), "engine": SettingsService.get('youtube_engine', 'auto') } }) else: return jsonify({ "success": False, "message": "Fetch returned no data" }) except Exception as e: return jsonify({"success": False, "message": str(e)})