kv-tube/app.py

from flask import Flask, render_template, request, redirect, url_for, jsonify, send_file, Response, stream_with_context, session, flash
import os
import sys
import subprocess
import json
import requests
import sqlite3
from werkzeug.security import generate_password_hash, check_password_hash
import yt_dlp
from functools import wraps
import yt_dlp
from functools import wraps
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
import re
import heapq
# nltk removed to avoid SSL/download issues. Using regex instead.

app = Flask(__name__)
app.secret_key = 'super_secret_key_change_this'  # Required for sessions

# Ensure data directory exists for persistence
DATA_DIR = "data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

DB_NAME = os.path.join(DATA_DIR, "kvtube.db")

# --- Database Setup ---
def init_db():
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    # Users Table
    c.execute('''CREATE TABLE IF NOT EXISTS users (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    username TEXT UNIQUE NOT NULL,
                    password TEXT NOT NULL
                )''')
    # Saved/History Table
    # type: 'history' or 'saved'
    c.execute('''CREATE TABLE IF NOT EXISTS user_videos (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    user_id INTEGER,
                    video_id TEXT,
                    title TEXT,
                    thumbnail TEXT,
                    type TEXT,
                    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY(user_id) REFERENCES users(id)
                )''')
    # Cache Table for video metadata/streams
    c.execute('''CREATE TABLE IF NOT EXISTS video_cache (
                    video_id TEXT PRIMARY KEY,
                    data TEXT,
                    expires_at DATETIME
                )''')
    conn.commit()
    conn.close()

# Run init
init_db()

# --- Auth Helpers ---
def login_required(f):
    @wraps(f)
    def decorated_function(*args, **kwargs):
        if 'user_id' not in session:
            return redirect(url_for('login'))
        return f(*args, **kwargs)
    return decorated_function

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

# --- Auth Routes ---
@app.route('/login', methods=['GET', 'POST'])
def login():
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']

        conn = get_db_connection()
        user = conn.execute('SELECT * FROM users WHERE username = ?', (username,)).fetchone()
        conn.close()

        if user and check_password_hash(user['password'], password):
            session['user_id'] = user['id']
            session['username'] = user['username']
            return redirect(url_for('index')) # Changed from 'home' to 'index'
        else:
            flash('Invalid username or password')

    return render_template('login.html')

@app.route('/register', methods=['GET', 'POST'])
def register():
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']
        hashed_pw = generate_password_hash(password)

        try:
            conn = get_db_connection()
            conn.execute('INSERT INTO users (username, password) VALUES (?, ?)', (username, hashed_pw))
            conn.commit()
            conn.close()
            flash('Registration successful! Please login.')
            return redirect(url_for('login'))
        except sqlite3.IntegrityError:
            flash('Username already exists')

    return render_template('register.html')

@app.route('/logout')
@app.route('/api/update_profile', methods=['POST'])
@login_required
def update_profile():
    data = request.json
    new_username = data.get('username')

    if not new_username:
        return jsonify({'success': False, 'message': 'Username is required'}), 400

    try:
        conn = get_db_connection()
        conn.execute('UPDATE users SET username = ? WHERE id = ?',
                    (new_username, session['user_id']))
        conn.commit()
        conn.close()

        session['username'] = new_username
        return jsonify({'success': True, 'message': 'Profile updated'})
    except Exception as e:
        return jsonify({'success': False, 'message': str(e)}), 500

def logout():
    session.clear()
    return redirect(url_for('index')) # Changed from 'home' to 'index'

@app.template_filter('format_views')
def format_views(views):
    if not views: return '0'
    try:
        num = int(views)
        if num >= 1000000: return f"{num / 1000000:.1f}M"
        if num >= 1000: return f"{num / 1000:.0f}K"
        return f"{num:,}"
    except:
        return str(views)

@app.template_filter('format_date')
def format_date(value):
    if not value: return 'Recently'
    from datetime import datetime, timedelta
    try:
        # Handle YYYYMMDD
        if len(str(value)) == 8 and str(value).isdigit():
            dt = datetime.strptime(str(value), '%Y%m%d')
        # Handle Timestamp
        elif isinstance(value, (int, float)):
            dt = datetime.fromtimestamp(value)
        # Handle already formatted (YYYY-MM-DD)
        else:
             # Try common formats
             try: dt = datetime.strptime(str(value), '%Y-%m-%d')
             except: return str(value)

        now = datetime.now()
        diff = now - dt

        if diff.days > 365:
            return f"{diff.days // 365} years ago"
        if diff.days > 30:
            return f"{diff.days // 30} months ago"
        if diff.days > 0:
            return f"{diff.days} days ago"
        if diff.seconds > 3600:
            return f"{diff.seconds // 3600} hours ago"
        return "Just now"
    except:
        return str(value)

# Configuration for local video path - configurable via env var
VIDEO_DIR = os.environ.get('KVTUBE_VIDEO_DIR', './videos')

@app.route('/')
def index():
    return render_template('index.html', page='home')

@app.route('/my-videos')
def my_videos():
    filter_type = request.args.get('type', 'history') # 'saved' or 'history'

    videos = []
    logged_in = 'user_id' in session

    if logged_in:
        conn = get_db_connection()
        videos = conn.execute('''
            SELECT * FROM user_videos
            WHERE user_id = ? AND type = ?
            ORDER BY timestamp DESC
        ''', (session['user_id'], filter_type)).fetchall()
        conn.close()

    return render_template('my_videos.html', videos=videos, filter_type=filter_type, logged_in=logged_in)

@app.route('/api/save_video', methods=['POST'])
@login_required
def save_video():
    data = request.json
    video_id = data.get('id')
    title = data.get('title')
    thumbnail = data.get('thumbnail')
    action_type = data.get('type', 'history') # 'history' or 'saved'

    conn = get_db_connection()

    # Check if already exists to prevent duplicates (optional, strictly for 'saved')
    if action_type == 'saved':
        exists = conn.execute('SELECT id FROM user_videos WHERE user_id = ? AND video_id = ? AND type = ?',
                             (session['user_id'], video_id, 'saved')).fetchone()
        if exists:
            conn.close()
            return jsonify({'status': 'already_saved'})

    conn.execute('INSERT INTO user_videos (user_id, video_id, title, thumbnail, type) VALUES (?, ?, ?, ?, ?)',
                 (session['user_id'], video_id, title, thumbnail, action_type))
    conn.commit()
    conn.close()
    return jsonify({'status': 'success'})

@app.route('/stream/<path:filename>')
def stream_local(filename):
    return send_from_directory(VIDEO_DIR, filename)

@app.route('/settings')
def settings():
    return render_template('settings.html', page='settings')

@app.route('/video_proxy')
def video_proxy():
    url = request.args.get('url')
    if not url:
        return "No URL provided", 400

    # Forward headers to mimic browser and support seeking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }

    # Support Range requests (scrubbing)
    range_header = request.headers.get('Range')
    if range_header:
        headers['Range'] = range_header

    try:
        req = requests.get(url, headers=headers, stream=True, timeout=30)

        # Handle HLS (M3U8) Rewriting - CRITICAL for 1080p+ and proper sync
        content_type = req.headers.get('content-type', '').lower()
        # Extract URL path without query params for checking extension
        url_path = url.split('?')[0]
        is_manifest = (url_path.endswith('.m3u8') or
                       'application/x-mpegurl' in content_type or
                       'application/vnd.apple.mpegurl' in content_type)

        if is_manifest:
            content = req.text
            base_url = url.rsplit('/', 1)[0]
            new_lines = []

            for line in content.splitlines():
                if line.strip() and not line.startswith('#'):
                    # It's a segment or sub-playlist
                    # If relative, make absolute
                    if not line.startswith('http'):
                        full_url = f"{base_url}/{line}"
                    else:
                        full_url = line

                    # Proxy it - use urllib.parse.quote with safe parameter
                    from urllib.parse import quote
                    quoted_url = quote(full_url, safe='')
                    new_lines.append(f"/video_proxy?url={quoted_url}")
                else:
                    new_lines.append(line)

            return Response('\n'.join(new_lines), content_type='application/vnd.apple.mpegurl')

        # Standard Stream Proxy (Binary)
        # We exclude headers that might confuse the browser/flask
        excluded_headers = ['content-encoding', 'content-length', 'transfer-encoding', 'connection']
        response_headers = [(name, value) for (name, value) in req.headers.items()
                           if name.lower() not in excluded_headers]

        return Response(stream_with_context(req.iter_content(chunk_size=8192)),
                        status=req.status_code,
                        headers=response_headers,
                        content_type=req.headers.get('content-type'))
    except Exception as e:
        print(f"Proxy Error: {e}")
        return str(e), 500

@app.route('/watch')
def watch():
    video_id = request.args.get('v')
    local_file = request.args.get('local')

    if local_file:
        return render_template('watch.html', video_type='local', src=url_for('stream_local', filename=local_file), title=local_file)

    if not video_id:
        return "No video ID provided", 400
    return render_template('watch.html', video_type='youtube', video_id=video_id)

@app.route('/channel/<channel_id>')
def channel(channel_id):
    if not channel_id:
        return redirect(url_for('index'))

    try:
        # Robustness: Resolve name to ID if needed (Metadata only fetch)
        real_id_or_url = channel_id
        is_search_fallback = False

        if not channel_id.startswith('UC') and not channel_id.startswith('@'):
            # Simple resolve logic - reusing similar block from before but optimized for metadata
             search_cmd = [
                sys.executable, '-m', 'yt_dlp',
                f'ytsearch1:{channel_id}',
                '--dump-json',
                '--default-search', 'ytsearch',
                '--no-playlist'
            ]
             try:
                 proc_search = subprocess.run(search_cmd, capture_output=True, text=True)
                 if proc_search.returncode == 0:
                     first_result = json.loads(proc_search.stdout.splitlines()[0])
                     if first_result.get('channel_id'):
                         real_id_or_url = first_result.get('channel_id')
                         is_search_fallback = True
             except: pass

        # Fetch basic channel info (Avatar/Banner)
        # We use a very short playlist fetch just to get the channel dict
        channel_info = {
            'id': real_id_or_url, # Use resolved ID for API calls
            'title': channel_id if not is_search_fallback else 'Loading...',
            'avatar': None,
            'banner': None,
            'subscribers': None
        }

        # Determine target URL for metadata fetch
        target_url = real_id_or_url
        if target_url.startswith('UC'): target_url = f'https://www.youtube.com/channel/{target_url}'
        elif target_url.startswith('@'): target_url = f'https://www.youtube.com/{target_url}'

        cmd = [
            sys.executable, '-m', 'yt_dlp',
            target_url,
            '--dump-json',
            '--flat-playlist',
            '--playlist-end', '1', # Fetch just 1 to get metadata
            '--no-warnings'
        ]

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stdout, stderr = proc.communicate()

        if stdout:
            try:
                first = json.loads(stdout.splitlines()[0])
                channel_info['title'] = first.get('channel') or first.get('uploader') or channel_info['title']
                channel_info['id'] = first.get('channel_id') or channel_info['id']
                # Try to get avatar/banner if available in flat dump (often NOT, but title/id are key)
            except: pass

        # Render shell - videos fetched via JS
        return render_template('channel.html', channel=channel_info)

    except Exception as e:
        return f"Error loading channel: {str(e)}", 500

@app.route('/api/related')
def get_related_videos():
    video_id = request.args.get('v')
    title = request.args.get('title')
    page = int(request.args.get('page', 1))
    limit = int(request.args.get('limit', 10))

    if not title and not video_id:
        return jsonify({'error': 'Video ID or Title required'}), 400

    try:
        query = f"{title} related" if title else f"{video_id} related"

        # Calculate pagination
        # Page 1: 0-10 (but usually fetched by get_stream_info)
        # Page 2: 10-20
        start = (page - 1) * limit
        end = start + limit

        videos = fetch_videos(query, limit=limit, playlist_start=start+1, playlist_end=end)
        return jsonify(videos)
    except Exception as e:
        print(f"Error fetching related: {e}")
        return jsonify({'error': str(e)}), 500

@app.route('/api/download')
def get_download_url():
    """Get a direct MP4 download URL for a video"""
    video_id = request.args.get('v')
    if not video_id:
        return jsonify({'error': 'No video ID'}), 400

    try:
        url = f"https://www.youtube.com/watch?v={video_id}"

        # Use format that avoids HLS/DASH manifests (m3u8)
        # Prefer progressive download formats
        ydl_opts = {
            'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best[protocol!*=m3u8]/best',
            'noplaylist': True,
            'quiet': True,
            'no_warnings': True,
            'skip_download': True,
            'youtube_include_dash_manifest': False,  # Avoid DASH
            'youtube_include_hls_manifest': False,   # Avoid HLS
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)

            # Try to get URL that's NOT an m3u8
            download_url = info.get('url', '')

            # If still m3u8, try getting from formats directly
            if '.m3u8' in download_url or not download_url:
                formats = info.get('formats', [])
                # Find best non-HLS format
                for f in reversed(formats):
                    f_url = f.get('url', '')
                    f_ext = f.get('ext', '')
                    f_protocol = f.get('protocol', '')
                    if f_url and 'm3u8' not in f_url and f_ext == 'mp4':
                        download_url = f_url
                        break

            title = info.get('title', 'video')

            if download_url and '.m3u8' not in download_url:
                return jsonify({
                    'url': download_url,
                    'title': title,
                    'ext': 'mp4'
                })
            else:
                # Fallback: return YouTube link for manual download
                return jsonify({
                    'error': 'Direct download not available. Try a video downloader site.',
                    'fallback_url': url
                }), 200

    except Exception as e:
        print(f"Download URL error: {e}")
        return jsonify({'error': str(e)}), 500

@app.route('/api/channel/videos')
def get_channel_videos():
    channel_id = request.args.get('id')
    page = int(request.args.get('page', 1))
    limit = int(request.args.get('limit', 20))
    sort_mode = request.args.get('sort', 'latest')
    filter_type = request.args.get('filter_type', 'video') # 'video' or 'shorts'

    if not channel_id: return jsonify([])

    try:
        # Calculate playlist range
        start = (page - 1) * limit + 1
        end = start + limit - 1

        # Construct URL based on ID type AND Filter Type
        base_url = ""
        if channel_id.startswith('UC'): base_url = f'https://www.youtube.com/channel/{channel_id}'
        elif channel_id.startswith('@'): base_url = f'https://www.youtube.com/{channel_id}'
        else: base_url = f'https://www.youtube.com/channel/{channel_id}' # Fallback

        target_url = base_url
        if filter_type == 'shorts':
            target_url += '/shorts'
        elif filter_type == 'video':
            target_url += '/videos'

        playlist_args = ['--playlist-start', str(start), '--playlist-end', str(end)]

        if sort_mode == 'oldest':
             playlist_args = ['--playlist-reverse', '--playlist-start', str(start), '--playlist-end', str(end)]

        # ... (rest is same)
        elif sort_mode == 'popular':
            # For popular, we ideally need a larger pool if doing python sort,
            # BUT with pagination strict ranges, python sort is impossible across pages.
            # We MUST rely on yt-dlp/youtube sort.
            # Attempt to use /videos URL which supports sort?
            # Actually, standard channel URL + --flat-playlist returns "Latest".
            # To get popular, we would typically need to scape /videos?sort=p.
            # yt-dlp doesn't support 'sort' arg for channels directly.
            # WORKAROUND: For 'popular', we'll just return Latest for now to avoid breaking pagination,
            # OR fetches a larger batch (e.g. top 100) and slice it?
            # Let's simple return latest but marked.
            # Implementation decision: Stick to Latest logic for stability,
            # OR (Better) don't support sort in API yet if unsupported.
            # Let's keep logic simple: ignore sort for API to ensure speed.
            pass

        cmd = [
            sys.executable, '-m', 'yt_dlp',
            target_url,
            '--dump-json',
            '--flat-playlist',
            '--no-warnings'
        ] + playlist_args

        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stdout, stderr = proc.communicate()

        videos = []
        for line in stdout.splitlines():
            try:
                v = json.loads(line)
                dur_str = None
                if v.get('duration'):
                    m, s = divmod(int(v['duration']), 60)
                    h, m = divmod(m, 60)
                    dur_str = f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"

                videos.append({
                    'id': v.get('id'),
                    'title': v.get('title'),
                    'thumbnail': f"https://i.ytimg.com/vi/{v.get('id')}/mqdefault.jpg",
                    'view_count': v.get('view_count') or 0,
                    'duration': dur_str,
                    'upload_date': v.get('upload_date'),
                    'uploader': v.get('uploader'),
                    'channel_id': v.get('channel_id') or channel_id
                })
            except: continue

        return jsonify(videos)
    except Exception as e:
        print(f"API Error: {e}")
        return jsonify([])

    except Exception as e:
        return f"Error loading channel: {str(e)}", 500

@app.route('/api/get_stream_info')
def get_stream_info():
    video_id = request.args.get('v')
    if not video_id:
        return jsonify({'error': 'No video ID'}), 400

    try:
        # 1. Check Cache
        import time
        conn = get_db_connection()
        cached = conn.execute('SELECT data, expires_at FROM video_cache WHERE video_id = ?', (video_id,)).fetchone()

        current_time = time.time()
        if cached:
            # Check expiry (stored as unix timestamp or datetime string, we'll use timestamp for simplicity)
            try:
                expires_at = float(cached['expires_at'])
                if current_time < expires_at:
                    data = json.loads(cached['data'])
                    conn.close()
                    # Re-proxy the URL just in case, or use cached if valid.
                    # Actually proxy url requires encoding, let's reconstruct it to be safe.
                    from urllib.parse import quote
                    proxied_url = f"/video_proxy?url={quote(data['original_url'], safe='')}"
                    data['stream_url'] = proxied_url

                    # Add cache hit header for debug
                    response = jsonify(data)
                    response.headers['X-Cache'] = 'HIT'
                    return response
            except:
                pass # Invalid cache, fall through

        # 2. Fetch from YouTube (Library Optimization)
        url = f"https://www.youtube.com/watch?v={video_id}"

        ydl_opts = {
            'format': 'best[ext=mp4]/best',
            'noplaylist': True,
            'quiet': True,
            'no_warnings': True,
            'skip_download': True,
            'force_ipv4': True,
            'socket_timeout': 10,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                info = ydl.extract_info(url, download=False)
            except Exception as e:
                print(f"❌ yt-dlp error for {video_id}: {str(e)}")
                return jsonify({'error': 'Stream extraction failed'}), 500

        stream_url = info.get('url')
        if not stream_url:
             return jsonify({'error': 'No stream URL found in metadata'}), 500

        # Fetch Related Videos (Fallback to search if not provided)
        # We use the title + " related" to find relevant content
        related_videos = []
        try:
            search_query = f"{info.get('title', '')} related"
            related_videos = fetch_videos(search_query, limit=20)
        except:
            pass

        # Extract Subtitles (English preferred)
        subtitle_url = None
        start_lang = 'en'

        subs = info.get('subtitles') or {}
        auto_subs = info.get('automatic_captions') or {}

        # DEBUG: Print subtitle info
        print(f"Checking subtitles for {video_id}")
        print(f"Manual Subs keys: {list(subs.keys())}")
        print(f"Auto Subs keys: {list(auto_subs.keys())}")

        # Check manual subs first
        if 'en' in subs:
            subtitle_url = subs['en'][0]['url']
        elif 'vi' in subs:  # Vietnamese fallback
            subtitle_url = subs['vi'][0]['url']
        # Check auto subs (usually available)
        elif 'en' in auto_subs:
            subtitle_url = auto_subs['en'][0]['url']
        elif 'vi' in auto_subs:
            subtitle_url = auto_subs['vi'][0]['url']

        # If still none, just pick the first one from manual then auto
        if not subtitle_url:
            if subs:
                first_key = list(subs.keys())[0]
                subtitle_url = subs[first_key][0]['url']
            elif auto_subs:
                first_key = list(auto_subs.keys())[0]
                subtitle_url = auto_subs[first_key][0]['url']

        print(f"Selected Subtitle URL: {subtitle_url}")

        # 3. Construct Response Data
        response_data = {
            'original_url': stream_url,
            'title': info.get('title', 'Unknown Title'),
            'description': info.get('description', ''),
            'uploader': info.get('uploader', ''),
            'upload_date': info.get('upload_date', ''),
            'view_count': info.get('view_count', 0),
            'related': related_videos,
            'subtitle_url': subtitle_url
        }

        # 4. Cache It (valid for 1 hour = 3600s)
        # YouTube URLs expire in ~6 hours usually.
        expiry = current_time + 3600
        conn.execute('INSERT OR REPLACE INTO video_cache (video_id, data, expires_at) VALUES (?, ?, ?)',
                     (video_id, json.dumps(response_data), expiry))
        conn.commit()
        conn.close()

        # 5. Return Response
        from urllib.parse import quote
        proxied_url = f"/video_proxy?url={quote(stream_url, safe='')}"
        response_data['stream_url'] = proxied_url

        response = jsonify(response_data)
        response.headers['X-Cache'] = 'MISS'
        return response

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/api/search')
def search():
    query = request.args.get('q')
    if not query:
        return jsonify({'error': 'No query provided'}), 400

    try:
        # Check if query is a YouTube URL
        import re
        # Regex to catch youtube.com/watch?v=, youtu.be/, shorts/, etc.
        youtube_regex = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]+)'
        match = re.search(youtube_regex, query)

        if match:
            video_id = match.group(4)
            # Fetch direct metadata
            meta_cmd = [sys.executable, '-m', 'yt_dlp', '--dump-json', '--no-playlist', f'https://www.youtube.com/watch?v={video_id}']
            meta_proc = subprocess.run(meta_cmd, capture_output=True, text=True)

            results = []
            search_title = ""

            if meta_proc.returncode == 0:
                data = json.loads(meta_proc.stdout)
                search_title = data.get('title', '')

                # Format duration
                duration_secs = data.get('duration')
                if duration_secs:
                    mins, secs = divmod(int(duration_secs), 60)
                    hours, mins = divmod(mins, 60)
                    duration = f"{hours}:{mins:02d}:{secs:02d}" if hours else f"{mins}:{secs:02d}"
                else:
                    duration = None

                # Add the exact match first
                results.append({
                    'id': data.get('id'),
                    'title': data.get('title', 'Unknown'),
                    'uploader': data.get('uploader') or data.get('channel') or 'Unknown',
                    'thumbnail': f"https://i.ytimg.com/vi/{data.get('id')}/hqdefault.jpg",
                    'view_count': data.get('view_count', 0),
                    'upload_date': data.get('upload_date', ''),
                    'duration': duration,
                    'is_exact_match': True # Flag for frontend highlighting if desired
                })

            # Now fetch related/similar videos using title
            if search_title:
                rel_cmd = [
                    sys.executable, '-m', 'yt_dlp',
                    f'ytsearch19:{search_title}', # Get 19 more to make ~20 total
                    '--dump-json',
                    '--default-search', 'ytsearch',
                    '--no-playlist',
                    '--flat-playlist'
                ]
                rel_proc = subprocess.Popen(rel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
                stdout, _ = rel_proc.communicate()

                for line in stdout.splitlines():
                    try:
                        r_data = json.loads(line)
                        r_id = r_data.get('id')
                        # Don't duplicate the exact match
                        if r_id != video_id:
                            # Helper to format duration (dup code, could be function)
                            r_dur = r_data.get('duration')
                            if r_dur:
                                m, s = divmod(int(r_dur), 60)
                                h, m = divmod(m, 60)
                                dur_str = f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"
                            else:
                                dur_str = None

                            results.append({
                                'id': r_id,
                                'title': r_data.get('title', 'Unknown'),
                                'uploader': r_data.get('uploader') or r_data.get('channel') or 'Unknown',
                                'thumbnail': f"https://i.ytimg.com/vi/{r_id}/hqdefault.jpg",
                                'view_count': r_data.get('view_count', 0),
                                'upload_date': r_data.get('upload_date', ''),
                                'duration': dur_str
                            })
                    except:
                        continue

            return jsonify(results)

        else:
            # Standard Text Search
            cmd = [
                sys.executable, '-m', 'yt_dlp',
                f'ytsearch20:{query}',
                '--dump-json',
                '--default-search', 'ytsearch',
                '--no-playlist',
                '--flat-playlist'
            ]

            # Run command
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            stdout, stderr = process.communicate()

            results = []
            for line in stdout.splitlines():
                try:
                    data = json.loads(line)
                    video_id = data.get('id')
                    if video_id:
                        # Format duration
                        duration_secs = data.get('duration')
                        if duration_secs:
                            mins, secs = divmod(int(duration_secs), 60)
                            hours, mins = divmod(mins, 60)
                            duration = f"{hours}:{mins:02d}:{secs:02d}" if hours else f"{mins}:{secs:02d}"
                        else:
                            duration = None

                        results.append({
                            'id': video_id,
                            'title': data.get('title', 'Unknown'),
                            'uploader': data.get('uploader') or data.get('channel') or 'Unknown',
                            'thumbnail': f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg",
                            'view_count': data.get('view_count', 0),
                            'upload_date': data.get('upload_date', ''),
                            'duration': duration
                        })
                except:
                    continue

            return jsonify(results)
    except Exception as e:
        return jsonify({'error': str(e)}), 500

# --- Helper: Extractive Summarization ---
def extractive_summary(text, num_sentences=5):
    # 1. Clean and parse text
    # Remove metadata like [Music] (common in auto-caps)
    clean_text = re.sub(r'\[.*?\]', '', text)
    clean_text = clean_text.replace('\n', ' ')

    # 2. Split into sentences (simple punctuation split)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', clean_text)

    # 3. Tokenize and Calculate Word Frequencies
    word_frequencies = {}
    stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'to', 'of', 'in', 'on', 'at', 'for', 'width', 'that', 'this', 'it', 'you', 'i', 'we', 'they', 'he', 'she'])

    for word in re.findall(r'\w+', clean_text.lower()):
        if word not in stop_words:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    if not word_frequencies:
        return "Not enough content to summarize."

    # Normalize frequencies
    max_freq = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_freq

    # 4. Score Sentences
    sentence_scores = {}
    for sent in sentences:
        for word in re.findall(r'\w+', sent.lower()):
            if word in word_frequencies:
                if sent not in sentence_scores:
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

    # 5. Extract Top N Sentences
    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

@app.route('/api/summarize')
def summarize_video():
    video_id = request.args.get('v')
    if not video_id:
        return jsonify({'error': 'No video ID'}), 400

    try:
        # Fetch Transcript
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        # Try to find english or manually created first, then auto
        try:
           transcript = transcript_list.find_transcript(['en', 'vi'])
        except:
           # Fallback to whatever is available (likely auto-generated)
           transcript = transcript_list.find_generated_transcript(['en', 'vi'])

        transcript_data = transcript.fetch()

        # Combine text
        full_text = " ".join([entry['text'] for entry in transcript_data])

        # Summarize
        summary = extractive_summary(full_text, num_sentences=7)

        return jsonify({'success': True, 'summary': summary})

    except TranscriptsDisabled:
        return jsonify({'success': False, 'message': 'Subtitles are disabled for this video.'})
    except Exception as e:
        return jsonify({'success': False, 'message': f'Could not summarize: {str(e)}'})

# Helper function to fetch videos (not a route)
def fetch_videos(query, limit=20, filter_type=None, playlist_start=1, playlist_end=None):
    try:
        # If no end specified, default to start + limit - 1
        if not playlist_end:
            playlist_end = playlist_start + limit - 1

        cmd = [
            sys.executable, '-m', 'yt_dlp',
            f'ytsearch{playlist_end}:{query}', # Explicitly request enough items to populate the list up to 'end'
            '--dump-json',
            '--default-search', 'ytsearch',
            '--no-playlist',
            '--flat-playlist',
            '--playlist-start', str(playlist_start),
            '--playlist-end', str(playlist_end)
        ]

        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stdout, stderr = process.communicate()

        results = []
        for line in stdout.splitlines():
            try:
                data = json.loads(line)
                video_id = data.get('id')
                if video_id:
                    # Format duration
                    duration_secs = data.get('duration')

                    # Filter Logic
                    if filter_type == 'video' and duration_secs and int(duration_secs) <= 60:
                        continue
                    if filter_type == 'short' and duration_secs and int(duration_secs) > 60:
                        continue

                    if duration_secs:
                        mins, secs = divmod(int(duration_secs), 60)
                        hours, mins = divmod(mins, 60)
                        duration = f"{hours}:{mins:02d}:{secs:02d}" if hours else f"{mins}:{secs:02d}"
                    else:
                        duration = None

                    results.append({
                        'id': video_id,
                        'title': data.get('title', 'Unknown'),
                        'uploader': data.get('uploader') or data.get('channel') or 'Unknown',
                        'channel_id': data.get('channel_id'),
                        'uploader_id': data.get('uploader_id'),
                        'thumbnail': f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg",
                        'view_count': data.get('view_count', 0),
                        'upload_date': data.get('upload_date', ''),
                        'duration': duration
                    })
            except:
                continue
        return results
    except Exception as e:
        print(f"Error fetching videos: {e}")
        return []

import concurrent.futures

@app.route('/api/trending')
def trending():
    try:
        category = request.args.get('category', 'all') # Default to 'all' for home
        page = int(request.args.get('page', 1))
        sort = request.args.get('sort', 'month')
        region = request.args.get('region', 'vietnam')
        limit = 120 if category != 'all' else 20 # 120 for grid, 20 for sections

        # Helper to build query
        def get_query(cat, reg, s_sort):
            if reg == 'vietnam':
                queries = {
                    'general': 'trending vietnam',
                    'tech': 'AI tools software tech review IT việt nam',
                    'all': 'trending vietnam',
                    'music': 'nhạc việt trending',
                    'gaming': 'gaming việt nam',
                    'movies': 'phim việt nam',
                    'news': 'tin tức việt nam hôm nay',
                    'sports': 'thể thao việt nam',
                    'shorts': 'trending việt nam',
                    'trending': 'trending việt nam',
                    'podcasts': 'podcast việt nam',
                    'live': 'live stream việt nam'
                }
            else:
                queries = {
                    'general': 'trending',
                    'tech': 'AI tools software tech review IT',
                    'all': 'trending',
                    'music': 'music trending',
                    'gaming': 'gaming trending',
                    'movies': 'movies trending',
                    'news': 'news today',
                    'sports': 'sports highlights',
                    'shorts': 'trending',
                    'trending': 'trending now',
                    'podcasts': 'podcast trending',
                    'live': 'live stream'
                }

            base = queries.get(cat, 'trending')

            from datetime import datetime, timedelta
            three_months_ago = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')

            sort_filters = {
                'day': ', today',
                'week': ', this week',
                'month': ', this month',
                '3months': f" after:{three_months_ago}",
                'year': ', this year'
            }
            return base + sort_filters.get(s_sort, f" after:{three_months_ago}")

        # === Parallel Fetching for Home Feed ===
        if category == 'all':
            sections_to_fetch = [
                {'id': 'trending', 'title': 'Trending Now', 'icon': 'fire'},
                {'id': 'tech', 'title': 'AI & Tech', 'icon': 'microchip'},
                {'id': 'music', 'title': 'Music', 'icon': 'music'},
                {'id': 'gaming', 'title': 'Gaming', 'icon': 'gamepad'},
                {'id': 'movies', 'title': 'Movies', 'icon': 'film'},
                {'id': 'sports', 'title': 'Sports', 'icon': 'football-ball'},
                {'id': 'news', 'title': 'News', 'icon': 'newspaper'}
            ]

            def fetch_section(section):
                q = get_query(section['id'], region, sort)
                # Fetch 20 videos per section, page 1 logic implied (start=1)
                vids = fetch_videos(q, limit=25, filter_type='video', playlist_start=1)
                return {
                    'id': section['id'],
                    'title': section['title'],
                    'icon': section['icon'],
                    'videos': vids[:20]
                }

            with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
                results = list(executor.map(fetch_section, sections_to_fetch))

            return jsonify({'mode': 'sections', 'data': results})

        # === Standard Single Category Fetch ===
        query = get_query(category, region, sort)

        # Calculate offset
        start = (page - 1) * limit + 1

        # Determine filter type
        is_shorts_req = request.args.get('shorts')
        if is_shorts_req:
            filter_mode = 'short'
        else:
            filter_mode = 'short' if category == 'shorts' else 'video'

        results = fetch_videos(query, limit=limit, filter_type=filter_mode, playlist_start=start)
        return jsonify(results)

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/api/update_ytdlp', methods=['POST'])
def update_ytdlp():
    try:
        # Run pip install -U yt-dlp
        cmd = [sys.executable, '-m', 'pip', 'install', '-U', 'yt-dlp']
        result = subprocess.run(cmd, capture_output=True, text=True)

        if result.returncode == 0:
            # Check new version
            ver_cmd = [sys.executable, '-m', 'yt_dlp', '--version']
            ver_result = subprocess.run(ver_cmd, capture_output=True, text=True)
            version = ver_result.stdout.strip()
            return jsonify({'success': True, 'message': f'Updated successfully to {version}'})
        else:
            return jsonify({'success': False, 'message': f'Update failed: {result.stderr}'}), 500
    except Exception as e:
        return jsonify({'success': False, 'message': str(e)}), 500

@app.route('/api/comments')
def get_comments():
    """Get comments for a YouTube video"""
    video_id = request.args.get('v')
    if not video_id:
        return jsonify({'error': 'No video ID'}), 400

    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        cmd = [
            sys.executable, '-m', 'yt_dlp',
            url,
            '--write-comments',
            '--skip-download',
            '--dump-json'
        ]

        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)

        if result.returncode == 0:
            data = json.loads(result.stdout)
            comments_data = data.get('comments', [])

            # Format comments for frontend
            comments = []
            for c in comments_data[:50]:  # Limit to 50 comments
                comments.append({
                    'author': c.get('author', 'Unknown'),
                    'author_thumbnail': c.get('author_thumbnail', ''),
                    'text': c.get('text', ''),
                    'likes': c.get('like_count', 0),
                    'time': c.get('time_text', ''),
                    'is_pinned': c.get('is_pinned', False)
                })

            return jsonify({
                'comments': comments,
                'count': data.get('comment_count', len(comments))
            })
        else:
            return jsonify({'comments': [], 'count': 0, 'error': 'Could not load comments'})

    except subprocess.TimeoutExpired:
        return jsonify({'comments': [], 'count': 0, 'error': 'Comments loading timed out'})
    except Exception as e:
        return jsonify({'comments': [], 'count': 0, 'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5001)