kv-tube/debug_transcript.py

import yt_dlp
import requests
import json
import traceback

def _parse_json3_subtitles(data):
    """Parse YouTube json3 subtitle format into simplified format"""
    transcript = []
    events = data.get('events', [])

    for event in events:
        # Skip non-text events
        if 'segs' not in event:
            continue

        start_ms = event.get('tStartMs', 0)
        duration_ms = event.get('dDurationMs', 0)

        # Combine all segments in this event
        text_parts = []
        for seg in event.get('segs', []):
            text = seg.get('utf8', '')
            if text and text.strip():
                text_parts.append(text)

        combined_text = ''.join(text_parts).strip()
        if combined_text:
            transcript.append({
                'text': combined_text,
                'start': start_ms / 1000.0,  # Convert to seconds
                'duration': duration_ms / 1000.0 if duration_ms else 2.0  # Default 2s
            })

    return transcript

def debug(video_id):
    print(f"DEBUGGING VIDEO: {video_id}")
    url = f"https://www.youtube.com/watch?v={video_id}"
    languages = ['en', 'vi']

    # Use a temp filename template
    import os
    temp_template = f"temp_subs_{video_id}"

    ydl_opts = {
        'quiet': True,
        'no_warnings': True,
        'skip_download': True,
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': languages,
        'subtitlesformat': 'json3',
        'outtmpl': temp_template,
    }

    try:
        # cleanup old files
        for f in os.listdir('.'):
            if f.startswith(temp_template):
                os.remove(f)

        print("Downloading subtitles via yt-dlp...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # We must enable download=True for it to write files, but skip_download=True in opts prevents video DL
            ydl.download([url])

        # Find the downloaded file
        downloaded_file = None
        for f in os.listdir('.'):
            if f.startswith(temp_template) and f.endswith('.json3'):
                downloaded_file = f
                break

        if downloaded_file:
            print(f"Downloaded file: {downloaded_file}")
            with open(downloaded_file, 'r', encoding='utf-8') as f:
                sub_data = json.load(f)
                transcript_data = _parse_json3_subtitles(sub_data)
                print(f"Parsed {len(transcript_data)} items")
                # print(f"First 3: {transcript_data[:3]}")

            # Cleanup
            os.remove(downloaded_file)
        else:
            print("No subtitle file found after download attempt.")

    except Exception as e:
        traceback.print_exc()

    except Exception as e:
        traceback.print_exc()

if __name__ == '__main__':
    debug('dQw4w9WgXcQ')