94 lines
2.8 KiB
Python
94 lines
2.8 KiB
Python
import yt_dlp
|
|
import requests
|
|
import json
|
|
import traceback
|
|
|
|
def _parse_json3_subtitles(data):
|
|
"""Parse YouTube json3 subtitle format into simplified format"""
|
|
transcript = []
|
|
events = data.get('events', [])
|
|
|
|
for event in events:
|
|
# Skip non-text events
|
|
if 'segs' not in event:
|
|
continue
|
|
|
|
start_ms = event.get('tStartMs', 0)
|
|
duration_ms = event.get('dDurationMs', 0)
|
|
|
|
# Combine all segments in this event
|
|
text_parts = []
|
|
for seg in event.get('segs', []):
|
|
text = seg.get('utf8', '')
|
|
if text and text.strip():
|
|
text_parts.append(text)
|
|
|
|
combined_text = ''.join(text_parts).strip()
|
|
if combined_text:
|
|
transcript.append({
|
|
'text': combined_text,
|
|
'start': start_ms / 1000.0, # Convert to seconds
|
|
'duration': duration_ms / 1000.0 if duration_ms else 2.0 # Default 2s
|
|
})
|
|
|
|
return transcript
|
|
|
|
def debug(video_id):
|
|
print(f"DEBUGGING VIDEO: {video_id}")
|
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
languages = ['en', 'vi']
|
|
|
|
# Use a temp filename template
|
|
import os
|
|
temp_template = f"temp_subs_{video_id}"
|
|
|
|
ydl_opts = {
|
|
'quiet': True,
|
|
'no_warnings': True,
|
|
'skip_download': True,
|
|
'writesubtitles': True,
|
|
'writeautomaticsub': True,
|
|
'subtitleslangs': languages,
|
|
'subtitlesformat': 'json3',
|
|
'outtmpl': temp_template,
|
|
}
|
|
|
|
try:
|
|
# cleanup old files
|
|
for f in os.listdir('.'):
|
|
if f.startswith(temp_template):
|
|
os.remove(f)
|
|
|
|
print("Downloading subtitles via yt-dlp...")
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
# We must enable download=True for it to write files, but skip_download=True in opts prevents video DL
|
|
ydl.download([url])
|
|
|
|
# Find the downloaded file
|
|
downloaded_file = None
|
|
for f in os.listdir('.'):
|
|
if f.startswith(temp_template) and f.endswith('.json3'):
|
|
downloaded_file = f
|
|
break
|
|
|
|
if downloaded_file:
|
|
print(f"Downloaded file: {downloaded_file}")
|
|
with open(downloaded_file, 'r', encoding='utf-8') as f:
|
|
sub_data = json.load(f)
|
|
transcript_data = _parse_json3_subtitles(sub_data)
|
|
print(f"Parsed {len(transcript_data)} items")
|
|
# print(f"First 3: {transcript_data[:3]}")
|
|
|
|
# Cleanup
|
|
os.remove(downloaded_file)
|
|
else:
|
|
print("No subtitle file found after download attempt.")
|
|
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
|
|
if __name__ == '__main__':
|
|
debug('dQw4w9WgXcQ')
|