import json import re import requests import yt_dlp import syncedlyrics from starlette.concurrency import run_in_threadpool from backend.core.cache import CacheManager from backend.core.config import settings class LyricsService: def __init__(self): self.cache = CacheManager(str(settings.CACHE_DIR)) self.lyrics_cache_dir = settings.CACHE_DIR / "lyrics" self.lyrics_cache_dir.mkdir(parents=True, exist_ok=True) def _parse_lrc_string(self, lrc_string: str): parsed = [] for line in lrc_string.split('\n'): match = re.search(r'\[(\d+):(\d+\.\d+)\](.*)', line) if match: minutes = int(match.group(1)) seconds = float(match.group(2)) text = match.group(3).strip() parsed.append({"time": minutes * 60 + seconds, "text": text}) return parsed async def get_lyrics(self, id: str, title: str = None, artist: str = None): if not id: return [] cache_key = f"lyrics:{id}" cached = self.cache.get(cache_key) if cached: return cached parsed_lines = [] # Strategy 1: yt-dlp def fetch_ytdlp(): parsed = [] try: out_tmpl = str(self.lyrics_cache_dir / f"{id}") ydl_opts = { 'skip_download': True, 'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['en', 'vi'], 'subtitlesformat': 'json3', 'outtmpl': out_tmpl, 'quiet': True } url = f"https://www.youtube.com/watch?v={id}" import glob with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) pattern = str(self.lyrics_cache_dir / f"{id}.*.json3") found_files = glob.glob(pattern) if found_files: best_file = next((f for f in found_files if f.endswith(f"{id}.en.json3")), found_files[0]) with open(best_file, 'r', encoding='utf-8') as f: data = json.load(f) for event in data.get('events', []): if 'segs' in event and 'tStartMs' in event: text = "".join([s.get('utf8', '') for s in event['segs']]).strip() if text and not text.startswith('[') and text != '\n': parsed.append({"time": float(event['tStartMs']) / 1000.0, "text": text}) except Exception as e: print(f"yt-dlp sub error: {e}") return parsed parsed_lines = await run_in_threadpool(fetch_ytdlp) if not parsed_lines and title and artist: # Strategy 2: LRCLIB def fetch_lrclib(): try: cleaned_title = re.sub(r'\(.*?\)', '', title) clean_query = f"{artist} {cleaned_title}".strip() resp = requests.get("https://lrclib.net/api/search", params={"q": clean_query}, timeout=5) if resp.status_code == 200: results = resp.json() for item in results: if item.get("syncedLyrics"): return self._parse_lrc_string(item["syncedLyrics"]) except Exception: pass return [] parsed_lines = await run_in_threadpool(fetch_lrclib) if not parsed_lines and title and artist: # Strategy 3: syncedlyrics def fetch_syncedlyrics(): try: clean_query = f"{title} {artist}".strip() lrc_str = syncedlyrics.search(clean_query) if lrc_str: return self._parse_lrc_string(lrc_str) except Exception: pass return [] parsed_lines = await run_in_threadpool(fetch_syncedlyrics) if parsed_lines: self.cache.set(cache_key, parsed_lines, ttl_seconds=86400) return parsed_lines