spotify-clone/backend/services/lyrics.py

import json
import re
import requests
import yt_dlp
import syncedlyrics
from starlette.concurrency import run_in_threadpool
from backend.core.cache import CacheManager
from backend.core.config import settings

class LyricsService:
    def __init__(self):
        self.cache = CacheManager(str(settings.CACHE_DIR))
        self.lyrics_cache_dir = settings.CACHE_DIR / "lyrics"
        self.lyrics_cache_dir.mkdir(parents=True, exist_ok=True)

    def _parse_lrc_string(self, lrc_string: str):
        parsed = []
        for line in lrc_string.split('\n'):
            match = re.search(r'\[(\d+):(\d+\.\d+)\](.*)', line)
            if match:
                minutes = int(match.group(1))
                seconds = float(match.group(2))
                text = match.group(3).strip()
                parsed.append({"time": minutes * 60 + seconds, "text": text})
        return parsed

    async def get_lyrics(self, id: str, title: str = None, artist: str = None):
        if not id: return []

        cache_key = f"lyrics:{id}"
        cached = self.cache.get(cache_key)
        if cached: return cached

        parsed_lines = []

        # Strategy 1: yt-dlp
        def fetch_ytdlp():
            parsed = []
            try:
                out_tmpl = str(self.lyrics_cache_dir / f"{id}")
                ydl_opts = {
                    'skip_download': True, 'writesubtitles': True, 'writeautomaticsub': True,
                    'subtitleslangs': ['en', 'vi'], 'subtitlesformat': 'json3',
                    'outtmpl': out_tmpl, 'quiet': True
                }
                url = f"https://www.youtube.com/watch?v={id}"
                import glob
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([url])

                pattern = str(self.lyrics_cache_dir / f"{id}.*.json3")
                found_files = glob.glob(pattern)
                if found_files:
                    best_file = next((f for f in found_files if f.endswith(f"{id}.en.json3")), found_files[0])
                    with open(best_file, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        for event in data.get('events', []):
                            if 'segs' in event and 'tStartMs' in event:
                                text = "".join([s.get('utf8', '') for s in event['segs']]).strip()
                                if text and not text.startswith('[') and text != '\n':
                                    parsed.append({"time": float(event['tStartMs']) / 1000.0, "text": text})
            except Exception as e:
                print(f"yt-dlp sub error: {e}")
            return parsed

        parsed_lines = await run_in_threadpool(fetch_ytdlp)

        if not parsed_lines and title and artist:
            # Strategy 2: LRCLIB
            def fetch_lrclib():
                try:
                    cleaned_title = re.sub(r'\(.*?\)', '', title)
                    clean_query = f"{artist} {cleaned_title}".strip()
                    resp = requests.get("https://lrclib.net/api/search", params={"q": clean_query}, timeout=5)
                    if resp.status_code == 200:
                        results = resp.json()
                        for item in results:
                            if item.get("syncedLyrics"):
                                return self._parse_lrc_string(item["syncedLyrics"])
                except Exception:
                    pass
                return []

            parsed_lines = await run_in_threadpool(fetch_lrclib)

        if not parsed_lines and title and artist:
             # Strategy 3: syncedlyrics
            def fetch_syncedlyrics():
                try:
                    clean_query = f"{title} {artist}".strip()
                    lrc_str = syncedlyrics.search(clean_query)
                    if lrc_str:
                        return self._parse_lrc_string(lrc_str)
                except Exception:
                    pass
                return []

            parsed_lines = await run_in_threadpool(fetch_syncedlyrics)

        if parsed_lines:
             self.cache.set(cache_key, parsed_lines, ttl_seconds=86400)

        return parsed_lines