spotify-clone/backend/services/youtube.py

import re
import json
import requests
import yt_dlp
from ytmusicapi import YTMusic
from backend.core.cache import CacheManager
from backend.core.config import settings
from backend.core.exceptions import ResourceNotFound, ExternalAPIError

class YouTubeService:
    def __init__(self):
        self.yt = YTMusic()
        self.cache = CacheManager(str(settings.CACHE_DIR))

    def _get_high_res_thumbnail(self, thumbnails: list) -> str:
        if not thumbnails:
            return "https://placehold.co/300x300"

        best_url = thumbnails[-1]['url']

        if "googleusercontent.com" in best_url or "ggpht.com" in best_url:
            if "w" in best_url and "h" in best_url:
                best_url = re.sub(r'=w\d+-h\d+', '=w544-h544', best_url)
        return best_url

    def _extract_artist_names(self, track: dict) -> str:
        artists = track.get('artists') or []
        if isinstance(artists, list):
            names = []
            for a in artists:
                if isinstance(a, dict):
                    names.append(a.get('name', 'Unknown'))
                elif isinstance(a, str):
                    names.append(a)
            return ", ".join(names) if names else "Unknown Artist"
        return "Unknown Artist"

    def _extract_album_name(self, track: dict, default="Single") -> str:
        album = track.get('album')
        if isinstance(album, dict):
            return album.get('name', default)
        if isinstance(album, str):
            return album
        return default

    def _clean_title(self, title: str) -> str:
        if not title: return "Playlist"
        title = title.encode('ascii', 'ignore').decode('ascii')
        spam_words = ["Playlist", "Music Chart", "Full SPOTIFY Video", "Updated Weekly", "Official", "Video"]
        for word in spam_words:
            title = re.sub(word, "", title, flags=re.IGNORECASE)
        title = re.sub(r'\s+', ' ', title).strip()
        title = title.strip('*- ')
        return title

    def _clean_description(self, desc: str) -> str:
        if not desc: return ""
        desc = re.sub(r'http\S+', '', desc)
        desc = re.sub(r'[*_=]{3,}', '', desc)
        if len(desc) > 300:
            desc = desc[:300] + "..."
        return desc.strip()

    def get_playlist(self, id: str):
        cache_key = f"playlist:{id}"
        cached_playlist = self.cache.get(cache_key)
        if cached_playlist:
            return cached_playlist

        try:
            playlist_data = None
            is_album = False

            # Try as Album first if MPREb ID
            if id.startswith("MPREb"):
                try:
                    playlist_data = self.yt.get_album(id)
                    is_album = True
                except:
                    pass

            if not playlist_data:
                try:
                    playlist_data = self.yt.get_playlist(id, limit=100)
                except Exception:
                    if not is_album:
                        playlist_data = self.yt.get_album(id)
                        is_album = True

            formatted_tracks = []
            if 'tracks' in playlist_data:
                for track in playlist_data['tracks']:
                    formatted_tracks.append({
                        "title": track.get('title', 'Unknown Title'),
                        "artist": self._extract_artist_names(track),
                        "album": self._extract_album_name(track, playlist_data.get('title', 'Single')),
                        "duration": track.get('duration_seconds', track.get('length_seconds', 0)),
                        "cover_url": self._get_high_res_thumbnail(track.get('thumbnails', []) or (playlist_data.get('thumbnails', []) if is_album else [])),
                        "id": track.get('videoId'),
                        "url": f"https://music.youtube.com/watch?v={track.get('videoId')}"
                    })

            p_cover = self._get_high_res_thumbnail(playlist_data.get('thumbnails', []))

            author = "YouTube Music"
            if is_album:
                artists = playlist_data.get('artists', [])
                names = [a.get('name', 'Unknown') if isinstance(a, dict) else a for a in artists]
                author = ", ".join(names)
            else:
                author_data = playlist_data.get('author', {})
                author = author_data.get('name', 'YouTube Music') if isinstance(author_data, dict) else str(author_data)

            formatted_playlist = {
                "id": playlist_data.get('browseId', playlist_data.get('id')),
                "title": self._clean_title(playlist_data.get('title', 'Unknown')),
                "description": self._clean_description(playlist_data.get('description', '')),
                "author": author,
                "cover_url": p_cover,
                "tracks": formatted_tracks
            }

            self.cache.set(cache_key, formatted_playlist, ttl_seconds=3600)
            return formatted_playlist

        except Exception as e:
            print(f"Playlist Fetch Error: {e}")
            raise ResourceNotFound(f"Playlist {id} not found")

    def search(self, query: str):
        if not query: return []
        cache_key = f"search:{query.lower().strip()}"
        cached = self.cache.get(cache_key)
        if cached: return cached

        try:
            results = self.yt.search(query, filter="songs", limit=20)
            tracks = []
            for track in results:
                tracks.append({
                    "title": track.get('title', 'Unknown Title'),
                    "artist": self._extract_artist_names(track),
                    "album": self._extract_album_name(track, "Single"),
                    "duration": track.get('duration_seconds', 0),
                    "cover_url": self._get_high_res_thumbnail(track.get('thumbnails', [])),
                    "id": track.get('videoId'),
                    "url": f"https://music.youtube.com/watch?v={track.get('videoId')}"
                })

            response = {"tracks": tracks}
            self.cache.set(cache_key, response, ttl_seconds=86400)
            return response
        except Exception as e:
            print(f"Search Error: {e}")
            raise ExternalAPIError(str(e))

    def get_stream_url(self, id: str):
        cache_key = f"stream:{id}"
        cached = self.cache.get(cache_key)
        if cached: return cached

        # Strategy: Try versatile clients in order
        clients_to_try = [
            # 1. iOS (often best for audio)
            {'extractor_args': {'youtube': {'player_client': ['ios']}}},
            # 2. Android (robust)
            {'extractor_args': {'youtube': {'player_client': ['android']}}},
            # 3. Web (standard, prone to 403)
            {'extractor_args': {'youtube': {'player_client': ['web']}}},
            # 4. TV (sometimes works for age-gated)
            {'extractor_args': {'youtube': {'player_client': ['tv']}}},
        ]

        last_error = None

        for client_config in clients_to_try:
            try:
                url = f"https://www.youtube.com/watch?v={id}"
                ydl_opts = {
                    'format': 'bestaudio[ext=m4a]/best[ext=mp4]/best',
                    'quiet': True,
                    'noplaylist': True,
                    'force_ipv4': True,
                }
                ydl_opts.update(client_config)

                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(url, download=False)
                    stream_url = info.get('url')

                if stream_url:
                    headers = info.get('http_headers', {})
                    result = {
                        "url": stream_url,
                        "headers": headers
                    }
                    self.cache.set(cache_key, result, ttl_seconds=3600)
                    return result
            except Exception as e:
                last_error = e
                print(f"Fetch failed with client {client_config}: {e}")
                continue

        # If all fail
        print(f"All clients failed for {id}. Last error: {last_error}")
        raise ExternalAPIError(str(last_error))

    def invalidate_stream_cache(self, id: str):
        cache_key = f"stream:{id}"
        path = self.cache._get_path(cache_key)
        if path.exists():
            try:
                path.unlink()
            except:
                pass

    def get_recommendations(self, seed_id: str):
        if not seed_id: return []
        cache_key = f"rec:{seed_id}"
        cached = self.cache.get(cache_key)
        if cached: return cached

        try:
            watch_playlist = self.yt.get_watch_playlist(videoId=seed_id, limit=20)
            tracks = []
            if 'tracks' in watch_playlist:
                seen_ids = {seed_id}
                for track in watch_playlist['tracks']:
                    t_id = track.get('videoId')
                    if not t_id or t_id in seen_ids: continue
                    seen_ids.add(t_id)

                    tracks.append({
                        "title": track.get('title', 'Unknown Title'),
                        "artist": self._extract_artist_names(track),
                        "album": self._extract_album_name(track, "Single"),
                        "duration": track.get('length_seconds', track.get('duration_seconds', 0)),
                        "cover_url": self._get_high_res_thumbnail(track.get('thumbnails') or track.get('thumbnail') or []),
                        "id": t_id,
                        "url": f"https://music.youtube.com/watch?v={t_id}"
                    })

            response = {"tracks": tracks}
            self.cache.set(cache_key, response, ttl_seconds=3600)
            return response
        except Exception as e:
            print(f"Rec Error: {e}")
            return {"tracks": []}

    def get_home(self):
        cache_key = "home:browse"
        cached = self.cache.get(cache_key)
        if cached: return cached

        try:
            # ytmusicapi `get_home` returns complex Sections
            # For simplicity, we'll fetch charts and new releases as "Browse" content
            # Prepare trending songs
            trending_songs = []
            try:
                # Get charts
                trending = self.yt.get_charts(country='VN')
                if 'videos' in trending and trending['videos']:
                    for item in trending['videos']['items']:
                        # Extract high-res thumbnail
                        thumbnails = item.get('thumbnails', [])
                        cover_url = thumbnails[-1]['url'] if thumbnails else ""

                        trending_songs.append({
                            "id": item.get('videoId'),
                            "title": item.get('title'),
                            "artist": item.get('artists', [{'name': 'Unknown'}])[0]['name'],
                            "album": "Trending", # Charts don't usually have album info, stick to generic
                            "cover_url": cover_url,
                            "duration": 0 # Charts might not have duration
                        })
            except Exception as e:
                print(f"Error fetching trending: {e}")

            # --- FALLBACK IF API FAILS OR RETURNS EMPTY ---
            if not trending_songs:
                print("Using HARDCODED fallback for trending songs.")
                trending_songs = [
                    {
                        "id": "Da4P2uT4ikU", "title": "Angel Baby", "artist": "Troye Sivan", "album": "Angel Baby",
                        "cover_url": "https://lh3.googleusercontent.com/Fj_JpwC1QGEFkH3y973Xv7w7tqVw5C_V-1o7g1gX_c4X_1o7g1gX_c4X_1o7g1=w544-h544-l90-rj"
                    },
                    {
                        "id": "fJ9rUzIMcZQ", "title": "Bohemian Rhapsody", "artist": "Queen", "album": "A Night at the Opera",
                        "cover_url": "https://lh3.googleusercontent.com/yFj_JpwC1QGEFkH3y973Xv7w7tqVw5C_V-1o7g1gX_c4X_1o7g1gX_c4X_1o7g1=w544-h544-l90-rj"
                    },
                    {
                        "id": "4NRXx6U8ABQ", "title": "Blinding Lights", "artist": "The Weeknd", "album": "After Hours",
                        "cover_url": "https://lh3.googleusercontent.com/Fj_JpwC1QGEFkH3y973Xv7w7tqVw5C_V-1o7g1gX_c4X_1o7g1gX_c4X_1o7g1=w544-h544-l90-rj"
                    },
                    {
                        "id": "OPf0YbXqDm0", "title": "Uptown Funk", "artist": "Mark Ronson", "album": "Uptown Special",
                        "cover_url": "https://lh3.googleusercontent.com/Fj_JpwC1QGEFkH3y973Xv7w7tqVw5C_V-1o7g1gX_c4X_1o7g1gX_c4X_1o7g1=w544-h544-l90-rj"
                    }
                ]
            # -----------------------------------------------
            # New Releases (using search for "New Songs" as proxy or actual new releases if supported)
            # Actually ytmusicapi has get_new_releases usually under get_charts or specific calls
            # We'll use get_charts "trending" for "Trending" category
            # And maybe "Top Songs" for "Top Hits"

            # 1. Trending (from Charts)
            trending_playlist = {
                "id": "trending",
                "title": "Trending Now",
                "description": "Top music videos right now",
                "cover_url": trending_songs[0]['cover_url'] if trending_songs else "",
                "tracks": trending_songs,
                "type": "Playlist",
                "creator": "YouTube Charts"
            }

            # 2. Top Hits (Simulated via search)
            # We'll fetch a few "standard" playlists or results to populate the home page
            # This makes the app feel "alive" even without user history

            async def get_search_shelf(query, title):
                try:
                    res = self.search(query)
                    if res and 'tracks' in res:
                        return {
                            "id": f"shelf_{query}",
                            "title": title,
                            "description": f"Best of {title}",
                            "cover_url": res['tracks'][0]['cover_url'] if res['tracks'] else "",
                            "tracks": res['tracks'],
                            "type": "Playlist",
                            "creator": "Spotify Clone"
                        }
                except:
                    return None

            # Since this is synchronous, we'll do simple searches or use cached results
            # For speed, we might want to hardcode IDs of popular playlists in the future
            # But for now, let's just reuse the trending videos for a "Top Hits" section to fill space
            # and maybe shuffle them or pick different slice

            import random
            top_hits_tracks = list(trending_songs)
            if len(top_hits_tracks) > 5:
                random.shuffle(top_hits_tracks)

            top_hits_playlist = {
                "id": "top_hits",
                "title": "Top Hits Today",
                "description": "The hottest tracks right now.",
                "cover_url": top_hits_tracks[0]['cover_url'] if top_hits_tracks else "",
                "tracks": top_hits_tracks,
                "type": "Playlist",
                "creator": "Editors"
            }

            # 3. New Releases (Simulated)
            new_releases_tracks = list(trending_songs)
            if len(new_releases_tracks) > 2:
                 # Just rotate them to look different
                new_releases_tracks = new_releases_tracks[2:] + new_releases_tracks[:2]

            new_releases_playlist = {
                "id": "new_releases",
                "title": "New Releases",
                "description": "Brand new music found for you.",
                "cover_url": new_releases_tracks[0]['cover_url'] if new_releases_tracks else "",
                "tracks": new_releases_tracks,
                "type": "Playlist",
                "creator": "Spotify Clone"
            }

            response = {
                "Trending": [trending_playlist],
                "Top Hits": [top_hits_playlist],
                "New Releases": [new_releases_playlist],
                "Focus & Chill": [
                    {
                        "id": "lofi_beats",
                        "title": "Lofi Beats",
                        "description": "Chill beats to study/relax to",
                        "cover_url": "https://i.ytimg.com/vi/jfKfPfyJRdk/hqdefault.jpg",
                        "tracks": [], # Empty tracks will force a fetch when clicked if handled
                        "type": "Playlist",
                        "creator": "Lofi Girl"
                    },
                     {
                        "id": "jazz_vibes",
                        "title": "Jazz Vibes",
                        "description": "Relaxing Jazz instrumental",
                        "cover_url": "https://i.ytimg.com/vi/DX7W7WUI6w8/hqdefault.jpg",
                        "tracks": [],
                        "type": "Playlist",
                        "creator": "Jazz Cafe"
                    }
                ]
            }

            self.cache.set(cache_key, response, ttl_seconds=3600)
            return response
        except Exception as e:
            print(f"Home Error: {e}")
            return {}

    def get_trending(self):
        # Dedicated trending endpoint
        home = self.get_home()
        if "Trending" in home and home["Trending"]:
            return {"tracks": home["Trending"][0]["tracks"]}
        return {"tracks": []}