kv-tiktok/backend/core/feed_service.py

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
import os
import json
import asyncio
from typing import List, Optional
import yt_dlp
from cachetools import TTLCache
import time

class VideoSchema(BaseModel):
    url: str = Field(..., description="The URL to the video content")
    description: str = Field(..., description="The video caption/description")
    author: str = Field(..., description="The username of the creator")

class FeedService:
    # Class-level TTL cache for feed results (60 second expiry, max 10 entries)
    _feed_cache: TTLCache = TTLCache(maxsize=10, ttl=60)
    _browser_warmed_up: bool = False
    _persistent_session_id: str = "tiktok_feed_session"

    def __init__(self):
        self.api_key = os.getenv("OPENAI_API_KEY")

    async def warmup(self):
        """Pre-warm the browser session on startup for faster first request."""
        if FeedService._browser_warmed_up:
            return

        print("DEBUG: Warming up browser session...")
        try:
            browser_config = BrowserConfig(headless=True, java_script_enabled=True)
            run_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                session_id=FeedService._persistent_session_id,
                wait_until="domcontentloaded"
            )
            async with AsyncWebCrawler(config=browser_config) as crawler:
                await crawler.arun(url="https://www.tiktok.com", config=run_config)
            FeedService._browser_warmed_up = True
            print("DEBUG: Browser session warmed up successfully!")
        except Exception as e:
            print(f"DEBUG: Warmup failed (non-critical): {e}")

    async def _resolve_video_url(self, url: str) -> Optional[str]:
        """Resolve direct media URL using yt-dlp."""
        cookie_header = ""
        user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"

        if os.path.exists("cookies.json"):
            try:
                with open("cookies.json", 'r') as f:
                    cookies_dict = json.load(f)
                    cookie_header = "; ".join([f"{k}={v}" for k, v in cookies_dict.items()])
            except Exception as e:
                print(f"Error preparing cookies for yt-dlp: {e}")

        if os.path.exists("session_metadata.json"):
            try:
                with open("session_metadata.json", "r") as f:
                    meta = json.load(f)
                    user_agent = meta.get("user_agent", user_agent)
            except:
                pass

        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'format': 'best',
            'http_headers': {
                'Cookie': cookie_header,
                'User-Agent': user_agent
            } if cookie_header else None,
            'socket_timeout': 10,
        }

        try:
            loop = asyncio.get_event_loop()
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=False))
                return info.get('url')
        except Exception as e:
            print(f"Failed to resolve URL {url}: {e}")
            return None

    async def get_feed(self, source_url: str = "https://www.tiktok.com/foryou", skip_cache: bool = False) -> List[dict]:
        # Check cache first (unless skip_cache is True for infinite scroll)
        cache_key = source_url
        if not skip_cache and cache_key in FeedService._feed_cache:
            print(f"DEBUG: Returning cached results for {source_url}")
            return FeedService._feed_cache[cache_key]

        # 1. Load cookies
        crawl_cookies = []
        cookies_path = "cookies.json"
        own_user_id = None  # Track logged-in user's ID to filter out their videos

        if os.path.exists(cookies_path):
            try:
                with open(cookies_path, 'r') as f:
                    cookie_dict = json.load(f)
                    # Extract the logged-in user's ID from cookies
                    own_user_id = cookie_dict.get("living_user_id")

                    for k, v in cookie_dict.items():
                        crawl_cookies.append({
                            "name": k,
                            "value": v,
                            "domain": ".tiktok.com",
                            "path": "/"
                        })
                print(f"DEBUG: Loaded {len(crawl_cookies)} cookies. User ID: {own_user_id}")
            except Exception as e:
                print(f"Error loading cookies: {e}")

        # 2. Config Crawler
        default_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        user_agent = default_ua
        if os.path.exists("session_metadata.json"):
            try:
                with open("session_metadata.json", "r") as f:
                    meta = json.load(f)
                    user_agent = meta.get("user_agent", default_ua)
            except:
                pass

        browser_config = BrowserConfig(
            headless=True,
            java_script_enabled=True,
            cookies=crawl_cookies if crawl_cookies else None,
            headers={
                "User-Agent": user_agent
            }
        )

        # Aggressive scrolling to load many videos (12 scrolls = ~30+ videos)
        run_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            session_id=FeedService._persistent_session_id,
            js_code="""
                // Scroll aggressively to load ~30 videos
                for (let i = 0; i < 12; i++) {
                    window.scrollBy(0, 1500);
                    await new Promise(r => setTimeout(r, 800));
                }
            """,
            wait_for="body",
            wait_until="domcontentloaded",
            delay_before_return_html=10.0,
            page_timeout=60000,
            magic=True
        )

        try:
            print(f"DEBUG: Starting crawl for: {source_url}")
            async with AsyncWebCrawler(config=browser_config) as crawler:
                result = await asyncio.wait_for(
                    crawler.arun(url=source_url, config=run_config),
                    timeout=90.0
                )

            print(f"DEBUG: Crawl Success: {result.success}")
            if not result.success:
                print(f"DEBUG: Crawl Error: {result.error_message}")
                return []

            # Parse SIGI_STATE from HTML (TikTok's embedded data)
            html = result.html if result.html else ""
            videos = []

            # Try to find video links directly from HTML
            import re

            # TikTok uses relative URLs like /@username/video/1234567890
            video_pattern = r'/@([a-zA-Z0-9_.]+)/video/(\d+)'
            matches = re.findall(video_pattern, html)

            # Dedupe by video ID, skip own videos, and keep first 20
            seen_ids = set()
            unique_videos = []
            skipped_own = 0
            for author, video_id in matches:
                # Skip videos from the logged-in user's account
                if own_user_id and author == own_user_id:
                    skipped_own += 1
                    continue

                if video_id not in seen_ids:
                    seen_ids.add(video_id)
                    unique_videos.append((author, video_id))
                    if len(unique_videos) >= 30:  # Get up to 30 videos per batch
                        break

            if skipped_own > 0:
                print(f"DEBUG: Skipped {skipped_own} videos from own account")

            print(f"DEBUG: Found {len(unique_videos)} unique videos in HTML")
            print(f"DEBUG: HTML length: {len(html)} characters")

            # Debug: Save HTML to file for inspection
            try:
                with open("debug_tiktok.html", "w") as f:
                    f.write(html)
                print("DEBUG: Saved HTML to debug_tiktok.html")
            except:
                pass

            if unique_videos:
                # Build video objects (author and video_id already extracted)
                for author, video_id in unique_videos:
                    videos.append({
                        "url": f"https://www.tiktok.com/@{author}/video/{video_id}",
                        "author": author,
                        "description": f"Video by @{author}"
                    })

                # Resolve direct URLs in parallel
                print(f"DEBUG: Resolving direct URLs for {len(videos)} videos...")

                async def resolve_item(item):
                    direct_url = await self._resolve_video_url(item['url'])
                    if direct_url:
                        item['url'] = direct_url
                        return item
                    return None

                resolved_items = await asyncio.gather(*[resolve_item(item) for item in videos])
                final_results = [item for item in resolved_items if item]

                # Cache results
                if final_results:
                    FeedService._feed_cache[cache_key] = final_results
                    print(f"DEBUG: Cached {len(final_results)} videos")

                return final_results
            else:
                print("DEBUG: No video IDs found in HTML, trying SIGI_STATE...")

                # Try parsing SIGI_STATE JSON
                sigi_pattern = r'<script id="SIGI_STATE" type="application/json">(.+?)</script>'
                sigi_match = re.search(sigi_pattern, html, re.DOTALL)

                if sigi_match:
                    try:
                        sigi_data = json.loads(sigi_match.group(1))
                        items = sigi_data.get("ItemModule", {})

                        for item_id, item_data in list(items.items())[:10]:
                            author = item_data.get("author", "unknown")
                            desc = item_data.get("desc", "")
                            video_url = f"https://www.tiktok.com/@{author}/video/{item_id}"

                            videos.append({
                                "url": video_url,
                                "author": author,
                                "description": desc or f"Video by @{author}"
                            })

                        if videos:
                            # Resolve URLs
                            async def resolve_item(item):
                                direct_url = await self._resolve_video_url(item['url'])
                                if direct_url:
                                    item['url'] = direct_url
                                    return item
                                return None

                            resolved_items = await asyncio.gather(*[resolve_item(item) for item in videos])
                            final_results = [item for item in resolved_items if item]

                            if final_results:
                                FeedService._feed_cache[cache_key] = final_results
                                print(f"DEBUG: Cached {len(final_results)} videos from SIGI_STATE")

                            return final_results
                    except Exception as e:
                        print(f"DEBUG: Failed to parse SIGI_STATE: {e}")

                return []

        except asyncio.TimeoutError:
            print("DEBUG: Crawl timed out after 90s")
            return []
        except Exception as e:
            print(f"DEBUG: Crawl process failed: {e}")
            return []

    async def search_videos(self, query: str) -> List[dict]:
        search_url = f"https://www.tiktok.com/search?q={query}"
        return await self.get_feed(source_url=search_url)

    async def check_cookie_health(self) -> bool:
        """Check if cookies are still valid by hitting a simple endpoint."""
        if not os.path.exists("cookies.json"):
            return False

        # In a real scenario, we'd hit https://www.tiktok.com/api/user/detail/ or similar
        # For now, we'll just check if the file exists and is non-empty
        return os.path.getsize("cookies.json") > 0

feed_service = FeedService()