kv-netflix/backend/category_discovery.py

328 lines
11 KiB
Python

"""
Category Discovery Module for PhimMoiChill
Automatically discovers and maps all available categories
"""
import asyncio
import aiohttp
import ssl
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional
from urllib.parse import urljoin
BASE_URL = "https://phimmoichill.network"
@dataclass
class Category:
"""Category metadata"""
id: str
name: str
slug: str
type: str # 'type', 'genre', 'country', 'year'
url: str
parent: Optional[str] = None
movie_count: int = 0
def to_dict(self):
return asdict(self)
class CategoryDiscovery:
"""Discovers categories from PhimMoiChill navigation"""
def __init__(self):
self.session: Optional[aiohttp.ClientSession] = None
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7',
}
async def _get_session(self) -> aiohttp.ClientSession:
if not self.session:
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
connector = aiohttp.TCPConnector(ssl=ssl_context)
self.session = aiohttp.ClientSession(headers=self.headers, connector=connector)
return self.session
async def close(self):
if self.session:
await self.session.close()
self.session = None
async def _fetch_html(self, url: str) -> str:
"""Fetch HTML content"""
session = await self._get_session()
async with session.get(url) as response:
if response.status == 200:
return await response.text()
raise Exception(f"Failed to fetch {url}: {response.status}")
async def discover_all_categories(self) -> Dict[str, List[Category]]:
"""
Discover all categories from PhimMoiChill
Returns organized structure of categories
"""
try:
html = await self._fetch_html(BASE_URL)
soup = BeautifulSoup(html, 'lxml')
categories = {
'types': [],
'genres': [],
'countries': [],
'years': []
}
# Discover main types (phim-le, phim-bo, etc.)
categories['types'] = await self._discover_main_types(soup)
# Discover genres (the-loai/*)
categories['genres'] = await self._discover_genres(soup)
# Discover countries (quoc-gia/*)
categories['countries'] = await self._discover_countries(soup)
# Generate year categories
categories['years'] = self._generate_year_categories()
return categories
except Exception as e:
print(f"Error discovering categories: {e}")
return self._get_fallback_categories()
async def _discover_main_types(self, soup: BeautifulSoup) -> List[Category]:
"""Discover main content types"""
types = []
# Look for navigation menu with main types
nav_links = soup.select('nav a, .menu a, .navigation a')
# Known type patterns
type_patterns = {
'phim-le': 'Movies',
'phim-bo': 'TV Series',
'tv-shows': 'TV Shows',
'hoat-hinh': 'Animation'
}
for link in nav_links:
href = link.get('href', '')
text = link.get_text(strip=True)
for slug, name in type_patterns.items():
if slug in href:
types.append(Category(
id=slug,
name=text or name,
slug=f'danh-sach/{slug}',
type='type',
url=urljoin(BASE_URL, f'/danh-sach/{slug}')
))
break
# Ensure we have at least the basic types
if not types:
for slug, name in type_patterns.items():
types.append(Category(
id=slug,
name=name,
slug=f'danh-sach/{slug}',
type='type',
url=urljoin(BASE_URL, f'/danh-sach/{slug}')
))
return types
async def _discover_genres(self, soup: BeautifulSoup) -> List[Category]:
"""Discover genre categories"""
genres = []
# Look for genre menu/dropdown
genre_links = soup.select('a[href*="the-loai/"]')
seen_genres = set()
for link in genre_links:
href = link.get('href', '')
text = link.get_text(strip=True)
# Extract genre slug from URL
if '/the-loai/' in href:
slug = href.split('/the-loai/')[-1].split('/')[0].split('?')[0]
if slug and slug not in seen_genres:
seen_genres.add(slug)
genres.append(Category(
id=slug,
name=text or slug.replace('-', ' ').title(),
slug=f'the-loai/{slug}',
type='genre',
url=urljoin(BASE_URL, f'/the-loai/{slug}')
))
# Fallback: common genres
if not genres:
genres = self._get_fallback_genres()
return genres
async def _discover_countries(self, soup: BeautifulSoup) -> List[Category]:
"""Discover country categories"""
countries = []
# Look for country menu/dropdown
country_links = soup.select('a[href*="quoc-gia/"]')
seen_countries = set()
for link in country_links:
href = link.get('href', '')
text = link.get_text(strip=True)
# Extract country slug from URL
if '/quoc-gia/' in href:
slug = href.split('/quoc-gia/')[-1].split('/')[0].split('?')[0]
if slug and slug not in seen_countries:
seen_countries.add(slug)
countries.append(Category(
id=slug,
name=text or slug.replace('-', ' ').title(),
slug=f'quoc-gia/{slug}',
type='country',
url=urljoin(BASE_URL, f'/quoc-gia/{slug}')
))
# Fallback: common countries
if not countries:
countries = self._get_fallback_countries()
return countries
def _generate_year_categories(self) -> List[Category]:
"""Generate year-based categories"""
from datetime import datetime
current_year = datetime.now().year
years = []
for year in range(current_year, current_year - 10, -1):
years.append(Category(
id=str(year),
name=str(year),
slug=f'nam/{year}',
type='year',
url=urljoin(BASE_URL, f'/nam/{year}')
))
return years
def _get_fallback_genres(self) -> List[Category]:
"""Fallback genres if discovery fails"""
genres_map = {
'hanh-dong': 'Action',
'kinh-di': 'Horror',
'tinh-cam': 'Romance',
'hai-huoc': 'Comedy',
'vien-tuong': 'Sci-Fi',
'phieu-luu': 'Adventure',
'bi-an': 'Mystery',
'chien-tranh': 'War',
'tam-ly': 'Psychological',
'gia-dinh': 'Family'
}
return [
Category(
id=slug,
name=name,
slug=f'the-loai/{slug}',
type='genre',
url=urljoin(BASE_URL, f'/the-loai/{slug}')
)
for slug, name in genres_map.items()
]
def _get_fallback_countries(self) -> List[Category]:
"""Fallback countries if discovery fails"""
countries_map = {
'my': 'United States',
'han-quoc': 'South Korea',
'nhat-ban': 'Japan',
'trung-quoc': 'China',
'thai-lan': 'Thailand',
'au-my': 'Europe & Americas',
'viet-nam': 'Vietnam'
}
return [
Category(
id=slug,
name=name,
slug=f'quoc-gia/{slug}',
type='country',
url=urljoin(BASE_URL, f'/quoc-gia/{slug}')
)
for slug, name in countries_map.items()
]
def _get_fallback_categories(self) -> Dict[str, List[Category]]:
"""Complete fallback if discovery fails"""
return {
'types': [
Category('phim-le', 'Movies', 'danh-sach/phim-le', 'type', f'{BASE_URL}/danh-sach/phim-le'),
Category('phim-bo', 'TV Series', 'danh-sach/phim-bo', 'type', f'{BASE_URL}/danh-sach/phim-bo'),
Category('hoat-hinh', 'Animation', 'danh-sach/hoat-hinh', 'type', f'{BASE_URL}/danh-sach/hoat-hinh'),
],
'genres': self._get_fallback_genres(),
'countries': self._get_fallback_countries(),
'years': self._generate_year_categories()
}
# Singleton instance
_discovery_instance = None
async def get_categories() -> Dict[str, List[Dict]]:
"""Get all categories (cached)"""
global _discovery_instance
discovery = CategoryDiscovery()
try:
categories = await discovery.discover_all_categories()
# Convert to dict format
return {
key: [cat.to_dict() for cat in cat_list]
for key, cat_list in categories.items()
}
finally:
await discovery.close()
def get_categories_sync() -> Dict[str, List[Dict]]:
"""Synchronous wrapper for getting categories"""
return asyncio.run(get_categories())
# CLI testing
if __name__ == "__main__":
import json
print("Discovering categories from PhimMoiChill...")
categories = get_categories_sync()
print("\n" + "="*50)
print("DISCOVERED CATEGORIES")
print("="*50)
for cat_type, cat_list in categories.items():
print(f"\n{cat_type.upper()}: {len(cat_list)} categories")
for cat in cat_list[:5]: # Show first 5
print(f" - {cat['name']} ({cat['slug']})")
if len(cat_list) > 5:
print(f" ... and {len(cat_list) - 5} more")
print("\n" + "="*50)
print(f"Total categories: {sum(len(cats) for cats in categories.values())}")
print("="*50)