328 lines
11 KiB
Python
328 lines
11 KiB
Python
"""
|
|
Category Discovery Module for PhimMoiChill
|
|
Automatically discovers and maps all available categories
|
|
"""
|
|
import asyncio
|
|
import aiohttp
|
|
import ssl
|
|
from bs4 import BeautifulSoup
|
|
from dataclasses import dataclass, asdict
|
|
from typing import List, Dict, Optional
|
|
from urllib.parse import urljoin
|
|
|
|
BASE_URL = "https://phimmoichill.network"
|
|
|
|
@dataclass
|
|
class Category:
|
|
"""Category metadata"""
|
|
id: str
|
|
name: str
|
|
slug: str
|
|
type: str # 'type', 'genre', 'country', 'year'
|
|
url: str
|
|
parent: Optional[str] = None
|
|
movie_count: int = 0
|
|
|
|
def to_dict(self):
|
|
return asdict(self)
|
|
|
|
|
|
class CategoryDiscovery:
|
|
"""Discovers categories from PhimMoiChill navigation"""
|
|
|
|
def __init__(self):
|
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
}
|
|
|
|
async def _get_session(self) -> aiohttp.ClientSession:
|
|
if not self.session:
|
|
ssl_context = ssl.create_default_context()
|
|
ssl_context.check_hostname = False
|
|
ssl_context.verify_mode = ssl.CERT_NONE
|
|
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
|
self.session = aiohttp.ClientSession(headers=self.headers, connector=connector)
|
|
return self.session
|
|
|
|
async def close(self):
|
|
if self.session:
|
|
await self.session.close()
|
|
self.session = None
|
|
|
|
async def _fetch_html(self, url: str) -> str:
|
|
"""Fetch HTML content"""
|
|
session = await self._get_session()
|
|
async with session.get(url) as response:
|
|
if response.status == 200:
|
|
return await response.text()
|
|
raise Exception(f"Failed to fetch {url}: {response.status}")
|
|
|
|
async def discover_all_categories(self) -> Dict[str, List[Category]]:
|
|
"""
|
|
Discover all categories from PhimMoiChill
|
|
Returns organized structure of categories
|
|
"""
|
|
try:
|
|
html = await self._fetch_html(BASE_URL)
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
categories = {
|
|
'types': [],
|
|
'genres': [],
|
|
'countries': [],
|
|
'years': []
|
|
}
|
|
|
|
# Discover main types (phim-le, phim-bo, etc.)
|
|
categories['types'] = await self._discover_main_types(soup)
|
|
|
|
# Discover genres (the-loai/*)
|
|
categories['genres'] = await self._discover_genres(soup)
|
|
|
|
# Discover countries (quoc-gia/*)
|
|
categories['countries'] = await self._discover_countries(soup)
|
|
|
|
# Generate year categories
|
|
categories['years'] = self._generate_year_categories()
|
|
|
|
return categories
|
|
|
|
except Exception as e:
|
|
print(f"Error discovering categories: {e}")
|
|
return self._get_fallback_categories()
|
|
|
|
async def _discover_main_types(self, soup: BeautifulSoup) -> List[Category]:
|
|
"""Discover main content types"""
|
|
types = []
|
|
|
|
# Look for navigation menu with main types
|
|
nav_links = soup.select('nav a, .menu a, .navigation a')
|
|
|
|
# Known type patterns
|
|
type_patterns = {
|
|
'phim-le': 'Movies',
|
|
'phim-bo': 'TV Series',
|
|
'tv-shows': 'TV Shows',
|
|
'hoat-hinh': 'Animation'
|
|
}
|
|
|
|
for link in nav_links:
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
|
|
for slug, name in type_patterns.items():
|
|
if slug in href:
|
|
types.append(Category(
|
|
id=slug,
|
|
name=text or name,
|
|
slug=f'danh-sach/{slug}',
|
|
type='type',
|
|
url=urljoin(BASE_URL, f'/danh-sach/{slug}')
|
|
))
|
|
break
|
|
|
|
# Ensure we have at least the basic types
|
|
if not types:
|
|
for slug, name in type_patterns.items():
|
|
types.append(Category(
|
|
id=slug,
|
|
name=name,
|
|
slug=f'danh-sach/{slug}',
|
|
type='type',
|
|
url=urljoin(BASE_URL, f'/danh-sach/{slug}')
|
|
))
|
|
|
|
return types
|
|
|
|
async def _discover_genres(self, soup: BeautifulSoup) -> List[Category]:
|
|
"""Discover genre categories"""
|
|
genres = []
|
|
|
|
# Look for genre menu/dropdown
|
|
genre_links = soup.select('a[href*="the-loai/"]')
|
|
|
|
seen_genres = set()
|
|
for link in genre_links:
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
|
|
# Extract genre slug from URL
|
|
if '/the-loai/' in href:
|
|
slug = href.split('/the-loai/')[-1].split('/')[0].split('?')[0]
|
|
|
|
if slug and slug not in seen_genres:
|
|
seen_genres.add(slug)
|
|
genres.append(Category(
|
|
id=slug,
|
|
name=text or slug.replace('-', ' ').title(),
|
|
slug=f'the-loai/{slug}',
|
|
type='genre',
|
|
url=urljoin(BASE_URL, f'/the-loai/{slug}')
|
|
))
|
|
|
|
# Fallback: common genres
|
|
if not genres:
|
|
genres = self._get_fallback_genres()
|
|
|
|
return genres
|
|
|
|
async def _discover_countries(self, soup: BeautifulSoup) -> List[Category]:
|
|
"""Discover country categories"""
|
|
countries = []
|
|
|
|
# Look for country menu/dropdown
|
|
country_links = soup.select('a[href*="quoc-gia/"]')
|
|
|
|
seen_countries = set()
|
|
for link in country_links:
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
|
|
# Extract country slug from URL
|
|
if '/quoc-gia/' in href:
|
|
slug = href.split('/quoc-gia/')[-1].split('/')[0].split('?')[0]
|
|
|
|
if slug and slug not in seen_countries:
|
|
seen_countries.add(slug)
|
|
countries.append(Category(
|
|
id=slug,
|
|
name=text or slug.replace('-', ' ').title(),
|
|
slug=f'quoc-gia/{slug}',
|
|
type='country',
|
|
url=urljoin(BASE_URL, f'/quoc-gia/{slug}')
|
|
))
|
|
|
|
# Fallback: common countries
|
|
if not countries:
|
|
countries = self._get_fallback_countries()
|
|
|
|
return countries
|
|
|
|
def _generate_year_categories(self) -> List[Category]:
|
|
"""Generate year-based categories"""
|
|
from datetime import datetime
|
|
current_year = datetime.now().year
|
|
|
|
years = []
|
|
for year in range(current_year, current_year - 10, -1):
|
|
years.append(Category(
|
|
id=str(year),
|
|
name=str(year),
|
|
slug=f'nam/{year}',
|
|
type='year',
|
|
url=urljoin(BASE_URL, f'/nam/{year}')
|
|
))
|
|
|
|
return years
|
|
|
|
def _get_fallback_genres(self) -> List[Category]:
|
|
"""Fallback genres if discovery fails"""
|
|
genres_map = {
|
|
'hanh-dong': 'Action',
|
|
'kinh-di': 'Horror',
|
|
'tinh-cam': 'Romance',
|
|
'hai-huoc': 'Comedy',
|
|
'vien-tuong': 'Sci-Fi',
|
|
'phieu-luu': 'Adventure',
|
|
'bi-an': 'Mystery',
|
|
'chien-tranh': 'War',
|
|
'tam-ly': 'Psychological',
|
|
'gia-dinh': 'Family'
|
|
}
|
|
|
|
return [
|
|
Category(
|
|
id=slug,
|
|
name=name,
|
|
slug=f'the-loai/{slug}',
|
|
type='genre',
|
|
url=urljoin(BASE_URL, f'/the-loai/{slug}')
|
|
)
|
|
for slug, name in genres_map.items()
|
|
]
|
|
|
|
def _get_fallback_countries(self) -> List[Category]:
|
|
"""Fallback countries if discovery fails"""
|
|
countries_map = {
|
|
'my': 'United States',
|
|
'han-quoc': 'South Korea',
|
|
'nhat-ban': 'Japan',
|
|
'trung-quoc': 'China',
|
|
'thai-lan': 'Thailand',
|
|
'au-my': 'Europe & Americas',
|
|
'viet-nam': 'Vietnam'
|
|
}
|
|
|
|
return [
|
|
Category(
|
|
id=slug,
|
|
name=name,
|
|
slug=f'quoc-gia/{slug}',
|
|
type='country',
|
|
url=urljoin(BASE_URL, f'/quoc-gia/{slug}')
|
|
)
|
|
for slug, name in countries_map.items()
|
|
]
|
|
|
|
def _get_fallback_categories(self) -> Dict[str, List[Category]]:
|
|
"""Complete fallback if discovery fails"""
|
|
return {
|
|
'types': [
|
|
Category('phim-le', 'Movies', 'danh-sach/phim-le', 'type', f'{BASE_URL}/danh-sach/phim-le'),
|
|
Category('phim-bo', 'TV Series', 'danh-sach/phim-bo', 'type', f'{BASE_URL}/danh-sach/phim-bo'),
|
|
Category('hoat-hinh', 'Animation', 'danh-sach/hoat-hinh', 'type', f'{BASE_URL}/danh-sach/hoat-hinh'),
|
|
],
|
|
'genres': self._get_fallback_genres(),
|
|
'countries': self._get_fallback_countries(),
|
|
'years': self._generate_year_categories()
|
|
}
|
|
|
|
|
|
# Singleton instance
|
|
_discovery_instance = None
|
|
|
|
async def get_categories() -> Dict[str, List[Dict]]:
|
|
"""Get all categories (cached)"""
|
|
global _discovery_instance
|
|
|
|
discovery = CategoryDiscovery()
|
|
try:
|
|
categories = await discovery.discover_all_categories()
|
|
# Convert to dict format
|
|
return {
|
|
key: [cat.to_dict() for cat in cat_list]
|
|
for key, cat_list in categories.items()
|
|
}
|
|
finally:
|
|
await discovery.close()
|
|
|
|
|
|
def get_categories_sync() -> Dict[str, List[Dict]]:
|
|
"""Synchronous wrapper for getting categories"""
|
|
return asyncio.run(get_categories())
|
|
|
|
|
|
# CLI testing
|
|
if __name__ == "__main__":
|
|
import json
|
|
|
|
print("Discovering categories from PhimMoiChill...")
|
|
categories = get_categories_sync()
|
|
|
|
print("\n" + "="*50)
|
|
print("DISCOVERED CATEGORIES")
|
|
print("="*50)
|
|
|
|
for cat_type, cat_list in categories.items():
|
|
print(f"\n{cat_type.upper()}: {len(cat_list)} categories")
|
|
for cat in cat_list[:5]: # Show first 5
|
|
print(f" - {cat['name']} ({cat['slug']})")
|
|
if len(cat_list) > 5:
|
|
print(f" ... and {len(cat_list) - 5} more")
|
|
|
|
print("\n" + "="*50)
|
|
print(f"Total categories: {sum(len(cats) for cats in categories.values())}")
|
|
print("="*50)
|