kv-netflix/backend/category_scraper.py

234 lines
9.1 KiB
Python

"""
Category Scraper for PhimMoiChill
Orchestrates category-based crawling to build themed sections
"""
import asyncio
from typing import Dict, List, Any
from rophim_scraper import RophimScraper
from category_discovery import get_categories
class PhimMoiChillCategoryScraper:
"""
Advanced scraper that looks for categories first, then crawls them.
"""
def __init__(self):
self.scraper = RophimScraper()
async def close(self):
await self.scraper.close()
async def get_all_sections(self) -> Dict[str, List[Dict]]:
"""
Build complete homepage structure by crawling key categories
"""
# 1. Discover Categories (Cached)
discovered = await get_categories()
# 2. Map discovered categories to UI sections
# We look for specific slugs in the discovered lists
tasks = []
# Define what we want to fetch
# Format: (section_key, category_expected_slug, fallback_slug)
sections_to_fetch = [
# Hot -> Phim Le Page 1
('hot', 'danh-sach/phim-le'),
# New Releases -> Phim Le Page 2 (Variation)
('new_releases', 'danh-sach/phim-le'),
# Series -> Phim Bo
('series', 'danh-sach/phim-bo'),
# Animation -> Hoat Hinh
('animated', 'danh-sach/hoat-hinh'),
# Cinema -> Phim Chieu Rap
('cinema', 'the-loai/phim-chieu-rap'),
# Top 10 -> Phim Le Page 1
('top10', 'danh-sach/phim-le'),
# Vietnamese
('vietnamese', 'quoc-gia/viet-nam')
]
results = {}
# Parallel fetch
async def fetch_section(key, slug):
try:
# Use scraper to get movies for this category
limit = 10 if key == 'top10' else 42 # Increased for 2-3 rows
# Fetch Page 2 for New Releases to allow variety from Hot (Page 1)
page = 2 if key == 'new_releases' else 1
movies = await self.scraper.get_category(slug, page=page, limit=limit)
# Fallback for cinema if empty - try action genre
if key == 'cinema' and not movies:
movies = await self.scraper.get_category('the-loai/hanh-dong', page=1, limit=limit)
# Convert to dict and enrich
movie_dicts = []
for idx, m in enumerate(movies, 1):
d = m.__dict__
# Add Metadata Badges
if key == 'top10':
d['ranking'] = idx
d['badge'] = f'TOP {idx}'
elif key == 'hot':
d['badge'] = 'HOT'
elif key == 'new_releases':
d['badge'] = 'NEW'
elif key == 'cinema':
d['badge'] = 'CINEMA'
movie_dicts.append(d)
return key, movie_dicts
except Exception as e:
print(f"Error fetching section {key} ({slug}): {e}")
return key, []
pending_tasks = [fetch_section(key, slug) for key, slug in sections_to_fetch]
fetched_results = await asyncio.gather(*pending_tasks)
for key, movies in fetched_results:
results[key] = movies
return results
# Individual fetchers for specific endpoints
async def get_hot_movies(self, limit=24):
movies = await self.scraper.get_category('danh-sach/phim-le', 1, limit)
return [m.__dict__ for m in movies]
async def get_new_releases(self, limit=24):
# Fetch page 2 for variety? Or just page 1
movies = await self.scraper.get_category('danh-sach/phim-le', 1, limit)
return [m.__dict__ for m in movies]
async def get_cinema_releases(self, limit=24):
# Try finding a cinema category
movies = await self.scraper.get_category('the-loai/phim-chieu-rap', 1, limit)
if not movies:
# Fallback: Phim Le
movies = await self.scraper.get_category('danh-sach/phim-le', 1, limit)
return [m.__dict__ for m in movies]
async def get_top_10(self):
movies = await self.scraper.get_category('danh-sach/phim-le', 1, 10)
return [m.__dict__ for m in movies]
async def get_mixed_sections(self, page: int) -> List[Dict[str, Any]]:
"""
Fetch subsequent pages of Main Categories for infinite scroll.
Strategy: Keep the same structure (Hot, Series, etc.) but load Page N.
"""
# Define the main structure to repeat
main_categories = [
{'title': 'Phim Hot (Movies)', 'slug': 'danh-sach/phim-le'},
{'title': 'Phim Bộ Mới (Series)', 'slug': 'danh-sach/phim-bo'},
{'title': 'Hoạt Hình & Anime', 'slug': 'danh-sach/hoat-hinh'},
{'title': 'Phim Chiếu Rạp', 'slug': 'the-loai/phim-chieu-rap'},
{'title': 'Phim Việt Nam', 'slug': 'quoc-gia/viet-nam'}
]
tasks = []
async def fetch_dynamic(cat):
try:
# Use large limit for multi-row display
movies = await self.scraper.get_category(cat['slug'], page, 84)
if not movies: return None
# Optional: Differentiate title for clarity, or keep same?
# User asked to "keep the same structure".
# We can append " - Page N" or just leave as is.
# Let's leave as is but maybe ensures frontend renders it.
return {
'title': cat['title'],
'key': cat['slug'],
'movies': [m.__dict__ for m in movies]
}
except:
return None
tasks = [fetch_dynamic(cat) for cat in main_categories]
results = await asyncio.gather(*tasks)
return [r for r in results if r is not None]
async def get_view_sections(self, view: str, page: int) -> List[Dict[str, Any]]:
"""
Fetch structured sections for specific views (Movies, Series, etc.)
mimicking the Main Page design with sliders.
"""
sub_sections = []
if view == 'movies':
sub_sections = [
{'title': 'Phim Lẻ Mới', 'slug': 'danh-sach/phim-le'},
{'title': 'Hành Động', 'slug': 'the-loai/hanh-dong'},
{'title': 'Tình Cảm', 'slug': 'the-loai/tinh-cam'},
{'title': 'Kinh Dị', 'slug': 'the-loai/kinh-di'},
{'title': 'Viễn Tưởng', 'slug': 'the-loai/vien-tuong'},
{'title': 'Hài Hước', 'slug': 'the-loai/hai-huoc'}
]
elif view == 'series':
sub_sections = [
{'title': 'Phim Bộ Mới', 'slug': 'danh-sach/phim-bo'},
{'title': 'Hàn Quốc', 'slug': 'quoc-gia/han-quoc'},
{'title': 'Trung Quốc', 'slug': 'quoc-gia/trung-quoc'},
{'title': 'Âu Mỹ', 'slug': 'quoc-gia/au-my'},
{'title': 'Thái Lan', 'slug': 'quoc-gia/thai-lan'}
]
elif view == 'animation':
sub_sections = [
{'title': 'Anime Mới', 'slug': 'danh-sach/hoat-hinh'},
{'title': 'Học Đường', 'slug': 'the-loai/hoc-duong'},
{'title': 'Nhật Bản', 'slug': 'quoc-gia/nhat-ban'}
]
elif view == 'cinema':
sub_sections = [
{'title': 'Phim Chiếu Rạp Hot', 'slug': 'the-loai/phim-chieu-rap'},
{'title': 'Hành Động', 'slug': 'the-loai/hanh-dong'},
{'title': 'Hài Hước', 'slug': 'the-loai/hai-huoc'}
]
if not sub_sections: return []
tasks = []
async def fetch_section(cat):
try:
# Fetch larger batch for multi-row
movies = await self.scraper.get_category(cat['slug'], page, 84)
if not movies: return None
return {
'title': cat['title'],
'key': cat['slug'],
'movies': [m.__dict__ for m in movies]
}
except: return None
tasks = [fetch_section(cat) for cat in sub_sections]
results = await asyncio.gather(*tasks)
return [r for r in results if r is not None]
# Wrapper function for main.py (Sync compatibility)
def get_categories_sync() -> Dict[str, List[Dict]]:
"""Synchronous wrapper to get all category sections"""
async def _run():
scraper = PhimMoiChillCategoryScraper()
try:
return await scraper.get_all_sections()
finally:
await scraper.close()
try:
return asyncio.run(_run())
except Exception as e:
print(f"Sync Category Crawl Error: {e}")
return {
'hot': [], 'new_releases': [], 'top10': [],
'cinema': [], 'vietnamese': [], 'animated': [], 'series': []
}