246 lines
5.8 KiB
Go
246 lines
5.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"crypto/tls"
|
|
"fmt"
|
|
"net/http"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"streamflow-backend/internal/models"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
const BaseURL = "https://phimmoichill.network"
|
|
|
|
type RophimScraper struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func NewRophimScraper() *RophimScraper {
|
|
// Create custom client to handle SSL constraints if needed, similar to Python's ssl_context
|
|
tr := &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
}
|
|
client := &http.Client{
|
|
Transport: tr,
|
|
Timeout: 30 * time.Second,
|
|
}
|
|
return &RophimScraper{client: client}
|
|
}
|
|
|
|
func (s *RophimScraper) fetchDocument(url string) (*goquery.Document, error) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
|
req.Header.Set("Referer", BaseURL)
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("status code error: %d %s", resp.StatusCode, resp.Status)
|
|
}
|
|
|
|
return goquery.NewDocumentFromReader(resp.Body)
|
|
}
|
|
|
|
func (s *RophimScraper) GetHomepageMovies(page int, limit int) ([]models.RophimMovie, error) {
|
|
url := fmt.Sprintf("%s/danh-sach/phim-le", BaseURL)
|
|
if page > 1 {
|
|
url = fmt.Sprintf("%s/danh-sach/phim-le/page/%d", BaseURL, page)
|
|
}
|
|
|
|
doc, err := s.fetchDocument(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s.parseMovieGrid(doc, limit), nil
|
|
}
|
|
|
|
func (s *RophimScraper) Search(query string, limit int) ([]models.RophimMovie, error) {
|
|
url := fmt.Sprintf("%s/tim-kiem?keyword=%s", BaseURL, query)
|
|
doc, err := s.fetchDocument(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return s.parseMovieGrid(doc, limit), nil
|
|
}
|
|
|
|
func (s *RophimScraper) parseMovieGrid(doc *goquery.Document, limit int) []models.RophimMovie {
|
|
var movies []models.RophimMovie
|
|
|
|
doc.Find(".myui-vodlist__box").EachWithBreak(func(i int, s *goquery.Selection) bool {
|
|
if i >= limit {
|
|
return false
|
|
}
|
|
|
|
link := s.Find("a.myui-vodlist__thumb")
|
|
if link.Length() == 0 {
|
|
link = s.Find("a[href*='/phim/']")
|
|
}
|
|
if link.Length() == 0 {
|
|
return true
|
|
}
|
|
|
|
href, _ := link.Attr("href")
|
|
slug := extractSlug(href)
|
|
if slug == "" {
|
|
return true
|
|
}
|
|
|
|
title, _ := link.Attr("title")
|
|
if title == "" {
|
|
title = s.Find("h4.title a").Text()
|
|
}
|
|
|
|
style, _ := link.Attr("style")
|
|
thumbnail := extractThumbnail(style)
|
|
if thumbnail == "" {
|
|
thumbnail, _ = s.Find("img").Attr("src")
|
|
}
|
|
|
|
quality := s.Find(".pic-tag").Text()
|
|
if quality == "" {
|
|
quality = "HD"
|
|
}
|
|
|
|
engTitle := s.Find(".text-muted").Text()
|
|
|
|
movie := models.RophimMovie{
|
|
ID: slug,
|
|
Title: strings.TrimSpace(title),
|
|
OriginalTitle: strings.TrimSpace(engTitle),
|
|
Slug: slug,
|
|
Thumbnail: normalizeURL(thumbnail),
|
|
Quality: strings.TrimSpace(quality),
|
|
Category: "movies", // Default
|
|
}
|
|
movies = append(movies, movie)
|
|
return true
|
|
})
|
|
|
|
return movies
|
|
}
|
|
|
|
func (s *RophimScraper) GetMovieDetail(slug string) (*models.RophimMovie, error) {
|
|
url := fmt.Sprintf("%s/phim/%s", BaseURL, slug)
|
|
doc, err := s.fetchDocument(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s.parseMovieDetail(doc, slug), nil
|
|
}
|
|
|
|
func (s *RophimScraper) parseMovieDetail(doc *goquery.Document, slug string) *models.RophimMovie {
|
|
title := doc.Find("h1.movie-title").Text()
|
|
if title == "" {
|
|
title = doc.Find("h1").Text()
|
|
}
|
|
|
|
description := doc.Find("meta[name='description']").AttrOr("content", "")
|
|
if description == "" {
|
|
description = doc.Find(".description, .content, .film-description").Text()
|
|
}
|
|
|
|
poster := doc.Find("meta[property='og:image']").AttrOr("content", "")
|
|
|
|
// Parse Info (Year, Country, etc) - simplified for brevity
|
|
var year int
|
|
doc.Find(".movie-info li, .film-info li").Each(func(i int, s *goquery.Selection) {
|
|
text := s.Text()
|
|
if strings.Contains(text, "Năm") || strings.Contains(text, "Year") {
|
|
re := regexp.MustCompile(`\d{4}`)
|
|
if match := re.FindString(text); match != "" {
|
|
year, _ = strconv.Atoi(match)
|
|
}
|
|
}
|
|
})
|
|
|
|
// Parse Episodes
|
|
var episodes []models.Episode
|
|
doc.Find("a[href*='/tap-'], a[href*='episode'], .episode-list a").Each(func(i int, s *goquery.Selection) {
|
|
href, _ := s.Attr("href")
|
|
text := strings.TrimSpace(s.Text())
|
|
|
|
re := regexp.MustCompile(`tap-(\d+)`)
|
|
match := re.FindStringSubmatch(href)
|
|
if len(match) > 1 {
|
|
epNum, _ := strconv.Atoi(match[1])
|
|
episodes = append(episodes, models.Episode{
|
|
Number: epNum,
|
|
Title: text,
|
|
URL: normalizeURL(href),
|
|
})
|
|
}
|
|
})
|
|
|
|
// De-duplicate episodes
|
|
seen := make(map[int]bool)
|
|
var uniqueEpisodes []models.Episode
|
|
for _, ep := range episodes {
|
|
if !seen[ep.Number] {
|
|
seen[ep.Number] = true
|
|
uniqueEpisodes = append(uniqueEpisodes, ep)
|
|
}
|
|
}
|
|
|
|
return &models.RophimMovie{
|
|
ID: slug,
|
|
Title: strings.TrimSpace(title),
|
|
Slug: slug,
|
|
Thumbnail: normalizeURL(poster),
|
|
Description: strings.TrimSpace(description),
|
|
Year: year,
|
|
Episodes: uniqueEpisodes,
|
|
Category: "movies",
|
|
}
|
|
}
|
|
|
|
func extractSlug(url string) string {
|
|
re := regexp.MustCompile(`/phim/([^/?#]+)`)
|
|
matches := re.FindStringSubmatch(url)
|
|
if len(matches) > 1 {
|
|
return matches[1]
|
|
}
|
|
// Fallback
|
|
parts := strings.Split(url, "/")
|
|
if len(parts) > 0 {
|
|
return parts[len(parts)-1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractThumbnail(style string) string {
|
|
re := regexp.MustCompile(`url\(([^)]+)\)`)
|
|
matches := re.FindStringSubmatch(style)
|
|
if len(matches) > 1 {
|
|
return strings.Trim(matches[1], "'\"")
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func normalizeURL(url string) string {
|
|
if url == "" {
|
|
return ""
|
|
}
|
|
if strings.HasPrefix(url, "//") {
|
|
return "https:" + url
|
|
}
|
|
if strings.HasPrefix(url, "/") {
|
|
return BaseURL + url
|
|
}
|
|
return url
|
|
}
|