Some checks failed
StreamFlow CI/CD / Backend Tests (push) Has been cancelled
StreamFlow CI/CD / Backend Lint (push) Has been cancelled
StreamFlow CI/CD / Frontend Tests (push) Has been cancelled
StreamFlow CI/CD / Android TV Build (push) Has been cancelled
StreamFlow CI/CD / Docker Build (push) Has been cancelled
StreamFlow CI/CD / Docker Publish (push) Has been cancelled
237 lines
6 KiB
Go
237 lines
6 KiB
Go
package scraper
|
||
|
||
import (
|
||
"fmt"
|
||
"net/http"
|
||
"net/url"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"streamflow-backend/internal/models"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
)
|
||
|
||
func parseEpisodeNumber(title string) int {
|
||
// e.g "Tập 1", "Tập 01", "Full"
|
||
t := strings.ToLower(strings.TrimSpace(title))
|
||
if t == "full" {
|
||
return 1
|
||
}
|
||
t = strings.ReplaceAll(t, "tập ", "")
|
||
t = strings.ReplaceAll(t, "tap ", "")
|
||
|
||
// handle multi-spaces
|
||
parts := strings.Fields(t)
|
||
if len(parts) > 0 {
|
||
num, err := strconv.Atoi(parts[0])
|
||
if err == nil {
|
||
return num
|
||
}
|
||
}
|
||
return 1
|
||
}
|
||
|
||
const Phim30BaseURL = "https://phim30.me"
|
||
|
||
type Phim30Scraper struct {
|
||
client *http.Client
|
||
}
|
||
|
||
func NewPhim30Scraper() *Phim30Scraper {
|
||
return &Phim30Scraper{
|
||
client: &http.Client{
|
||
Timeout: 30 * time.Second,
|
||
},
|
||
}
|
||
}
|
||
|
||
func (p *Phim30Scraper) Search(query string, page int) ([]models.RophimMovie, error) {
|
||
searchURL := fmt.Sprintf("%s/tim-kiem?keyword=%s&page=%d", Phim30BaseURL, url.QueryEscape(query), page)
|
||
return p.scrapeMovieList(searchURL)
|
||
}
|
||
|
||
func (p *Phim30Scraper) GetMoviesByCategory(category string, page int) ([]models.RophimMovie, error) {
|
||
if category == "" || category == "home" {
|
||
homeURL := fmt.Sprintf("%s/?page=%d", Phim30BaseURL, page)
|
||
return p.scrapeMovieList(homeURL)
|
||
}
|
||
|
||
var path string
|
||
switch category {
|
||
case "phim-le", "phim-bo", "phim-sap-chieu":
|
||
path = fmt.Sprintf("danh-sach/%s", category)
|
||
default:
|
||
// Assume everything else is a Genre (e.g., hanh-dong, hoat-hinh, tv-shows)
|
||
path = fmt.Sprintf("the-loai/%s", category)
|
||
}
|
||
|
||
catURL := fmt.Sprintf("%s/%s?page=%d", Phim30BaseURL, path, page)
|
||
return p.scrapeMovieList(catURL)
|
||
}
|
||
|
||
func cleanImageUrl(rawURL string) string {
|
||
if strings.Contains(rawURL, "cdn-image-tf.phim30.me") {
|
||
// Example: https://cdn-image-tf.phim30.me/unsafe/360x0/filters:quality(90)/https%3A%2F%2Fphimimg.com%2Fupload%2Fvod%2F...
|
||
parts := strings.SplitN(rawURL, "/https", 2)
|
||
if len(parts) == 2 {
|
||
decoded, err := url.QueryUnescape("https" + parts[1])
|
||
if err == nil {
|
||
return decoded
|
||
}
|
||
}
|
||
}
|
||
return rawURL
|
||
}
|
||
|
||
func (p *Phim30Scraper) scrapeMovieList(targetURL string) ([]models.RophimMovie, error) {
|
||
req, err := http.NewRequest("GET", targetURL, nil)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||
|
||
resp, err := p.client.Do(req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
return nil, fmt.Errorf("phim30 returned status: %d", resp.StatusCode)
|
||
}
|
||
|
||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
var movies []models.RophimMovie
|
||
|
||
doc.Find("a[href^='https://phim30.me/phim/']").Each(func(i int, s *goquery.Selection) {
|
||
href, _ := s.Attr("href")
|
||
title, _ := s.Attr("title")
|
||
|
||
if title == "" {
|
||
title = strings.TrimSpace(s.Text())
|
||
}
|
||
|
||
// Remove the base url to get the slug
|
||
slug := strings.TrimPrefix(href, "https://phim30.me/phim/")
|
||
|
||
// Try to find an image child (check data-src for lazy-loaded images)
|
||
thumb := ""
|
||
s.Find("img").Each(func(j int, img *goquery.Selection) {
|
||
src, _ := img.Attr("src")
|
||
dataSrc, _ := img.Attr("data-src")
|
||
lazySrc, _ := img.Attr("lazy-src")
|
||
if dataSrc != "" {
|
||
thumb = dataSrc
|
||
} else if lazySrc != "" {
|
||
thumb = lazySrc
|
||
} else if src != "" && !strings.Contains(src, "data:image") {
|
||
thumb = src
|
||
}
|
||
})
|
||
|
||
if title != "" && slug != "" && !strings.Contains(slug, "the-loai") && !strings.Contains(slug, "quoc-gia") && !strings.Contains(slug, "nam-phat-hanh") {
|
||
movies = append(movies, models.RophimMovie{
|
||
ID: slug,
|
||
Slug: slug,
|
||
Title: title,
|
||
OriginalTitle: title,
|
||
Thumbnail: cleanImageUrl(thumb),
|
||
Backdrop: cleanImageUrl(thumb),
|
||
Provider: "Phim30.me",
|
||
})
|
||
}
|
||
})
|
||
|
||
// Deduplicate movies because a search page might have multiple links to the same movie
|
||
var uniqueMovies []models.RophimMovie
|
||
seen := make(map[string]bool)
|
||
for _, m := range movies {
|
||
if !seen[m.Slug] {
|
||
seen[m.Slug] = true
|
||
uniqueMovies = append(uniqueMovies, m)
|
||
}
|
||
}
|
||
|
||
return uniqueMovies, nil
|
||
}
|
||
|
||
func (p *Phim30Scraper) GetMovieDetail(slug string) (*models.RophimMovie, error) {
|
||
targetURL := fmt.Sprintf("%s/phim/%s", Phim30BaseURL, slug)
|
||
req, err := http.NewRequest("GET", targetURL, nil)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||
|
||
resp, err := p.client.Do(req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
return nil, fmt.Errorf("phim30 returned status: %d", resp.StatusCode)
|
||
}
|
||
|
||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
movie := &models.RophimMovie{
|
||
ID: slug,
|
||
Slug: slug,
|
||
}
|
||
|
||
title := doc.Find("h1.movie-title").Text()
|
||
if title == "" {
|
||
title = doc.Find("title").Text()
|
||
title = strings.Split(title, "–")[0]
|
||
title = strings.TrimSpace(title)
|
||
}
|
||
movie.Title = title
|
||
movie.OriginalTitle = title
|
||
|
||
thumb := ""
|
||
doc.Find("div.movie-l-img img").Each(func(i int, img *goquery.Selection) {
|
||
if src, ok := img.Attr("src"); ok {
|
||
thumb = src
|
||
}
|
||
})
|
||
if thumb != "" {
|
||
movie.Thumbnail = cleanImageUrl(thumb)
|
||
movie.Backdrop = cleanImageUrl(thumb)
|
||
}
|
||
|
||
movie.Provider = "Phim30.me"
|
||
|
||
var eps []models.Episode
|
||
doc.Find("a[href*='/xem-phim/']").Each(func(i int, s *goquery.Selection) {
|
||
href, _ := s.Attr("href")
|
||
epName := strings.TrimSpace(s.Text())
|
||
|
||
if epName != "" && href != "" {
|
||
if !strings.HasPrefix(href, "http") {
|
||
href = Phim30BaseURL + href
|
||
}
|
||
eps = append(eps, models.Episode{
|
||
ServerName: "Phim30",
|
||
Title: epName,
|
||
Number: parseEpisodeNumber(epName),
|
||
URL: href,
|
||
})
|
||
}
|
||
})
|
||
|
||
if len(eps) > 0 {
|
||
movie.Episodes = eps
|
||
}
|
||
|
||
return movie, nil
|
||
}
|