kv-tiktok-download/crawlers/hybrid/hybrid_crawler.py

320 lines
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==============================================================================
# Copyright (C) 2021 Evil0ctal
#
# This file is part of the Douyin_TikTok_Download_API project.
#
# This project is licensed under the Apache License 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#         __
#        />  フ
#       |  _  _ l
#       ` ミ_x
#      /      | Feed me Stars ⭐
#     /  ヽ   ノ
#     │  | | |
#  / ̄|   | | |
#  | ( ̄ヽ__ヽ_)__)
#  \二つ
# ==============================================================================
#
# Contributor Link:
# - https://github.com/Evil0ctal
#
# ==============================================================================
import asyncio
import re
import httpx
from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫
from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫
from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫
from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler # 导入Bilibili Web爬虫
class HybridCrawler:
def __init__(self):
self.DouyinWebCrawler = DouyinWebCrawler()
self.TikTokWebCrawler = TikTokWebCrawler()
self.TikTokAPPCrawler = TikTokAPPCrawler()
self.BilibiliWebCrawler = BilibiliWebCrawler()
async def get_bilibili_bv_id(self, url: str) -> str:
"""
从 Bilibili URL 中提取 BV 号,支持短链重定向
"""
# 如果是 b23.tv 短链需要重定向获取真实URL
if "b23.tv" in url:
async with httpx.AsyncClient() as client:
response = await client.head(url, follow_redirects=True)
url = str(response.url)
# 从URL中提取BV号
bv_pattern = r'(?:video\/|\/)(BV[A-Za-z0-9]+)'
match = re.search(bv_pattern, url)
if match:
return match.group(1)
else:
raise ValueError(f"Cannot extract BV ID from URL: {url}")
async def hybrid_parsing_single_video(self, url: str, minimal: bool = False):
# 解析抖音视频/Parse Douyin video
if "douyin" in url:
platform = "douyin"
aweme_id = await self.DouyinWebCrawler.get_aweme_id(url)
data = await self.DouyinWebCrawler.fetch_one_video(aweme_id)
data = data.get("aweme_detail")
# $.aweme_detail.aweme_type
aweme_type = data.get("aweme_type")
# 解析TikTok视频/Parse TikTok video
elif "tiktok" in url:
platform = "tiktok"
aweme_id = await self.TikTokWebCrawler.get_aweme_id(url)
# 2024-09-14: Switch to TikTokAPPCrawler instead of TikTokWebCrawler
# data = await self.TikTokWebCrawler.fetch_one_video(aweme_id)
# data = data.get("itemInfo").get("itemStruct")
data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id)
# $.imagePost exists if aweme_type is photo
aweme_type = data.get("aweme_type")
# 解析Bilibili视频/Parse Bilibili video
elif "bilibili" in url or "b23.tv" in url:
platform = "bilibili"
aweme_id = await self.get_bilibili_bv_id(url) # BV号作为统一的video_id
response = await self.BilibiliWebCrawler.fetch_one_video(aweme_id)
data = response.get('data', {}) # 提取data部分
# Bilibili只有视频类型aweme_type设为0(video)
aweme_type = 0
else:
raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.")
# 检查是否需要返回最小数据/Check if minimal data is required
if not minimal:
return data
# 如果是最小数据,处理数据/If it is minimal data, process the data
url_type_code_dict = {
# common
0: 'video',
# Douyin
2: 'image',
4: 'video',
68: 'image',
# TikTok
51: 'video',
55: 'video',
58: 'video',
61: 'video',
150: 'image'
}
# 判断链接类型/Judge link type
url_type = url_type_code_dict.get(aweme_type, 'video')
# print(f"url_type: {url_type}")
"""
以下为(视频||图片)数据处理的四个方法,如果你需要自定义数据处理请在这里修改.
The following are four methods of (video || image) data processing.
If you need to customize data processing, please modify it here.
"""
"""
创建已知数据字典(索引相同),稍后使用.update()方法更新数据
Create a known data dictionary (index the same),
and then use the .update() method to update the data
"""
# 根据平台适配字段映射
if platform == 'bilibili':
result_data = {
'type': url_type,
'platform': platform,
'video_id': aweme_id,
'desc': data.get("title"), # Bilibili使用title
'create_time': data.get("pubdate"), # Bilibili使用pubdate
'author': data.get("owner"), # Bilibili使用owner
'music': None, # Bilibili没有音乐信息
'statistics': data.get("stat"), # Bilibili使用stat
'cover_data': {}, # 将在各平台处理中填充
'hashtags': None, # Bilibili没有hashtags概念
}
else:
result_data = {
'type': url_type,
'platform': platform,
'video_id': aweme_id, # 统一使用video_id字段内容可能是aweme_id或bv_id
'desc': data.get("desc"),
'create_time': data.get("create_time"),
'author': data.get("author"),
'music': data.get("music"),
'statistics': data.get("statistics"),
'cover_data': {}, # 将在各平台处理中填充
'hashtags': data.get('text_extra'),
}
# 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data
api_data = None
# 判断链接类型并处理数据/Judge link type and process data
# 抖音数据处理/Douyin data processing
if platform == 'douyin':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("video", {}).get("cover"),
'origin_cover': data.get("video", {}).get("origin_cover"),
'dynamic_cover': data.get("video", {}).get("dynamic_cover")
}
# 抖音视频数据处理/Douyin video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
uri = data['video']['play_addr']['uri']
wm_video_url_HQ = data['video']['play_addr']['url_list'][0]
wm_video_url = f"https://aweme.snssdk.com/aweme/v1/playwm/?video_id={uri}&radio=1080p&line=0"
nwm_video_url_HQ = wm_video_url_HQ.replace('playwm', 'play')
nwm_video_url = f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=1080p&line=0"
api_data = {
'video_data':
{
'wm_video_url': wm_video_url,
'wm_video_url_HQ': wm_video_url_HQ,
'nwm_video_url': nwm_video_url,
'nwm_video_url_HQ': nwm_video_url_HQ
}
}
# 抖音图片数据处理/Douyin image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
# 遍历图片列表/Traverse image list
for i in data['images']:
no_watermark_image_list.append(i['url_list'][0])
watermark_image_list.append(i['download_url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# TikTok数据处理/TikTok data processing
elif platform == 'tiktok':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("video", {}).get("cover"),
'origin_cover': data.get("video", {}).get("origin_cover"),
'dynamic_cover': data.get("video", {}).get("dynamic_cover")
}
# TikTok视频数据处理/TikTok video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
# wm_video = data['video']['downloadAddr']
# wm_video = data['video']['download_addr']['url_list'][0]
wm_video = (
data.get('video', {})
.get('download_addr', {})
.get('url_list', [None])[0]
)
api_data = {
'video_data':
{
'wm_video_url': wm_video,
'wm_video_url_HQ': wm_video,
# 'nwm_video_url': data['video']['playAddr'],
'nwm_video_url': data['video']['play_addr']['url_list'][0],
# 'nwm_video_url_HQ': data['video']['bitrateInfo'][0]['PlayAddr']['UrlList'][0]
'nwm_video_url_HQ': data['video']['bit_rate'][0]['play_addr']['url_list'][0]
}
}
# TikTok图片数据处理/TikTok image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
for i in data['image_post_info']['images']:
no_watermark_image_list.append(i['display_image']['url_list'][0])
watermark_image_list.append(i['owner_watermark_image']['url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# Bilibili数据处理/Bilibili data processing
elif platform == 'bilibili':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("pic"), # Bilibili使用pic作为封面
'origin_cover': data.get("pic"),
'dynamic_cover': data.get("pic")
}
# Bilibili只有视频直接处理视频数据
if url_type == 'video':
# 获取视频播放地址需要额外调用API
cid = data.get('cid') # 获取cid
if cid:
# 获取播放链接cid需要转换为字符串
playurl_data = await self.BilibiliWebCrawler.fetch_video_playurl(aweme_id, str(cid))
# 从播放数据中提取URL
dash = playurl_data.get('data', {}).get('dash', {})
video_list = dash.get('video', [])
audio_list = dash.get('audio', [])
# 选择最高质量的视频流
video_url = video_list[0].get('baseUrl') if video_list else None
audio_url = audio_list[0].get('baseUrl') if audio_list else None
api_data = {
'video_data': {
'wm_video_url': video_url,
'wm_video_url_HQ': video_url,
'nwm_video_url': video_url, # Bilibili没有水印概念
'nwm_video_url_HQ': video_url,
'audio_url': audio_url, # Bilibili音视频分离
'cid': cid, # 保存cid供后续使用
}
}
else:
api_data = {
'video_data': {
'wm_video_url': None,
'wm_video_url_HQ': None,
'nwm_video_url': None,
'nwm_video_url_HQ': None,
'error': 'Failed to get cid for video playback'
}
}
# 更新数据/Update data
result_data.update(api_data)
return result_data
async def main(self):
# 测试混合解析单一视频接口/Test hybrid parsing single video endpoint
# url = "https://v.douyin.com/L4FJNR3/"
# url = "https://www.tiktok.com/@taylorswift/video/7359655005701311786"
url = "https://www.tiktok.com/@flukegk83/video/7360734489271700753"
# url = "https://www.tiktok.com/@minecraft/photo/7369296852669205791"
minimal = True
result = await self.hybrid_parsing_single_video(url, minimal=minimal)
print(result)
# 占位
pass
if __name__ == '__main__':
# 实例化混合爬虫/Instantiate hybrid crawler
hybird_crawler = HybridCrawler()
# 运行测试代码/Run test code
asyncio.run(hybird_crawler.main())