# ============================================================================== # Copyright (C) 2021 Evil0ctal # # This file is part of the Douyin_TikTok_Download_API project. # # This project is licensed under the Apache License 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== #         __ #        />  フ #       |  _  _ l #       /` ミ_xノ #      /      | Feed me Stars ⭐ ️ #     /  ヽ   ノ #     │  | | | #  / ̄|   | | | #  | ( ̄ヽ__ヽ_)__) #  \二つ # ============================================================================== # # Contributor Link: # - https://github.com/Evil0ctal # # ============================================================================== import asyncio import re import httpx from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫 from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫 from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫 from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler # 导入Bilibili Web爬虫 class HybridCrawler: def __init__(self): self.DouyinWebCrawler = DouyinWebCrawler() self.TikTokWebCrawler = TikTokWebCrawler() self.TikTokAPPCrawler = TikTokAPPCrawler() self.BilibiliWebCrawler = BilibiliWebCrawler() async def get_bilibili_bv_id(self, url: str) -> str: """ 从 Bilibili URL 中提取 BV 号,支持短链重定向 """ # 如果是 b23.tv 短链,需要重定向获取真实URL if "b23.tv" in url: async with httpx.AsyncClient() as client: response = await client.head(url, follow_redirects=True) url = str(response.url) # 从URL中提取BV号 bv_pattern = r'(?:video\/|\/)(BV[A-Za-z0-9]+)' match = re.search(bv_pattern, url) if match: return match.group(1) else: raise ValueError(f"Cannot extract BV ID from URL: {url}") async def hybrid_parsing_single_video(self, url: str, minimal: bool = False): # 解析抖音视频/Parse Douyin video if "douyin" in url: platform = "douyin" aweme_id = await self.DouyinWebCrawler.get_aweme_id(url) data = await self.DouyinWebCrawler.fetch_one_video(aweme_id) data = data.get("aweme_detail") # $.aweme_detail.aweme_type aweme_type = data.get("aweme_type") # 解析TikTok视频/Parse TikTok video elif "tiktok" in url: platform = "tiktok" aweme_id = await self.TikTokWebCrawler.get_aweme_id(url) # 2024-09-14: Switch to TikTokAPPCrawler instead of TikTokWebCrawler # data = await self.TikTokWebCrawler.fetch_one_video(aweme_id) # data = data.get("itemInfo").get("itemStruct") data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id) # $.imagePost exists if aweme_type is photo aweme_type = data.get("aweme_type") # 解析Bilibili视频/Parse Bilibili video elif "bilibili" in url or "b23.tv" in url: platform = "bilibili" aweme_id = await self.get_bilibili_bv_id(url) # BV号作为统一的video_id response = await self.BilibiliWebCrawler.fetch_one_video(aweme_id) data = response.get('data', {}) # 提取data部分 # Bilibili只有视频类型,aweme_type设为0(video) aweme_type = 0 else: raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.") # 检查是否需要返回最小数据/Check if minimal data is required if not minimal: return data # 如果是最小数据,处理数据/If it is minimal data, process the data url_type_code_dict = { # common 0: 'video', # Douyin 2: 'image', 4: 'video', 68: 'image', # TikTok 51: 'video', 55: 'video', 58: 'video', 61: 'video', 150: 'image' } # 判断链接类型/Judge link type url_type = url_type_code_dict.get(aweme_type, 'video') # print(f"url_type: {url_type}") """ 以下为(视频||图片)数据处理的四个方法,如果你需要自定义数据处理请在这里修改. The following are four methods of (video || image) data processing. If you need to customize data processing, please modify it here. """ """ 创建已知数据字典(索引相同),稍后使用.update()方法更新数据 Create a known data dictionary (index the same), and then use the .update() method to update the data """ # 根据平台适配字段映射 if platform == 'bilibili': result_data = { 'type': url_type, 'platform': platform, 'video_id': aweme_id, 'desc': data.get("title"), # Bilibili使用title 'create_time': data.get("pubdate"), # Bilibili使用pubdate 'author': data.get("owner"), # Bilibili使用owner 'music': None, # Bilibili没有音乐信息 'statistics': data.get("stat"), # Bilibili使用stat 'cover_data': {}, # 将在各平台处理中填充 'hashtags': None, # Bilibili没有hashtags概念 } else: result_data = { 'type': url_type, 'platform': platform, 'video_id': aweme_id, # 统一使用video_id字段,内容可能是aweme_id或bv_id 'desc': data.get("desc"), 'create_time': data.get("create_time"), 'author': data.get("author"), 'music': data.get("music"), 'statistics': data.get("statistics"), 'cover_data': {}, # 将在各平台处理中填充 'hashtags': data.get('text_extra'), } # 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data api_data = None # 判断链接类型并处理数据/Judge link type and process data # 抖音数据处理/Douyin data processing if platform == 'douyin': # 填充封面数据 result_data['cover_data'] = { 'cover': data.get("video", {}).get("cover"), 'origin_cover': data.get("video", {}).get("origin_cover"), 'dynamic_cover': data.get("video", {}).get("dynamic_cover") } # 抖音视频数据处理/Douyin video data processing if url_type == 'video': # 将信息储存在字典中/Store information in a dictionary uri = data['video']['play_addr']['uri'] wm_video_url_HQ = data['video']['play_addr']['url_list'][0] wm_video_url = f"https://aweme.snssdk.com/aweme/v1/playwm/?video_id={uri}&radio=1080p&line=0" nwm_video_url_HQ = wm_video_url_HQ.replace('playwm', 'play') nwm_video_url = f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=1080p&line=0" api_data = { 'video_data': { 'wm_video_url': wm_video_url, 'wm_video_url_HQ': wm_video_url_HQ, 'nwm_video_url': nwm_video_url, 'nwm_video_url_HQ': nwm_video_url_HQ } } # 抖音图片数据处理/Douyin image data processing elif url_type == 'image': # 无水印图片列表/No watermark image list no_watermark_image_list = [] # 有水印图片列表/With watermark image list watermark_image_list = [] # 遍历图片列表/Traverse image list for i in data['images']: no_watermark_image_list.append(i['url_list'][0]) watermark_image_list.append(i['download_url_list'][0]) api_data = { 'image_data': { 'no_watermark_image_list': no_watermark_image_list, 'watermark_image_list': watermark_image_list } } # TikTok数据处理/TikTok data processing elif platform == 'tiktok': # 填充封面数据 result_data['cover_data'] = { 'cover': data.get("video", {}).get("cover"), 'origin_cover': data.get("video", {}).get("origin_cover"), 'dynamic_cover': data.get("video", {}).get("dynamic_cover") } # TikTok视频数据处理/TikTok video data processing if url_type == 'video': # 将信息储存在字典中/Store information in a dictionary # wm_video = data['video']['downloadAddr'] # wm_video = data['video']['download_addr']['url_list'][0] wm_video = ( data.get('video', {}) .get('download_addr', {}) .get('url_list', [None])[0] ) api_data = { 'video_data': { 'wm_video_url': wm_video, 'wm_video_url_HQ': wm_video, # 'nwm_video_url': data['video']['playAddr'], 'nwm_video_url': data['video']['play_addr']['url_list'][0], # 'nwm_video_url_HQ': data['video']['bitrateInfo'][0]['PlayAddr']['UrlList'][0] 'nwm_video_url_HQ': data['video']['bit_rate'][0]['play_addr']['url_list'][0] } } # TikTok图片数据处理/TikTok image data processing elif url_type == 'image': # 无水印图片列表/No watermark image list no_watermark_image_list = [] # 有水印图片列表/With watermark image list watermark_image_list = [] for i in data['image_post_info']['images']: no_watermark_image_list.append(i['display_image']['url_list'][0]) watermark_image_list.append(i['owner_watermark_image']['url_list'][0]) api_data = { 'image_data': { 'no_watermark_image_list': no_watermark_image_list, 'watermark_image_list': watermark_image_list } } # Bilibili数据处理/Bilibili data processing elif platform == 'bilibili': # 填充封面数据 result_data['cover_data'] = { 'cover': data.get("pic"), # Bilibili使用pic作为封面 'origin_cover': data.get("pic"), 'dynamic_cover': data.get("pic") } # Bilibili只有视频,直接处理视频数据 if url_type == 'video': # 获取视频播放地址需要额外调用API cid = data.get('cid') # 获取cid if cid: # 获取播放链接,cid需要转换为字符串 playurl_data = await self.BilibiliWebCrawler.fetch_video_playurl(aweme_id, str(cid)) # 从播放数据中提取URL dash = playurl_data.get('data', {}).get('dash', {}) video_list = dash.get('video', []) audio_list = dash.get('audio', []) # 选择最高质量的视频流 video_url = video_list[0].get('baseUrl') if video_list else None audio_url = audio_list[0].get('baseUrl') if audio_list else None api_data = { 'video_data': { 'wm_video_url': video_url, 'wm_video_url_HQ': video_url, 'nwm_video_url': video_url, # Bilibili没有水印概念 'nwm_video_url_HQ': video_url, 'audio_url': audio_url, # Bilibili音视频分离 'cid': cid, # 保存cid供后续使用 } } else: api_data = { 'video_data': { 'wm_video_url': None, 'wm_video_url_HQ': None, 'nwm_video_url': None, 'nwm_video_url_HQ': None, 'error': 'Failed to get cid for video playback' } } # 更新数据/Update data result_data.update(api_data) return result_data async def main(self): # 测试混合解析单一视频接口/Test hybrid parsing single video endpoint # url = "https://v.douyin.com/L4FJNR3/" # url = "https://www.tiktok.com/@taylorswift/video/7359655005701311786" url = "https://www.tiktok.com/@flukegk83/video/7360734489271700753" # url = "https://www.tiktok.com/@minecraft/photo/7369296852669205791" minimal = True result = await self.hybrid_parsing_single_video(url, minimal=minimal) print(result) # 占位 pass if __name__ == '__main__': # 实例化混合爬虫/Instantiate hybrid crawler hybird_crawler = HybridCrawler() # 运行测试代码/Run test code asyncio.run(hybird_crawler.main())