Enhance download functionality to support Bilibili: added video and audio merging, updated API to handle Bilibili URLs, and improved error handling. Updated documentation to reflect new features and parameters.

This commit is contained in:
Jagger.H 2025-09-02 00:03:14 +08:00
parent 2db97eaee8
commit 9f72c26ad6
3 changed files with 231 additions and 41 deletions

View file

@ -1,5 +1,7 @@
import os
import zipfile
import subprocess
import tempfile
import aiofiles
import httpx
@ -48,42 +50,102 @@ async def fetch_data_stream(url: str, request:Request , headers: dict = None, fi
await out_file.write(chunk)
return True
@router.get("/download", summary="在线下载抖音|TikTok视频/图片/Online download Douyin|TikTok video/image")
async def merge_bilibili_video_audio(video_url: str, audio_url: str, request: Request, output_path: str, headers: dict) -> bool:
"""
下载并合并 Bilibili 的视频流和音频流
"""
try:
# 创建临时文件
with tempfile.NamedTemporaryFile(suffix='.m4v', delete=False) as video_temp:
video_temp_path = video_temp.name
with tempfile.NamedTemporaryFile(suffix='.m4a', delete=False) as audio_temp:
audio_temp_path = audio_temp.name
# 下载视频流
video_success = await fetch_data_stream(video_url, request, headers=headers, file_path=video_temp_path)
# 下载音频流
audio_success = await fetch_data_stream(audio_url, request, headers=headers, file_path=audio_temp_path)
if not video_success or not audio_success:
print("Failed to download video or audio stream")
return False
# 使用 FFmpeg 合并视频和音频
ffmpeg_cmd = [
'ffmpeg', '-y', # -y 覆盖输出文件
'-i', video_temp_path, # 视频输入
'-i', audio_temp_path, # 音频输入
'-c:v', 'copy', # 复制视频编码,不重新编码
'-c:a', 'copy', # 复制音频编码,不重新编码(保持原始质量)
'-f', 'mp4', # 确保输出格式为MP4
output_path
]
print(f"FFmpeg command: {' '.join(ffmpeg_cmd)}")
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
print(f"FFmpeg return code: {result.returncode}")
if result.stderr:
print(f"FFmpeg stderr: {result.stderr}")
if result.stdout:
print(f"FFmpeg stdout: {result.stdout}")
# 清理临时文件
try:
os.unlink(video_temp_path)
os.unlink(audio_temp_path)
except:
pass
return result.returncode == 0
except Exception as e:
# 清理临时文件
try:
os.unlink(video_temp_path)
os.unlink(audio_temp_path)
except:
pass
print(f"Error merging video and audio: {e}")
return False
@router.get("/download", summary="在线下载抖音|TikTok|Bilibili视频/图片/Online download Douyin|TikTok|Bilibili video/image")
async def download_file_hybrid(request: Request,
url: str = Query(
example="https://www.douyin.com/video/7372484719365098803",
description="视频或图片的URL地址也支持抖音|TikTok的分享链接例如https://v.douyin.com/e4J8Q7A/"),
description="视频或图片的URL地址支持抖音|TikTok|Bilibili的分享链接例如https://v.douyin.com/e4J8Q7A/ 或 https://www.bilibili.com/video/BV1xxxxxxxxx"),
prefix: bool = True,
with_watermark: bool = False):
"""
# [中文]
### 用途:
- 在线下载抖音|TikTok 无水印或有水印的视频/图片
- 在线下载抖音|TikTok|Bilibili 无水印或有水印的视频/图片
- 通过传入的视频URL参数获取对应的视频或图片数据然后下载到本地
- 如果你在尝试直接访问TikTok单一视频接口的JSON数据中的视频播放地址时遇到HTTP403错误那么你可以使用此接口来下载视频
- Bilibili视频会自动合并视频流和音频流确保下载的视频有声音
- 这个接口会占用一定的服务器资源所以在Demo站点是默认关闭的你可以在本地部署后调用此接口
### 参数:
- url: 视频或图片的URL地址也支持抖音|TikTok的分享链接例如https://v.douyin.com/e4J8Q7A/
- url: 视频或图片的URL地址支持抖音|TikTok|Bilibili的分享链接例如https://v.douyin.com/e4J8Q7A/ https://www.bilibili.com/video/BV1xxxxxxxxx
- prefix: 下载文件的前缀默认为True可以在配置文件中修改
- with_watermark: 是否下载带水印的视频或图片默认为False
- with_watermark: 是否下载带水印的视频或图片默认为False(注意Bilibili没有水印概念)
### 返回:
- 返回下载的视频或图片文件响应
# [English]
### Purpose:
- Download Douyin|TikTok video/image with or without watermark online.
- Download Douyin|TikTok|Bilibili video/image with or without watermark online.
- By passing the video URL parameter, get the corresponding video or image data, and then download it to the local.
- If you encounter an HTTP403 error when trying to access the video playback address in the JSON data of the TikTok single video interface directly, you can use this interface to download the video.
- Bilibili videos will automatically merge video and audio streams to ensure downloaded videos have sound.
- This interface will occupy a certain amount of server resources, so it is disabled by default on the Demo site, you can call this interface after deploying it locally.
### Parameters:
- url: The URL address of the video or image, also supports Douyin|TikTok sharing links, for example: https://v.douyin.com/e4J8Q7A/.
- url: The URL address of the video or image, supports Douyin|TikTok|Bilibili sharing links, for example: https://v.douyin.com/e4J8Q7A/ or https://www.bilibili.com/video/BV1xxxxxxxxx
- prefix: The prefix of the downloaded file, the default is True, and can be modified in the configuration file.
- with_watermark: Whether to download videos or images with watermarks, the default is False.
- with_watermark: Whether to download videos or images with watermarks, the default is False. (Note: Bilibili has no watermark concept)
### Returns:
- Return the response of the downloaded video or image file.
# [示例/Example]
url: https://www.douyin.com/video/7372484719365098803
url: https://www.bilibili.com/video/BV1U5efz2Egn
"""
# 是否开启此端点/Whether to enable this endpoint
if not config["API"]["Download_Switch"]:
@ -103,7 +165,7 @@ async def download_file_hybrid(request: Request,
try:
data_type = data.get('type')
platform = data.get('platform')
aweme_id = data.get('aweme_id')
video_id = data.get('video_id') # 改为使用video_id
file_prefix = config.get("API").get("Download_File_Prefix") if prefix else ''
download_path = os.path.join(config.get("API").get("Download_Path"), f"{platform}_{data_type}")
@ -112,25 +174,48 @@ async def download_file_hybrid(request: Request,
# 下载视频文件/Download video file
if data_type == 'video':
file_name = f"{file_prefix}{platform}_{aweme_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_watermark.mp4"
url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get(
'wm_video_url_HQ')
file_name = f"{file_prefix}{platform}_{video_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{video_id}_watermark.mp4"
file_path = os.path.join(download_path, file_name)
# 判断文件是否存在,存在就直接返回
if os.path.exists(file_path):
return FileResponse(path=file_path, media_type='video/mp4', filename=file_name)
# 获取视频文件
__headers = await HybridCrawler.TikTokWebCrawler.get_tiktok_headers() if platform == 'tiktok' else await HybridCrawler.DouyinWebCrawler.get_douyin_headers()
# response = await fetch_data(url, headers=__headers)
# 获取对应平台的headers
if platform == 'tiktok':
__headers = await HybridCrawler.TikTokWebCrawler.get_tiktok_headers()
elif platform == 'bilibili':
__headers = await HybridCrawler.BilibiliWebCrawler.get_bilibili_headers()
else: # douyin
__headers = await HybridCrawler.DouyinWebCrawler.get_douyin_headers()
success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path)
if not success:
raise HTTPException(
status_code=500,
detail="An error occurred while fetching data"
)
# Bilibili 特殊处理:音视频分离
if platform == 'bilibili':
video_data = data.get('video_data', {})
video_url = video_data.get('nwm_video_url_HQ') if not with_watermark else video_data.get('wm_video_url_HQ')
audio_url = video_data.get('audio_url')
if not video_url or not audio_url:
raise HTTPException(
status_code=500,
detail="Failed to get video or audio URL from Bilibili"
)
# 使用专门的函数合并音视频
success = await merge_bilibili_video_audio(video_url, audio_url, request, file_path, __headers.get('headers'))
if not success:
raise HTTPException(
status_code=500,
detail="Failed to merge Bilibili video and audio streams"
)
else:
# 其他平台的常规处理
url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get('wm_video_url_HQ')
success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path)
if not success:
raise HTTPException(
status_code=500,
detail="An error occurred while fetching data"
)
# # 保存文件
# async with aiofiles.open(file_path, 'wb') as out_file:
@ -142,7 +227,7 @@ async def download_file_hybrid(request: Request,
# 下载图片文件/Download image file
elif data_type == 'image':
# 压缩文件属性/Compress file properties
zip_file_name = f"{file_prefix}{platform}_{aweme_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_images_watermark.zip"
zip_file_name = f"{file_prefix}{platform}_{video_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{video_id}_images_watermark.zip"
zip_file_path = os.path.join(download_path, zip_file_name)
# 判断文件是否存在,存在就直接返回、
@ -159,7 +244,7 @@ async def download_file_hybrid(request: Request,
index = int(urls.index(url))
content_type = response.headers.get('content-type')
file_format = content_type.split('/')[1]
file_name = f"{file_prefix}{platform}_{aweme_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_{index + 1}_watermark.{file_format}"
file_name = f"{file_prefix}{platform}_{video_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{video_id}_{index + 1}_watermark.{file_format}"
file_path = os.path.join(download_path, file_name)
image_file_list.append(file_path)

View file

@ -98,8 +98,16 @@ async def update_cookie_api(request: Request,
return ResponseModel(code=200,
router=request.url.path,
data={"message": f"Cookie for {service} will be updated (not implemented yet)"})
elif service == "bilibili":
# 这里可以添加Bilibili的cookie更新逻辑
# from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler
# bilibili_crawler = BilibiliWebCrawler()
# await bilibili_crawler.update_cookie(cookie)
return ResponseModel(code=200,
router=request.url.path,
data={"message": f"Cookie for {service} will be updated (not implemented yet)"})
else:
raise ValueError(f"Service '{service}' is not supported. Supported services: douyin, tiktok")
raise ValueError(f"Service '{service}' is not supported. Supported services: douyin, tiktok, bilibili")
except Exception as e:
status_code = 400
detail = ErrorResponseModel(code=status_code,

View file

@ -32,10 +32,13 @@
# ==============================================================================
import asyncio
import re
import httpx
from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫
from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫
from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫
from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler # 导入Bilibili Web爬虫
class HybridCrawler:
@ -43,6 +46,25 @@ class HybridCrawler:
self.DouyinWebCrawler = DouyinWebCrawler()
self.TikTokWebCrawler = TikTokWebCrawler()
self.TikTokAPPCrawler = TikTokAPPCrawler()
self.BilibiliWebCrawler = BilibiliWebCrawler()
async def get_bilibili_bv_id(self, url: str) -> str:
"""
Bilibili URL 中提取 BV 支持短链重定向
"""
# 如果是 b23.tv 短链需要重定向获取真实URL
if "b23.tv" in url:
async with httpx.AsyncClient() as client:
response = await client.head(url, follow_redirects=True)
url = str(response.url)
# 从URL中提取BV号
bv_pattern = r'(?:video\/|\/)(BV[A-Za-z0-9]+)'
match = re.search(bv_pattern, url)
if match:
return match.group(1)
else:
raise ValueError(f"Cannot extract BV ID from URL: {url}")
async def hybrid_parsing_single_video(self, url: str, minimal: bool = False):
# 解析抖音视频/Parse Douyin video
@ -65,6 +87,14 @@ class HybridCrawler:
data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id)
# $.imagePost exists if aweme_type is photo
aweme_type = data.get("aweme_type")
# 解析Bilibili视频/Parse Bilibili video
elif "bilibili" in url or "b23.tv" in url:
platform = "bilibili"
aweme_id = await self.get_bilibili_bv_id(url) # BV号作为统一的video_id
response = await self.BilibiliWebCrawler.fetch_one_video(aweme_id)
data = response.get('data', {}) # 提取data部分
# Bilibili只有视频类型aweme_type设为0(video)
aweme_type = 0
else:
raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.")
@ -103,27 +133,44 @@ class HybridCrawler:
and then use the .update() method to update the data
"""
result_data = {
'type': url_type,
'platform': platform,
'aweme_id': aweme_id,
'desc': data.get("desc"),
'create_time': data.get("create_time"),
'author': data.get("author"),
'music': data.get("music"),
'statistics': data.get("statistics"),
'cover_data': {
'cover': data.get("video").get("cover"),
'origin_cover': data.get("video").get("origin_cover"),
'dynamic_cover': data.get("video").get("dynamic_cover")
},
'hashtags': data.get('text_extra'),
}
# 根据平台适配字段映射
if platform == 'bilibili':
result_data = {
'type': url_type,
'platform': platform,
'video_id': aweme_id,
'desc': data.get("title"), # Bilibili使用title
'create_time': data.get("pubdate"), # Bilibili使用pubdate
'author': data.get("owner"), # Bilibili使用owner
'music': None, # Bilibili没有音乐信息
'statistics': data.get("stat"), # Bilibili使用stat
'cover_data': {}, # 将在各平台处理中填充
'hashtags': None, # Bilibili没有hashtags概念
}
else:
result_data = {
'type': url_type,
'platform': platform,
'video_id': aweme_id, # 统一使用video_id字段内容可能是aweme_id或bv_id
'desc': data.get("desc"),
'create_time': data.get("create_time"),
'author': data.get("author"),
'music': data.get("music"),
'statistics': data.get("statistics"),
'cover_data': {}, # 将在各平台处理中填充
'hashtags': data.get('text_extra'),
}
# 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data
api_data = None
# 判断链接类型并处理数据/Judge link type and process data
# 抖音数据处理/Douyin data processing
if platform == 'douyin':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("video", {}).get("cover"),
'origin_cover': data.get("video", {}).get("origin_cover"),
'dynamic_cover': data.get("video", {}).get("dynamic_cover")
}
# 抖音视频数据处理/Douyin video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
@ -160,6 +207,12 @@ class HybridCrawler:
}
# TikTok数据处理/TikTok data processing
elif platform == 'tiktok':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("video", {}).get("cover"),
'origin_cover': data.get("video", {}).get("origin_cover"),
'dynamic_cover': data.get("video", {}).get("dynamic_cover")
}
# TikTok视频数据处理/TikTok video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
@ -198,6 +251,50 @@ class HybridCrawler:
'watermark_image_list': watermark_image_list
}
}
# Bilibili数据处理/Bilibili data processing
elif platform == 'bilibili':
# 填充封面数据
result_data['cover_data'] = {
'cover': data.get("pic"), # Bilibili使用pic作为封面
'origin_cover': data.get("pic"),
'dynamic_cover': data.get("pic")
}
# Bilibili只有视频直接处理视频数据
if url_type == 'video':
# 获取视频播放地址需要额外调用API
cid = data.get('cid') # 获取cid
if cid:
# 获取播放链接cid需要转换为字符串
playurl_data = await self.BilibiliWebCrawler.fetch_video_playurl(aweme_id, str(cid))
# 从播放数据中提取URL
dash = playurl_data.get('data', {}).get('dash', {})
video_list = dash.get('video', [])
audio_list = dash.get('audio', [])
# 选择最高质量的视频流
video_url = video_list[0].get('baseUrl') if video_list else None
audio_url = audio_list[0].get('baseUrl') if audio_list else None
api_data = {
'video_data': {
'wm_video_url': video_url,
'wm_video_url_HQ': video_url,
'nwm_video_url': video_url, # Bilibili没有水印概念
'nwm_video_url_HQ': video_url,
'audio_url': audio_url, # Bilibili音视频分离
'cid': cid, # 保存cid供后续使用
}
}
else:
api_data = {
'video_data': {
'wm_video_url': None,
'wm_video_url_HQ': None,
'nwm_video_url': None,
'nwm_video_url_HQ': None,
'error': 'Failed to get cid for video playback'
}
}
# 更新数据/Update data
result_data.update(api_data)
return result_data