✨ Enhance download functionality to support Bilibili: added video and audio merging, updated API to handle Bilibili URLs, and improved error handling. Updated documentation to reflect new features and parameters.

2025-09-02 00:03:14 +08:00 · 2025-09-02 00:03:14 +08:00 · 9f72c26ad6
commit 9f72c26ad6
parent 2db97eaee8
3 changed files with 231 additions and 41 deletions
--- a/app/api/endpoints/download.py
+++ b/app/api/endpoints/download.py
@ -1,5 +1,7 @@
 import os
 import zipfile
+import subprocess
+import tempfile

 import aiofiles
 import httpx
@ -48,42 +50,102 @@ async def fetch_data_stream(url: str, request:Request , headers: dict = None, fi
                    await out_file.write(chunk)
            return True

-@router.get("/download", summary="在线下载抖音|TikTok视频/图片/Online download Douyin|TikTok video/image")
+async def merge_bilibili_video_audio(video_url: str, audio_url: str, request: Request, output_path: str, headers: dict) -> bool:
+    """
+    下载并合并 Bilibili 的视频流和音频流
+    """
+    try:
+        # 创建临时文件
+        with tempfile.NamedTemporaryFile(suffix='.m4v', delete=False) as video_temp:
+            video_temp_path = video_temp.name
+        with tempfile.NamedTemporaryFile(suffix='.m4a', delete=False) as audio_temp:
+            audio_temp_path = audio_temp.name
+        
+        # 下载视频流
+        video_success = await fetch_data_stream(video_url, request, headers=headers, file_path=video_temp_path)
+        # 下载音频流
+        audio_success = await fetch_data_stream(audio_url, request, headers=headers, file_path=audio_temp_path)
+        
+        if not video_success or not audio_success:
+            print("Failed to download video or audio stream")
+            return False
+        
+        # 使用 FFmpeg 合并视频和音频
+        ffmpeg_cmd = [
+            'ffmpeg', '-y',  # -y 覆盖输出文件
+            '-i', video_temp_path,  # 视频输入
+            '-i', audio_temp_path,  # 音频输入
+            '-c:v', 'copy',  # 复制视频编码，不重新编码
+            '-c:a', 'copy',  # 复制音频编码，不重新编码（保持原始质量）
+            '-f', 'mp4',     # 确保输出格式为MP4
+            output_path
+        ]
+        
+        print(f"FFmpeg command: {' '.join(ffmpeg_cmd)}")
+        result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
+        print(f"FFmpeg return code: {result.returncode}")
+        if result.stderr:
+            print(f"FFmpeg stderr: {result.stderr}")
+        if result.stdout:
+            print(f"FFmpeg stdout: {result.stdout}")
+        
+        # 清理临时文件
+        try:
+            os.unlink(video_temp_path)
+            os.unlink(audio_temp_path)
+        except:
+            pass
+        
+        return result.returncode == 0
+        
+    except Exception as e:
+        # 清理临时文件
+        try:
+            os.unlink(video_temp_path)
+            os.unlink(audio_temp_path)
+        except:
+            pass
+        print(f"Error merging video and audio: {e}")
+        return False
+
+@router.get("/download", summary="在线下载抖音|TikTok|Bilibili视频/图片/Online download Douyin|TikTok|Bilibili video/image")
 async def download_file_hybrid(request: Request,
                               url: str = Query(
                                   example="https://www.douyin.com/video/7372484719365098803",
-                                   description="视频或图片的URL地址，也支持抖音|TikTok的分享链接，例如：https://v.douyin.com/e4J8Q7A/"),
+                                   description="视频或图片的URL地址，支持抖音|TikTok|Bilibili的分享链接，例如：https://v.douyin.com/e4J8Q7A/ 或 https://www.bilibili.com/video/BV1xxxxxxxxx"),
                               prefix: bool = True,
                               with_watermark: bool = False):
    """
    # [中文]
    ### 用途:
-    - 在线下载抖音|TikTok 无水印或有水印的视频/图片
+    - 在线下载抖音|TikTok|Bilibili 无水印或有水印的视频/图片
    - 通过传入的视频URL参数，获取对应的视频或图片数据，然后下载到本地。
    - 如果你在尝试直接访问TikTok单一视频接口的JSON数据中的视频播放地址时遇到HTTP403错误，那么你可以使用此接口来下载视频。
+    - Bilibili视频会自动合并视频流和音频流，确保下载的视频有声音。
    - 这个接口会占用一定的服务器资源，所以在Demo站点是默认关闭的，你可以在本地部署后调用此接口。
    ### 参数:
-    - url: 视频或图片的URL地址，也支持抖音|TikTok的分享链接，例如：https://v.douyin.com/e4J8Q7A/。
+    - url: 视频或图片的URL地址，支持抖音|TikTok|Bilibili的分享链接，例如：https://v.douyin.com/e4J8Q7A/ 或 https://www.bilibili.com/video/BV1xxxxxxxxx
    - prefix: 下载文件的前缀，默认为True，可以在配置文件中修改。
-    - with_watermark: 是否下载带水印的视频或图片，默认为False。
+    - with_watermark: 是否下载带水印的视频或图片，默认为False。(注意：Bilibili没有水印概念)
    ### 返回:
    - 返回下载的视频或图片文件响应。

    # [English]
    ### Purpose:
-    - Download Douyin|TikTok video/image with or without watermark online.
+    - Download Douyin|TikTok|Bilibili video/image with or without watermark online.
    - By passing the video URL parameter, get the corresponding video or image data, and then download it to the local.
    - If you encounter an HTTP403 error when trying to access the video playback address in the JSON data of the TikTok single video interface directly, you can use this interface to download the video.
+    - Bilibili videos will automatically merge video and audio streams to ensure downloaded videos have sound.
    - This interface will occupy a certain amount of server resources, so it is disabled by default on the Demo site, you can call this interface after deploying it locally.
    ### Parameters:
-    - url: The URL address of the video or image, also supports Douyin|TikTok sharing links, for example: https://v.douyin.com/e4J8Q7A/.
+    - url: The URL address of the video or image, supports Douyin|TikTok|Bilibili sharing links, for example: https://v.douyin.com/e4J8Q7A/ or https://www.bilibili.com/video/BV1xxxxxxxxx
    - prefix: The prefix of the downloaded file, the default is True, and can be modified in the configuration file.
-    - with_watermark: Whether to download videos or images with watermarks, the default is False.
+    - with_watermark: Whether to download videos or images with watermarks, the default is False. (Note: Bilibili has no watermark concept)
    ### Returns:
    - Return the response of the downloaded video or image file.

    # [示例/Example]
-    url: https://www.douyin.com/video/7372484719365098803
+    url: https://www.bilibili.com/video/BV1U5efz2Egn
    """
    # 是否开启此端点/Whether to enable this endpoint
    if not config["API"]["Download_Switch"]:
@ -103,7 +165,7 @@ async def download_file_hybrid(request: Request,
    try:
        data_type = data.get('type')
        platform = data.get('platform')
-        aweme_id = data.get('aweme_id')
+        video_id = data.get('video_id')  # 改为使用video_id
        file_prefix = config.get("API").get("Download_File_Prefix") if prefix else ''
        download_path = os.path.join(config.get("API").get("Download_Path"), f"{platform}_{data_type}")

@ -112,25 +174,48 @@ async def download_file_hybrid(request: Request,

        # 下载视频文件/Download video file
        if data_type == 'video':
-            file_name = f"{file_prefix}{platform}_{aweme_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_watermark.mp4"
-            url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get(
-                'wm_video_url_HQ')
+            file_name = f"{file_prefix}{platform}_{video_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{video_id}_watermark.mp4"
            file_path = os.path.join(download_path, file_name)

            # 判断文件是否存在，存在就直接返回
            if os.path.exists(file_path):
                return FileResponse(path=file_path, media_type='video/mp4', filename=file_name)

-            # 获取视频文件
-            __headers = await HybridCrawler.TikTokWebCrawler.get_tiktok_headers() if platform == 'tiktok' else await HybridCrawler.DouyinWebCrawler.get_douyin_headers()
-            # response = await fetch_data(url, headers=__headers)
+            # 获取对应平台的headers
+            if platform == 'tiktok':
+                __headers = await HybridCrawler.TikTokWebCrawler.get_tiktok_headers()
+            elif platform == 'bilibili':
+                __headers = await HybridCrawler.BilibiliWebCrawler.get_bilibili_headers()
+            else:  # douyin
+                __headers = await HybridCrawler.DouyinWebCrawler.get_douyin_headers()

-            success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path)
-            if not success:
-                raise HTTPException(
-                    status_code=500,
-                    detail="An error occurred while fetching data"
-                )
+            # Bilibili 特殊处理：音视频分离
+            if platform == 'bilibili':
+                video_data = data.get('video_data', {})
+                video_url = video_data.get('nwm_video_url_HQ') if not with_watermark else video_data.get('wm_video_url_HQ')
+                audio_url = video_data.get('audio_url')
+                if not video_url or not audio_url:
+                    raise HTTPException(
+                        status_code=500,
+                        detail="Failed to get video or audio URL from Bilibili"
+                    )
+                
+                # 使用专门的函数合并音视频
+                success = await merge_bilibili_video_audio(video_url, audio_url, request, file_path, __headers.get('headers'))
+                if not success:
+                    raise HTTPException(
+                        status_code=500,
+                        detail="Failed to merge Bilibili video and audio streams"
+                    )
+            else:
+                # 其他平台的常规处理
+                url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get('wm_video_url_HQ')
+                success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path)
+                if not success:
+                    raise HTTPException(
+                        status_code=500,
+                        detail="An error occurred while fetching data"
+                    )

            # # 保存文件
            # async with aiofiles.open(file_path, 'wb') as out_file:
@ -142,7 +227,7 @@ async def download_file_hybrid(request: Request,
        # 下载图片文件/Download image file
        elif data_type == 'image':
            # 压缩文件属性/Compress file properties
-            zip_file_name = f"{file_prefix}{platform}_{aweme_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_images_watermark.zip"
+            zip_file_name = f"{file_prefix}{platform}_{video_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{video_id}_images_watermark.zip"
            zip_file_path = os.path.join(download_path, zip_file_name)

            # 判断文件是否存在，存在就直接返回、
@ -159,7 +244,7 @@ async def download_file_hybrid(request: Request,
                index = int(urls.index(url))
                content_type = response.headers.get('content-type')
                file_format = content_type.split('/')[1]
-                file_name = f"{file_prefix}{platform}_{aweme_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{aweme_id}_{index + 1}_watermark.{file_format}"
+                file_name = f"{file_prefix}{platform}_{video_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{video_id}_{index + 1}_watermark.{file_format}"
                file_path = os.path.join(download_path, file_name)
                image_file_list.append(file_path)

--- a/app/api/endpoints/hybrid_parsing.py
+++ b/app/api/endpoints/hybrid_parsing.py
@ -98,8 +98,16 @@ async def update_cookie_api(request: Request,
            return ResponseModel(code=200,
                                router=request.url.path,
                                data={"message": f"Cookie for {service} will be updated (not implemented yet)"})
+        elif service == "bilibili":
+            # 这里可以添加Bilibili的cookie更新逻辑
+            # from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler
+            # bilibili_crawler = BilibiliWebCrawler()
+            # await bilibili_crawler.update_cookie(cookie)
+            return ResponseModel(code=200,
+                                router=request.url.path,
+                                data={"message": f"Cookie for {service} will be updated (not implemented yet)"})
        else:
-            raise ValueError(f"Service '{service}' is not supported. Supported services: douyin, tiktok")
+            raise ValueError(f"Service '{service}' is not supported. Supported services: douyin, tiktok, bilibili")
    except Exception as e:
        status_code = 400
        detail = ErrorResponseModel(code=status_code,
--- a/crawlers/hybrid/hybrid_crawler.py
+++ b/crawlers/hybrid/hybrid_crawler.py
@ -32,10 +32,13 @@
 # ==============================================================================

 import asyncio
+import re
+import httpx

 from crawlers.douyin.web.web_crawler import DouyinWebCrawler  # 导入抖音Web爬虫
 from crawlers.tiktok.web.web_crawler import TikTokWebCrawler  # 导入TikTok Web爬虫
 from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler  # 导入TikTok App爬虫
+from crawlers.bilibili.web.web_crawler import BilibiliWebCrawler  # 导入Bilibili Web爬虫


 class HybridCrawler:
@ -43,6 +46,25 @@ class HybridCrawler:
        self.DouyinWebCrawler = DouyinWebCrawler()
        self.TikTokWebCrawler = TikTokWebCrawler()
        self.TikTokAPPCrawler = TikTokAPPCrawler()
+        self.BilibiliWebCrawler = BilibiliWebCrawler()
+
+    async def get_bilibili_bv_id(self, url: str) -> str:
+        """
+        从 Bilibili URL 中提取 BV 号，支持短链重定向
+        """
+        # 如果是 b23.tv 短链，需要重定向获取真实URL
+        if "b23.tv" in url:
+            async with httpx.AsyncClient() as client:
+                response = await client.head(url, follow_redirects=True)
+                url = str(response.url)
+        
+        # 从URL中提取BV号
+        bv_pattern = r'(?:video\/|\/)(BV[A-Za-z0-9]+)'
+        match = re.search(bv_pattern, url)
+        if match:
+            return match.group(1)
+        else:
+            raise ValueError(f"Cannot extract BV ID from URL: {url}")

    async def hybrid_parsing_single_video(self, url: str, minimal: bool = False):
        # 解析抖音视频/Parse Douyin video
@ -65,6 +87,14 @@ class HybridCrawler:
            data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id)
            # $.imagePost exists if aweme_type is photo
            aweme_type = data.get("aweme_type")
+        # 解析Bilibili视频/Parse Bilibili video
+        elif "bilibili" in url or "b23.tv" in url:
+            platform = "bilibili"
+            aweme_id = await self.get_bilibili_bv_id(url)  # BV号作为统一的video_id
+            response = await self.BilibiliWebCrawler.fetch_one_video(aweme_id)
+            data = response.get('data', {})  # 提取data部分
+            # Bilibili只有视频类型，aweme_type设为0(video)
+            aweme_type = 0
        else:
            raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.")

@ -103,27 +133,44 @@ class HybridCrawler:
        and then use the .update() method to update the data
        """

-        result_data = {
-            'type': url_type,
-            'platform': platform,
-            'aweme_id': aweme_id,
-            'desc': data.get("desc"),
-            'create_time': data.get("create_time"),
-            'author': data.get("author"),
-            'music': data.get("music"),
-            'statistics': data.get("statistics"),
-            'cover_data': {
-                'cover': data.get("video").get("cover"),
-                'origin_cover': data.get("video").get("origin_cover"),
-                'dynamic_cover': data.get("video").get("dynamic_cover")
-            },
-            'hashtags': data.get('text_extra'),
-        }
+        # 根据平台适配字段映射
+        if platform == 'bilibili':
+            result_data = {
+                'type': url_type,
+                'platform': platform,
+                'video_id': aweme_id,
+                'desc': data.get("title"),  # Bilibili使用title
+                'create_time': data.get("pubdate"),  # Bilibili使用pubdate
+                'author': data.get("owner"),  # Bilibili使用owner
+                'music': None,  # Bilibili没有音乐信息
+                'statistics': data.get("stat"),  # Bilibili使用stat
+                'cover_data': {},  # 将在各平台处理中填充
+                'hashtags': None,  # Bilibili没有hashtags概念
+            }
+        else:
+            result_data = {
+                'type': url_type,
+                'platform': platform,
+                'video_id': aweme_id,  # 统一使用video_id字段，内容可能是aweme_id或bv_id
+                'desc': data.get("desc"),
+                'create_time': data.get("create_time"),
+                'author': data.get("author"),
+                'music': data.get("music"),
+                'statistics': data.get("statistics"),
+                'cover_data': {},  # 将在各平台处理中填充
+                'hashtags': data.get('text_extra'),
+            }
        # 创建一个空变量，稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data
        api_data = None
        # 判断链接类型并处理数据/Judge link type and process data
        # 抖音数据处理/Douyin data processing
        if platform == 'douyin':
+            # 填充封面数据
+            result_data['cover_data'] = {
+                'cover': data.get("video", {}).get("cover"),
+                'origin_cover': data.get("video", {}).get("origin_cover"),
+                'dynamic_cover': data.get("video", {}).get("dynamic_cover")
+            }
            # 抖音视频数据处理/Douyin video data processing
            if url_type == 'video':
                # 将信息储存在字典中/Store information in a dictionary
@ -160,6 +207,12 @@ class HybridCrawler:
                }
        # TikTok数据处理/TikTok data processing
        elif platform == 'tiktok':
+            # 填充封面数据
+            result_data['cover_data'] = {
+                'cover': data.get("video", {}).get("cover"),
+                'origin_cover': data.get("video", {}).get("origin_cover"),
+                'dynamic_cover': data.get("video", {}).get("dynamic_cover")
+            }
            # TikTok视频数据处理/TikTok video data processing
            if url_type == 'video':
                # 将信息储存在字典中/Store information in a dictionary
@ -198,6 +251,50 @@ class HybridCrawler:
                            'watermark_image_list': watermark_image_list
                        }
                }
+        # Bilibili数据处理/Bilibili data processing
+        elif platform == 'bilibili':
+            # 填充封面数据
+            result_data['cover_data'] = {
+                'cover': data.get("pic"),  # Bilibili使用pic作为封面
+                'origin_cover': data.get("pic"),
+                'dynamic_cover': data.get("pic")
+            }
+            # Bilibili只有视频，直接处理视频数据
+            if url_type == 'video':
+                # 获取视频播放地址需要额外调用API
+                cid = data.get('cid')  # 获取cid
+                if cid:
+                    # 获取播放链接，cid需要转换为字符串
+                    playurl_data = await self.BilibiliWebCrawler.fetch_video_playurl(aweme_id, str(cid))
+                    # 从播放数据中提取URL
+                    dash = playurl_data.get('data', {}).get('dash', {})
+                    video_list = dash.get('video', [])
+                    audio_list = dash.get('audio', [])
+                    
+                    # 选择最高质量的视频流
+                    video_url = video_list[0].get('baseUrl') if video_list else None
+                    audio_url = audio_list[0].get('baseUrl') if audio_list else None
+                    
+                    api_data = {
+                        'video_data': {
+                            'wm_video_url': video_url,
+                            'wm_video_url_HQ': video_url,
+                            'nwm_video_url': video_url,  # Bilibili没有水印概念
+                            'nwm_video_url_HQ': video_url,
+                            'audio_url': audio_url,  # Bilibili音视频分离
+                            'cid': cid,  # 保存cid供后续使用
+                        }
+                    }
+                else:
+                    api_data = {
+                        'video_data': {
+                            'wm_video_url': None,
+                            'wm_video_url_HQ': None,
+                            'nwm_video_url': None,
+                            'nwm_video_url_HQ': None,
+                            'error': 'Failed to get cid for video playback'
+                        }
+                    }
        # 更新数据/Update data
        result_data.update(api_data)
        return result_data