🎈支持新的抖音链接

2022-04-21 22:13:34 -07:00 · 2022-04-21 22:13:34 -07:00 · 67fdd504a4
commit 67fdd504a4
parent 2cac26f249
1 changed files with 221 additions and 203 deletions
--- a/scraper.py
+++ b/scraper.py
@ -10,9 +10,8 @@

 import re
 import json
-import time
 import requests
-from retrying import retry
+from tenacity import *


 class Scraper:
@ -34,7 +33,7 @@ class Scraper:
            "User-Agent": "Mozilla/5.0  (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/86.0.170 Chrome/80.0.3987.170 Safari/537.36",
        }

-    @retry(stop_max_attempt_number=6)
+    @retry(stop=stop_after_attempt(3), wait=wait_random(min=0.2, max=2))
    def douyin(self, original_url):
        """
        利用官方接口解析抖音链接信息
@ -45,16 +44,34 @@ class Scraper:
        try:
            # 开始时间
            start = time.time()
+            # 判断是否为个人主页链接
+            if 'user' in original_url:
+                return {'status': 'failed', 'reason': '暂不支持个人主页批量解析', 'function': 'Scraper.douyin()',
+                        'value': original_url}
+            else:
                # 原视频链接
                r = requests.get(url=original_url, headers=headers, allow_redirects=False)
                try:
                    # 2021/12/11 发现抖音做了限制，会自动重定向网址，但是可以从回执头中获取
                    long_url = r.headers['Location']
+                    # 判断是否为个人主页链接
+                    if 'user' in long_url:
+                        return {'status': 'failed', 'reason': '暂不支持个人主页批量解析', 'function': 'Scraper.douyin()',
+                                'value': original_url}
                except:
                    # 报错后判断为长链接，直接截取视频id
                    long_url = original_url
                # 正则匹配出视频ID
+                try:
+                    # 第一种链接类型
+                    # https://www.douyin.com/video/7086770907674348841
                    key = re.findall('video/(\d+)?', long_url)[0]
+                    print('视频ID为: {}'.format(key))
+                except Exception:
+                    # 第二种链接类型
+                    # https://www.douyin.com/discover?modal_id=7086770907674348841
+                    key = re.findall('modal_id=(\d+)', long_url)[0]
+                    print('视频ID为: {}'.format(key))
                # 构造抖音API链接
                api_url = f'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={key}'
                print("正在请求抖音API链接: " + '\n' + api_url)
@ -168,7 +185,8 @@ class Scraper:
                    # 有水印视频链接
                    wm_video_url = str(js['item_list'][0]['video']['play_addr']['url_list'][0])
                    # 无水印视频链接 (在回执JSON中将关键字'playwm'替换为'play'即可获得无水印地址)
-                nwm_video_url = str(js['item_list'][0]['video']['play_addr']['url_list'][0]).replace('playwm', 'play')
+                    nwm_video_url = str(js['item_list'][0]['video']['play_addr']['url_list'][0]).replace('playwm',
+                                                                                                         'play')
                    # 去水印后视频链接(2022年1月1日抖音APi获取到的URL会进行跳转，需要在Location中获取直链)
                    r = requests.get(url=nwm_video_url, headers=headers, allow_redirects=False)
                    video_url = r.headers['Location']
@ -251,7 +269,7 @@ class Scraper:
            # 返回异常
            return {'status': 'failed', 'reason': e, 'function': 'Scraper.douyin()', 'value': original_url}

-    @retry(stop_max_attempt_number=6)
+    @retry(stop=stop_after_attempt(3), wait=wait_random(min=0.2, max=2))
    def tiktok(self, original_url):
        """
        解析TikTok链接