From 0d6a3a0c627f634031f28367025b10c39085400e Mon Sep 17 00:00:00 2001
From: Evil0ctal <evil0ctal1985@gmail.com>
Date: Thu, 23 Jun 2022 15:51:23 -0700
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=E4=BF=AE=E5=A4=8D=E4=BA=86=E8=A7=A3?=
 =?UTF-8?q?=E6=9E=90=E5=A4=B1=E8=B4=A5=E7=9A=84bug=E5=B9=B6=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E4=BA=86=E4=BB=A3=E7=90=86=E9=80=89=E9=A1=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scraper.py | 103 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 49 deletions(-)

diff --git a/scraper.py b/scraper.py
index 14635c9..176fb47 100644
--- a/scraper.py
+++ b/scraper.py
@@ -2,7 +2,7 @@
 # -*- encoding: utf-8 -*-
 # @Author: https://github.com/Evil0ctal/
 # @Time: 2021/11/06
-# @Update: 2022/06/05
+# @Update: 2022/06/23
 # @Function:
 # 核心代码，估值1块(๑•̀ㅂ•́)و✧
 # 用于爬取Douyin/TikTok数据并以字典形式返回。
@@ -11,6 +11,7 @@
 import re
 import json
 import requests
+import configparser
 from tenacity import *
 
 
@@ -32,6 +33,23 @@ class Scraper:
             "Host": "www.tiktok.com",
             "User-Agent": "Mozilla/5.0  (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/86.0.170 Chrome/80.0.3987.170 Safari/537.36",
         }
+        self.app_config = configparser.ConfigParser()
+        self.app_config.read('config.ini', encoding='utf-8')
+        self.api_config = self.app_config['Scraper']
+        # 判断是否使用代理
+        if self.api_config['Proxy_switch'] == 'True':
+            # 判断是否区别协议选择代理
+            if self.api_config['Use_different_protocols'] == 'False':
+                self.proxies = {
+                    'all': self.api_config['All']
+                }
+            else:
+                self.proxies = {
+                    'http': self.api_config['Http_proxy'],
+                    'https': self.api_config['Https_proxy'],
+                }
+        else:
+            self.proxies = None
 
     @retry(stop=stop_after_attempt(3), wait=wait_random(min=1, max=2))
     def douyin(self, original_url):
@@ -50,7 +68,7 @@ class Scraper:
                         'value': original_url}
             else:
                 # 原视频链接
-                r = requests.get(url=original_url, headers=headers, allow_redirects=False)
+                r = requests.get(url=original_url, headers=headers, allow_redirects=False, proxies=self.proxies)
                 try:
                     # 2021/12/11 发现抖音做了限制，会自动重定向网址，但是可以从回执头中获取
                     long_url = r.headers['Location']
@@ -76,7 +94,7 @@ class Scraper:
                 api_url = f'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={key}'
                 print("正在请求抖音API链接: " + '\n' + api_url)
                 # 将回执以JSON格式处理
-                js = json.loads(requests.get(url=api_url, headers=headers).text)
+                js = json.loads(requests.get(url=api_url, headers=headers, proxies=self.proxies).text)
                 # 判断是否为图集
                 if js['item_list'][0]['images'] is not None:
                     print("类型 = 图集")
@@ -96,33 +114,21 @@ class Scraper:
                         # 如果作者未修改过抖音号，应使用此值以避免无法获取其抖音ID
                         album_author_id = str(js['item_list'][0]['author']['short_id'])
                     # 尝试获取图集BGM信息
-                    if 'music' in js:
-                        try:
+                    for key in js['item_list'][0]:
+                        if key == 'music':
                             # 图集BGM链接
                             album_music = str(js['item_list'][0]['music']['play_url']['url_list'][0])
-                        except:
-                            # 报错后代表无背景音乐
+                            # 图集BGM标题
+                            album_music_title = str(js['item_list'][0]['music']['title'])
+                            # 图集BGM作者
+                            album_music_author = str(js['item_list'][0]['music']['author'])
+                            # 图集BGM ID
+                            album_music_id = str(js['item_list'][0]['music']['id'])
+                            # 图集BGM MID
+                            album_music_mid = str(js['item_list'][0]['music']['mid'])
+                        else:
                             # 图集BGM链接
-                            album_music = 'No BGM found'
-                        # 图集BGM标题
-                        album_music_title = str(js['item_list'][0]['music']['title'])
-                        # 图集BGM作者
-                        album_music_author = str(js['item_list'][0]['music']['author'])
-                        # 图集BGM ID
-                        album_music_id = str(js['item_list'][0]['music']['id'])
-                        # 图集BGM MID
-                        album_music_mid = str(js['item_list'][0]['music']['mid'])
-                    else:
-                        # 图集BGM为空
-                        album_music = 'No BGM found'
-                        # 图集BGM标题
-                        album_music_title = 'No BGM found'
-                        # 图集BGM作者
-                        album_music_author = 'No BGM found'
-                        # 图集BGM ID
-                        album_music_id = 'No BGM found'
-                        # 图集BGM MID
-                        album_music_mid = 'No BGM found'
+                            album_music = album_music_title = album_music_author = album_music_id = album_music_mid = 'No BGM found '
                     # 图集ID
                     album_aweme_id = str(js['item_list'][0]['statistics']['aweme_id'])
                     # 评论数量
@@ -192,7 +198,7 @@ class Scraper:
                     try:
                         r = requests.get(
                             "https://aweme.snssdk.com/aweme/v1/play/?video_id={}&radio=1080p&line=0".format(vid),
-                            headers=headers, allow_redirects=False)
+                            headers=headers, allow_redirects=False, proxies=self.proxies)
                         nwm_video_url_1080p = r.headers['Location']
                     except:
                         nwm_video_url_1080p = "None"
@@ -202,28 +208,27 @@ class Scraper:
                     nwm_video_url = str(js['item_list'][0]['video']['play_addr']['url_list'][0]).replace('playwm',
                                                                                                          'play')
                     # 去水印后视频链接(2022年1月1日抖音APi获取到的URL会进行跳转，需要在Location中获取直链)
-                    r = requests.get(url=nwm_video_url, headers=headers, allow_redirects=False)
+                    r = requests.get(url=nwm_video_url, headers=headers, allow_redirects=False, proxies=self.proxies)
                     video_url = r.headers['Location']
                     # 视频作者签名
                     video_author_signature = str(js['item_list'][0]['author']['signature'])
                     # 视频作者UID
                     video_author_uid = str(js['item_list'][0]['author']['uid'])
                     # 尝试获取视频背景音乐
-                    try:
-                        # 视频BGM链接
-                        video_music = str(js['item_list'][0]['music']['play_url']['url_list'][0])
-                    except:
-                        # 出错代表无背景音乐
-                        # 视频BGM链接
-                        video_music = 'No BGM found'
-                    # 视频BGM标题
-                    video_music_title = str(js['item_list'][0]['music']['title'])
-                    # 视频BGM作者
-                    video_music_author = str(js['item_list'][0]['music']['author'])
-                    # 视频BGM ID
-                    video_music_id = str(js['item_list'][0]['music']['id'])
-                    # 视频BGM MID
-                    video_music_mid = str(js['item_list'][0]['music']['mid'])
+                    for key in js['item_list'][0]:
+                        if key == 'music':
+                            # 视频BGM链接
+                            video_music = str(js['item_list'][0]['music']['play_url']['url_list'][0])
+                            # 视频BGM标题
+                            video_music_title = str(js['item_list'][0]['music']['title'])
+                            # 视频BGM作者
+                            video_music_author = str(js['item_list'][0]['music']['author'])
+                            # 视频BGM ID
+                            video_music_id = str(js['item_list'][0]['music']['id'])
+                            # 视频BGM MID
+                            video_music_mid = str(js['item_list'][0]['music']['mid'])
+                        else:
+                            video_music = video_music_title = video_music_author = video_music_id = video_music_mid = 'No BGM found'
                     # 视频ID
                     video_aweme_id = str(js['item_list'][0]['statistics']['aweme_id'])
                     # 评论数量
@@ -292,12 +297,12 @@ class Scraper:
             print("目标链接: ", original_url)
         else:
             # 从请求头中获取原始链接
-            response = requests.get(url=original_url, headers=headers, allow_redirects=False)
+            response = requests.get(url=original_url, headers=headers, allow_redirects=False, proxies=self.proxies)
             true_link = response.headers['Location'].split("?")[0]
             original_url = true_link
             # TikTok请求头返回的第二种链接类型
             if '.html' in true_link:
-                response = requests.get(url=true_link, headers=headers, allow_redirects=False)
+                response = requests.get(url=true_link, headers=headers, allow_redirects=False, proxies=self.proxies)
                 original_url = response.headers['Location'].split("?")[0]
                 print("目标链接: ", original_url)
         try:
@@ -307,7 +312,7 @@ class Scraper:
             # 尝试从TikTok网页获取部分视频数据，失败后判断为图集
             try:
                 tiktok_headers = self.tiktok_headers
-                html = requests.get(url=original_url, headers=tiktok_headers)
+                html = requests.get(url=original_url, headers=tiktok_headers, proxies=self.proxies)
                 # 正则检索网页中存在的JSON信息
                 resp = re.search('"ItemModule":{(.*)},"UserModule":', html.text).group(1)
                 resp_info = ('{"ItemModule":{' + resp + '}}')
@@ -318,7 +323,7 @@ class Scraper:
                 tiktok_api_link = 'https://api.tiktokv.com/aweme/v1/multi/aweme/detail/?aweme_ids=%5B{}%5D'.format(
                     video_id)
                 print('正在请求API链接:{}'.format(tiktok_api_link))
-                response = requests.get(url=tiktok_api_link, headers=headers).text
+                response = requests.get(url=tiktok_api_link, headers=headers, proxies=self.proxies).text
                 # 将API获取到的内容格式化为JSON
                 result = json.loads(response)
                 # 类型为视频
@@ -421,7 +426,7 @@ class Scraper:
                 tiktok_api_link = 'https://api.tiktokv.com/aweme/v1/multi/aweme/detail/?aweme_ids=%5B{}%5D'.format(
                     video_id)
                 print('正在请求API链接:{}'.format(tiktok_api_link))
-                response = requests.get(url=tiktok_api_link, headers=headers).text
+                response = requests.get(url=tiktok_api_link, headers=headers, proxies=self.proxies).text
                 # 将API获取到的内容格式化为JSON
                 result = json.loads(response)
                 # 类型为视频