🐛Fix TikTok scraper #54 #57 #58

This commit is contained in:
Evil0ctal 2022-07-29 16:54:54 -07:00 committed by GitHub
parent b546d20332
commit 84b1036817
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2,7 +2,7 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
# @Author: https://github.com/Evil0ctal/ # @Author: https://github.com/Evil0ctal/
# @Time: 2021/11/06 # @Time: 2021/11/06
# @Update: 2022/07/03 # @Update: 2022/07/29
# @Function: # @Function:
# 核心代码估值1块(๑•̀ㅂ•́)و✧ # 核心代码估值1块(๑•̀ㅂ•́)و✧
# 用于爬取Douyin/TikTok数据并以字典形式返回。 # 用于爬取Douyin/TikTok数据并以字典形式返回。
@ -334,65 +334,67 @@ class Scraper:
except: except:
video_info = None video_info = None
# 从TikTok官方API获取部分视频数据 # 从TikTok官方API获取部分视频数据
tiktok_api_link = 'https://api.tiktokv.com/aweme/v1/multi/aweme/detail/?aweme_ids=%5B{}%5D'.format( # 新API2022年7月29日 https://api.tiktokv.com/aweme/v1/aweme/detail/?aweme_id={}
# 旧API https://api.tiktokv.com/aweme/v1/multi/aweme/detail/?aweme_ids=%5B{}%5D
tiktok_api_link = 'https://api.tiktokv.com/aweme/v1/aweme/detail/?aweme_id={}'.format(
video_id) video_id)
print('正在请求API链接:{}'.format(tiktok_api_link)) print('正在请求API链接:{}'.format(tiktok_api_link))
response = requests.get(url=tiktok_api_link, headers=headers, proxies=self.proxies).text response = requests.get(url=tiktok_api_link, headers=headers, proxies=self.proxies).text
# 将API获取到的内容格式化为JSON # 将API获取到的内容格式化为JSON
result = json.loads(response) result = json.loads(response)
for i in result["aweme_details"][0]: for i in result["aweme_detail"]:
if i != 'image_post_info': if i != 'image_post_info':
# 类型为视频 # 类型为视频
url_type = 'video' url_type = 'video'
print('类型为视频') print('类型为视频')
# 无水印视频链接 # 无水印视频链接
nwm_video_url = result["aweme_details"][0]["video"]["play_addr"]["url_list"][0] nwm_video_url = result["aweme_detail"]["video"]["play_addr"]["url_list"][0]
try: try:
# 有水印视频链接 # 有水印视频链接
wm_video_url = result["aweme_details"][0]["video"]['download_addr']['url_list'][0] wm_video_url = result["aweme_detail"]["video"]['download_addr']['url_list'][0]
except Exception: except Exception:
# 有水印视频链接 # 有水印视频链接
wm_video_url = 'None' wm_video_url = 'None'
# 视频标题 # 视频标题
video_title = result["aweme_details"][0]["desc"] video_title = result["aweme_detail"]["desc"]
# 视频作者昵称 # 视频作者昵称
video_author_nickname = result["aweme_details"][0]['author']["nickname"] video_author_nickname = result["aweme_detail"]['author']["nickname"]
# 视频作者ID # 视频作者ID
video_author_id = result["aweme_details"][0]['author']["unique_id"] video_author_id = result["aweme_detail"]['author']["unique_id"]
# 上传时间戳 # 上传时间戳
video_create_time = result["aweme_details"][0]['create_time'] video_create_time = result["aweme_detail"]['create_time']
# 视频ID # 视频ID
video_aweme_id = result["aweme_details"][0]['statistics']['aweme_id'] video_aweme_id = result["aweme_detail"]['statistics']['aweme_id']
try: try:
# 视频BGM标题 # 视频BGM标题
video_music_title = result["aweme_details"][0]['music']['title'] video_music_title = result["aweme_detail"]['music']['title']
# 视频BGM作者 # 视频BGM作者
video_music_author = result["aweme_details"][0]['music']['author'] video_music_author = result["aweme_detail"]['music']['author']
# 视频BGM ID # 视频BGM ID
video_music_id = result["aweme_details"][0]['music']['id'] video_music_id = result["aweme_detail"]['music']['id']
# 视频BGM链接 # 视频BGM链接
video_music_url = result["aweme_details"][0]['music']['play_url']['url_list'][0] video_music_url = result["aweme_detail"]['music']['play_url']['url_list'][0]
except: except:
video_music_title, video_music_author, video_music_id, video_music_url = "None", "None", "None", "None" video_music_title, video_music_author, video_music_id, video_music_url = "None", "None", "None", "None"
# 评论数量 # 评论数量
video_comment_count = result["aweme_details"][0]['statistics']['comment_count'] video_comment_count = result["aweme_detail"]['statistics']['comment_count']
# 获赞数量 # 获赞数量
video_digg_count = result["aweme_details"][0]['statistics']['digg_count'] video_digg_count = result["aweme_detail"]['statistics']['digg_count']
# 播放次数 # 播放次数
video_play_count = result["aweme_details"][0]['statistics']['play_count'] video_play_count = result["aweme_detail"]['statistics']['play_count']
# 下载次数 # 下载次数
video_download_count = result["aweme_details"][0]['statistics']['download_count'] video_download_count = result["aweme_detail"]['statistics']['download_count']
# 分享次数 # 分享次数
video_share_count = result["aweme_details"][0]['statistics']['share_count'] video_share_count = result["aweme_detail"]['statistics']['share_count']
# 视频封面 # 视频封面
video_cover = result["aweme_details"][0]['video']['cover']['url_list'][0] video_cover = result["aweme_detail"]['video']['cover']['url_list'][0]
# 视频动态封面 # 视频动态封面
video_dynamic_cover = result["aweme_details"][0]['video']['dynamic_cover']['url_list'][0] video_dynamic_cover = result["aweme_detail"]['video']['dynamic_cover']['url_list'][0]
# 视频原始封面 # 视频原始封面
video_origin_cover = result["aweme_details"][0]['video']['origin_cover']['url_list'][0] video_origin_cover = result["aweme_detail"]['video']['origin_cover']['url_list'][0]
# 将话题保存在列表中 # 将话题保存在列表中
video_hashtags = [] video_hashtags = []
for tag in result["aweme_details"][0]['text_extra']: for tag in result["aweme_detail"]['text_extra']:
if 'hashtag_name' in tag: if 'hashtag_name' in tag:
video_hashtags.append(tag['hashtag_name']) video_hashtags.append(tag['hashtag_name'])
else: else:
@ -474,39 +476,39 @@ class Scraper:
url_type = 'album' url_type = 'album'
print('类型为图集') print('类型为图集')
# 视频标题 # 视频标题
album_title = result["aweme_details"][0]["desc"] album_title = result["aweme_detail"]["desc"]
# 视频作者昵称 # 视频作者昵称
album_author_nickname = result["aweme_details"][0]['author']["nickname"] album_author_nickname = result["aweme_detail"]['author']["nickname"]
# 视频作者ID # 视频作者ID
album_author_id = result["aweme_details"][0]['author']["unique_id"] album_author_id = result["aweme_detail"]['author']["unique_id"]
# 上传时间戳 # 上传时间戳
album_create_time = result["aweme_details"][0]['create_time'] album_create_time = result["aweme_detail"]['create_time']
# 视频ID # 视频ID
album_aweme_id = result["aweme_details"][0]['statistics']['aweme_id'] album_aweme_id = result["aweme_detail"]['statistics']['aweme_id']
try: try:
# 视频BGM标题 # 视频BGM标题
album_music_title = result["aweme_details"][0]['music']['title'] album_music_title = result["aweme_detail"]['music']['title']
# 视频BGM作者 # 视频BGM作者
album_music_author = result["aweme_details"][0]['music']['author'] album_music_author = result["aweme_detail"]['music']['author']
# 视频BGM ID # 视频BGM ID
album_music_id = result["aweme_details"][0]['music']['id'] album_music_id = result["aweme_detail"]['music']['id']
# 视频BGM链接 # 视频BGM链接
album_music_url = result["aweme_details"][0]['music']['play_url']['url_list'][0] album_music_url = result["aweme_detail"]['music']['play_url']['url_list'][0]
except: except:
album_music_title, album_music_author, album_music_id, album_music_url = "None", "None", "None", "None" album_music_title, album_music_author, album_music_id, album_music_url = "None", "None", "None", "None"
# 评论数量 # 评论数量
album_comment_count = result["aweme_details"][0]['statistics']['comment_count'] album_comment_count = result["aweme_detail"]['statistics']['comment_count']
# 获赞数量 # 获赞数量
album_digg_count = result["aweme_details"][0]['statistics']['digg_count'] album_digg_count = result["aweme_detail"]['statistics']['digg_count']
# 播放次数 # 播放次数
album_play_count = result["aweme_details"][0]['statistics']['play_count'] album_play_count = result["aweme_detail"]['statistics']['play_count']
# 下载次数 # 下载次数
album_download_count = result["aweme_details"][0]['statistics']['download_count'] album_download_count = result["aweme_detail"]['statistics']['download_count']
# 分享次数 # 分享次数
album_share_count = result["aweme_details"][0]['statistics']['share_count'] album_share_count = result["aweme_detail"]['statistics']['share_count']
# 无水印图集 # 无水印图集
album_list = [] album_list = []
for i in result["aweme_details"][0]['image_post_info']['images']: for i in result["aweme_detail"]['image_post_info']['images']:
album_list.append(i['display_image']['url_list'][0]) album_list.append(i['display_image']['url_list'][0])
# 结束时间 # 结束时间
end = time.time() end = time.time()