diff --git a/app/api/endpoints/hybrid_parsing.py b/app/api/endpoints/hybrid_parsing.py index f45a8d8..a8b56dc 100644 --- a/app/api/endpoints/hybrid_parsing.py +++ b/app/api/endpoints/hybrid_parsing.py @@ -51,3 +51,59 @@ async def hybrid_parsing_single_video(request: Request, params=dict(request.query_params), ) raise HTTPException(status_code=status_code, detail=detail.dict()) + +# 更新Cookie +@router.post("/update_cookie", + response_model=ResponseModel, + summary="更新Cookie/Update Cookie") +async def update_cookie_api(request: Request, + service: str = Body(example="douyin", description="服务名称/Service name"), + cookie: str = Body(example="YOUR_NEW_COOKIE", description="新的Cookie值/New Cookie value")): + """ + # [中文] + ### 用途: + - 更新指定服务的Cookie + ### 参数: + - service: 服务名称 (如: douyin_web) + - cookie: 新的Cookie值 + ### 返回: + - 更新结果 + + # [English] + ### Purpose: + - Update Cookie for specified service + ### Parameters: + - service: Service name (e.g.: douyin_web) + - cookie: New Cookie value + ### Return: + - Update result + + # [示例/Example] + service = "douyin_web" + cookie = "YOUR_NEW_COOKIE" + """ + try: + if service == "douyin": + from crawlers.douyin.web.web_crawler import DouyinWebCrawler + douyin_crawler = DouyinWebCrawler() + await douyin_crawler.update_cookie(cookie) + return ResponseModel(code=200, + router=request.url.path, + data={"message": f"Cookie for {service} updated successfully"}) + elif service == "tiktok": + # 这里可以添加TikTok的cookie更新逻辑 + # from crawlers.tiktok.web.web_crawler import TikTokWebCrawler + # tiktok_crawler = TikTokWebCrawler() + # await tiktok_crawler.update_cookie(cookie) + return ResponseModel(code=200, + router=request.url.path, + data={"message": f"Cookie for {service} will be updated (not implemented yet)"}) + else: + raise ValueError(f"Service '{service}' is not supported. Supported services: douyin, tiktok") + except Exception as e: + status_code = 400 + detail = ErrorResponseModel(code=status_code, + router=request.url.path, + params=dict(request.query_params), + ) + raise HTTPException(status_code=status_code, detail=detail.dict()) \ No newline at end of file diff --git a/chrome-cookie-sniffer/README.md b/chrome-cookie-sniffer/README.md new file mode 100644 index 0000000..f749c40 --- /dev/null +++ b/chrome-cookie-sniffer/README.md @@ -0,0 +1,171 @@ +# Chrome Cookie Sniffer + +一个用于自动嗅探和提取网站Cookie的Chrome扩展程序。支持抖音等主流平台,具备智能去重、时间控制和Webhook回调等功能。 + +## 功能特性 + +- 🎯 **智能Cookie抓取** - 自动拦截POST/GET请求中的Cookie +- ⏱️ **防重复机制** - 5分钟内不重复抓取相同服务 +- 🔄 **内容去重** - 只有Cookie内容变化时才保存 +- 🎨 **现代化界面** - Card列表展示,状态一目了然 +- 🔗 **Webhook回调** - Cookie更新时自动推送到指定地址 +- 📋 **一键复制** - 快速复制Cookie到剪贴板 +- 🗂️ **数据管理** - 支持导出、清理和单独删除 +- 🔧 **调试友好** - 内置Webhook测试功能 + +## 支持的网站 + +- 🎵 **抖音** (douyin.com) +- 🚀 **扩展性** - 架构支持轻松添加更多平台 + +## 安装方法 + +### 1. 下载源码 + +```bash +git clone +# 或直接下载ZIP文件并解压 +``` + +### 2. 在Chrome中加载扩展 + +1. **打开Chrome扩展管理页面** + - 方法一:地址栏输入 `chrome://extensions/` + - 方法二:菜单 → 更多工具 → 扩展程序 + +2. **启用开发者模式** + - 在扩展管理页面右上角,开启"开发者模式"开关 + +3. **加载解压的扩展程序** + - 点击"加载已解压的扩展程序"按钮 + - 选择 `chrome-cookie-sniffer` 文件夹 + - 确认加载 + +4. **验证安装** + - 扩展列表中出现"Cookie Sniffer" + - 浏览器工具栏出现扩展图标 + - 状态显示为"已启用" + +### 3. 权限确认 + +安装时Chrome会请求以下权限: +- `webRequest` - 拦截网络请求 +- `storage` - 本地数据存储 +- `cookies` - 读取Cookie信息 +- `activeTab` - 当前标签页访问 +- `host_permissions` - 访问douyin.com域名 + +## 使用方法 + +### 基础使用 + +1. **访问目标网站** - 打开抖音等支持的网站 +2. **触发请求** - 正常浏览,触发POST/GET请求 +3. **查看结果** - 点击扩展图标查看抓取的Cookie + +### 配置Webhook + +1. **打开扩展弹窗** +2. **输入Webhook地址** - 在顶部输入框填入回调URL +3. **测试连接** - 点击"🔧 测试"按钮验证 +4. **自动回调** - Cookie更新时自动POST到指定地址 + +### Webhook数据格式 + +```json +{ + "service": "douyin", + "cookie": "具体的Cookie字符串", + "timestamp": "2025-08-29T12:34:56.789Z" +} +``` + +测试时会额外包含: +```json +{ + "test": true, + "message": "这是一个测试回调..." +} +``` + +### 数据管理 + +- **📋 复制Cookie** - 点击卡片中的复制按钮 +- **🗑️ 删除数据** - 删除单个服务的Cookie +- **🔄 刷新** - 手动刷新数据显示 +- **📤 导出** - 导出所有数据为JSON文件 +- **🧹 清空** - 清空所有Cookie数据 + +## 调试指南 + +### 查看日志 + +1. **打开扩展管理页面** (`chrome://extensions/`) +2. **找到Cookie Sniffer扩展** +3. **点击"服务工作进程"** - 查看蓝色链接 +4. **查看控制台输出** - 所有日志都在这里 + +### 常见问题 + +**Q: 扩展不工作?** +- 检查是否启用开发者模式 +- 确认权限已正确授予 +- 查看service worker是否正在运行 + +**Q: 没有抓取到Cookie?** +- 确认访问的是支持的网站 +- 检查是否触发了POST/GET请求 +- 查看service worker控制台日志 + +**Q: Webhook测试失败?** +- 检查URL格式是否正确 +- 确认服务器支持跨域请求 +- 验证服务器是否正常响应 + +### 开发者选项 + +修改 `background.js` 中的 `SERVICES` 配置来添加新网站: + +```javascript +const SERVICES = { + douyin: { + name: 'douyin', + displayName: '抖音', + domains: ['douyin.com'], + cookieDomain: '.douyin.com' + }, + // 添加新服务 + bilibili: { + name: 'bilibili', + displayName: 'B站', + domains: ['bilibili.com'], + cookieDomain: '.bilibili.com' + } +}; +``` + +## 文件结构 + +``` +chrome-cookie-sniffer/ +├── manifest.json # 扩展配置文件 +├── background.js # 后台服务脚本 +├── popup.html # 弹窗界面 +├── popup.js # 弹窗逻辑 +└── README.md # 说明文档 +``` + +## 注意事项 + +- ⚠️ **仅用于合法用途** - 请遵守网站服务条款 +- 🔒 **数据安全** - Cookie数据存储在本地,不会上传 +- 🔄 **定期更新** - 网站更新可能影响抓取效果 +- 📱 **Chrome限制** - 部分网站可能有反爬虫机制 + +## 开源协议 + +本项目遵循 MIT 开源协议。 + +## 贡献指南 + +欢迎提交Issue和Pull Request来改进这个项目! \ No newline at end of file diff --git a/chrome-cookie-sniffer/background.js b/chrome-cookie-sniffer/background.js new file mode 100644 index 0000000..17f861c --- /dev/null +++ b/chrome-cookie-sniffer/background.js @@ -0,0 +1,177 @@ +// 启动时记录 +console.log('Cookie Sniffer service worker 已启动'); + +// 服务配置 +const SERVICES = { + douyin: { + name: 'douyin', + displayName: '抖音', + domains: ['douyin.com'], + cookieDomain: '.douyin.com' + } +}; + +// 获取服务名称 +function getServiceFromUrl(url) { + for (const [key, service] of Object.entries(SERVICES)) { + if (service.domains.some(domain => url.includes(domain))) { + return service; + } + } + return null; +} + +// 检查是否在5分钟内抓取过 +async function shouldSkipCapture(serviceName) { + return new Promise((resolve) => { + chrome.storage.local.get([`lastCapture_${serviceName}`], function(result) { + const lastTime = result[`lastCapture_${serviceName}`]; + if (!lastTime) { + resolve(false); + return; + } + + const now = Date.now(); + const fiveMinutes = 5 * 60 * 1000; + const shouldSkip = (now - lastTime) < fiveMinutes; + + if (shouldSkip) { + console.log(`${serviceName}: 5分钟内已抓取过,跳过`); + } + resolve(shouldSkip); + }); + }); +} + +// 检查Cookie是否有变化 +async function isCookieChanged(serviceName, newCookie) { + return new Promise((resolve) => { + chrome.storage.local.get([`cookieData_${serviceName}`], function(result) { + const existingData = result[`cookieData_${serviceName}`]; + if (!existingData || existingData.cookie !== newCookie) { + resolve(true); + } else { + console.log(`${serviceName}: Cookie内容无变化,跳过`); + resolve(false); + } + }); + }); +} + +// 保存Cookie数据 +async function saveCookieData(serviceName, url, cookie, source = 'headers') { + const cookieData = { + service: serviceName, + url: url, + timestamp: Date.now(), + lastUpdate: new Date().toISOString(), + cookie: cookie, + source: source + }; + + // 保存服务数据 + chrome.storage.local.set({ + [`cookieData_${serviceName}`]: cookieData, + [`lastCapture_${serviceName}`]: Date.now() + }); + + // 触发Webhook回调 + await sendWebhook(serviceName, cookie); + + console.log(`${serviceName}: Cookie已保存`); +} + +// Webhook回调 +async function sendWebhook(serviceName, cookie) { + chrome.storage.local.get(['webhookUrl'], function(result) { + const webhookUrl = result.webhookUrl; + if (webhookUrl && webhookUrl.trim()) { + const payload = { + service: serviceName, + cookie: cookie, + timestamp: new Date().toISOString() + }; + + fetch(webhookUrl, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(payload) + }).then(response => { + console.log(`Webhook回调成功: ${serviceName}`, response.status); + }).catch(error => { + console.error(`Webhook回调失败: ${serviceName}`, error); + }); + } + }); +} + +chrome.webRequest.onBeforeSendHeaders.addListener( + async function(details) { + const service = getServiceFromUrl(details.url); + if (!service) return; + + console.log(`请求拦截: ${service.displayName}`, details.url, details.method); + + if (details.method === "POST" || details.method === "GET") { + // 检查5分钟限制 + if (await shouldSkipCapture(service.name)) { + return; + } + + let cookieFound = false; + + // 尝试从请求头获取Cookie + if (details.requestHeaders) { + for (let header of details.requestHeaders) { + if (header.name.toLowerCase() === "cookie") { + console.log(`从请求头捕获到Cookie: ${service.displayName}`); + + // 检查Cookie是否有变化 + if (await isCookieChanged(service.name, header.value)) { + await saveCookieData(service.name, details.url, header.value, 'headers'); + } + + cookieFound = true; + break; + } + } + } + + // 如果请求头没有Cookie,使用cookies API备用方案 + if (!cookieFound) { + chrome.cookies.getAll({domain: service.cookieDomain}, async function(cookies) { + if (cookies && cookies.length > 0) { + console.log(`通过cookies API获取到: ${service.displayName}`, cookies.length, '个cookie'); + const cookieString = cookies.map(c => `${c.name}=${c.value}`).join('; '); + + // 检查Cookie是否有变化 + if (await isCookieChanged(service.name, cookieString)) { + await saveCookieData(service.name, details.url, cookieString, 'cookies_api'); + } + } + }); + } + } + }, + { urls: ["https://*.douyin.com/*", "https://douyin.com/*"] }, + ["requestHeaders", "extraHeaders"] + ); + +// 添加存储变化监听 +chrome.storage.onChanged.addListener((changes, areaName) => { + if (areaName === 'local') { + // 监听服务数据变化 + Object.keys(changes).forEach(key => { + if (key.startsWith('cookieData_')) { + const serviceName = key.replace('cookieData_', ''); + const serviceConfig = SERVICES[serviceName]; + if (serviceConfig && changes[key].newValue) { + console.log(`${serviceConfig.displayName} Cookie数据已更新`); + } + } + }); + } +}); + \ No newline at end of file diff --git a/chrome-cookie-sniffer/manifest.json b/chrome-cookie-sniffer/manifest.json new file mode 100644 index 0000000..d163517 --- /dev/null +++ b/chrome-cookie-sniffer/manifest.json @@ -0,0 +1,24 @@ +{ + "manifest_version": 3, + "name": "Cookie Sniffer", + "version": "1.0", + "description": "监听并获取指定网站的请求 Cookie", + "permissions": [ + "webRequest", + "storage", + "activeTab", + "cookies" + ], + "host_permissions": [ + "https://*.douyin.com/*", + "https://douyin.com/*" + ], + "background": { + "service_worker": "background.js" + }, + "action": { + "default_popup": "popup.html", + "default_title": "Cookie Sniffer" + } + } + \ No newline at end of file diff --git a/chrome-cookie-sniffer/popup.html b/chrome-cookie-sniffer/popup.html new file mode 100644 index 0000000..36538ed --- /dev/null +++ b/chrome-cookie-sniffer/popup.html @@ -0,0 +1,178 @@ + + + + + + + +
+

Cookie Sniffer

+ +
+ +
+ + +
+
+
+
+ +
+ + + +
+ + + +
+ + + + + + \ No newline at end of file diff --git a/chrome-cookie-sniffer/popup.js b/chrome-cookie-sniffer/popup.js new file mode 100644 index 0000000..92e962b --- /dev/null +++ b/chrome-cookie-sniffer/popup.js @@ -0,0 +1,292 @@ +document.addEventListener('DOMContentLoaded', function() { + const refreshBtn = document.getElementById('refresh'); + const clearBtn = document.getElementById('clear'); + const exportBtn = document.getElementById('export'); + const webhookInput = document.getElementById('webhookUrl'); + const testWebhookBtn = document.getElementById('testWebhook'); + const webhookStatus = document.getElementById('webhookStatus'); + const statusInfo = document.getElementById('statusInfo'); + const serviceCards = document.getElementById('serviceCards'); + const emptyState = document.getElementById('emptyState'); + + // 服务配置 + const SERVICES = { + douyin: { name: 'douyin', displayName: '抖音', icon: '🎵' } + }; + + // 加载Webhook配置 + function loadWebhookConfig() { + chrome.storage.local.get(['webhookUrl'], function(result) { + if (result.webhookUrl) { + webhookInput.value = result.webhookUrl; + } + updateTestButtonState(); + }); + } + + // 保存Webhook配置 + function saveWebhookConfig() { + const url = webhookInput.value.trim(); + chrome.storage.local.set({ webhookUrl: url }); + showStatusInfo('Webhook地址已保存'); + updateTestButtonState(); + } + + // 更新测试按钮状态 + function updateTestButtonState() { + const url = webhookInput.value.trim(); + testWebhookBtn.disabled = !url || !isValidUrl(url); + } + + // 验证URL格式 + function isValidUrl(string) { + try { + new URL(string); + return string.startsWith('http://') || string.startsWith('https://'); + } catch (_) { + return false; + } + } + + // 测试Webhook回调 + async function testWebhook() { + const url = webhookInput.value.trim(); + if (!url) { + webhookStatus.textContent = '请先输入Webhook地址'; + webhookStatus.style.color = '#dc3545'; + return; + } + + testWebhookBtn.disabled = true; + testWebhookBtn.textContent = '⏳ 测试中...'; + webhookStatus.textContent = '正在发送测试请求...'; + webhookStatus.style.color = '#17a2b8'; + + // 获取现有数据或创建测试数据 + chrome.storage.local.get(['cookieData_douyin'], async function(result) { + let testData; + + if (result.cookieData_douyin) { + // 使用现有数据 + testData = { + service: 'douyin', + cookie: result.cookieData_douyin.cookie, + timestamp: new Date().toISOString(), + test: true, + message: '这是一个测试回调,使用了真实的Cookie数据' + }; + } else { + // 使用模拟数据 + testData = { + service: 'douyin', + cookie: 'test_cookie=test_value; another_cookie=another_value', + timestamp: new Date().toISOString(), + test: true, + message: '这是一个测试回调,使用了模拟Cookie数据' + }; + } + + try { + const response = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(testData) + }); + + if (response.ok) { + webhookStatus.textContent = `✅ 测试成功 (${response.status})`; + webhookStatus.style.color = '#28a745'; + } else { + webhookStatus.textContent = `❌ 服务器错误 (${response.status})`; + webhookStatus.style.color = '#dc3545'; + } + } catch (error) { + console.error('Webhook测试失败:', error); + if (error.name === 'TypeError' && error.message.includes('fetch')) { + webhookStatus.textContent = '❌ 网络错误或跨域限制'; + } else { + webhookStatus.textContent = `❌ 请求失败: ${error.message}`; + } + webhookStatus.style.color = '#dc3545'; + } finally { + testWebhookBtn.disabled = false; + testWebhookBtn.textContent = '🔧 测试'; + updateTestButtonState(); + + // 5秒后清除状态信息 + setTimeout(() => { + webhookStatus.textContent = ''; + }, 5000); + } + }); + } + + // 显示状态信息 + function showStatusInfo(message) { + statusInfo.textContent = message; + statusInfo.style.display = 'block'; + setTimeout(() => { + statusInfo.style.display = 'none'; + }, 3000); + } + + // 加载服务数据 + function loadServiceData() { + const serviceKeys = Object.keys(SERVICES).map(service => `cookieData_${service}`); + chrome.storage.local.get(serviceKeys, function(result) { + const hasData = Object.keys(result).length > 0; + + if (!hasData) { + serviceCards.innerHTML = ''; + emptyState.style.display = 'block'; + return; + } + + emptyState.style.display = 'none'; + serviceCards.innerHTML = ''; + + Object.keys(SERVICES).forEach(serviceKey => { + const service = SERVICES[serviceKey]; + const data = result[`cookieData_${serviceKey}`]; + + if (data) { + createServiceCard(service, data); + } + }); + }); + } + + // 创建服务卡片 + function createServiceCard(service, data) { + const card = document.createElement('div'); + card.className = 'service-card'; + + const isRecent = Date.now() - data.timestamp < 5 * 60 * 1000; // 5分钟内 + const lastUpdate = new Date(data.lastUpdate).toLocaleString(); + + card.innerHTML = ` +
+
${service.icon} ${service.displayName}
+
+ ${isRecent ? '活跃' : '休眠'} +
+
+
+
上次更新: ${lastUpdate}
+
+ + +
+
+ `; + + serviceCards.appendChild(card); + } + + // 复制Cookie到剪贴板 + async function copyCookie(serviceName) { + chrome.storage.local.get([`cookieData_${serviceName}`], async function(result) { + const data = result[`cookieData_${serviceName}`]; + if (data && data.cookie) { + try { + await navigator.clipboard.writeText(data.cookie); + showStatusInfo(`${SERVICES[serviceName].displayName} Cookie已复制到剪贴板`); + } catch (err) { + // 备用方案 + const textarea = document.createElement('textarea'); + textarea.value = data.cookie; + document.body.appendChild(textarea); + textarea.select(); + document.execCommand('copy'); + document.body.removeChild(textarea); + showStatusInfo(`${SERVICES[serviceName].displayName} Cookie已复制到剪贴板`); + } + } + }); + } + + // 删除服务数据 + function deleteService(serviceName) { + if (confirm(`确定要删除 ${SERVICES[serviceName].displayName} 的Cookie数据吗?`)) { + chrome.storage.local.remove([ + `cookieData_${serviceName}`, + `lastCapture_${serviceName}` + ], function() { + loadServiceData(); + showStatusInfo(`${SERVICES[serviceName].displayName} 数据已删除`); + }); + } + } + + // 清空所有数据 + function clearAllData() { + if (confirm('确定要清空所有Cookie数据吗?')) { + const keysToRemove = []; + Object.keys(SERVICES).forEach(service => { + keysToRemove.push(`cookieData_${service}`); + keysToRemove.push(`lastCapture_${service}`); + }); + + chrome.storage.local.remove(keysToRemove, function() { + loadServiceData(); + showStatusInfo('所有数据已清空'); + }); + } + } + + // 导出数据 + function exportData() { + const serviceKeys = Object.keys(SERVICES).map(service => `cookieData_${service}`); + chrome.storage.local.get(serviceKeys, function(result) { + const exportData = {}; + + Object.keys(result).forEach(key => { + const serviceName = key.replace('cookieData_', ''); + exportData[serviceName] = result[key]; + }); + + const blob = new Blob([JSON.stringify(exportData, null, 2)], {type: 'application/json'}); + const url = URL.createObjectURL(blob); + + const a = document.createElement('a'); + a.href = url; + a.download = `cookie-sniffer-${new Date().toISOString().slice(0,10)}.json`; + a.click(); + + URL.revokeObjectURL(url); + showStatusInfo('数据已导出'); + }); + } + + // 事件绑定 + refreshBtn.addEventListener('click', loadServiceData); + clearBtn.addEventListener('click', clearAllData); + exportBtn.addEventListener('click', exportData); + webhookInput.addEventListener('blur', saveWebhookConfig); + webhookInput.addEventListener('input', updateTestButtonState); + testWebhookBtn.addEventListener('click', testWebhook); + + // 代理点击事件 + serviceCards.addEventListener('click', function(e) { + if (e.target.classList.contains('copy-btn')) { + const serviceName = e.target.getAttribute('data-service'); + copyCookie(serviceName); + } else if (e.target.classList.contains('delete-btn')) { + const serviceName = e.target.getAttribute('data-service'); + deleteService(serviceName); + } + }); + + // 初始化 + loadWebhookConfig(); + loadServiceData(); + + // 自动刷新(每30秒) + setInterval(loadServiceData, 30000); +}); \ No newline at end of file diff --git a/crawlers/douyin/web/web_crawler.py b/crawlers/douyin/web/web_crawler.py index e5c07d5..1e075eb 100644 --- a/crawlers/douyin/web/web_crawler.py +++ b/crawlers/douyin/web/web_crawler.py @@ -348,6 +348,26 @@ class DouyinWebCrawler: # 对于URL列表 return await WebCastIdFetcher.get_all_webcast_id(urls) + async def update_cookie(self, cookie: str): + """ + 更新指定服务的Cookie + + Args: + service: 服务名称 (如: douyin_web) + cookie: 新的Cookie值 + """ + global config + service = "douyin" + print('DouyinWebCrawler before update', config["TokenManager"][service]["headers"]["Cookie"]) + print('DouyinWebCrawler to update', cookie) + # 1. 更新内存中的配置(立即生效) + config["TokenManager"][service]["headers"]["Cookie"] = cookie + print('DouyinWebCrawler cookie updated', config["TokenManager"][service]["headers"]["Cookie"]) + # 2. 写入配置文件(持久化) + config_path = f"{path}/config.yaml" + with open(config_path, 'w', encoding='utf-8') as file: + yaml.dump(config, file, default_flow_style=False, allow_unicode=True, indent=2) + async def main(self): """-------------------------------------------------------handler接口列表-------------------------------------------------------"""