""" B站短剧采集器 · 公开API版 不需要登录,直接搜短剧数据 """ import requests, json, time, os from datetime import datetime DATA_DIR = '/opt/zhiqiu-tools/data/drama' os.makedirs(DATA_DIR, exist_ok=True) HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Referer': 'https://www.bilibili.com/' } def search_bilibili(keyword, page=1): """搜索B站视频""" url = 'https://api.bilibili.com/x/web-interface/search/type' params = {'search_type': 'video', 'keyword': keyword, 'page': page} r = requests.get(url, params=params, headers=HEADERS, timeout=15) data = r.json() if data.get('code') != 0: return [] return data.get('data', {}).get('result', []) def format_video(v): """提取关键字段""" title = v.get('title', '').replace('', '').replace('', '') return { 'title': title, 'play': v.get('play', 0), 'author': v.get('author', ''), 'duration': v.get('duration', ''), 'bvid': v.get('bvid', ''), 'aid': v.get('aid', ''), 'tag': v.get('tag', ''), 'description': v.get('description', '')[:200], 'pic': v.get('pic', ''), 'source': 'bilibili', 'crawl_time': datetime.now().isoformat() } def collect(keywords, max_pages=3): """采集主函数""" all_results = [] seen = set() for kw in keywords: print(f'\n🔍 搜索: {kw}') for page in range(1, max_pages + 1): try: results = search_bilibili(kw, page) except: print(f' 第{page}页请求失败,跳过') break if not results: print(f' 第{page}页无数据') break count = 0 for v in results: vid = v.get('bvid') or v.get('aid') if vid and vid not in seen: seen.add(vid) item = format_video(v) all_results.append(item) count += 1 print(f' 第{page}页: +{count}条 (累计{len(all_results)})') time.sleep(0.5) # 定期保存中间结果(每轮关键词后) def save_interim(data, tag): ts = datetime.now().strftime('%Y%m%d_%H%M%S') path = os.path.join(DATA_DIR, f'bili_drama_{tag}_{ts}.json') with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) return path save_interim(all_results, 'interim') # 按播放量排序 # 存文件 ts = datetime.now().strftime('%Y%m%d_%H%M%S') path = os.path.join(DATA_DIR, f'bili_drama_{ts}.json') with open(path, 'w', encoding='utf-8') as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f'\n✅ 采集完成: {len(all_results)}条') print(f'📁 已保存: {path}') # 打印TOP5 print('\n🏆 播放量TOP5:') for i, item in enumerate(all_results[:5], 1): print(f' {i}. [{item["play"]}播放] {item["title"][:50]}') print(f' UP: {item["author"]} | {item["duration"]}') return all_results if __name__ == '__main__': keywords = ['短剧', '短剧推荐', 'ai短剧', '真人短剧', '爆款短剧'] collect(keywords, max_pages=2)