zhizhi/tools/bili_drama_collector.py

103 lines
3.3 KiB
Python
Raw Normal View History

"""
B站短剧采集器 · 公开API版
不需要登录直接搜短剧数据
"""
import requests, json, time, os
from datetime import datetime
DATA_DIR = '/opt/zhiqiu-tools/data/drama'
os.makedirs(DATA_DIR, exist_ok=True)
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com/'
}
def search_bilibili(keyword, page=1):
"""搜索B站视频"""
url = 'https://api.bilibili.com/x/web-interface/search/type'
params = {'search_type': 'video', 'keyword': keyword, 'page': page}
r = requests.get(url, params=params, headers=HEADERS, timeout=15)
data = r.json()
if data.get('code') != 0:
return []
return data.get('data', {}).get('result', [])
def format_video(v):
"""提取关键字段"""
title = v.get('title', '').replace('<em class="keyword">', '').replace('</em>', '')
return {
'title': title,
'play': v.get('play', 0),
'author': v.get('author', ''),
'duration': v.get('duration', ''),
'bvid': v.get('bvid', ''),
'aid': v.get('aid', ''),
'tag': v.get('tag', ''),
'description': v.get('description', '')[:200],
'pic': v.get('pic', ''),
'source': 'bilibili',
'crawl_time': datetime.now().isoformat()
}
def collect(keywords, max_pages=3):
"""采集主函数"""
all_results = []
seen = set()
for kw in keywords:
print(f'\n🔍 搜索: {kw}')
for page in range(1, max_pages + 1):
try:
results = search_bilibili(kw, page)
except:
print(f'{page}页请求失败,跳过')
break
if not results:
print(f'{page}页无数据')
break
count = 0
for v in results:
vid = v.get('bvid') or v.get('aid')
if vid and vid not in seen:
seen.add(vid)
item = format_video(v)
all_results.append(item)
count += 1
print(f'{page}页: +{count}条 (累计{len(all_results)})')
time.sleep(0.5)
# 定期保存中间结果(每轮关键词后)
def save_interim(data, tag):
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
path = os.path.join(DATA_DIR, f'bili_drama_{tag}_{ts}.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return path
save_interim(all_results, 'interim')
# 按播放量排序
# 存文件
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
path = os.path.join(DATA_DIR, f'bili_drama_{ts}.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f'\n✅ 采集完成: {len(all_results)}')
print(f'📁 已保存: {path}')
# 打印TOP5
print('\n🏆 播放量TOP5:')
for i, item in enumerate(all_results[:5], 1):
print(f' {i}. [{item["play"]}播放] {item["title"][:50]}')
print(f' UP: {item["author"]} | {item["duration"]}')
return all_results
if __name__ == '__main__':
keywords = ['短剧', '短剧推荐', 'ai短剧', '真人短剧', '爆款短剧']
collect(keywords, max_pages=2)