103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
"""
|
|
B站短剧采集器 · 公开API版
|
|
不需要登录,直接搜短剧数据
|
|
"""
|
|
|
|
import requests, json, time, os
|
|
from datetime import datetime
|
|
|
|
DATA_DIR = '/opt/zhiqiu-tools/data/drama'
|
|
os.makedirs(DATA_DIR, exist_ok=True)
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
|
'Referer': 'https://www.bilibili.com/'
|
|
}
|
|
|
|
def search_bilibili(keyword, page=1):
|
|
"""搜索B站视频"""
|
|
url = 'https://api.bilibili.com/x/web-interface/search/type'
|
|
params = {'search_type': 'video', 'keyword': keyword, 'page': page}
|
|
r = requests.get(url, params=params, headers=HEADERS, timeout=15)
|
|
data = r.json()
|
|
if data.get('code') != 0:
|
|
return []
|
|
return data.get('data', {}).get('result', [])
|
|
|
|
def format_video(v):
|
|
"""提取关键字段"""
|
|
title = v.get('title', '').replace('<em class="keyword">', '').replace('</em>', '')
|
|
return {
|
|
'title': title,
|
|
'play': v.get('play', 0),
|
|
'author': v.get('author', ''),
|
|
'duration': v.get('duration', ''),
|
|
'bvid': v.get('bvid', ''),
|
|
'aid': v.get('aid', ''),
|
|
'tag': v.get('tag', ''),
|
|
'description': v.get('description', '')[:200],
|
|
'pic': v.get('pic', ''),
|
|
'source': 'bilibili',
|
|
'crawl_time': datetime.now().isoformat()
|
|
}
|
|
|
|
def collect(keywords, max_pages=3):
|
|
"""采集主函数"""
|
|
all_results = []
|
|
seen = set()
|
|
|
|
for kw in keywords:
|
|
print(f'\n🔍 搜索: {kw}')
|
|
for page in range(1, max_pages + 1):
|
|
try:
|
|
results = search_bilibili(kw, page)
|
|
except:
|
|
print(f' 第{page}页请求失败,跳过')
|
|
break
|
|
if not results:
|
|
print(f' 第{page}页无数据')
|
|
break
|
|
count = 0
|
|
for v in results:
|
|
vid = v.get('bvid') or v.get('aid')
|
|
if vid and vid not in seen:
|
|
seen.add(vid)
|
|
item = format_video(v)
|
|
all_results.append(item)
|
|
count += 1
|
|
print(f' 第{page}页: +{count}条 (累计{len(all_results)})')
|
|
time.sleep(0.5)
|
|
|
|
# 定期保存中间结果(每轮关键词后)
|
|
def save_interim(data, tag):
|
|
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
path = os.path.join(DATA_DIR, f'bili_drama_{tag}_{ts}.json')
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
return path
|
|
|
|
save_interim(all_results, 'interim')
|
|
|
|
# 按播放量排序
|
|
|
|
# 存文件
|
|
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
path = os.path.join(DATA_DIR, f'bili_drama_{ts}.json')
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f'\n✅ 采集完成: {len(all_results)}条')
|
|
print(f'📁 已保存: {path}')
|
|
|
|
# 打印TOP5
|
|
print('\n🏆 播放量TOP5:')
|
|
for i, item in enumerate(all_results[:5], 1):
|
|
print(f' {i}. [{item["play"]}播放] {item["title"][:50]}')
|
|
print(f' UP: {item["author"]} | {item["duration"]}')
|
|
|
|
return all_results
|
|
|
|
if __name__ == '__main__':
|
|
keywords = ['短剧', '短剧推荐', 'ai短剧', '真人短剧', '爆款短剧']
|
|
collect(keywords, max_pages=2)
|