Files
BiliNote/backend/app/downloaders/bilibili_subtitle.py
huangjianwu 702b57c165 feat(bilibili): 优先走官方 player API 直拉字幕
之前 BilibiliDownloader.download_subtitles 走的是 yt-dlp 的 writesubtitles 路径,对 B 站签名/Cookie 的兼容性差,常常空手而归,落到音频下载 + Whisper 转写的慢路径。

新增 bilibili_subtitle.BilibiliSubtitleFetcher:
- /x/web-interface/view?bvid=... → 拿 cid
- /x/player/wbi/v2?bvid=...&cid=... → 拿 subtitle 列表(subtitle_url 已带 auth_key)
- 优先级:人工中文 > AI 中文 > 任意中文 > 任意非空
- fetch JSON body 解析为 TranscriptResult
- 通过 CookieConfigManager 自动注入 SESSDATA cookie(AI 字幕必需)

bilibili_downloader.download_subtitles 顺序改为:先试新 fetcher,失败再回退到原 yt-dlp 路径。NoteGenerator 的字幕优先逻辑无需改动——它本来就调 download_subtitles。

效果:
- B 站视频如果有字幕(人工或 AI),直接秒拿,跳过音频下载 + 转写
- 完全绕开 MLX Whisper 不可用 / 模型未下载 等转写器问题
- 拿不到字幕时仍可走原音频转写路径

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 11:55:50 +08:00

165 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
直接调用 B 站 player API 拿字幕,绕过 yt-dlp。
流程:
1. 从 URL 提 BV id已有 utils.url_parser.extract_video_id
2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
每条带 subtitle_urlB 站后端已经签好 auth_key 的完整地址)
4. 按优先级(人工 zh-CN > AI zh-CN > 任意 zh > 任意非空)选一条
5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
6. 解析为 TranscriptResult
AI 字幕需要登录态 cookieSESSDATA通过 CookieConfigManager 注入。
"""
from typing import List, Optional
import requests
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
from app.services.cookie_manager import CookieConfigManager
from app.utils.logger import get_logger
from app.utils.url_parser import extract_video_id
logger = get_logger(__name__)
UA = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
class BilibiliSubtitleFetcher:
"""通过 B 站官方 API 直拉字幕。"""
def __init__(self):
self._cookie = CookieConfigManager().get("bilibili") or ""
def _headers(self) -> dict:
h = {
"User-Agent": UA,
"Referer": "https://www.bilibili.com",
}
if self._cookie:
h["Cookie"] = self._cookie
return h
def _get_cid(self, bvid: str) -> Optional[int]:
url = "https://api.bilibili.com/x/web-interface/view"
try:
resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
data = resp.json()
except Exception as e:
logger.warning(f"获取 cid 失败: {e}")
return None
if data.get("code") != 0:
logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
return None
cid = data.get("data", {}).get("cid")
return int(cid) if cid else None
def _list_subtitles(self, bvid: str, cid: int) -> List[dict]:
url = "https://api.bilibili.com/x/player/wbi/v2"
try:
resp = requests.get(url, params={"bvid": bvid, "cid": cid}, headers=self._headers(), timeout=10)
data = resp.json()
except Exception as e:
logger.warning(f"获取字幕列表失败: {e}")
return []
if data.get("code") != 0:
logger.warning(f"player API 返回错误: code={data.get('code')}, msg={data.get('message')}")
return []
subtitles = data.get("data", {}).get("subtitle", {}).get("subtitles", [])
return subtitles or []
def _pick(self, subtitles: List[dict]) -> Optional[dict]:
"""优先级:人工中文 > AI 中文 > 任意中文 > 任意非空。"""
if not subtitles:
return None
def is_zh(s: dict) -> bool:
lan = (s.get("lan") or "").lower()
return lan.startswith("zh") or lan == "ai-zh"
# 人工中文type 0=AI, 1=人工 ai_type=0 视为人工)
for s in subtitles:
if is_zh(s) and not s.get("ai_type"):
return s
# AI 中文
for s in subtitles:
if is_zh(s):
return s
# 任意非空
return subtitles[0]
@staticmethod
def _normalize_url(url: str) -> str:
if url.startswith("//"):
return "https:" + url
return url
def _fetch_body(self, subtitle_url: str) -> Optional[List[dict]]:
try:
resp = requests.get(self._normalize_url(subtitle_url), headers=self._headers(), timeout=15)
data = resp.json()
return data.get("body") or []
except Exception as e:
logger.warning(f"下载字幕 JSON 失败: {e}")
return None
def fetch_subtitles(self, video_url: str) -> Optional[TranscriptResult]:
bvid = extract_video_id(video_url, "bilibili")
if not bvid:
logger.info("无法从 URL 提取 BV id")
return None
cid = self._get_cid(bvid)
if not cid:
logger.info(f"{bvid} 没有取到 cid")
return None
subtitles = self._list_subtitles(bvid, cid)
if not subtitles:
logger.info(f"{bvid} (cid={cid}) 没有可用字幕轨")
return None
track = self._pick(subtitles)
if not track or not track.get("subtitle_url"):
logger.info(f"{bvid} 字幕轨存在但没有 subtitle_url可能未登录、需要 SESSDATA cookie")
return None
lan = track.get("lan") or "zh"
body = self._fetch_body(track["subtitle_url"])
if not body:
return None
segments: List[TranscriptSegment] = []
for item in body:
text = (item.get("content") or "").strip()
if not text:
continue
segments.append(TranscriptSegment(
start=float(item.get("from", 0)),
end=float(item.get("to", 0)),
text=text,
))
if not segments:
return None
full_text = " ".join(s.text for s in segments)
logger.info(f"B站直拉字幕成功: {bvid} lan={lan}{len(segments)}")
return TranscriptResult(
language=lan,
full_text=full_text,
segments=segments,
raw={
"source": "bilibili_player_api",
"bvid": bvid,
"cid": cid,
"lan": lan,
"ai_type": track.get("ai_type"),
},
)