mirror of
https://github.com/JefferyHcool/BiliNote.git
synced 2026-05-30 19:19:33 +08:00
feat(bilibili): 优先走官方 player API 直拉字幕
之前 BilibiliDownloader.download_subtitles 走的是 yt-dlp 的 writesubtitles 路径,对 B 站签名/Cookie 的兼容性差,常常空手而归,落到音频下载 + Whisper 转写的慢路径。 新增 bilibili_subtitle.BilibiliSubtitleFetcher: - /x/web-interface/view?bvid=... → 拿 cid - /x/player/wbi/v2?bvid=...&cid=... → 拿 subtitle 列表(subtitle_url 已带 auth_key) - 优先级:人工中文 > AI 中文 > 任意中文 > 任意非空 - fetch JSON body 解析为 TranscriptResult - 通过 CookieConfigManager 自动注入 SESSDATA cookie(AI 字幕必需) bilibili_downloader.download_subtitles 顺序改为:先试新 fetcher,失败再回退到原 yt-dlp 路径。NoteGenerator 的字幕优先逻辑无需改动——它本来就调 download_subtitles。 效果: - B 站视频如果有字幕(人工或 AI),直接秒拿,跳过音频下载 + 转写 - 完全绕开 MLX Whisper 不可用 / 模型未下载 等转写器问题 - 拿不到字幕时仍可走原音频转写路径 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ from typing import Union, Optional, List
|
||||
import yt_dlp
|
||||
|
||||
from app.downloaders.base import Downloader, DownloadQuality, QUALITY_MAP
|
||||
from app.downloaders.bilibili_subtitle import BilibiliSubtitleFetcher
|
||||
from app.models.notes_model import AudioDownloadResult
|
||||
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
||||
from app.utils.path_helper import get_data_dir
|
||||
@@ -155,6 +156,15 @@ class BilibiliDownloader(Downloader, ABC):
|
||||
:param langs: 优先语言列表
|
||||
:return: TranscriptResult 或 None
|
||||
"""
|
||||
# 1) 优先走 B 站官方 player API(直拉,无需下视频;AI 字幕需 SESSDATA cookie)
|
||||
try:
|
||||
result = BilibiliSubtitleFetcher().fetch_subtitles(video_url)
|
||||
if result and result.segments:
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning(f"player API 直拉字幕异常,回退到 yt-dlp: {e}")
|
||||
|
||||
# 2) Fallback:原 yt-dlp 路径(更脆弱,遇到签名/Cookie 问题失败概率较高)
|
||||
if output_dir is None:
|
||||
output_dir = get_data_dir()
|
||||
if not output_dir:
|
||||
|
||||
164
backend/app/downloaders/bilibili_subtitle.py
Normal file
164
backend/app/downloaders/bilibili_subtitle.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
直接调用 B 站 player API 拿字幕,绕过 yt-dlp。
|
||||
|
||||
流程:
|
||||
1. 从 URL 提 BV id(已有 utils.url_parser.extract_video_id)
|
||||
2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
|
||||
3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
|
||||
每条带 subtitle_url(B 站后端已经签好 auth_key 的完整地址)
|
||||
4. 按优先级(人工 zh-CN > AI zh-CN > 任意 zh > 任意非空)选一条
|
||||
5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
|
||||
6. 解析为 TranscriptResult
|
||||
|
||||
AI 字幕需要登录态 cookie(SESSDATA);通过 CookieConfigManager 注入。
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
|
||||
from app.services.cookie_manager import CookieConfigManager
|
||||
from app.utils.logger import get_logger
|
||||
from app.utils.url_parser import extract_video_id
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
UA = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
class BilibiliSubtitleFetcher:
|
||||
"""通过 B 站官方 API 直拉字幕。"""
|
||||
|
||||
def __init__(self):
|
||||
self._cookie = CookieConfigManager().get("bilibili") or ""
|
||||
|
||||
def _headers(self) -> dict:
|
||||
h = {
|
||||
"User-Agent": UA,
|
||||
"Referer": "https://www.bilibili.com",
|
||||
}
|
||||
if self._cookie:
|
||||
h["Cookie"] = self._cookie
|
||||
return h
|
||||
|
||||
def _get_cid(self, bvid: str) -> Optional[int]:
|
||||
url = "https://api.bilibili.com/x/web-interface/view"
|
||||
try:
|
||||
resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.warning(f"获取 cid 失败: {e}")
|
||||
return None
|
||||
if data.get("code") != 0:
|
||||
logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
|
||||
return None
|
||||
cid = data.get("data", {}).get("cid")
|
||||
return int(cid) if cid else None
|
||||
|
||||
def _list_subtitles(self, bvid: str, cid: int) -> List[dict]:
|
||||
url = "https://api.bilibili.com/x/player/wbi/v2"
|
||||
try:
|
||||
resp = requests.get(url, params={"bvid": bvid, "cid": cid}, headers=self._headers(), timeout=10)
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.warning(f"获取字幕列表失败: {e}")
|
||||
return []
|
||||
if data.get("code") != 0:
|
||||
logger.warning(f"player API 返回错误: code={data.get('code')}, msg={data.get('message')}")
|
||||
return []
|
||||
subtitles = data.get("data", {}).get("subtitle", {}).get("subtitles", [])
|
||||
return subtitles or []
|
||||
|
||||
def _pick(self, subtitles: List[dict]) -> Optional[dict]:
|
||||
"""优先级:人工中文 > AI 中文 > 任意中文 > 任意非空。"""
|
||||
if not subtitles:
|
||||
return None
|
||||
|
||||
def is_zh(s: dict) -> bool:
|
||||
lan = (s.get("lan") or "").lower()
|
||||
return lan.startswith("zh") or lan == "ai-zh"
|
||||
|
||||
# 人工中文(type 0=AI, 1=人工 ;ai_type=0 视为人工)
|
||||
for s in subtitles:
|
||||
if is_zh(s) and not s.get("ai_type"):
|
||||
return s
|
||||
# AI 中文
|
||||
for s in subtitles:
|
||||
if is_zh(s):
|
||||
return s
|
||||
# 任意非空
|
||||
return subtitles[0]
|
||||
|
||||
@staticmethod
|
||||
def _normalize_url(url: str) -> str:
|
||||
if url.startswith("//"):
|
||||
return "https:" + url
|
||||
return url
|
||||
|
||||
def _fetch_body(self, subtitle_url: str) -> Optional[List[dict]]:
|
||||
try:
|
||||
resp = requests.get(self._normalize_url(subtitle_url), headers=self._headers(), timeout=15)
|
||||
data = resp.json()
|
||||
return data.get("body") or []
|
||||
except Exception as e:
|
||||
logger.warning(f"下载字幕 JSON 失败: {e}")
|
||||
return None
|
||||
|
||||
def fetch_subtitles(self, video_url: str) -> Optional[TranscriptResult]:
|
||||
bvid = extract_video_id(video_url, "bilibili")
|
||||
if not bvid:
|
||||
logger.info("无法从 URL 提取 BV id")
|
||||
return None
|
||||
|
||||
cid = self._get_cid(bvid)
|
||||
if not cid:
|
||||
logger.info(f"{bvid} 没有取到 cid")
|
||||
return None
|
||||
|
||||
subtitles = self._list_subtitles(bvid, cid)
|
||||
if not subtitles:
|
||||
logger.info(f"{bvid} (cid={cid}) 没有可用字幕轨")
|
||||
return None
|
||||
|
||||
track = self._pick(subtitles)
|
||||
if not track or not track.get("subtitle_url"):
|
||||
logger.info(f"{bvid} 字幕轨存在但没有 subtitle_url(可能未登录、需要 SESSDATA cookie)")
|
||||
return None
|
||||
|
||||
lan = track.get("lan") or "zh"
|
||||
body = self._fetch_body(track["subtitle_url"])
|
||||
if not body:
|
||||
return None
|
||||
|
||||
segments: List[TranscriptSegment] = []
|
||||
for item in body:
|
||||
text = (item.get("content") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
segments.append(TranscriptSegment(
|
||||
start=float(item.get("from", 0)),
|
||||
end=float(item.get("to", 0)),
|
||||
text=text,
|
||||
))
|
||||
|
||||
if not segments:
|
||||
return None
|
||||
|
||||
full_text = " ".join(s.text for s in segments)
|
||||
logger.info(f"B站直拉字幕成功: {bvid} lan={lan} 共 {len(segments)} 段")
|
||||
return TranscriptResult(
|
||||
language=lan,
|
||||
full_text=full_text,
|
||||
segments=segments,
|
||||
raw={
|
||||
"source": "bilibili_player_api",
|
||||
"bvid": bvid,
|
||||
"cid": cid,
|
||||
"lan": lan,
|
||||
"ai_type": track.get("ai_type"),
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user