fix(bilibili): 修正分P视频字幕优先链路未传p参数导致取错集

问题：B站分P视频（如62集课程），提交?p=36链接时，字幕优先链路通过x/web-interface/view API拿cid时未传p参数，默认取第1集cid，导致生成的是第1集的笔记。同时yt-dlp正确下载了p36音频，但被跳过。修复： - url_parser新增extract_bilibili_p_number()提取URL中的p参数 - bilibili_subtitle的_get_cid()接收p参数，从data.pages[p-1]取对应分P的cid - fetch_subtitles()调用extract_bilibili_p_number()透传p
2026-06-17 13:41:42 +08:00 · 2026-06-11 23:00:12 +08:00
parent f5bfb43619
commit 2ba409880e
2 changed files with 64 additions and 12 deletions
--- a/backend/app/downloaders/bilibili_subtitle.py
+++ b/backend/app/downloaders/bilibili_subtitle.py
@@ -3,12 +3,13 @@

 流程：
 1. 从 URL 提 BV id（已有 utils.url_parser.extract_video_id）
-2. GET /x/web-interface/view?bvid=BVxxx → 拿 cid
-3. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
+2. 从 URL 提 p 参数（分 P 序号，已有 utils.url_parser.extract_bilibili_p_number）
+3. GET /x/web-interface/view?bvid=BVxxx&p=N → 拿第 N 集的 cid
+4. GET /x/player/wbi/v2?bvid=...&cid=... → 返回 data.subtitle.subtitles[]
   每条带 subtitle_url（B 站后端已经签好 auth_key 的完整地址）
-4. 按优先级（人工 zh-CN > AI zh-CN > 任意 zh > 任意非空）选一条
-5. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
-6. 解析为 TranscriptResult
+5. 按优先级（人工 zh-CN > AI zh-CN > 任意 zh > 任意非空）选一条
+6. fetch subtitle_url → JSON {body:[{from,to,content,...}]}
+7. 解析为 TranscriptResult

 AI 字幕需要登录态 cookie（SESSDATA）；通过 CookieConfigManager 注入。
 """
@@ -20,7 +21,7 @@ import requests
 from app.models.transcriber_model import TranscriptResult, TranscriptSegment
 from app.services.cookie_manager import CookieConfigManager
 from app.utils.logger import get_logger
-from app.utils.url_parser import extract_video_id
+from app.utils.url_parser import extract_video_id, extract_bilibili_p_number

 logger = get_logger(__name__)

@@ -45,10 +46,13 @@ class BilibiliSubtitleFetcher:
            h["Cookie"] = self._cookie
        return h

-    def _get_cid(self, bvid: str) -> Optional[int]:
+    def _get_cid(self, bvid: str, p: Optional[int] = None) -> Optional[int]:
        url = "https://api.bilibili.com/x/web-interface/view"
+        params = {"bvid": bvid}
+        if p is not None and p >= 1:
+            params["p"] = p
        try:
-            resp = requests.get(url, params={"bvid": bvid}, headers=self._headers(), timeout=10)
+            resp = requests.get(url, params=params, headers=self._headers(), timeout=10)
            data = resp.json()
        except Exception as e:
            logger.warning(f"获取 cid 失败: {e}")
@@ -56,6 +60,19 @@ class BilibiliSubtitleFetcher:
        if data.get("code") != 0:
            logger.warning(f"view API 返回错误: code={data.get('code')}, msg={data.get('message')}")
            return None
+        # 分 P 视频：data.pages[N-1] 对应第 N 集
+        pages = data.get("data", {}).get("pages", [])
+        if pages:
+            if p is not None and 1 <= p <= len(pages):
+                cid = pages[p - 1].get("cid")
+                logger.info(f"分 P 视频: bvid={bvid} p={p} 共 {len(pages)} 集, 取第 {p} 集 cid={cid}")
+                return int(cid) if cid else None
+            else:
+                # 没有 p 参数或 p 超出范围，取第 1 集
+                cid = pages[0].get("cid")
+                logger.info(f"非分 P 或 p 无效: bvid={bvid} 取第 1 集 cid={cid}")
+                return int(cid) if cid else None
+        # 单集视频
        cid = data.get("data", {}).get("cid")
        return int(cid) if cid else None

@@ -114,9 +131,12 @@ class BilibiliSubtitleFetcher:
            logger.info("无法从 URL 提取 BV id")
            return None

-        cid = self._get_cid(bvid)
+        # 提取分 P 序号
+        p = extract_bilibili_p_number(video_url)
+
+        cid = self._get_cid(bvid, p)
        if not cid:
-            logger.info(f"{bvid} 没有取到 cid")
+            logger.info(f"{bvid} (p={p}) 没有取到 cid")
            return None

        subtitles = self._list_subtitles(bvid, cid)
@@ -149,7 +169,7 @@ class BilibiliSubtitleFetcher:
            return None

        full_text = " ".join(s.text for s in segments)
-        logger.info(f"B站直拉字幕成功: {bvid} lan={lan} 共 {len(segments)} 段")
+        logger.info(f"B站直拉字幕成功: {bvid} p={p} lan={lan} 共 {len(segments)} 段")
        return TranscriptResult(
            language=lan,
            full_text=full_text,
@@ -158,6 +178,7 @@ class BilibiliSubtitleFetcher:
                "source": "bilibili_player_api",
                "bvid": bvid,
                "cid": cid,
+                "p": p,
                "lan": lan,
                "ai_type": track.get("ai_type"),
            },
--- a/backend/app/utils/url_parser.py
+++ b/backend/app/utils/url_parser.py
@@ -1,5 +1,5 @@
 import re
-from typing import Optional
+from typing import Optional, Tuple
 import requests


@@ -48,3 +48,34 @@ def resolve_bilibili_short_url(short_url: str) -> Optional[str]:
    except requests.RequestException as e:
        print(f"Error resolving short URL: {e}")
        return None
+
+
+def extract_bilibili_p_number(url: str) -> Optional[int]:
+    """
+    从 B 站分 P 视频 URL 中提取 p 参数（分 P 序号）。
+
+    支持格式：
+      - https://www.bilibili.com/video/BVxxx/?p=36
+      - https://www.bilibili.com/video/BVxxx?p=5
+      - https://b23.tv/xxxxx?p=10
+      - https://www.bilibili.com/video/BVxxx/pN (尾缀形式)
+
+    :param url: B 站视频链接
+    :return: 分 P 序号（从 1 开始），非分 P 视频返回 None
+    """
+    if "b23.tv" in url:
+        url = resolve_bilibili_short_url(url) or url
+
+    # 匹配 ?p=NNN 或 &p=NNN
+    match = re.search(r'[?&]p=(\d+)', url)
+    if match:
+        p = int(match.group(1))
+        if p >= 1:
+            return p
+
+    # 匹配 /pN 尾缀形式（较少见）
+    match = re.search(r'/p(\d+)(?:/?$|\?|&)', url)
+    if match:
+        return int(match.group(1))
+
+    return None