Files
BiliNote/backend/app/downloaders/youtube_subtitle.py
huangjianwu 41f17592c2 fix(backend): 部署韧性——模型自愈/就绪门禁/全局代理/启动诊断
- whisper: model.bin 截断/损坏时删目录重下重试一次,修「Unable to
  open file model.bin」死循环;mlx 同样按 config.json 判完整性
- /generate_note 加就绪门禁:本地转写引擎模型没下好直接拦截,返回
  reason=transcriber_model_not_ready,不让任务静默卡在首次下载
- 全局代理:新增 ProxyConfigManager(JSON 配置 + HTTP_PROXY env 兜底)
  + build_openai_client,统一注入代理到 LLM/Groq 客户端;yt-dlp 与
  youtube-transcript-api 也走代理
- build_openai_client 校验 api_key 非空,空 key 给「xxx 的 API Key
  未配置」而不是天书般的 Illegal header value b'Bearer '
- universal_gpt: 模型拒绝自定义 temperature(o1/o3/gpt-5 系列)时
  就地去掉参数重试,不消耗重试预算
- connect_test 改用真实 chat completion 而非 /v1/models 探测
- main.py: lifespan 拆 [startup 1/5..5/5] 分段日志 + 异常清晰定位
- /sys_health 重构为结构化返回 {backend,ffmpeg,db,whisper_model}

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 19:01:14 +08:00

114 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
通过 youtube-transcript-api 获取 YouTube 字幕。
优先人工字幕,其次自动生成字幕。不依赖 yt_dlp无需下载任何文件。
"""
from typing import Optional, List
from youtube_transcript_api import YouTubeTranscriptApi
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
from app.services.proxy_config_manager import ProxyConfigManager
from app.utils.logger import get_logger
logger = get_logger(__name__)
class YouTubeSubtitleFetcher:
"""通过 youtube-transcript-api 获取 YouTube 字幕。"""
def __init__(self):
# 配了全局代理就给 youtube-transcript-api 套一个带 proxies 的 requests.Session
# 否则国内拉字幕同样会超时。代理未配置时退回默认无代理客户端。
proxy = ProxyConfigManager().get_proxy_url()
if proxy:
try:
import requests
session = requests.Session()
session.proxies = {"http": proxy, "https": proxy}
self._api = YouTubeTranscriptApi(http_client=session)
logger.info(f"YouTube 字幕走代理: {proxy}")
except Exception as e:
logger.warning(f"为 youtube-transcript-api 注入代理失败,回退无代理: {e}")
self._api = YouTubeTranscriptApi()
else:
self._api = YouTubeTranscriptApi()
def fetch_subtitles(
self,
video_id: str,
langs: Optional[List[str]] = None,
) -> Optional[TranscriptResult]:
if langs is None:
langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"]
try:
# 1. 列出所有可用字幕
transcript_list = self._api.list(video_id)
available = []
for t in transcript_list:
available.append(
f"{t.language_code}({'auto' if t.is_generated else 'manual'})"
)
logger.info(f"可用字幕轨道: {', '.join(available)}")
# 2. 按优先级查找:先人工字幕,再自动字幕
transcript = None
try:
transcript = transcript_list.find_manually_created_transcript(langs)
logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})")
except Exception:
try:
transcript = transcript_list.find_generated_transcript(langs)
logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})")
except Exception:
# 都没匹配,取第一个可用的
for t in transcript_list:
transcript = t
source = "auto" if t.is_generated else "manual"
logger.info(f"使用首个可用字幕: {t.language_code} ({source})")
break
if not transcript:
logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕")
return None
# 3. 获取字幕内容
fetched = transcript.fetch()
segments = []
for snippet in fetched:
text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip()
if not text:
continue
start = snippet.get("start", 0) if isinstance(snippet, dict) else 0
duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0
segments.append(TranscriptSegment(
start=float(start),
end=float(start) + float(duration),
text=text,
))
if not segments:
logger.warning(f"YouTube 字幕内容为空: {video_id}")
return None
full_text = " ".join(seg.text for seg in segments)
logger.info(f"成功获取 YouTube 字幕,共 {len(segments)}")
return TranscriptResult(
language=transcript.language_code,
full_text=full_text,
segments=segments,
raw={
"source": "youtube_transcript_api",
"language": transcript.language,
"language_code": transcript.language_code,
"is_generated": transcript.is_generated,
},
)
except Exception as e:
logger.warning(f"YouTube 字幕获取失败: {e}")
return None