Files
BiliNote/backend/app/downloaders/youtube_subtitle.py
huangjianwu f4801d5be7 feat(youtube): 使用 youtube-transcript-api 优先获取字幕,有字幕时跳过音频下载
- 新增 YouTubeSubtitleFetcher 模块,通过 youtube-transcript-api 获取字幕
- 重构笔记生成流程:缓存 → 平台字幕 → 按需下载 → 转写 fallback
- 有字幕时仅提取视频元信息,不下载音视频文件
- 添加 youtube-transcript-api 依赖

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 17:31:30 +08:00

99 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
通过 youtube-transcript-api 获取 YouTube 字幕。
优先人工字幕,其次自动生成字幕。不依赖 yt_dlp无需下载任何文件。
"""
from typing import Optional, List
from youtube_transcript_api import YouTubeTranscriptApi
from app.models.transcriber_model import TranscriptResult, TranscriptSegment
from app.utils.logger import get_logger
logger = get_logger(__name__)
class YouTubeSubtitleFetcher:
"""通过 youtube-transcript-api 获取 YouTube 字幕。"""
def __init__(self):
self._api = YouTubeTranscriptApi()
def fetch_subtitles(
self,
video_id: str,
langs: Optional[List[str]] = None,
) -> Optional[TranscriptResult]:
if langs is None:
langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"]
try:
# 1. 列出所有可用字幕
transcript_list = self._api.list(video_id)
available = []
for t in transcript_list:
available.append(
f"{t.language_code}({'auto' if t.is_generated else 'manual'})"
)
logger.info(f"可用字幕轨道: {', '.join(available)}")
# 2. 按优先级查找:先人工字幕,再自动字幕
transcript = None
try:
transcript = transcript_list.find_manually_created_transcript(langs)
logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})")
except Exception:
try:
transcript = transcript_list.find_generated_transcript(langs)
logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})")
except Exception:
# 都没匹配,取第一个可用的
for t in transcript_list:
transcript = t
source = "auto" if t.is_generated else "manual"
logger.info(f"使用首个可用字幕: {t.language_code} ({source})")
break
if not transcript:
logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕")
return None
# 3. 获取字幕内容
fetched = transcript.fetch()
segments = []
for snippet in fetched:
text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip()
if not text:
continue
start = snippet.get("start", 0) if isinstance(snippet, dict) else 0
duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0
segments.append(TranscriptSegment(
start=float(start),
end=float(start) + float(duration),
text=text,
))
if not segments:
logger.warning(f"YouTube 字幕内容为空: {video_id}")
return None
full_text = " ".join(seg.text for seg in segments)
logger.info(f"成功获取 YouTube 字幕,共 {len(segments)}")
return TranscriptResult(
language=transcript.language_code,
full_text=full_text,
segments=segments,
raw={
"source": "youtube_transcript_api",
"language": transcript.language,
"language_code": transcript.language_code,
"is_generated": transcript.is_generated,
},
)
except Exception as e:
logger.warning(f"YouTube 字幕获取失败: {e}")
return None