From f4801d5be7238537dd84ba103dca605ac02d09c3 Mon Sep 17 00:00:00 2001 From: huangjianwu Date: Mon, 23 Mar 2026 17:31:30 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat(youtube):=20=E4=BD=BF=E7=94=A8=20youtu?= =?UTF-8?q?be-transcript-api=20=E4=BC=98=E5=85=88=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=AD=97=E5=B9=95=EF=BC=8C=E6=9C=89=E5=AD=97=E5=B9=95=E6=97=B6?= =?UTF-8?q?=E8=B7=B3=E8=BF=87=E9=9F=B3=E9=A2=91=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 YouTubeSubtitleFetcher 模块,通过 youtube-transcript-api 获取字幕 - 重构笔记生成流程:缓存 → 平台字幕 → 按需下载 → 转写 fallback - 有字幕时仅提取视频元信息,不下载音视频文件 - 添加 youtube-transcript-api 依赖 Co-Authored-By: Claude Opus 4.6 --- backend/app/downloaders/base.py | 3 +- backend/app/downloaders/youtube_downloader.py | 134 +++--------------- backend/app/downloaders/youtube_subtitle.py | 98 +++++++++++++ backend/app/services/note.py | 105 ++++++++++---- backend/requirements.txt | 1 + 5 files changed, 203 insertions(+), 138 deletions(-) create mode 100644 backend/app/downloaders/youtube_subtitle.py diff --git a/backend/app/downloaders/base.py b/backend/app/downloaders/base.py index a4dfb07..d1e71ad 100644 --- a/backend/app/downloaders/base.py +++ b/backend/app/downloaders/base.py @@ -22,7 +22,8 @@ class Downloader(ABC): @abstractmethod def download(self, video_url: str, output_dir: str = None, - quality: DownloadQuality = "fast", need_video: Optional[bool] = False) -> AudioDownloadResult: + quality: DownloadQuality = "fast", need_video: Optional[bool] = False, + skip_download: bool = False) -> AudioDownloadResult: ''' :param need_video: diff --git a/backend/app/downloaders/youtube_downloader.py b/backend/app/downloaders/youtube_downloader.py index 2a081bb..bb8ed8a 100644 --- a/backend/app/downloaders/youtube_downloader.py +++ b/backend/app/downloaders/youtube_downloader.py @@ -1,5 +1,4 @@ import os -import json import logging from abc import ABC from typing import Union, Optional, List @@ -7,8 +6,9 @@ from typing import Union, Optional, List import yt_dlp from app.downloaders.base import Downloader, DownloadQuality +from app.downloaders.youtube_subtitle import YouTubeSubtitleFetcher from app.models.notes_model import AudioDownloadResult -from app.models.transcriber_model import TranscriptResult, TranscriptSegment +from app.models.transcriber_model import TranscriptResult from app.utils.path_helper import get_data_dir from app.utils.url_parser import extract_video_id @@ -25,12 +25,13 @@ class YoutubeDownloader(Downloader, ABC): video_url: str, output_dir: Union[str, None] = None, quality: DownloadQuality = "fast", - need_video:Optional[bool]=False + need_video: Optional[bool] = False, + skip_download: bool = False, ) -> AudioDownloadResult: if output_dir is None: output_dir = get_data_dir() if not output_dir: - output_dir=self.cache_data + output_dir = self.cache_data os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "%(id)s.%(ext)s") @@ -42,15 +43,17 @@ class YoutubeDownloader(Downloader, ABC): 'quiet': False, } + if skip_download: + ydl_opts['skip_download'] = True + with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(video_url, download=True) + info = ydl.extract_info(video_url, download=not skip_download) video_id = info.get("id") title = info.get("title") duration = info.get("duration", 0) cover_url = info.get("thumbnail") - ext = info.get("ext", "m4a") # 兜底用 m4a + ext = info.get("ext", "m4a") audio_path = os.path.join(output_dir, f"{video_id}.{ext}") - print('os.path.join(output_dir, f"{video_id}.{ext}")',os.path.join(output_dir, f"{video_id}.{ext}")) return AudioDownloadResult( file_path=audio_path, @@ -59,8 +62,8 @@ class YoutubeDownloader(Downloader, ABC): cover_url=cover_url, platform="youtube", video_id=video_id, - raw_info={'tags':info.get('tags')}, #全部返回会报错 - video_path=None # ❗音频下载不包含视频路径 + raw_info={'tags': info.get('tags')}, + video_path=None, ) def download_video( @@ -101,115 +104,20 @@ class YoutubeDownloader(Downloader, ABC): def download_subtitles(self, video_url: str, output_dir: str = None, langs: List[str] = None) -> Optional[TranscriptResult]: """ - 尝试获取YouTube视频字幕(优先人工字幕,其次自动生成) + 通过 YouTube InnerTube API 直接获取字幕(优先人工字幕,其次自动生成)。 + 比 yt_dlp 方式更轻量,无需写临时文件到磁盘。 :param video_url: 视频链接 - :param output_dir: 输出路径 + :param output_dir: 未使用(保留接口兼容) :param langs: 优先语言列表 :return: TranscriptResult 或 None """ - if output_dir is None: - output_dir = get_data_dir() - if not output_dir: - output_dir = self.cache_data - os.makedirs(output_dir, exist_ok=True) - if langs is None: - langs = ['zh-Hans', 'zh', 'zh-CN', 'zh-TW', 'en', 'en-US'] + langs = ['zh-Hans', 'zh', 'zh-CN', 'zh-TW', 'en', 'en-US', 'ja'] video_id = extract_video_id(video_url, "youtube") - - ydl_opts = { - 'writesubtitles': True, - 'writeautomaticsub': True, - 'subtitleslangs': langs, - 'subtitlesformat': 'json3', - 'skip_download': True, - 'outtmpl': os.path.join(output_dir, f'{video_id}.%(ext)s'), - 'quiet': True, - } - - try: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(video_url, download=True) - - # 查找下载的字幕文件 - subtitles = info.get('requested_subtitles') or {} - if not subtitles: - logger.info(f"YouTube视频 {video_id} 没有可用字幕") - return None - - # 按优先级查找字幕文件 - subtitle_file = None - detected_lang = None - for lang in langs: - if lang in subtitles: - subtitle_file = os.path.join(output_dir, f"{video_id}.{lang}.json3") - detected_lang = lang - break - - # 如果按优先级没找到,取第一个可用的 - if not subtitle_file: - for lang, sub_info in subtitles.items(): - subtitle_file = os.path.join(output_dir, f"{video_id}.{lang}.json3") - detected_lang = lang - break - - if not subtitle_file or not os.path.exists(subtitle_file): - logger.info(f"字幕文件不存在: {subtitle_file}") - return None - - # 解析字幕文件 - return self._parse_json3_subtitle(subtitle_file, detected_lang) - - except Exception as e: - logger.warning(f"获取YouTube字幕失败: {e}") - return None - - def _parse_json3_subtitle(self, subtitle_file: str, language: str) -> Optional[TranscriptResult]: - """ - 解析 json3 格式字幕文件 - - :param subtitle_file: 字幕文件路径 - :param language: 语言代码 - :return: TranscriptResult - """ - try: - with open(subtitle_file, 'r', encoding='utf-8') as f: - data = json.load(f) - - segments = [] - events = data.get('events', []) - - for event in events: - # json3 格式中时间单位是毫秒 - start_ms = event.get('tStartMs', 0) - duration_ms = event.get('dDurationMs', 0) - - # 提取文本 - segs = event.get('segs', []) - text = ''.join(seg.get('utf8', '') for seg in segs).strip() - - if text: # 只添加非空文本 - segments.append(TranscriptSegment( - start=start_ms / 1000.0, - end=(start_ms + duration_ms) / 1000.0, - text=text - )) - - if not segments: - return None - - full_text = ' '.join(seg.text for seg in segments) - - logger.info(f"成功解析YouTube字幕,共 {len(segments)} 段") - return TranscriptResult( - language=language, - full_text=full_text, - segments=segments, - raw={'source': 'youtube_subtitle', 'file': subtitle_file} - ) - - except Exception as e: - logger.warning(f"解析字幕文件失败: {e}") - return None + fetcher = YouTubeSubtitleFetcher() + print( + f"尝试获取字幕,video_id={video_id}, langs={langs}" + ) + return fetcher.fetch_subtitles(video_id, langs) diff --git a/backend/app/downloaders/youtube_subtitle.py b/backend/app/downloaders/youtube_subtitle.py new file mode 100644 index 0000000..d59a871 --- /dev/null +++ b/backend/app/downloaders/youtube_subtitle.py @@ -0,0 +1,98 @@ +""" +通过 youtube-transcript-api 获取 YouTube 字幕。 +优先人工字幕,其次自动生成字幕。不依赖 yt_dlp,无需下载任何文件。 +""" + +from typing import Optional, List + +from youtube_transcript_api import YouTubeTranscriptApi + +from app.models.transcriber_model import TranscriptResult, TranscriptSegment +from app.utils.logger import get_logger + +logger = get_logger(__name__) + + +class YouTubeSubtitleFetcher: + """通过 youtube-transcript-api 获取 YouTube 字幕。""" + + def __init__(self): + self._api = YouTubeTranscriptApi() + + def fetch_subtitles( + self, + video_id: str, + langs: Optional[List[str]] = None, + ) -> Optional[TranscriptResult]: + if langs is None: + langs = ["zh-Hans", "zh", "zh-CN", "zh-TW", "en", "en-US", "ja"] + + try: + # 1. 列出所有可用字幕 + transcript_list = self._api.list(video_id) + + available = [] + for t in transcript_list: + available.append( + f"{t.language_code}({'auto' if t.is_generated else 'manual'})" + ) + logger.info(f"可用字幕轨道: {', '.join(available)}") + + # 2. 按优先级查找:先人工字幕,再自动字幕 + transcript = None + try: + transcript = transcript_list.find_manually_created_transcript(langs) + logger.info(f"选中人工字幕: {transcript.language_code} ({transcript.language})") + except Exception: + try: + transcript = transcript_list.find_generated_transcript(langs) + logger.info(f"选中自动字幕: {transcript.language_code} ({transcript.language})") + except Exception: + # 都没匹配,取第一个可用的 + for t in transcript_list: + transcript = t + source = "auto" if t.is_generated else "manual" + logger.info(f"使用首个可用字幕: {t.language_code} ({source})") + break + + if not transcript: + logger.info(f"YouTube 视频 {video_id} 没有任何可用字幕") + return None + + # 3. 获取字幕内容 + fetched = transcript.fetch() + segments = [] + for snippet in fetched: + text = snippet.get("text", "").strip() if isinstance(snippet, dict) else str(snippet).strip() + if not text: + continue + start = snippet.get("start", 0) if isinstance(snippet, dict) else 0 + duration = snippet.get("duration", 0) if isinstance(snippet, dict) else 0 + segments.append(TranscriptSegment( + start=float(start), + end=float(start) + float(duration), + text=text, + )) + + if not segments: + logger.warning(f"YouTube 字幕内容为空: {video_id}") + return None + + full_text = " ".join(seg.text for seg in segments) + logger.info(f"成功获取 YouTube 字幕,共 {len(segments)} 段") + + return TranscriptResult( + language=transcript.language_code, + full_text=full_text, + segments=segments, + raw={ + "source": "youtube_transcript_api", + "language": transcript.language, + "language_code": transcript.language_code, + "is_generated": transcript.is_generated, + }, + ) + + except Exception as e: + logger.warning(f"YouTube 字幕获取失败: {e}") + return None diff --git a/backend/app/services/note.py b/backend/app/services/note.py index 81d35c0..ebbe83a 100644 --- a/backend/app/services/note.py +++ b/backend/app/services/note.py @@ -133,8 +133,46 @@ class NoteGenerator: audio_cache_file = NOTE_OUTPUT_DIR / f"{task_id}_audio.json" transcript_cache_file = NOTE_OUTPUT_DIR / f"{task_id}_transcript.json" markdown_cache_file = NOTE_OUTPUT_DIR / f"{task_id}_markdown.md" - print(audio_cache_file) - # 1. 下载音频/视频 + # 1. 获取字幕/转写:优先缓存 → 平台字幕 → 音频转写 + transcript = None + + # 尝试读取缓存 + if transcript_cache_file.exists(): + logger.info(f"检测到转写缓存 ({transcript_cache_file}),尝试读取") + try: + data = json.loads(transcript_cache_file.read_text(encoding="utf-8")) + segments = [TranscriptSegment(**seg) for seg in data.get("segments", [])] + transcript = TranscriptResult( + language=data.get("language"), + full_text=data["full_text"], + segments=segments, + ) + logger.info(f"已从缓存加载转写结果,共 {len(segments)} 段") + except Exception as e: + logger.warning(f"加载转写缓存失败: {e}") + + # 缓存没有,尝试获取平台字幕 + if transcript is None: + logger.info("尝试获取平台字幕(优先于音频下载)...") + try: + transcript = downloader.download_subtitles(video_url) + if transcript and transcript.segments: + logger.info(f"成功获取平台字幕,共 {len(transcript.segments)} 段") + transcript_cache_file.write_text( + json.dumps(asdict(transcript), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + else: + transcript = None + logger.info("平台无可用字幕,将下载音频后转写") + except Exception as e: + logger.warning(f"获取平台字幕失败: {e},将下载音频后转写") + transcript = None + + # 2. 下载音频/视频 + # 有字幕时只提取元信息,不下载音视频文件(除非需要截图/视频理解) + has_transcript = transcript is not None + need_full_download = not has_transcript or screenshot or video_understanding audio_meta = self._download_media( downloader=downloader, video_url=video_url, @@ -147,18 +185,19 @@ class NoteGenerator: video_understanding=video_understanding, video_interval=video_interval, grid_size=grid_size, + skip_download=not need_full_download, ) - # 2. 获取字幕/转写文字 - # 优先尝试获取平台字幕,没有再 fallback 到音频转写 - transcript = self._get_transcript( - downloader=downloader, - video_url=video_url, - audio_file=audio_meta.file_path, - transcript_cache_file=transcript_cache_file, - status_phase=TaskStatus.TRANSCRIBING, - task_id=task_id, - ) + # 3. 如果前面没拿到字幕,走转写流程 + if transcript is None: + transcript = self._get_transcript( + downloader=downloader, + video_url=video_url, + audio_file=audio_meta.file_path, + transcript_cache_file=transcript_cache_file, + status_phase=TaskStatus.TRANSCRIBING, + task_id=task_id, + ) # 3. GPT 总结 markdown = self._summarize_text( @@ -331,6 +370,7 @@ class NoteGenerator: video_understanding: bool, video_interval: int, grid_size: List[int], + skip_download: bool = False, ) -> AudioDownloadResult | None: """ 1. 检查音频缓存;若不存在,则根据需要下载音频或视频(若需截图/可视化)。 @@ -353,7 +393,34 @@ class NoteGenerator: task_id = audio_cache_file.stem.split("_")[0] self._update_status(task_id, status_phase) + # 已有缓存,尝试加载 + if audio_cache_file.exists(): + logger.info(f"检测到音频缓存 ({audio_cache_file}),直接读取") + try: + data = json.loads(audio_cache_file.read_text(encoding="utf-8")) + return AudioDownloadResult(**data) + except Exception as e: + logger.warning(f"读取音频缓存失败,将重新下载:{e}") + # 有字幕且不需要截图/视频理解时,只提取元信息不下载文件 + if skip_download: + logger.info("已有字幕,仅提取视频元信息(不下载音视频)") + try: + audio = downloader.download( + video_url=video_url, + quality=quality, + output_dir=output_path, + need_video=False, + skip_download=True, + ) + audio_cache_file.write_text( + json.dumps(asdict(audio), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + logger.info(f"元信息提取完成 ({audio_cache_file})") + return audio + except Exception as exc: + logger.warning(f"元信息提取失败,将尝试完整下载: {exc}") # 判断是否需要下载视频 need_video = screenshot or video_understanding @@ -368,9 +435,8 @@ class NoteGenerator: self.video_path = Path(video_path_str) logger.info(f"视频下载完成:{self.video_path}") - # 若指定了 grid_size,则生成缩略图 if grid_size: - self.video_img_urls=VideoReader( + self.video_img_urls = VideoReader( video_path=str(self.video_path), grid_size=tuple(grid_size), frame_interval=frame_interval, @@ -382,17 +448,9 @@ class NoteGenerator: logger.info("未指定 grid_size,跳过缩略图生成") except Exception as exc: logger.error(f"视频下载失败:{exc}") - self._handle_exception(task_id, exc) raise - # 已有缓存,尝试加载 - if audio_cache_file.exists(): - logger.info(f"检测到音频缓存 ({audio_cache_file}),直接读取") - try: - data = json.loads(audio_cache_file.read_text(encoding="utf-8")) - return AudioDownloadResult(**data) - except Exception as e: - logger.warning(f"读取音频缓存失败,将重新下载:{e}") + # 下载音频 try: logger.info("开始下载音频") @@ -402,7 +460,6 @@ class NoteGenerator: output_dir=output_path, need_video=need_video, ) - # 缓存 audio 元信息到本地 JSON audio_cache_file.write_text(json.dumps(asdict(audio), ensure_ascii=False, indent=2), encoding="utf-8") logger.info(f"音频下载并缓存成功 ({audio_cache_file})") return audio diff --git a/backend/requirements.txt b/backend/requirements.txt index b3afa64..b29b0ab 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -124,5 +124,6 @@ weasyprint==65.1 webencodings==0.5.1 websockets==15.0.1 yarl==1.19.0 +youtube-transcript-api>=1.0.0 yt-dlp==2025.3.31 zopfli==0.2.3.post1 From f6a34380795802ace48d094f125fa041e11f7f64 Mon Sep 17 00:00:00 2001 From: huangjianwu Date: Mon, 23 Mar 2026 17:48:34 +0800 Subject: [PATCH 2/2] =?UTF-8?q?feat(build):=20=E5=85=A8=E9=9D=A2=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E6=89=93=E5=8C=85=E6=B5=81=E7=A8=8B=EF=BC=8CDocker=20?= =?UTF-8?q?=E9=95=9C=E5=83=8F=E8=87=AA=E5=8A=A8=E5=8F=91=E5=B8=83=E5=88=B0?= =?UTF-8?q?=20GHCR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docker 优化: - Dockerfile 层缓存(requirements/lockfile 先复制再安装) - ARG 可配置镜像源,国际用户可覆盖为默认源 - 前端 Dockerfile 改用 corepack + frozen-lockfile - 精简 .dockerignore,排除 .git 和 Tauri 构建产物 CI/CD 优化: - docker-build 自动推送到 GHCR,支持 amd64/arm64 双架构 - 桌面端 CI 增加 pip/pnpm/cargo 缓存,升级 actions 版本 - Python 版本对齐为 3.11,增加 Linux 构建矩阵 - build.sh 加 -y 覆盖标志 文档更新: - README Docker 部署简化为 docker pull + docker run Co-Authored-By: Claude Opus 4.6 --- .dockerignore | 334 +++-------------------------- .github/workflows/docker-build.yml | 101 +++++---- .github/workflows/main.yml | 68 ++++-- BillNote_frontend/Dockerfile | 18 +- Dockerfile.complete | 37 +++- README.md | 25 ++- backend/Dockerfile | 20 +- backend/Dockerfile.gpu | 23 +- backend/build.sh | 1 + docker-compose.gpu.yml | 3 + docker-compose.yml | 3 + 11 files changed, 220 insertions(+), 413 deletions(-) diff --git a/.dockerignore b/.dockerignore index 1b9d5e1..179b969 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,321 +1,35 @@ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* -.pnpm-debug.log* +# Git 和 IDE +.git +.github +.idea/ +.vscode/ .DS_Store -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json -BiliNote/pnpm-lock.yaml -# Runtime data -pids -*.pid -*.seed -*.pid.lock -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov +# Tauri 构建产物(非常大) +BillNote_frontend/src-tauri/target +BillNote_frontend/src-tauri/bin -# Coverage directory used by tools like istanbul -coverage -*.lcov +# 运行时数据 +backend/data +backend/static +backend/models +backend/logs +backend/uploads +backend/*.db +backend/note_results +backend/bin/ -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release - -# Dependency directories +# 依赖和构建缓存 node_modules/ -jspm_packages/ - -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ - -# TypeScript cache -*.tsbuildinfo - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional stylelint cache -.stylelintcache - -# Microbundle cache -.rpt2_cache/ -.rts2_cache_cjs/ -.rts2_cache_es/ -.rts2_cache_umd/ -.BiliNote-dev/* -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variable files -.env -.env.development.local -.env.test.local -.env.production.local -.env.local -!.env.example -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache - -# Next.js build output -.next -out - -# Nuxt.js build / generate output -.nuxt -dist - -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public - -# vuepress build output -.vuepress/dist - -# vuepress v2.x temp and cache directory -.temp -.cache - -# vitepress build output -**/.vitepress/dist - -# vitepress cache directory -**/.vitepress/cache - -# Docusaurus cache and generated files -.docusaurus - -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v2 -.yarn/cache -.yarn/unplugged -.yarn/build-state.yml -.yarn/install-state.gz -.pnp.* -# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ dist/ -downloads/ -eggs/ -.eggs/ - -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ +build/ +*.tar *.egg-info/ -.installed.cfg -*.egg -MANIFEST -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot -.idea/ -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments +# 环境文件 .env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -# Ruff stuff: -.ruff_cache/ - -# PyPI configuration file -.pypirc -/backend/data/* -/backend/static/* -/backend/note_tasks.db -/backend/bin/ -/backend/logs/ -/backend/note_results -/backend/models -/backend/.idea/* -/backend/bili_note.db -/backend/uploads/* -/BiliNote_frontend/.idea/* +.env.local +.env.*.local +!.env.example diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index bb8fd20..73d4571 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -1,76 +1,73 @@ -name: Build Complete Docker Image +name: Build and Publish Docker Image on: push: tags: - - 'v*' # 在推送 tag 时触发 (如 v1.0.0) + - 'v*' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} jobs: - build: + build-and-push: runs-on: ubuntu-latest - + permissions: + contents: read + packages: write + steps: - name: Checkout Code uses: actions/checkout@v4 - - name: Extract Version from Tag - id: get_version - run: | - # 获取 tag 名称 (如 refs/tags/v1.0.0 -> v1.0.0) - VERSION=${GITHUB_REF#refs/tags/} - echo "version=$VERSION" >> $GITHUB_OUTPUT - echo "Tag version: $VERSION" - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Build Docker Image - run: | - VERSION="${{ steps.get_version.outputs.version }}" - IMAGE_NAME="bilinote:${VERSION}" - - echo "Building image: ${IMAGE_NAME}" - - # 构建镜像 - docker build -f Dockerfile.complete -t ${IMAGE_NAME} . - - # 保存镜像为 tar 文件 - docker save ${IMAGE_NAME} -o bilinote-${VERSION}.tar - - # 显示镜像信息 - echo "Image built successfully!" - docker images bilinote:${VERSION} - - - name: Upload Docker Image Artifact - uses: actions/upload-artifact@v4 + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 with: - name: bilinote-${{ steps.get_version.outputs.version }} - path: bilinote-${{ steps.get_version.outputs.version }}.tar - retention-days: 90 + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix= + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and Push Docker Image + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile.complete + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + platforms: linux/amd64,linux/arm64 - name: Generate Usage Instructions run: | - VERSION="${{ steps.get_version.outputs.version }}" echo "==========================================" - echo "Docker Image Build Complete!" + echo "Docker Image Published!" echo "==========================================" echo "" - echo "Image Name: bilinote:${VERSION}" - echo "Artifact: bilinote-${VERSION}.tar" + echo "Pull the image:" + echo " docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" echo "" - echo "To use this Docker image:" - echo "1. Download the artifact from this workflow run" - echo "2. Load the image:" - echo " docker load < bilinote-${VERSION}.tar" - echo "3. Run the container:" - echo " docker run -d -p 80:80 --name bilinote bilinote:${VERSION}" + echo "Run the container:" + echo " docker run -d -p 80:80 \\" + echo " -v bilinote-data:/app/backend/data \\" + echo " --name bilinote \\" + echo " ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" echo "" - echo "Or with environment variables:" - echo " docker run -d -p 80:80 \\" - echo " -e BACKEND_PORT=8483 \\" - echo " -e BACKEND_HOST=0.0.0.0 \\" - echo " -v /path/to/data:/app/backend/data \\" - echo " --name bilinote bilinote:${VERSION}" - echo "" - echo "Access the application at: http://localhost:80" + echo "Access the application at: http://localhost" echo "==========================================" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b3e2771..d6f1e2a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -6,25 +6,38 @@ on: tags: - 'v*' # 发布 tag 时触发 workflow_dispatch: + jobs: build: strategy: matrix: - platform: [macos-latest, windows-latest] + include: + - platform: macos-latest + - platform: ubuntu-22.04 + - platform: windows-latest runs-on: ${{ matrix.platform }} steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - # 设置 Python 环境 + # Linux 系统依赖(Tauri 需要) + - name: Install Linux Dependencies + if: matrix.platform == 'ubuntu-22.04' + run: | + sudo apt-get update + sudo apt-get install -y libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf + + # 设置 Python 环境(带 pip 缓存) - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.11' + cache: 'pip' + cache-dependency-path: backend/requirements.txt - # 安装 Python 依赖并执行你的 build.sh + # 安装 Python 依赖并执行构建 - name: Install Python dependencies & Build backend shell: bash run: | @@ -38,30 +51,57 @@ jobs: ./backend/build.sh fi - # 设置 Node 环境 + 安装前端依赖 + # 设置 pnpm + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 'latest' + + # 设置 Node 环境(带 pnpm 缓存) - name: Set up Node.js - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: '20' + cache: 'pnpm' + cache-dependency-path: BillNote_frontend/pnpm-lock.yaml - - name: Enable Corepack + Install pnpm + - name: Install frontend dependencies working-directory: BillNote_frontend - run: | - corepack enable - pnpm install + run: pnpm install --frozen-lockfile # 设置 Rust 环境 - name: Set up Rust uses: dtolnay/rust-toolchain@stable + # Cargo 缓存 + - name: Cache Cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + BillNote_frontend/src-tauri/target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('BillNote_frontend/src-tauri/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo- + # 打包 Tauri 应用 - name: Build Tauri App working-directory: BillNote_frontend run: pnpm tauri build - # 可选:上传构建产物 + # 生成 SHA256 校验和 + - name: Generate Checksums + shell: bash + run: | + cd BillNote_frontend/src-tauri/target/release/bundle/ + find . -type f \( -name "*.dmg" -o -name "*.msi" -o -name "*.deb" -o -name "*.AppImage" \) -exec sha256sum {} \; > checksums.sha256 || true + + # 上传构建产物 - name: Upload Desktop Bundle uses: actions/upload-artifact@v4 with: name: app-${{ matrix.platform }} - path: BillNote_frontend/src-tauri/target/release/bundle/ + path: | + BillNote_frontend/src-tauri/target/release/bundle/ diff --git a/BillNote_frontend/Dockerfile b/BillNote_frontend/Dockerfile index e666adc..a1e7b4e 100644 --- a/BillNote_frontend/Dockerfile +++ b/BillNote_frontend/Dockerfile @@ -1,25 +1,23 @@ # === 前端构建阶段 === FROM node:18-alpine AS builder -# 安装 pnpm -RUN npm install -g pnpm +RUN corepack enable && corepack prepare pnpm@latest --activate -# 设置工作目录 WORKDIR /app -# 拷贝前端源码 -COPY ./BillNote_frontend /app +# 先复制 lockfile 利用依赖层缓存 +COPY ./BillNote_frontend/package.json ./BillNote_frontend/pnpm-lock.yaml ./ +RUN pnpm install --frozen-lockfile -# 安装依赖并构建 -RUN pnpm install && pnpm run build +# 再复制源代码并构建 +COPY ./BillNote_frontend/ ./ +RUN pnpm run build # --- 阶段2:使用 nginx 作为静态服务器 --- FROM nginx:1.25-alpine -# 删除默认配置(可选) RUN rm -rf /etc/nginx/conf.d/default.conf COPY ./BillNote_frontend/deploy/default.conf /etc/nginx/conf.d/default.conf - # 拷贝构建产物 -COPY --from=builder /app/dist /usr/share/nginx/html \ No newline at end of file +COPY --from=builder /app/dist /usr/share/nginx/html diff --git a/Dockerfile.complete b/Dockerfile.complete index ba38edb..534551b 100644 --- a/Dockerfile.complete +++ b/Dockerfile.complete @@ -1,41 +1,56 @@ # === 阶段1:构建 Backend === FROM python:3.11-slim AS backend-builder +ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn +ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple + RUN rm -f /etc/apt/sources.list && \ rm -rf /etc/apt/sources.list.d/* && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ apt-get update && \ - apt-get install -y ffmpeg && \ + apt-get install -y --no-install-recommends ffmpeg && \ rm -rf /var/lib/apt/lists/* ENV PATH="/usr/bin:${PATH}" ENV HF_ENDPOINT=https://hf-mirror.com WORKDIR /tmp/backend + +# 先复制 requirements.txt 利用层缓存 +COPY ./backend/requirements.txt /tmp/backend/requirements.txt +RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt + COPY ./backend /tmp/backend -RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt # === 阶段2:构建 Frontend === FROM node:18-alpine AS frontend-builder -RUN npm install -g pnpm +RUN corepack enable && corepack prepare pnpm@latest --activate + WORKDIR /tmp/frontend + +# 先复制 lockfile 利用依赖层缓存 +COPY ./BillNote_frontend/package.json ./BillNote_frontend/pnpm-lock.yaml ./ +RUN pnpm install --frozen-lockfile + COPY ./BillNote_frontend /tmp/frontend -RUN pnpm install && pnpm run build +RUN pnpm run build # === 阶段3:完整应用镜像 === FROM python:3.11-slim +ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn + # 安装必要的运行时依赖 RUN rm -f /etc/apt/sources.list && \ rm -rf /etc/apt/sources.list.d/* && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ apt-get update && \ - apt-get install -y ffmpeg nginx supervisor procps && \ + apt-get install -y --no-install-recommends ffmpeg nginx supervisor procps && \ rm -rf /var/lib/apt/lists/* ENV PATH="/usr/bin:${PATH}" diff --git a/README.md b/README.md index 22402a1..19b89e4 100644 --- a/README.md +++ b/README.md @@ -104,9 +104,30 @@ sudo apt install ffmpeg ### 🐳 使用 Docker 一键部署 -确保你已安装 Docker 和 Docker Compose: +确保你已安装 Docker,然后直接拉取预构建镜像运行: -[docker 部署](https://github.com/JefferyHcool/bilinote-deploy/blob/master/README.md) +```bash +# 拉取最新镜像 +docker pull ghcr.io/jefferyhcool/bilinote:latest + +# 运行容器 +docker run -d -p 80:80 \ + -v bilinote-data:/app/backend/data \ + --name bilinote \ + ghcr.io/jefferyhcool/bilinote:latest +``` + +访问:`http://localhost` + +也可以使用 docker-compose 本地构建: + +```bash +# 标准部署 +docker-compose up -d + +# GPU 加速部署(需要 NVIDIA GPU) +docker-compose -f docker-compose.gpu.yml up -d +``` ## 🧠 TODO diff --git a/backend/Dockerfile b/backend/Dockerfile index d482d2b..9e899bf 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,23 +1,27 @@ FROM python:3.11-slim +ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn +ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple RUN rm -f /etc/apt/sources.list && \ rm -rf /etc/apt/sources.list.d/* && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ - echo "deb https://mirrors.tuna.tsinghua.edu.cn/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm main contrib non-free non-free-firmware" > /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian bookworm-updates main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/debian-security bookworm-security main contrib non-free non-free-firmware" >> /etc/apt/sources.list && \ apt-get update && \ - apt-get install -y ffmpeg && \ + apt-get install -y --no-install-recommends ffmpeg curl && \ rm -rf /var/lib/apt/lists/* -# 确保 PATH 中包含 ffmpeg 路径(可选) ENV PATH="/usr/bin:${PATH}" - -# 设置 Hugging Face 镜像源环境变量 ENV HF_ENDPOINT=https://hf-mirror.com WORKDIR /app + +# 先复制 requirements.txt 利用层缓存 +COPY ./backend/requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt + +# 再复制应用代码(频繁变动不影响 pip 缓存层) COPY ./backend /app -RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt CMD ["python", "main.py"] diff --git a/backend/Dockerfile.gpu b/backend/Dockerfile.gpu index 341dbbe..d74a3cb 100644 --- a/backend/Dockerfile.gpu +++ b/backend/Dockerfile.gpu @@ -1,16 +1,27 @@ FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 -RUN apt update && \ - apt install -y ffmpeg python3-pip && \ - apt clean all && \ +ARG APT_MIRROR=mirrors.tuna.tsinghua.edu.cn +ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple + +RUN rm -f /etc/apt/sources.list && \ + rm -rf /etc/apt/sources.list.d/* && \ + echo "deb https://${APT_MIRROR}/ubuntu jammy main restricted universe multiverse" > /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/ubuntu jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb https://${APT_MIRROR}/ubuntu jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends ffmpeg python3-pip curl && \ rm -rf /var/lib/apt/lists/* -# 设置 Hugging Face 镜像源环境变量 ENV HF_ENDPOINT=https://hf-mirror.com WORKDIR /app + +# 先复制 requirements.txt 利用层缓存 +COPY ./backend/requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -i ${PIP_INDEX} -r requirements.txt && \ + pip install --no-cache-dir -i ${PIP_INDEX} 'transformers[torch]>=4.23' + +# 再复制应用代码 COPY ./backend /app -RUN pip install --no-cache-dir -i https://pypi.mirrors.ustc.edu.cn/simple -r requirements.txt -RUN pip install --no-cache-dir -i https://pypi.mirrors.ustc.edu.cn/simple 'transformers[torch]>=4.23' CMD ["python3", "main.py"] diff --git a/backend/build.sh b/backend/build.sh index 9c1a500..ae23261 100755 --- a/backend/build.sh +++ b/backend/build.sh @@ -25,6 +25,7 @@ cp .env.example backend/.env # 步骤 2: PyInstaller 打包,直接添加已存在的 .env 文件 echo "开始 PyInstaller 打包..." pyinstaller \ + -y \ --name BiliNoteBackend \ --paths backend \ --distpath ./BillNote_frontend/src-tauri/bin \ diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml index 9e9a56d..1b1fdbc 100644 --- a/docker-compose.gpu.yml +++ b/docker-compose.gpu.yml @@ -5,6 +5,9 @@ services: build: context: . dockerfile: backend/Dockerfile.gpu + args: + APT_MIRROR: ${APT_MIRROR:-mirrors.tuna.tsinghua.edu.cn} + PIP_INDEX: ${PIP_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple} env_file: - .env environment: diff --git a/docker-compose.yml b/docker-compose.yml index d7d09dd..4f5383a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,9 @@ services: build: context: . dockerfile: backend/Dockerfile + args: + APT_MIRROR: ${APT_MIRROR:-mirrors.tuna.tsinghua.edu.cn} + PIP_INDEX: ${PIP_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple} env_file: - .env environment: