diff --git a/backend/app/routers/config.py b/backend/app/routers/config.py index 97687f6..c816fc5 100644 --- a/backend/app/routers/config.py +++ b/backend/app/routers/config.py @@ -119,12 +119,21 @@ _downloading: dict[str, str] = {} # model_size -> status ("downloading" | "done def _check_whisper_model_exists(model_size: str, subdir: str = "whisper") -> bool: """检查指定 whisper 模型是否已下载完整到本地。 - 必须 model.bin 落盘才算完成,仅有空目录或半成品不能算「已下载」—— - 否则监控页会显示绿勾但加载时报「Unable to open file 'model.bin'」。 + faster-whisper 把模型缓存在 HF cache 布局下: + /models--Systran--faster-whisper-{size}/snapshots//model.bin + 必须能在某个 snapshot 目录里找到 model.bin 才算完成。 + (历史 modelscope 布局 /whisper-{size}/model.bin 也兼容识别。) """ - model_dir = get_model_dir(subdir) - model_path = os.path.join(model_dir, f"whisper-{model_size}") - return (Path(model_path) / "model.bin").exists() + model_dir = Path(get_model_dir(subdir)) + # HF cache 布局 + hf_repo_dir = model_dir / f"models--Systran--faster-whisper-{model_size}" / "snapshots" + if hf_repo_dir.exists(): + for snapshot in hf_repo_dir.iterdir(): + if (snapshot / "model.bin").exists(): + return True + # 历史 modelscope 布局(向后兼容老用户) + legacy = model_dir / f"whisper-{model_size}" / "model.bin" + return legacy.exists() def _check_mlx_whisper_model_exists(model_size: str) -> bool: @@ -189,24 +198,37 @@ class ModelDownloadRequest(BaseModel): def _do_download_whisper(model_size: str): - """后台下载 faster-whisper 模型。""" - from app.transcriber.whisper import MODEL_MAP - from modelscope import snapshot_download + """后台下载 faster-whisper 模型。 + + 直接走 huggingface_hub.snapshot_download,把模型放到 HF cache 布局里—— + 这样 faster-whisper 加载时(WhisperModel(model_size_or_path=size_name, + download_root=model_dir))能直接命中缓存,跟加载路径完全对齐。 + """ + from huggingface_hub import snapshot_download try: _downloading[model_size] = "downloading" model_dir = get_model_dir("whisper") - model_path = os.path.join(model_dir, f"whisper-{model_size}") - # 用 model.bin 判定而非目录存在:半成品目录不能算「已下载」 - if (Path(model_path) / "model.bin").exists(): + + # 已经下好就不重复下 + if _check_whisper_model_exists(model_size, "whisper"): _downloading[model_size] = "done" return - repo_id = MODEL_MAP.get(model_size) - if not repo_id: - _downloading[model_size] = "failed" - return - logger.info(f"开始下载 whisper 模型: {model_size}") - snapshot_download(repo_id, local_dir=model_path) + repo_id = f"Systran/faster-whisper-{model_size}" + logger.info(f"开始下载 whisper 模型: {repo_id}") + # 跟 faster-whisper utils.py 用同样的 allow_patterns,避免多下无关文件; + # 不传 local_dir 让它走 HF 默认 cache 布局(与加载逻辑对齐) + snapshot_download( + repo_id, + cache_dir=model_dir, + allow_patterns=[ + "config.json", + "preprocessor_config.json", + "model.bin", + "tokenizer.json", + "vocabulary.*", + ], + ) logger.info(f"whisper 模型下载完成: {model_size}") _downloading[model_size] = "done" except Exception as e: diff --git a/backend/app/transcriber/whisper.py b/backend/app/transcriber/whisper.py index e212861..5308579 100644 --- a/backend/app/transcriber/whisper.py +++ b/backend/app/transcriber/whisper.py @@ -11,8 +11,6 @@ from events import transcription_finished from pathlib import Path import os import shutil -from tqdm import tqdm -from modelscope import snapshot_download ''' @@ -20,19 +18,16 @@ from modelscope import snapshot_download ''' logger=get_logger(__name__) -MODEL_MAP={ - "tiny": "pengzhendong/faster-whisper-tiny", - 'base':'pengzhendong/faster-whisper-base', - 'small':'pengzhendong/faster-whisper-small', - 'medium':'pengzhendong/faster-whisper-medium', - 'large-v1':'pengzhendong/faster-whisper-large-v1', - 'large-v2':'pengzhendong/faster-whisper-large-v2', - 'large-v3':'pengzhendong/faster-whisper-large-v3', - 'large-v3-turbo':'pengzhendong/faster-whisper-large-v3-turbo', -} - +# 历史遗留:之前用 modelscope 下载到自定义目录然后把路径传给 WhisperModel。 +# 但 faster-whisper 1.1.1 的 download_model(utils.py:76)逻辑是: +# 只要 size_or_id 里含 "/" 就当 HF repo_id 处理,没有「本地目录直接返回」分支。 +# 我们传 /app/models/whisper/whisper-tiny 进去 → 被当成不存在的 HF repo → +# 在线请求失败 → fallback local_files_only=True → HF cache 找不到(因为是 +# modelscope 目录布局不是 HF)→ LocalEntryNotFoundError,误导说"离线模式"。 +# 解法:彻底让 faster-whisper 自己处理下载——传 size name,配 download_root +# 作为 HF cache 根目录,HF_ENDPOINT 已经在 Dockerfile 里指到 hf-mirror.com, +# 国内能用。删掉 modelscope 那一套,避免布局不匹配。 class WhisperTranscriber(Transcriber): - # TODO:修改为可配置 def __init__( self, model_size: str = "base", @@ -48,44 +43,40 @@ class WhisperTranscriber(Transcriber): print('没有 cuda 使用 cpu进行计算') self.compute_type = compute_type or ("float16" if self.device == "cuda" else "int8") + self.model_size = model_size model_dir = get_model_dir("whisper") - model_path = os.path.join(model_dir, f"whisper-{model_size}") - repo_id = MODEL_MAP[model_size] - - # 第一步:目录 / model.bin 不在 → 下载。 - # 关键判据用 model.bin 而不是目录存在:首次下载若被打断(网络中断 / 磁盘满 / - # 容器被 kill)会留下半成品目录,只看目录存在会跳过下载。 - model_bin = Path(model_path) / "model.bin" - if not model_bin.exists(): - if Path(model_path).exists(): - logger.warning(f"模型目录 {model_path} 存在但 model.bin 缺失(上次下载未完成),重新下载") - else: - logger.info(f"模型 whisper-{model_size} 不存在,开始下载...") - model_path = snapshot_download(repo_id, local_dir=model_path) - logger.info("模型下载完成") - - # 第二步:加载。model.bin 可能存在但【内容截断】(下载到一半被 kill), - # 此时 WhisperModel() 会抛 "File model.bin is incomplete: failed to read a buffer..."。 - # 捕获后删掉损坏目录、重新下载、再试一次——自愈,避免 500 死循环。 try: - self.model = WhisperModel( - model_size_or_path=model_path, - device=self.device, - compute_type=self.compute_type, - download_root=model_dir, - ) + self.model = self._build_model(model_size, model_dir) except Exception as e: - logger.warning(f"加载 whisper-{model_size} 失败(疑似模型文件损坏 / 截断):{e};删除后重新下载") - shutil.rmtree(model_path, ignore_errors=True) - model_path = snapshot_download(repo_id, local_dir=model_path) - logger.info("模型重新下载完成,重试加载") - self.model = WhisperModel( - model_size_or_path=model_path, - device=self.device, - compute_type=self.compute_type, - download_root=model_dir, - ) + # 自愈:损坏 / 截断 / 半成品 cache → 删掉对应 HF cache 重下一次 + logger.warning(f"加载 whisper-{model_size} 失败:{e};清理 cache 后重新下载") + self._purge_cache(model_dir, model_size) + self.model = self._build_model(model_size, model_dir) + + def _build_model(self, model_size: str, model_dir: str) -> WhisperModel: + return WhisperModel( + model_size_or_path=model_size, # 传 size name,让 faster-whisper 自己映射到 Systran/faster-whisper-* + device=self.device, + compute_type=self.compute_type, + download_root=model_dir, + ) + + @staticmethod + def _purge_cache(model_dir: str, model_size: str) -> None: + """删掉 HF cache 里这个 size 对应的 snapshot 目录,强制下次重新下载。 + + HF cache 布局:/models--Systran--faster-whisper-{size}/ + 没找到也不报错——可能用户改了 endpoint 或者 cache 布局变了。 + """ + candidates = [ + Path(model_dir) / f"models--Systran--faster-whisper-{model_size}", + Path(model_dir) / f"whisper-{model_size}", # 历史 modelscope 目录,顺手清掉 + ] + for path in candidates: + if path.exists(): + logger.info(f"清理损坏 cache: {path}") + shutil.rmtree(path, ignore_errors=True) @staticmethod def is_torch_installed() -> bool: try: