mirror of
https://github.com/JefferyHcool/BiliNote.git
synced 2026-06-12 11:09:50 +08:00
fix(transcriber): whisper 模型下载/加载统一走 HF cache 布局
此前用 modelscope 下到自定义目录 whisper-{size}/ 再把该路径传给
WhisperModel。但 faster-whisper 1.1.1 只要 path 含 '/' 就当成 HF
repo_id 处理,没有「本地目录直接返回」分支 → 在线请求失败后 fallback
local_files_only,又因 modelscope 布局命不中 HF cache → LocalEntryNotFound,
误导用户以为是「离线模式」。
改为下载与加载路径对齐:
- 下载:huggingface_hub.snapshot_download(cache_dir=model_dir),落到 HF
cache 布局 models--Systran--faster-whisper-{size}/snapshots/<hash>/
- 加载:WhisperModel(model_size_or_path=size, download_root=model_dir),
让 faster-whisper 自己映射到 Systran/faster-whisper-* 并命中同一 cache
- 完整性检测 / 损坏自愈(_purge_cache) 同步按 HF cache 布局,并兼容老
modelscope 目录(向后兼容已下载的老用户)
HF_ENDPOINT 已在 Dockerfile 指向 hf-mirror.com,国内可用。
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -119,12 +119,21 @@ _downloading: dict[str, str] = {} # model_size -> status ("downloading" | "done
|
||||
def _check_whisper_model_exists(model_size: str, subdir: str = "whisper") -> bool:
|
||||
"""检查指定 whisper 模型是否已下载完整到本地。
|
||||
|
||||
必须 model.bin 落盘才算完成,仅有空目录或半成品不能算「已下载」——
|
||||
否则监控页会显示绿勾但加载时报「Unable to open file 'model.bin'」。
|
||||
faster-whisper 把模型缓存在 HF cache 布局下:
|
||||
<model_dir>/models--Systran--faster-whisper-{size}/snapshots/<hash>/model.bin
|
||||
必须能在某个 snapshot 目录里找到 model.bin 才算完成。
|
||||
(历史 modelscope 布局 <model_dir>/whisper-{size}/model.bin 也兼容识别。)
|
||||
"""
|
||||
model_dir = get_model_dir(subdir)
|
||||
model_path = os.path.join(model_dir, f"whisper-{model_size}")
|
||||
return (Path(model_path) / "model.bin").exists()
|
||||
model_dir = Path(get_model_dir(subdir))
|
||||
# HF cache 布局
|
||||
hf_repo_dir = model_dir / f"models--Systran--faster-whisper-{model_size}" / "snapshots"
|
||||
if hf_repo_dir.exists():
|
||||
for snapshot in hf_repo_dir.iterdir():
|
||||
if (snapshot / "model.bin").exists():
|
||||
return True
|
||||
# 历史 modelscope 布局(向后兼容老用户)
|
||||
legacy = model_dir / f"whisper-{model_size}" / "model.bin"
|
||||
return legacy.exists()
|
||||
|
||||
|
||||
def _check_mlx_whisper_model_exists(model_size: str) -> bool:
|
||||
@@ -189,24 +198,37 @@ class ModelDownloadRequest(BaseModel):
|
||||
|
||||
|
||||
def _do_download_whisper(model_size: str):
|
||||
"""后台下载 faster-whisper 模型。"""
|
||||
from app.transcriber.whisper import MODEL_MAP
|
||||
from modelscope import snapshot_download
|
||||
"""后台下载 faster-whisper 模型。
|
||||
|
||||
直接走 huggingface_hub.snapshot_download,把模型放到 HF cache 布局里——
|
||||
这样 faster-whisper 加载时(WhisperModel(model_size_or_path=size_name,
|
||||
download_root=model_dir))能直接命中缓存,跟加载路径完全对齐。
|
||||
"""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
try:
|
||||
_downloading[model_size] = "downloading"
|
||||
model_dir = get_model_dir("whisper")
|
||||
model_path = os.path.join(model_dir, f"whisper-{model_size}")
|
||||
# 用 model.bin 判定而非目录存在:半成品目录不能算「已下载」
|
||||
if (Path(model_path) / "model.bin").exists():
|
||||
|
||||
# 已经下好就不重复下
|
||||
if _check_whisper_model_exists(model_size, "whisper"):
|
||||
_downloading[model_size] = "done"
|
||||
return
|
||||
repo_id = MODEL_MAP.get(model_size)
|
||||
if not repo_id:
|
||||
_downloading[model_size] = "failed"
|
||||
return
|
||||
logger.info(f"开始下载 whisper 模型: {model_size}")
|
||||
snapshot_download(repo_id, local_dir=model_path)
|
||||
repo_id = f"Systran/faster-whisper-{model_size}"
|
||||
logger.info(f"开始下载 whisper 模型: {repo_id}")
|
||||
# 跟 faster-whisper utils.py 用同样的 allow_patterns,避免多下无关文件;
|
||||
# 不传 local_dir 让它走 HF 默认 cache 布局(与加载逻辑对齐)
|
||||
snapshot_download(
|
||||
repo_id,
|
||||
cache_dir=model_dir,
|
||||
allow_patterns=[
|
||||
"config.json",
|
||||
"preprocessor_config.json",
|
||||
"model.bin",
|
||||
"tokenizer.json",
|
||||
"vocabulary.*",
|
||||
],
|
||||
)
|
||||
logger.info(f"whisper 模型下载完成: {model_size}")
|
||||
_downloading[model_size] = "done"
|
||||
except Exception as e:
|
||||
|
||||
@@ -11,8 +11,6 @@ from events import transcription_finished
|
||||
from pathlib import Path
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
from modelscope import snapshot_download
|
||||
|
||||
|
||||
'''
|
||||
@@ -20,19 +18,16 @@ from modelscope import snapshot_download
|
||||
'''
|
||||
logger=get_logger(__name__)
|
||||
|
||||
MODEL_MAP={
|
||||
"tiny": "pengzhendong/faster-whisper-tiny",
|
||||
'base':'pengzhendong/faster-whisper-base',
|
||||
'small':'pengzhendong/faster-whisper-small',
|
||||
'medium':'pengzhendong/faster-whisper-medium',
|
||||
'large-v1':'pengzhendong/faster-whisper-large-v1',
|
||||
'large-v2':'pengzhendong/faster-whisper-large-v2',
|
||||
'large-v3':'pengzhendong/faster-whisper-large-v3',
|
||||
'large-v3-turbo':'pengzhendong/faster-whisper-large-v3-turbo',
|
||||
}
|
||||
|
||||
# 历史遗留:之前用 modelscope 下载到自定义目录然后把路径传给 WhisperModel。
|
||||
# 但 faster-whisper 1.1.1 的 download_model(utils.py:76)逻辑是:
|
||||
# 只要 size_or_id 里含 "/" 就当 HF repo_id 处理,没有「本地目录直接返回」分支。
|
||||
# 我们传 /app/models/whisper/whisper-tiny 进去 → 被当成不存在的 HF repo →
|
||||
# 在线请求失败 → fallback local_files_only=True → HF cache 找不到(因为是
|
||||
# modelscope 目录布局不是 HF)→ LocalEntryNotFoundError,误导说"离线模式"。
|
||||
# 解法:彻底让 faster-whisper 自己处理下载——传 size name,配 download_root
|
||||
# 作为 HF cache 根目录,HF_ENDPOINT 已经在 Dockerfile 里指到 hf-mirror.com,
|
||||
# 国内能用。删掉 modelscope 那一套,避免布局不匹配。
|
||||
class WhisperTranscriber(Transcriber):
|
||||
# TODO:修改为可配置
|
||||
def __init__(
|
||||
self,
|
||||
model_size: str = "base",
|
||||
@@ -48,44 +43,40 @@ class WhisperTranscriber(Transcriber):
|
||||
print('没有 cuda 使用 cpu进行计算')
|
||||
|
||||
self.compute_type = compute_type or ("float16" if self.device == "cuda" else "int8")
|
||||
self.model_size = model_size
|
||||
|
||||
model_dir = get_model_dir("whisper")
|
||||
model_path = os.path.join(model_dir, f"whisper-{model_size}")
|
||||
repo_id = MODEL_MAP[model_size]
|
||||
|
||||
# 第一步:目录 / model.bin 不在 → 下载。
|
||||
# 关键判据用 model.bin 而不是目录存在:首次下载若被打断(网络中断 / 磁盘满 /
|
||||
# 容器被 kill)会留下半成品目录,只看目录存在会跳过下载。
|
||||
model_bin = Path(model_path) / "model.bin"
|
||||
if not model_bin.exists():
|
||||
if Path(model_path).exists():
|
||||
logger.warning(f"模型目录 {model_path} 存在但 model.bin 缺失(上次下载未完成),重新下载")
|
||||
else:
|
||||
logger.info(f"模型 whisper-{model_size} 不存在,开始下载...")
|
||||
model_path = snapshot_download(repo_id, local_dir=model_path)
|
||||
logger.info("模型下载完成")
|
||||
|
||||
# 第二步:加载。model.bin 可能存在但【内容截断】(下载到一半被 kill),
|
||||
# 此时 WhisperModel() 会抛 "File model.bin is incomplete: failed to read a buffer..."。
|
||||
# 捕获后删掉损坏目录、重新下载、再试一次——自愈,避免 500 死循环。
|
||||
try:
|
||||
self.model = WhisperModel(
|
||||
model_size_or_path=model_path,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type,
|
||||
download_root=model_dir,
|
||||
)
|
||||
self.model = self._build_model(model_size, model_dir)
|
||||
except Exception as e:
|
||||
logger.warning(f"加载 whisper-{model_size} 失败(疑似模型文件损坏 / 截断):{e};删除后重新下载")
|
||||
shutil.rmtree(model_path, ignore_errors=True)
|
||||
model_path = snapshot_download(repo_id, local_dir=model_path)
|
||||
logger.info("模型重新下载完成,重试加载")
|
||||
self.model = WhisperModel(
|
||||
model_size_or_path=model_path,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type,
|
||||
download_root=model_dir,
|
||||
)
|
||||
# 自愈:损坏 / 截断 / 半成品 cache → 删掉对应 HF cache 重下一次
|
||||
logger.warning(f"加载 whisper-{model_size} 失败:{e};清理 cache 后重新下载")
|
||||
self._purge_cache(model_dir, model_size)
|
||||
self.model = self._build_model(model_size, model_dir)
|
||||
|
||||
def _build_model(self, model_size: str, model_dir: str) -> WhisperModel:
|
||||
return WhisperModel(
|
||||
model_size_or_path=model_size, # 传 size name,让 faster-whisper 自己映射到 Systran/faster-whisper-*
|
||||
device=self.device,
|
||||
compute_type=self.compute_type,
|
||||
download_root=model_dir,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _purge_cache(model_dir: str, model_size: str) -> None:
|
||||
"""删掉 HF cache 里这个 size 对应的 snapshot 目录,强制下次重新下载。
|
||||
|
||||
HF cache 布局:<model_dir>/models--Systran--faster-whisper-{size}/
|
||||
没找到也不报错——可能用户改了 endpoint 或者 cache 布局变了。
|
||||
"""
|
||||
candidates = [
|
||||
Path(model_dir) / f"models--Systran--faster-whisper-{model_size}",
|
||||
Path(model_dir) / f"whisper-{model_size}", # 历史 modelscope 目录,顺手清掉
|
||||
]
|
||||
for path in candidates:
|
||||
if path.exists():
|
||||
logger.info(f"清理损坏 cache: {path}")
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
@staticmethod
|
||||
def is_torch_installed() -> bool:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user