From 3ff70864915a4601581f093f8c736acd5e077878 Mon Sep 17 00:00:00 2001 From: voidborne-d Date: Thu, 7 May 2026 13:50:59 +0800 Subject: [PATCH] fix(backend): UniversalGPT.create_messages emit string content when no images MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepSeek deepseek-chat 等非多模态模型只接受 ``content`` 为字符串。旧实现在 没有 ``video_img_urls`` 输入时也把 ``content`` 拼成 ``[{"type":"text","text":...}]`` 多模态数组,导致 DeepSeek API 返回 ``Failed to deserialize the JSON body into the target type: messages[0]: unknown variant `image_url`, expected `text```,整个笔记生成流程随之崩溃。 修复方式:``create_messages`` 在没有截图时退回 string content;有截图时维持 原多模态数组形态,多模态模型功能不退化。同时把 ``_build_merge_messages`` 也 改为 string content —— 合并阶段从不带图片,旧的数组形态会让长视频 chunk 之后的合并阶段同样命中 DeepSeek 400。 新增 ``backend/tests/test_universal_gpt_content_format.py`` (6 cases): - 无图片 / 显式空 image 列表都走 string content - 有图片仍输出多模态数组(含 ``image_url`` + ``detail: auto``) - 纯文本响应里完全不含 ``image_url`` 字段 - ``_build_merge_messages`` 用 string content + 仍带入 partials 文本 红基线:在不打补丁的 ``universal_gpt.py`` 上跑这 6 个 case,3 个 string- content 断言会失败(命中 issue #282 的同一根因),打补丁后 6/6 通过。 Closes #282 --- backend/app/gpt/universal_gpt.py | 31 +-- .../test_universal_gpt_content_format.py | 189 ++++++++++++++++++ 2 files changed, 208 insertions(+), 12 deletions(-) create mode 100644 backend/tests/test_universal_gpt_content_format.py diff --git a/backend/app/gpt/universal_gpt.py b/backend/app/gpt/universal_gpt.py index 14ec72e..739225f 100644 --- a/backend/app/gpt/universal_gpt.py +++ b/backend/app/gpt/universal_gpt.py @@ -53,20 +53,26 @@ class UniversalGPT(GPT): extras=kwargs.get('extras'), ) - # ⛳ 组装 content 数组,支持 text + image_url 混合 - content: List[dict] = [{"type": "text", "text": content_text}] video_img_urls = kwargs.get('video_img_urls', []) - for url in video_img_urls: - content.append({ - "type": "image_url", - "image_url": { - "url": url, - "detail": "auto" - } - }) + content: list[dict] | str + if video_img_urls: + # 有截图时走 OpenAI 多模态 content 数组(text + image_url) + content = [{"type": "text", "text": content_text}] + for url in video_img_urls: + content.append({ + "type": "image_url", + "image_url": { + "url": url, + "detail": "auto" + } + }) + else: + # 纯文本场景退回 string content:DeepSeek deepseek-chat 等非多模态模型 + # 不识别 [{"type":"text",...}] 数组形态,会返回 invalid_request_error + # (issue #282)。OpenAI 规范本身也允许 content 为 string。 + content = content_text - # 正确格式:整体包在一个 message 里,role + content array messages = [{ "role": "user", "content": content @@ -83,9 +89,10 @@ class UniversalGPT(GPT): def _build_merge_messages(self, partials: list) -> list: merge_text = MERGE_PROMPT + "\n\n" + "\n\n---\n\n".join(partials) + # 合并阶段没有图片,直接用 string content 兼容非多模态模型(issue #282) return [{ "role": "user", - "content": [{"type": "text", "text": merge_text}] + "content": merge_text }] def _checkpoint_path(self, checkpoint_key: str) -> Path: diff --git a/backend/tests/test_universal_gpt_content_format.py b/backend/tests/test_universal_gpt_content_format.py new file mode 100644 index 0000000..0eb63bd --- /dev/null +++ b/backend/tests/test_universal_gpt_content_format.py @@ -0,0 +1,189 @@ +"""issue #282 回归测试:UniversalGPT 拼装 content 时按是否有图片切换 string / array 形态。 + +DeepSeek deepseek-chat 等非多模态模型只接受 ``content`` 为字符串,旧实现无条件 +emit ``[{"type":"text","text":...}]`` 导致 ``invalid_request_error``。 +""" +import importlib.util +import pathlib +import sys +import types +import unittest + + +def _install_stubs(): + app_mod = types.ModuleType("app") + gpt_pkg = types.ModuleType("app.gpt") + models_pkg = types.ModuleType("app.models") + + base_mod = types.ModuleType("app.gpt.base") + + class _GPT: + pass + + base_mod.GPT = _GPT + + prompt_builder_mod = types.ModuleType("app.gpt.prompt_builder") + + def _generate_base_prompt(**_kwargs): + return "PROMPT_BODY" + + prompt_builder_mod.generate_base_prompt = _generate_base_prompt + + prompt_mod = types.ModuleType("app.gpt.prompt") + prompt_mod.BASE_PROMPT = "" + prompt_mod.AI_SUM = "" + prompt_mod.SCREENSHOT = "" + prompt_mod.LINK = "" + prompt_mod.MERGE_PROMPT = "MERGE_HEAD" + + utils_mod = types.ModuleType("app.gpt.utils") + + def _fix_markdown(text): + return text + + utils_mod.fix_markdown = _fix_markdown + + request_chunker_mod = types.ModuleType("app.gpt.request_chunker") + + class _RequestChunker: + def __init__(self, *_args, **_kwargs): + pass + + def group_texts_by_budget(self, texts, _builder, **_kwargs): + return [texts] + + request_chunker_mod.RequestChunker = _RequestChunker + + gpt_model_mod = types.ModuleType("app.models.gpt_model") + + class _GPTSource: + pass + + gpt_model_mod.GPTSource = _GPTSource + + transcriber_model_mod = types.ModuleType("app.models.transcriber_model") + + class _TranscriptSegment: + def __init__(self, **kwargs): + self.start = kwargs.get("start", 0) + self.end = kwargs.get("end", 0) + self.text = kwargs.get("text", "") + + transcriber_model_mod.TranscriptSegment = _TranscriptSegment + + sys.modules.setdefault("app", app_mod) + sys.modules.setdefault("app.gpt", gpt_pkg) + sys.modules.setdefault("app.models", models_pkg) + sys.modules["app.gpt.base"] = base_mod + sys.modules["app.gpt.prompt_builder"] = prompt_builder_mod + sys.modules["app.gpt.prompt"] = prompt_mod + sys.modules["app.gpt.utils"] = utils_mod + sys.modules["app.gpt.request_chunker"] = request_chunker_mod + sys.modules["app.models.gpt_model"] = gpt_model_mod + sys.modules["app.models.transcriber_model"] = transcriber_model_mod + + +def _load_universal_gpt_class(): + _install_stubs() + root = pathlib.Path(__file__).resolve().parents[1] + module_path = root / "app" / "gpt" / "universal_gpt.py" + spec = importlib.util.spec_from_file_location( + "universal_gpt_content_format", module_path + ) + if spec is None or spec.loader is None: + raise ImportError("universal_gpt module spec not found") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.UniversalGPT + + +UniversalGPT = _load_universal_gpt_class() + + +class _DummyClient: + """create_messages 不会真的调用 client,给个空壳即可。""" + + +def _make_gpt(): + return UniversalGPT(_DummyClient(), model="deepseek-chat") + + +class TestCreateMessagesContentFormat(unittest.TestCase): + """覆盖 create_messages 在不同 video_img_urls 输入下的输出形态。""" + + def test_no_images_emits_string_content(self): + """无图片时 content 为 str(DeepSeek / 非多模态模型可解析)。""" + gpt = _make_gpt() + + messages = gpt.create_messages(segments=[]) + + self.assertEqual(len(messages), 1) + self.assertEqual(messages[0]["role"], "user") + self.assertIsInstance(messages[0]["content"], str) + self.assertEqual(messages[0]["content"], "PROMPT_BODY") + + def test_empty_image_list_emits_string_content(self): + """显式传入空列表也要走纯文本分支,避免图片字段误触发。""" + gpt = _make_gpt() + + messages = gpt.create_messages(segments=[], video_img_urls=[]) + + self.assertIsInstance(messages[0]["content"], str) + + def test_with_images_emits_multimodal_array(self): + """有图片时保留多模态 array 形态,确保多模态模型功能不退化。""" + gpt = _make_gpt() + + messages = gpt.create_messages( + segments=[], + video_img_urls=["https://example.com/a.jpg", "https://example.com/b.jpg"], + ) + + content = messages[0]["content"] + self.assertIsInstance(content, list) + self.assertEqual(len(content), 3) # 1 text + 2 images + self.assertEqual(content[0], {"type": "text", "text": "PROMPT_BODY"}) + self.assertEqual(content[1]["type"], "image_url") + self.assertEqual(content[1]["image_url"]["url"], "https://example.com/a.jpg") + self.assertEqual(content[1]["image_url"]["detail"], "auto") + self.assertEqual(content[2]["image_url"]["url"], "https://example.com/b.jpg") + + def test_no_image_url_field_when_no_images(self): + """纯文本响应里不应该出现 image_url 关键字 —— 这是触发 DeepSeek 400 的根因。""" + gpt = _make_gpt() + + messages = gpt.create_messages(segments=[]) + + import json + serialized = json.dumps(messages, ensure_ascii=False) + self.assertNotIn("image_url", serialized) + + +class TestBuildMergeMessagesContentFormat(unittest.TestCase): + """合并阶段从不带图片,应该统一走 string content 路径。""" + + def test_merge_messages_use_string_content(self): + """否则长视频 chunk 后的合并阶段还会复现 issue #282 错误。""" + gpt = _make_gpt() + + messages = gpt._build_merge_messages(["partial-A", "partial-B"]) + + self.assertEqual(len(messages), 1) + self.assertEqual(messages[0]["role"], "user") + self.assertIsInstance(messages[0]["content"], str) + self.assertIn("MERGE_HEAD", messages[0]["content"]) + self.assertIn("partial-A", messages[0]["content"]) + self.assertIn("partial-B", messages[0]["content"]) + + def test_merge_messages_no_image_url_field(self): + gpt = _make_gpt() + + messages = gpt._build_merge_messages(["x"]) + + import json + serialized = json.dumps(messages, ensure_ascii=False) + self.assertNotIn("image_url", serialized) + + +if __name__ == "__main__": + unittest.main()