🐞 fix: 增加错误之后对已解析段落的缓存功能,再次重试时不再重头开始

解析长视频时,当附件大小过大时不再调用后进行报错,而是将附件进行分批次发送在每篇笔记开头默认增加地址来源链接,对模糊处可溯源
2026-05-11 18:10:06 +08:00 · 2026-02-12 18:28:11 +08:00
parent 7b45db2f59
commit d9a7b89e7d
67 changed files with 279293 additions and 64 deletions
--- a/backend/tests/init.py
+++ b/backend/tests/init.py
--- a/backend/tests/test_note_helper.py
+++ b/backend/tests/test_note_helper.py
@@ -0,0 +1,35 @@
+import importlib.util
+import pathlib
+import unittest
+
+
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+MODULE_PATH = ROOT / "app" / "utils" / "note_helper.py"
+spec = importlib.util.spec_from_file_location("note_helper", MODULE_PATH)
+if spec is None or spec.loader is None:
+    raise ImportError("note_helper module spec not found")
+note_helper = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(note_helper)
+
+
+class TestNoteHelper(unittest.TestCase):
+    def test_prepend_source_link_adds_header_at_top(self):
+        source_url = "https://www.bilibili.com/video/BV1xx411c7mD"
+        markdown = "## 标题\n\n内容"
+
+        result = note_helper.prepend_source_link(markdown, source_url)
+
+        self.assertTrue(result.startswith(f"> 来源链接：{source_url}\n\n"))
+        self.assertIn("## 标题", result)
+
+    def test_prepend_source_link_does_not_duplicate_when_header_exists(self):
+        source_url = "https://www.youtube.com/watch?v=abc123"
+        markdown = f"> 来源链接：{source_url}\n\n## 标题\n\n内容"
+
+        result = note_helper.prepend_source_link(markdown, source_url)
+
+        self.assertEqual(result, markdown)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/test_request_chunker.py
+++ b/backend/tests/test_request_chunker.py
@@ -0,0 +1,97 @@
+import importlib.util
+import pathlib
+import unittest
+from dataclasses import dataclass
+
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+MODULE_PATH = ROOT / "app" / "gpt" / "request_chunker.py"
+spec = importlib.util.spec_from_file_location("request_chunker", MODULE_PATH)
+if spec is None or spec.loader is None:
+    raise ImportError("request_chunker module spec not found")
+request_chunker = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(request_chunker)
+RequestChunker = request_chunker.RequestChunker
+
+
+@dataclass
+class DummySeg:
+    start: float
+    end: float
+    text: str
+
+
+def build_messages(segments, image_urls, **_):
+    content = [{"type": "text", "text": "".join(s.text for s in segments)}]
+    for url in image_urls:
+        content.append({"type": "image_url", "image_url": {"url": url, "detail": "auto"}})
+    return [{"role": "user", "content": content}]
+
+
+def size_estimator(messages):
+    size = 0
+    for part in messages[0]["content"]:
+        if part["type"] == "text":
+            size += len(part["text"])
+        else:
+            size += len(part["image_url"]["url"])
+    return size
+
+
+class TestRequestChunker(unittest.TestCase):
+    def test_chunk_segments_preserves_order_and_content(self):
+        segments = [
+            DummySeg(0, 1, "aaaa"),
+            DummySeg(1, 2, "bbbb"),
+            DummySeg(2, 3, "cccc"),
+        ]
+        chunker = RequestChunker(build_messages, max_bytes=8, size_estimator=size_estimator)
+        chunks = chunker.chunk(segments, [])
+        texts = ["".join(seg.text for seg in c.segments) for c in chunks]
+        self.assertEqual("".join(texts), "aaaabbbbcccc")
+        self.assertTrue(all(texts))
+
+    def test_chunk_images_distributed_across_batches(self):
+        segments = [DummySeg(0, 1, "aa")]
+        images = ["i" * 6, "j" * 6, "k" * 6]
+        chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
+        chunks = chunker.chunk(segments, images)
+        all_images = [img for c in chunks for img in c.image_urls]
+        self.assertEqual(all_images, images)
+
+    def test_chunk_images_are_not_front_loaded_when_multiple_segment_chunks(self):
+        segments = [
+            DummySeg(0, 1, "aaaaaa"),
+            DummySeg(1, 2, "bbbbbb"),
+            DummySeg(2, 3, "cccccc"),
+        ]
+        images = ["11111", "22222", "33333"]
+        chunker = RequestChunker(build_messages, max_bytes=12, size_estimator=size_estimator)
+        chunks = chunker.chunk(segments, images)
+
+        self.assertGreaterEqual(len(chunks), 3)
+        image_counts = [len(c.image_urls) for c in chunks]
+        self.assertGreater(image_counts[1], 0)
+        self.assertGreater(image_counts[2], 0)
+        all_images = [img for c in chunks for img in c.image_urls]
+        self.assertEqual(all_images, images)
+
+    def test_split_oversized_segment(self):
+        segments = [DummySeg(0, 1, "x" * 25)]
+        chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
+        chunks = chunker.chunk(segments, [])
+        combined = "".join(seg.text for c in chunks for seg in c.segments)
+        self.assertEqual(combined, "x" * 25)
+
+    def test_group_texts_by_budget(self):
+        chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
+
+        def build_text_messages(texts, *_args, **_kwargs):
+            content = [{"type": "text", "text": "".join(texts)}]
+            return [{"role": "user", "content": content}]
+
+        groups = chunker.group_texts_by_budget(["aaaaa", "bbbbb", "ccccc"], build_text_messages)
+        self.assertEqual(groups, [["aaaaa", "bbbbb"], ["ccccc"]])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/test_screenshot_marker.py
+++ b/backend/tests/test_screenshot_marker.py
@@ -0,0 +1,35 @@
+import importlib.util
+import pathlib
+import unittest
+
+
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+MODULE_PATH = ROOT / "app" / "utils" / "screenshot_marker.py"
+spec = importlib.util.spec_from_file_location("screenshot_marker", MODULE_PATH)
+if spec is None or spec.loader is None:
+    raise ImportError("screenshot_marker module spec not found")
+screenshot_marker = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(screenshot_marker)
+extract_screenshot_timestamps = screenshot_marker.extract_screenshot_timestamps
+
+
+class TestScreenshotMarker(unittest.TestCase):
+    def test_extract_accepts_star_bracket_format(self):
+        markdown = "A\n*Screenshot-[01:02]\nB"
+        matches = extract_screenshot_timestamps(markdown)
+        self.assertEqual(matches, [("*Screenshot-[01:02]", 62)])
+
+    def test_extract_accepts_legacy_formats(self):
+        markdown = "*Screenshot-03:04 and Screenshot-[05:06]"
+        matches = extract_screenshot_timestamps(markdown)
+        self.assertEqual(
+            matches,
+            [
+                ("*Screenshot-03:04", 184),
+                ("Screenshot-[05:06]", 306),
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/test_task_serial_executor.py
+++ b/backend/tests/test_task_serial_executor.py
@@ -0,0 +1,42 @@
+import importlib.util
+import pathlib
+import threading
+import time
+import unittest
+
+
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+MODULE_PATH = ROOT / "app" / "services" / "task_serial_executor.py"
+spec = importlib.util.spec_from_file_location("task_serial_executor", MODULE_PATH)
+if spec is None or spec.loader is None:
+    raise ImportError("task_serial_executor module spec not found")
+task_serial_executor = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(task_serial_executor)
+SerialTaskExecutor = task_serial_executor.SerialTaskExecutor
+
+
+class TestTaskSerialExecutor(unittest.TestCase):
+    def test_executor_runs_tasks_one_by_one(self):
+        executor = SerialTaskExecutor()
+        state_lock = threading.Lock()
+        state = {"active": 0, "peak_active": 0}
+
+        def critical_work():
+            with state_lock:
+                state["active"] += 1
+                state["peak_active"] = max(state["peak_active"], state["active"])
+            time.sleep(0.05)
+            with state_lock:
+                state["active"] -= 1
+
+        threads = [threading.Thread(target=lambda: executor.run(critical_work)) for _ in range(2)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        self.assertEqual(state["peak_active"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/test_universal_gpt_checkpoint.py
+++ b/backend/tests/test_universal_gpt_checkpoint.py
@@ -0,0 +1,147 @@
+import importlib.util
+import json
+import os
+import pathlib
+import sys
+import tempfile
+import types
+import unittest
+from pathlib import Path
+
+
+def _install_stubs():
+    app_mod = types.ModuleType("app")
+    gpt_pkg = types.ModuleType("app.gpt")
+    models_pkg = types.ModuleType("app.models")
+
+    base_mod = types.ModuleType("app.gpt.base")
+
+    class _GPT:
+        pass
+
+    base_mod.GPT = _GPT
+
+    prompt_builder_mod = types.ModuleType("app.gpt.prompt_builder")
+
+    def _generate_base_prompt(**_kwargs):
+        return "prompt"
+
+    prompt_builder_mod.generate_base_prompt = _generate_base_prompt
+
+    prompt_mod = types.ModuleType("app.gpt.prompt")
+    prompt_mod.BASE_PROMPT = ""
+    prompt_mod.AI_SUM = ""
+    prompt_mod.SCREENSHOT = ""
+    prompt_mod.LINK = ""
+    prompt_mod.MERGE_PROMPT = "merge"
+
+    utils_mod = types.ModuleType("app.gpt.utils")
+
+    def _fix_markdown(text):
+        return text
+
+    utils_mod.fix_markdown = _fix_markdown
+
+    request_chunker_mod = types.ModuleType("app.gpt.request_chunker")
+
+    class _RequestChunker:
+        def __init__(self, *_args, **_kwargs):
+            pass
+
+        def group_texts_by_budget(self, texts, _builder, **_kwargs):
+            return [texts]
+
+    request_chunker_mod.RequestChunker = _RequestChunker
+
+    gpt_model_mod = types.ModuleType("app.models.gpt_model")
+
+    class _GPTSource:
+        pass
+
+    gpt_model_mod.GPTSource = _GPTSource
+
+    transcriber_model_mod = types.ModuleType("app.models.transcriber_model")
+
+    class _TranscriptSegment:
+        def __init__(self, **kwargs):
+            self.start = kwargs.get("start", 0)
+            self.end = kwargs.get("end", 0)
+            self.text = kwargs.get("text", "")
+
+    transcriber_model_mod.TranscriptSegment = _TranscriptSegment
+
+    sys.modules.setdefault("app", app_mod)
+    sys.modules.setdefault("app.gpt", gpt_pkg)
+    sys.modules.setdefault("app.models", models_pkg)
+    sys.modules["app.gpt.base"] = base_mod
+    sys.modules["app.gpt.prompt_builder"] = prompt_builder_mod
+    sys.modules["app.gpt.prompt"] = prompt_mod
+    sys.modules["app.gpt.utils"] = utils_mod
+    sys.modules["app.gpt.request_chunker"] = request_chunker_mod
+    sys.modules["app.models.gpt_model"] = gpt_model_mod
+    sys.modules["app.models.transcriber_model"] = transcriber_model_mod
+
+
+def _load_universal_gpt_class():
+    _install_stubs()
+    root = pathlib.Path(__file__).resolve().parents[1]
+    module_path = root / "app" / "gpt" / "universal_gpt.py"
+    spec = importlib.util.spec_from_file_location("universal_gpt", module_path)
+    if spec is None or spec.loader is None:
+        raise ImportError("universal_gpt module spec not found")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.UniversalGPT
+
+
+UniversalGPT = _load_universal_gpt_class()
+
+
+class _FailingCompletions:
+    def create(self, **_kwargs):
+        raise Exception("Error code: 524 - bad_response_status_code")
+
+
+class _DummyChat:
+    def __init__(self):
+        self.completions = _FailingCompletions()
+
+
+class _DummyModels:
+    @staticmethod
+    def list():
+        return []
+
+
+class _DummyClient:
+    def __init__(self):
+        self.chat = _DummyChat()
+        self.models = _DummyModels()
+
+
+class TestUniversalGPTCheckpoint(unittest.TestCase):
+    def test_merge_524_error_persists_checkpoint(self):
+        original_attempts = os.environ.get("OPENAI_RETRY_ATTEMPTS")
+        os.environ["OPENAI_RETRY_ATTEMPTS"] = "1"
+        gpt = UniversalGPT(_DummyClient(), model="mock-model")
+        try:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                gpt.checkpoint_dir = Path(tmp_dir)
+
+                with self.assertRaises(Exception):
+                    gpt._merge_partials(["part-a", "part-b"], "task-1", "sig-1")
+
+                checkpoint_path = gpt._checkpoint_path("task-1")
+                self.assertTrue(checkpoint_path.exists())
+                payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
+                self.assertEqual(payload["phase"], "merge")
+                self.assertEqual(payload["partials"], ["part-a", "part-b"])
+        finally:
+            if original_attempts is None:
+                os.environ.pop("OPENAI_RETRY_ATTEMPTS", None)
+            else:
+                os.environ["OPENAI_RETRY_ATTEMPTS"] = original_attempts
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/test_video_reader_dedupe.py
+++ b/backend/tests/test_video_reader_dedupe.py
@@ -0,0 +1,142 @@
+import importlib.util
+import pathlib
+import re
+import sys
+import tempfile
+import types
+import unittest
+from unittest.mock import patch
+
+
+def _install_stubs():
+    app_mod = types.ModuleType("app")
+    utils_pkg = types.ModuleType("app.utils")
+
+    logger_mod = types.ModuleType("app.utils.logger")
+
+    class _Logger:
+        @staticmethod
+        def info(*_args, **_kwargs):
+            return None
+
+        @staticmethod
+        def warning(*_args, **_kwargs):
+            return None
+
+        @staticmethod
+        def error(*_args, **_kwargs):
+            return None
+
+    def _get_logger(_name):
+        return _Logger()
+
+    logger_mod.get_logger = _get_logger
+
+    path_helper_mod = types.ModuleType("app.utils.path_helper")
+    ffmpeg_mod = types.ModuleType("ffmpeg")
+
+    pil_mod = types.ModuleType("PIL")
+    pil_image_mod = types.ModuleType("PIL.Image")
+    pil_draw_mod = types.ModuleType("PIL.ImageDraw")
+    pil_font_mod = types.ModuleType("PIL.ImageFont")
+
+    class _FakeImage:
+        pass
+
+    class _FakeImageDraw:
+        @staticmethod
+        def Draw(*_args, **_kwargs):
+            return None
+
+    class _FakeImageFont:
+        @staticmethod
+        def truetype(*_args, **_kwargs):
+            return None
+
+        @staticmethod
+        def load_default():
+            return None
+
+    pil_image_mod.Image = _FakeImage
+    pil_draw_mod.ImageDraw = _FakeImageDraw
+    pil_font_mod.ImageFont = _FakeImageFont
+
+    def _get_app_dir(name):
+        return name
+
+    path_helper_mod.get_app_dir = _get_app_dir
+    ffmpeg_mod.probe = lambda *_args, **_kwargs: {"format": {"duration": "0"}}
+
+    sys.modules.setdefault("app", app_mod)
+    sys.modules.setdefault("app.utils", utils_pkg)
+    sys.modules["PIL"] = pil_mod
+    sys.modules["PIL.Image"] = pil_image_mod
+    sys.modules["PIL.ImageDraw"] = pil_draw_mod
+    sys.modules["PIL.ImageFont"] = pil_font_mod
+    sys.modules["ffmpeg"] = ffmpeg_mod
+    sys.modules["app.utils.logger"] = logger_mod
+    sys.modules["app.utils.path_helper"] = path_helper_mod
+
+
+def _load_video_reader_module():
+    _install_stubs()
+    root = pathlib.Path(__file__).resolve().parents[1]
+    module_path = root / "app" / "utils" / "video_reader.py"
+    spec = importlib.util.spec_from_file_location("video_reader", module_path)
+    if spec is None or spec.loader is None:
+        raise ImportError("video_reader module spec not found")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+video_reader_module = _load_video_reader_module()
+VideoReader = video_reader_module.VideoReader
+
+
+def _make_fake_ffmpeg_runner(colors_by_second):
+    def _runner(cmd, check=True):
+        output_path = next((arg for arg in cmd if isinstance(arg, str) and arg.endswith(".jpg")), None)
+        if output_path is None:
+            raise AssertionError("Output path not found in ffmpeg cmd")
+        match = re.search(r"frame_(\d{2})_(\d{2})\.jpg$", output_path)
+        if match is None:
+            raise AssertionError("Unexpected output path")
+        sec = int(match.group(1)) * 60 + int(match.group(2))
+        payload = colors_by_second[sec]
+        with open(output_path, "wb") as f:
+            f.write(payload)
+        return 0
+
+    return _runner
+
+
+class TestVideoReaderDeduplicateFrames(unittest.TestCase):
+    def test_extract_frames_skips_adjacent_duplicates_when_enabled(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            frame_dir = pathlib.Path(tmp_dir) / "frames"
+            grid_dir = pathlib.Path(tmp_dir) / "grids"
+            reader = VideoReader(
+                video_path="dummy.mp4",
+                frame_interval=1,
+                frame_dir=str(frame_dir),
+                grid_dir=str(grid_dir),
+            )
+
+            fake_colors = {
+                0: b"frame-a",
+                1: b"frame-a",
+                2: b"frame-b",
+                3: b"frame-b",
+            }
+
+            with patch.object(video_reader_module.ffmpeg, "probe", return_value={"format": {"duration": "4"}}), \
+                    patch.object(video_reader_module.subprocess, "run", side_effect=_make_fake_ffmpeg_runner(fake_colors)):
+                paths = reader.extract_frames(max_frames=10)
+
+            names = [pathlib.Path(p).name for p in paths]
+            self.assertEqual(names, ["frame_00_00.jpg", "frame_00_02.jpg"])
+
+
+if __name__ == "__main__":
+    unittest.main()