🐞 fix: 增加错误之后对已解析段落的缓存功能,再次重试时不再重头开始

解析长视频时,当附件大小过大时不再调用后进行报错,而是将附件进行分批次发送

在每篇笔记开头默认增加地址来源链接,对模糊处可溯源
This commit is contained in:
CyanAutumn
2026-02-12 18:28:11 +08:00
parent 7b45db2f59
commit d9a7b89e7d
67 changed files with 279293 additions and 64 deletions

View File

View File

@@ -0,0 +1,35 @@
import importlib.util
import pathlib
import unittest
ROOT = pathlib.Path(__file__).resolve().parents[1]
MODULE_PATH = ROOT / "app" / "utils" / "note_helper.py"
spec = importlib.util.spec_from_file_location("note_helper", MODULE_PATH)
if spec is None or spec.loader is None:
raise ImportError("note_helper module spec not found")
note_helper = importlib.util.module_from_spec(spec)
spec.loader.exec_module(note_helper)
class TestNoteHelper(unittest.TestCase):
def test_prepend_source_link_adds_header_at_top(self):
source_url = "https://www.bilibili.com/video/BV1xx411c7mD"
markdown = "## 标题\n\n内容"
result = note_helper.prepend_source_link(markdown, source_url)
self.assertTrue(result.startswith(f"> 来源链接:{source_url}\n\n"))
self.assertIn("## 标题", result)
def test_prepend_source_link_does_not_duplicate_when_header_exists(self):
source_url = "https://www.youtube.com/watch?v=abc123"
markdown = f"> 来源链接:{source_url}\n\n## 标题\n\n内容"
result = note_helper.prepend_source_link(markdown, source_url)
self.assertEqual(result, markdown)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,97 @@
import importlib.util
import pathlib
import unittest
from dataclasses import dataclass
ROOT = pathlib.Path(__file__).resolve().parents[1]
MODULE_PATH = ROOT / "app" / "gpt" / "request_chunker.py"
spec = importlib.util.spec_from_file_location("request_chunker", MODULE_PATH)
if spec is None or spec.loader is None:
raise ImportError("request_chunker module spec not found")
request_chunker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(request_chunker)
RequestChunker = request_chunker.RequestChunker
@dataclass
class DummySeg:
start: float
end: float
text: str
def build_messages(segments, image_urls, **_):
content = [{"type": "text", "text": "".join(s.text for s in segments)}]
for url in image_urls:
content.append({"type": "image_url", "image_url": {"url": url, "detail": "auto"}})
return [{"role": "user", "content": content}]
def size_estimator(messages):
size = 0
for part in messages[0]["content"]:
if part["type"] == "text":
size += len(part["text"])
else:
size += len(part["image_url"]["url"])
return size
class TestRequestChunker(unittest.TestCase):
def test_chunk_segments_preserves_order_and_content(self):
segments = [
DummySeg(0, 1, "aaaa"),
DummySeg(1, 2, "bbbb"),
DummySeg(2, 3, "cccc"),
]
chunker = RequestChunker(build_messages, max_bytes=8, size_estimator=size_estimator)
chunks = chunker.chunk(segments, [])
texts = ["".join(seg.text for seg in c.segments) for c in chunks]
self.assertEqual("".join(texts), "aaaabbbbcccc")
self.assertTrue(all(texts))
def test_chunk_images_distributed_across_batches(self):
segments = [DummySeg(0, 1, "aa")]
images = ["i" * 6, "j" * 6, "k" * 6]
chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
chunks = chunker.chunk(segments, images)
all_images = [img for c in chunks for img in c.image_urls]
self.assertEqual(all_images, images)
def test_chunk_images_are_not_front_loaded_when_multiple_segment_chunks(self):
segments = [
DummySeg(0, 1, "aaaaaa"),
DummySeg(1, 2, "bbbbbb"),
DummySeg(2, 3, "cccccc"),
]
images = ["11111", "22222", "33333"]
chunker = RequestChunker(build_messages, max_bytes=12, size_estimator=size_estimator)
chunks = chunker.chunk(segments, images)
self.assertGreaterEqual(len(chunks), 3)
image_counts = [len(c.image_urls) for c in chunks]
self.assertGreater(image_counts[1], 0)
self.assertGreater(image_counts[2], 0)
all_images = [img for c in chunks for img in c.image_urls]
self.assertEqual(all_images, images)
def test_split_oversized_segment(self):
segments = [DummySeg(0, 1, "x" * 25)]
chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
chunks = chunker.chunk(segments, [])
combined = "".join(seg.text for c in chunks for seg in c.segments)
self.assertEqual(combined, "x" * 25)
def test_group_texts_by_budget(self):
chunker = RequestChunker(build_messages, max_bytes=10, size_estimator=size_estimator)
def build_text_messages(texts, *_args, **_kwargs):
content = [{"type": "text", "text": "".join(texts)}]
return [{"role": "user", "content": content}]
groups = chunker.group_texts_by_budget(["aaaaa", "bbbbb", "ccccc"], build_text_messages)
self.assertEqual(groups, [["aaaaa", "bbbbb"], ["ccccc"]])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,35 @@
import importlib.util
import pathlib
import unittest
ROOT = pathlib.Path(__file__).resolve().parents[1]
MODULE_PATH = ROOT / "app" / "utils" / "screenshot_marker.py"
spec = importlib.util.spec_from_file_location("screenshot_marker", MODULE_PATH)
if spec is None or spec.loader is None:
raise ImportError("screenshot_marker module spec not found")
screenshot_marker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(screenshot_marker)
extract_screenshot_timestamps = screenshot_marker.extract_screenshot_timestamps
class TestScreenshotMarker(unittest.TestCase):
def test_extract_accepts_star_bracket_format(self):
markdown = "A\n*Screenshot-[01:02]\nB"
matches = extract_screenshot_timestamps(markdown)
self.assertEqual(matches, [("*Screenshot-[01:02]", 62)])
def test_extract_accepts_legacy_formats(self):
markdown = "*Screenshot-03:04 and Screenshot-[05:06]"
matches = extract_screenshot_timestamps(markdown)
self.assertEqual(
matches,
[
("*Screenshot-03:04", 184),
("Screenshot-[05:06]", 306),
],
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,42 @@
import importlib.util
import pathlib
import threading
import time
import unittest
ROOT = pathlib.Path(__file__).resolve().parents[1]
MODULE_PATH = ROOT / "app" / "services" / "task_serial_executor.py"
spec = importlib.util.spec_from_file_location("task_serial_executor", MODULE_PATH)
if spec is None or spec.loader is None:
raise ImportError("task_serial_executor module spec not found")
task_serial_executor = importlib.util.module_from_spec(spec)
spec.loader.exec_module(task_serial_executor)
SerialTaskExecutor = task_serial_executor.SerialTaskExecutor
class TestTaskSerialExecutor(unittest.TestCase):
def test_executor_runs_tasks_one_by_one(self):
executor = SerialTaskExecutor()
state_lock = threading.Lock()
state = {"active": 0, "peak_active": 0}
def critical_work():
with state_lock:
state["active"] += 1
state["peak_active"] = max(state["peak_active"], state["active"])
time.sleep(0.05)
with state_lock:
state["active"] -= 1
threads = [threading.Thread(target=lambda: executor.run(critical_work)) for _ in range(2)]
for t in threads:
t.start()
for t in threads:
t.join()
self.assertEqual(state["peak_active"], 1)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,147 @@
import importlib.util
import json
import os
import pathlib
import sys
import tempfile
import types
import unittest
from pathlib import Path
def _install_stubs():
app_mod = types.ModuleType("app")
gpt_pkg = types.ModuleType("app.gpt")
models_pkg = types.ModuleType("app.models")
base_mod = types.ModuleType("app.gpt.base")
class _GPT:
pass
base_mod.GPT = _GPT
prompt_builder_mod = types.ModuleType("app.gpt.prompt_builder")
def _generate_base_prompt(**_kwargs):
return "prompt"
prompt_builder_mod.generate_base_prompt = _generate_base_prompt
prompt_mod = types.ModuleType("app.gpt.prompt")
prompt_mod.BASE_PROMPT = ""
prompt_mod.AI_SUM = ""
prompt_mod.SCREENSHOT = ""
prompt_mod.LINK = ""
prompt_mod.MERGE_PROMPT = "merge"
utils_mod = types.ModuleType("app.gpt.utils")
def _fix_markdown(text):
return text
utils_mod.fix_markdown = _fix_markdown
request_chunker_mod = types.ModuleType("app.gpt.request_chunker")
class _RequestChunker:
def __init__(self, *_args, **_kwargs):
pass
def group_texts_by_budget(self, texts, _builder, **_kwargs):
return [texts]
request_chunker_mod.RequestChunker = _RequestChunker
gpt_model_mod = types.ModuleType("app.models.gpt_model")
class _GPTSource:
pass
gpt_model_mod.GPTSource = _GPTSource
transcriber_model_mod = types.ModuleType("app.models.transcriber_model")
class _TranscriptSegment:
def __init__(self, **kwargs):
self.start = kwargs.get("start", 0)
self.end = kwargs.get("end", 0)
self.text = kwargs.get("text", "")
transcriber_model_mod.TranscriptSegment = _TranscriptSegment
sys.modules.setdefault("app", app_mod)
sys.modules.setdefault("app.gpt", gpt_pkg)
sys.modules.setdefault("app.models", models_pkg)
sys.modules["app.gpt.base"] = base_mod
sys.modules["app.gpt.prompt_builder"] = prompt_builder_mod
sys.modules["app.gpt.prompt"] = prompt_mod
sys.modules["app.gpt.utils"] = utils_mod
sys.modules["app.gpt.request_chunker"] = request_chunker_mod
sys.modules["app.models.gpt_model"] = gpt_model_mod
sys.modules["app.models.transcriber_model"] = transcriber_model_mod
def _load_universal_gpt_class():
_install_stubs()
root = pathlib.Path(__file__).resolve().parents[1]
module_path = root / "app" / "gpt" / "universal_gpt.py"
spec = importlib.util.spec_from_file_location("universal_gpt", module_path)
if spec is None or spec.loader is None:
raise ImportError("universal_gpt module spec not found")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module.UniversalGPT
UniversalGPT = _load_universal_gpt_class()
class _FailingCompletions:
def create(self, **_kwargs):
raise Exception("Error code: 524 - bad_response_status_code")
class _DummyChat:
def __init__(self):
self.completions = _FailingCompletions()
class _DummyModels:
@staticmethod
def list():
return []
class _DummyClient:
def __init__(self):
self.chat = _DummyChat()
self.models = _DummyModels()
class TestUniversalGPTCheckpoint(unittest.TestCase):
def test_merge_524_error_persists_checkpoint(self):
original_attempts = os.environ.get("OPENAI_RETRY_ATTEMPTS")
os.environ["OPENAI_RETRY_ATTEMPTS"] = "1"
gpt = UniversalGPT(_DummyClient(), model="mock-model")
try:
with tempfile.TemporaryDirectory() as tmp_dir:
gpt.checkpoint_dir = Path(tmp_dir)
with self.assertRaises(Exception):
gpt._merge_partials(["part-a", "part-b"], "task-1", "sig-1")
checkpoint_path = gpt._checkpoint_path("task-1")
self.assertTrue(checkpoint_path.exists())
payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
self.assertEqual(payload["phase"], "merge")
self.assertEqual(payload["partials"], ["part-a", "part-b"])
finally:
if original_attempts is None:
os.environ.pop("OPENAI_RETRY_ATTEMPTS", None)
else:
os.environ["OPENAI_RETRY_ATTEMPTS"] = original_attempts
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,142 @@
import importlib.util
import pathlib
import re
import sys
import tempfile
import types
import unittest
from unittest.mock import patch
def _install_stubs():
app_mod = types.ModuleType("app")
utils_pkg = types.ModuleType("app.utils")
logger_mod = types.ModuleType("app.utils.logger")
class _Logger:
@staticmethod
def info(*_args, **_kwargs):
return None
@staticmethod
def warning(*_args, **_kwargs):
return None
@staticmethod
def error(*_args, **_kwargs):
return None
def _get_logger(_name):
return _Logger()
logger_mod.get_logger = _get_logger
path_helper_mod = types.ModuleType("app.utils.path_helper")
ffmpeg_mod = types.ModuleType("ffmpeg")
pil_mod = types.ModuleType("PIL")
pil_image_mod = types.ModuleType("PIL.Image")
pil_draw_mod = types.ModuleType("PIL.ImageDraw")
pil_font_mod = types.ModuleType("PIL.ImageFont")
class _FakeImage:
pass
class _FakeImageDraw:
@staticmethod
def Draw(*_args, **_kwargs):
return None
class _FakeImageFont:
@staticmethod
def truetype(*_args, **_kwargs):
return None
@staticmethod
def load_default():
return None
pil_image_mod.Image = _FakeImage
pil_draw_mod.ImageDraw = _FakeImageDraw
pil_font_mod.ImageFont = _FakeImageFont
def _get_app_dir(name):
return name
path_helper_mod.get_app_dir = _get_app_dir
ffmpeg_mod.probe = lambda *_args, **_kwargs: {"format": {"duration": "0"}}
sys.modules.setdefault("app", app_mod)
sys.modules.setdefault("app.utils", utils_pkg)
sys.modules["PIL"] = pil_mod
sys.modules["PIL.Image"] = pil_image_mod
sys.modules["PIL.ImageDraw"] = pil_draw_mod
sys.modules["PIL.ImageFont"] = pil_font_mod
sys.modules["ffmpeg"] = ffmpeg_mod
sys.modules["app.utils.logger"] = logger_mod
sys.modules["app.utils.path_helper"] = path_helper_mod
def _load_video_reader_module():
_install_stubs()
root = pathlib.Path(__file__).resolve().parents[1]
module_path = root / "app" / "utils" / "video_reader.py"
spec = importlib.util.spec_from_file_location("video_reader", module_path)
if spec is None or spec.loader is None:
raise ImportError("video_reader module spec not found")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
video_reader_module = _load_video_reader_module()
VideoReader = video_reader_module.VideoReader
def _make_fake_ffmpeg_runner(colors_by_second):
def _runner(cmd, check=True):
output_path = next((arg for arg in cmd if isinstance(arg, str) and arg.endswith(".jpg")), None)
if output_path is None:
raise AssertionError("Output path not found in ffmpeg cmd")
match = re.search(r"frame_(\d{2})_(\d{2})\.jpg$", output_path)
if match is None:
raise AssertionError("Unexpected output path")
sec = int(match.group(1)) * 60 + int(match.group(2))
payload = colors_by_second[sec]
with open(output_path, "wb") as f:
f.write(payload)
return 0
return _runner
class TestVideoReaderDeduplicateFrames(unittest.TestCase):
def test_extract_frames_skips_adjacent_duplicates_when_enabled(self):
with tempfile.TemporaryDirectory() as tmp_dir:
frame_dir = pathlib.Path(tmp_dir) / "frames"
grid_dir = pathlib.Path(tmp_dir) / "grids"
reader = VideoReader(
video_path="dummy.mp4",
frame_interval=1,
frame_dir=str(frame_dir),
grid_dir=str(grid_dir),
)
fake_colors = {
0: b"frame-a",
1: b"frame-a",
2: b"frame-b",
3: b"frame-b",
}
with patch.object(video_reader_module.ffmpeg, "probe", return_value={"format": {"duration": "4"}}), \
patch.object(video_reader_module.subprocess, "run", side_effect=_make_fake_ffmpeg_runner(fake_colors)):
paths = reader.extract_frames(max_frames=10)
names = [pathlib.Path(p).name for p in paths]
self.assertEqual(names, ["frame_00_00.jpg", "frame_00_02.jpg"])
if __name__ == "__main__":
unittest.main()