🐞 fix: 增加错误之后对已解析段落的缓存功能,再次重试时不再重头开始

解析长视频时,当附件大小过大时不再调用后进行报错,而是将附件进行分批次发送在每篇笔记开头默认增加地址来源链接,对模糊处可溯源
2026-07-23 05:27:52 +08:00 · 2026-02-12 18:28:11 +08:00
parent 7b45db2f59
commit d9a7b89e7d
67 changed files with 279293 additions and 64 deletions
--- a/backend/app/utils/note_helper.py
+++ b/backend/app/utils/note_helper.py
@@ -5,6 +5,37 @@ import re

 import re

+
+def prepend_source_link(markdown: str | None, source_url: str) -> str | None:
+    """
+    在笔记开头添加来源链接；若首个非空行已包含来源链接，则更新该行并避免重复。
+    """
+    if markdown is None:
+        return None
+
+    source = (source_url or "").strip()
+    if not source:
+        return markdown
+
+    header = f"> 来源链接：{source}"
+    lines = markdown.splitlines()
+    first_non_empty_idx = None
+    for idx, line in enumerate(lines):
+        if line.strip():
+            first_non_empty_idx = idx
+            break
+
+    if first_non_empty_idx is not None:
+        first_line = lines[first_non_empty_idx].strip()
+        if first_line.startswith("> 来源链接：") or first_line.startswith("来源链接："):
+            lines[first_non_empty_idx] = header
+            return "\n".join(lines)
+
+    if markdown.strip():
+        return f"{header}\n\n{markdown}"
+    return header
+
+
 def replace_content_markers(markdown: str, video_id: str, platform: str = 'bilibili') -> str:
    """
    替换 *Content-04:16*、Content-04:16 或 Content-[04:16] 为超链接，跳转到对应平台视频的时间位置
@@ -12,18 +43,20 @@ def replace_content_markers(markdown: str, video_id: str, platform: str = 'bilib
    # 匹配三种形式：*Content-04:16*、Content-04:16、Content-[04:16]
    pattern = r"(?:\*?)Content-(?:\[(\d{2}):(\d{2})\]|(\d{2}):(\d{2}))"

+    safe_video_id = video_id
+
    def replacer(match):
        mm = match.group(1) or match.group(3)
        ss = match.group(2) or match.group(4)
        total_seconds = int(mm) * 60 + int(ss)

        if platform == 'bilibili':
-            video_id = video_id.replace("_p", "?p=")
-            url = f"https://www.bilibili.com/video/{video_id}&t={total_seconds}"
+            parsed_video_id = safe_video_id.replace("_p", "?p=")
+            url = f"https://www.bilibili.com/video/{parsed_video_id}&t={total_seconds}"
        elif platform == 'youtube':
-            url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
+            url = f"https://www.youtube.com/watch?v={safe_video_id}&t={total_seconds}s"
        elif platform == 'douyin':
-            url = f"https://www.douyin.com/video/{video_id}"
+            url = f"https://www.douyin.com/video/{safe_video_id}"
            return f"[原片 @ {mm}:{ss}]({url})"
        else:
            return f"({mm}:{ss})"
--- a/backend/app/utils/screenshot_marker.py
+++ b/backend/app/utils/screenshot_marker.py
@@ -0,0 +1,13 @@
+import re
+from typing import List, Tuple
+
+
+def extract_screenshot_timestamps(markdown: str) -> List[Tuple[str, int]]:
+    pattern = r"(\*?Screenshot-(?:\[(\d{2}):(\d{2})\]|(\d{2}):(\d{2})))"
+    results: List[Tuple[str, int]] = []
+    for match in re.finditer(pattern, markdown):
+        mm = match.group(2) or match.group(4)
+        ss = match.group(3) or match.group(5)
+        total_seconds = int(mm) * 60 + int(ss)
+        results.append((match.group(1), total_seconds))
+    return results
--- a/backend/app/utils/video_reader.py
+++ b/backend/app/utils/video_reader.py
@@ -1,4 +1,5 @@
 import base64
+import hashlib
 import os
 import re
 import subprocess
@@ -14,6 +15,7 @@ class VideoReader:
                 video_path: str,
                 grid_size=(3, 3),
                 frame_interval=2,
+                 dedupe_enabled=True,
                 unit_width=960,
                 unit_height=540,
                 save_quality=90,
@@ -23,6 +25,7 @@ class VideoReader:
        self.video_path = video_path
        self.grid_size = grid_size
        self.frame_interval = frame_interval
+        self.dedupe_enabled = dedupe_enabled
        self.unit_width = unit_width
        self.unit_height = unit_height
        self.save_quality = save_quality
@@ -31,6 +34,14 @@ class VideoReader:
        print(f"视频路径：{video_path}",self.frame_dir,self.grid_dir)
        self.font_path = font_path

+    @staticmethod
+    def _calculate_file_md5(file_path: str) -> str:
+        hasher = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(8192), b""):
+                hasher.update(chunk)
+        return hasher.hexdigest()
+
    def format_time(self, seconds: float) -> str:
        mm = int(seconds // 60)
        ss = int(seconds % 60)
@@ -51,12 +62,21 @@ class VideoReader:
            timestamps = [i for i in range(0, int(duration), self.frame_interval)][:max_frames]

            image_paths = []
+            last_hash = None
            for ts in timestamps:
                time_label = self.format_time(ts)
                output_path = os.path.join(self.frame_dir, f"frame_{time_label}.jpg")
                cmd = ["ffmpeg", "-ss", str(ts), "-i", self.video_path, "-frames:v", "1", "-q:v", "2", "-y", output_path,
                       "-hide_banner", "-loglevel", "error"]
                subprocess.run(cmd, check=True)
+
+                if self.dedupe_enabled:
+                    frame_hash = self._calculate_file_md5(output_path)
+                    if frame_hash == last_hash:
+                        os.remove(output_path)
+                        continue
+                    last_hash = frame_hash
+
                image_paths.append(output_path)
            return image_paths
        except Exception as e: