🐞 fix: 增加错误之后对已解析段落的缓存功能,再次重试时不再重头开始

解析长视频时,当附件大小过大时不再调用后进行报错,而是将附件进行分批次发送

在每篇笔记开头默认增加地址来源链接,对模糊处可溯源
This commit is contained in:
CyanAutumn
2026-02-12 18:28:11 +08:00
parent 7b45db2f59
commit d9a7b89e7d
67 changed files with 279293 additions and 64 deletions

View File

@@ -5,6 +5,37 @@ import re
import re
def prepend_source_link(markdown: str | None, source_url: str) -> str | None:
"""
在笔记开头添加来源链接;若首个非空行已包含来源链接,则更新该行并避免重复。
"""
if markdown is None:
return None
source = (source_url or "").strip()
if not source:
return markdown
header = f"> 来源链接:{source}"
lines = markdown.splitlines()
first_non_empty_idx = None
for idx, line in enumerate(lines):
if line.strip():
first_non_empty_idx = idx
break
if first_non_empty_idx is not None:
first_line = lines[first_non_empty_idx].strip()
if first_line.startswith("> 来源链接:") or first_line.startswith("来源链接:"):
lines[first_non_empty_idx] = header
return "\n".join(lines)
if markdown.strip():
return f"{header}\n\n{markdown}"
return header
def replace_content_markers(markdown: str, video_id: str, platform: str = 'bilibili') -> str:
"""
替换 *Content-04:16*、Content-04:16 或 Content-[04:16] 为超链接,跳转到对应平台视频的时间位置
@@ -12,18 +43,20 @@ def replace_content_markers(markdown: str, video_id: str, platform: str = 'bilib
# 匹配三种形式:*Content-04:16*、Content-04:16、Content-[04:16]
pattern = r"(?:\*?)Content-(?:\[(\d{2}):(\d{2})\]|(\d{2}):(\d{2}))"
safe_video_id = video_id
def replacer(match):
mm = match.group(1) or match.group(3)
ss = match.group(2) or match.group(4)
total_seconds = int(mm) * 60 + int(ss)
if platform == 'bilibili':
video_id = video_id.replace("_p", "?p=")
url = f"https://www.bilibili.com/video/{video_id}&t={total_seconds}"
parsed_video_id = safe_video_id.replace("_p", "?p=")
url = f"https://www.bilibili.com/video/{parsed_video_id}&t={total_seconds}"
elif platform == 'youtube':
url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
url = f"https://www.youtube.com/watch?v={safe_video_id}&t={total_seconds}s"
elif platform == 'douyin':
url = f"https://www.douyin.com/video/{video_id}"
url = f"https://www.douyin.com/video/{safe_video_id}"
return f"[原片 @ {mm}:{ss}]({url})"
else:
return f"({mm}:{ss})"

View File

@@ -0,0 +1,13 @@
import re
from typing import List, Tuple
def extract_screenshot_timestamps(markdown: str) -> List[Tuple[str, int]]:
pattern = r"(\*?Screenshot-(?:\[(\d{2}):(\d{2})\]|(\d{2}):(\d{2})))"
results: List[Tuple[str, int]] = []
for match in re.finditer(pattern, markdown):
mm = match.group(2) or match.group(4)
ss = match.group(3) or match.group(5)
total_seconds = int(mm) * 60 + int(ss)
results.append((match.group(1), total_seconds))
return results

View File

@@ -1,4 +1,5 @@
import base64
import hashlib
import os
import re
import subprocess
@@ -14,6 +15,7 @@ class VideoReader:
video_path: str,
grid_size=(3, 3),
frame_interval=2,
dedupe_enabled=True,
unit_width=960,
unit_height=540,
save_quality=90,
@@ -23,6 +25,7 @@ class VideoReader:
self.video_path = video_path
self.grid_size = grid_size
self.frame_interval = frame_interval
self.dedupe_enabled = dedupe_enabled
self.unit_width = unit_width
self.unit_height = unit_height
self.save_quality = save_quality
@@ -31,6 +34,14 @@ class VideoReader:
print(f"视频路径:{video_path}",self.frame_dir,self.grid_dir)
self.font_path = font_path
@staticmethod
def _calculate_file_md5(file_path: str) -> str:
hasher = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hasher.update(chunk)
return hasher.hexdigest()
def format_time(self, seconds: float) -> str:
mm = int(seconds // 60)
ss = int(seconds % 60)
@@ -51,12 +62,21 @@ class VideoReader:
timestamps = [i for i in range(0, int(duration), self.frame_interval)][:max_frames]
image_paths = []
last_hash = None
for ts in timestamps:
time_label = self.format_time(ts)
output_path = os.path.join(self.frame_dir, f"frame_{time_label}.jpg")
cmd = ["ffmpeg", "-ss", str(ts), "-i", self.video_path, "-frames:v", "1", "-q:v", "2", "-y", output_path,
"-hide_banner", "-loglevel", "error"]
subprocess.run(cmd, check=True)
if self.dedupe_enabled:
frame_hash = self._calculate_file_md5(output_path)
if frame_hash == last_hash:
os.remove(output_path)
continue
last_hash = frame_hash
image_paths.append(output_path)
return image_paths
except Exception as e: