feat(note): 添加视频理解功能- 在 GPT 模型中增加 video_img_urls 字段用于存储视频截图

- 在笔记生成请求中添加视频理解相关参数
- 实现视频截图功能,支持按指定间隔生成截图
- 更新笔记生成逻辑,支持视频理解功能- 在前端服务中添加视频理解相关参数
This commit is contained in:
黄建武
2025-05-02 23:47:15 +08:00
parent e4c1c0f7d1
commit 6e084f720d
10 changed files with 576 additions and 534 deletions

View File

@@ -40,7 +40,6 @@ def generate_base_prompt(title, segment_text, tags, _format=None, style=None, ex
# 添加额外内容
if extras:
prompt += f"\n{extras}"
print(prompt)
return prompt
@@ -104,9 +103,9 @@ def get_link_format():
def get_screenshot_format():
return '''
11. **原片截图**: 如果某个部分涉及**视觉演示**或任何能帮助理解的内容,插入截图提示
- 格式:`*Screenshot-[mm:ss]`
至少插入 1-3张截图
11. **原片截图**:你收到的截图一般是一个网格网格的每张图片就是一个时间点左上角会包含时间mm:ss的格式请你结合我发你的图片插入截图提示请你帮助用户更好的理解视频内容请你认真的分析每个图片和对应的转写文案插入最合适的内容来备注用户理解请一定按照这个格式 返回否则系统无法解析
- 格式:`*Screenshot-[mm:ss]`
'''

View File

@@ -7,6 +7,7 @@ from app.models.transcriber_model import TranscriptSegment
from datetime import timedelta
from typing import List
class UniversalGPT(GPT):
def __init__(self, client, model: str, temperature: float = 0.7):
self.client = client
@@ -28,21 +29,41 @@ class UniversalGPT(GPT):
def ensure_segments_type(self, segments) -> List[TranscriptSegment]:
return [TranscriptSegment(**seg) if isinstance(seg, dict) else seg for seg in segments]
def create_messages(self, segments: List[TranscriptSegment],**kwargs):
print("UniversalGPT",kwargs)
content =generate_base_prompt(
def create_messages(self, segments: List[TranscriptSegment], **kwargs):
content_text = generate_base_prompt(
title=kwargs.get('title'),
segment_text=self._build_segment_text(segments),
tags=kwargs.get('tags'),
_format=kwargs.get('_format'),
style=kwargs.get('style'),
extras=kwargs.get('extras')
extras=kwargs.get('extras'),
)
return [{"role": "user", "content": content }]
# ⛳ 组装 content 数组,支持 text + image_url 混合
content = [{"type": "text", "text": content_text}]
video_img_urls = kwargs.get('video_img_urls', [])
for url in video_img_urls:
content.append({
"type": "image_url",
"image_url": {
"url": url,
"detail": "auto"
}
})
# ✅ 正确格式:整体包在一个 message 里role + content array
messages = [{
"role": "user",
"content": content
}]
return messages
def list_models(self):
return self.client.models.list()
def summarize(self, source: GPTSource) -> str:
self.screenshot = source.screenshot
self.link = source.link
@@ -51,8 +72,8 @@ class UniversalGPT(GPT):
messages = self.create_messages(
source.segment,
title=source.title,
tags=source.tags
,
tags=source.tags,
video_img_urls=source.video_img_urls,
_format=source._format,
style=source.style,
extras=source.extras
@@ -63,7 +84,3 @@ class UniversalGPT(GPT):
temperature=0.7
)
return response.choices[0].message.content.strip()
if __name__ == '__main__':
print('s')

View File

@@ -14,4 +14,5 @@ class GPTSource:
style: Optional[str] = None
extras: Optional[str] = None
_format: Optional[list] = None
video_img_urls: Optional[list] = None

View File

@@ -37,12 +37,15 @@ class VideoRequest(BaseModel):
quality: DownloadQuality
screenshot: Optional[bool] = False
link: Optional[bool] = False
model_name:str
provider_id:str
model_name: str
provider_id: str
task_id: Optional[str] = None
format:Optional[list]=[]
style:str=None
extras:Optional[str]
format: Optional[list] = []
style: str = None
extras: Optional[str]
video_understanding: Optional[bool] = False
video_interval: Optional[int] = 0
grid_size: Optional[list] = []
@field_validator("video_url")
def validate_supported_url(cls, v):
@@ -59,6 +62,7 @@ class VideoRequest(BaseModel):
NOTE_OUTPUT_DIR = "note_results"
UPLOAD_DIR = "uploads"
def save_note_to_file(task_id: str, note):
os.makedirs(NOTE_OUTPUT_DIR, exist_ok=True)
with open(os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.json"), "w", encoding="utf-8") as f:
@@ -66,8 +70,10 @@ def save_note_to_file(task_id: str, note):
def run_note_task(task_id: str, video_url: str, platform: str, quality: DownloadQuality,
link: bool = False,screenshot: bool = False,model_name:str=None,provider_id:str=None,
_format:list=None,style:str=None,extras:str=None):
link: bool = False, screenshot: bool = False, model_name: str = None, provider_id: str = None,
_format: list = None, style: str = None, extras: str = None, video_understanding: bool = False,
video_interval=0, grid_size=[]
):
try:
if not model_name or not provider_id:
raise HTTPException(status_code=400, detail="请选择模型和提供者")
@@ -84,25 +90,26 @@ def run_note_task(task_id: str, video_url: str, platform: str, quality: Download
style=style,
extras=extras,
screenshot=screenshot
, video_understanding=video_understanding,
video_interval=video_interval,
grid_size=grid_size
)
print('Note 结果',note)
logger.info(f"Note generated: {task_id}")
save_note_to_file(task_id, note)
except Exception as e:
save_note_to_file(task_id, {"error": str(e)})
@router.post('/delete_task')
def delete_task(data:RecordRequest):
def delete_task(data: RecordRequest):
try:
NoteGenerator().delete_note(video_id=data.video_id,platform=data.platform)
NoteGenerator().delete_note(video_id=data.video_id, platform=data.platform)
return R.success(msg='删除成功')
except Exception as e:
return R.error(msg=e)
@router.post("/upload")
async def upload(file: UploadFile = File(...)):
os.makedirs(UPLOAD_DIR, exist_ok=True)
@@ -114,6 +121,7 @@ async def upload(file: UploadFile = File(...)):
# 假设你静态目录挂载了 /uploads
return R.success({"url": f"/uploads/{file.filename}"})
@router.post("/generate_note")
def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
try:
@@ -128,7 +136,6 @@ def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
#
# )
if data.task_id:
# 如果传了task_id说明是重试
task_id = data.task_id
@@ -139,14 +146,14 @@ def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
# 正常新建任务
task_id = str(uuid.uuid4())
background_tasks.add_task(run_note_task, task_id, data.video_url, data.platform, data.quality,data.link ,data.screenshot,data.model_name,data.provider_id,data.format,data.style,data.extras)
background_tasks.add_task(run_note_task, task_id, data.video_url, data.platform, data.quality, data.link,
data.screenshot, data.model_name, data.provider_id, data.format, data.style,
data.extras, data.video_understanding, data.video_interval, data.grid_size)
return R.success({"task_id": task_id})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/task_status/{task_id}")
def get_task_status(task_id: str):
status_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.status.json")

View File

@@ -28,7 +28,7 @@ from app.services.constant import SUPPORT_PLATFORM_MAP
from app.services.provider import ProviderService
from app.transcriber.base import Transcriber
from app.transcriber.transcriber_provider import get_transcriber,_transcribers
from app.transcriber.transcriber_provider import get_transcriber, _transcribers
from app.transcriber.whisper import WhisperTranscriber
import re
@@ -39,12 +39,13 @@ from app.utils.video_helper import generate_screenshot
# from app.services.gpt import summarize_text
from dotenv import load_dotenv
from app.utils.logger import get_logger
from app.utils.video_reader import VideoReader
from events import transcription_finished
logger = get_logger(__name__)
load_dotenv()
api_path = os.getenv("API_BASE_URL", "http://localhost")
BACKEND_PORT= os.getenv("BACKEND_PORT", 8000)
BACKEND_PORT = os.getenv("BACKEND_PORT", 8000)
BACKEND_BASE_URL = f"{api_path}:{BACKEND_PORT}"
output_dir = os.getenv('OUT_DIR')
@@ -53,11 +54,12 @@ logger.info("starting up")
NOTE_OUTPUT_DIR = "note_results"
class NoteGenerator:
def __init__(self):
self.model_size: str = 'base'
self.device: Union[str, None] = None
self.transcriber_type = os.getenv('TRANSCRIBER_TYPE','fast-whisper')
self.transcriber_type = os.getenv('TRANSCRIBER_TYPE', 'fast-whisper')
self.transcriber = self.get_transcriber()
self.video_path = None
logger.info("初始化NoteGenerator")
@@ -94,7 +96,7 @@ class NoteGenerator:
return gpt
def get_downloader(self, platform: str) -> Downloader:
downloader =SUPPORT_PLATFORM_MAP[platform]
downloader = SUPPORT_PLATFORM_MAP[platform]
if downloader:
logger.info(f"使用{downloader}下载器")
return downloader
@@ -120,7 +122,7 @@ class NoteGenerator:
insert_video_task(video_id=video_id, platform=platform, task_id=task_id)
def insert_screenshots_into_markdown(self, markdown: str, video_path: str, image_base_url: str,
output_dir: str,_format:list) -> str:
output_dir: str, _format: list) -> str:
"""
扫描 markdown 中的 *Screenshot-xx:xx生成截图并插入 markdown 图片
:param markdown:
@@ -128,7 +130,7 @@ class NoteGenerator:
"""
matches = self.extract_screenshot_timestamps(markdown)
new_markdown = markdown
print(f"匹配到的截图:{matches}")
logger.info(f"开始为笔记生成截图")
try:
for idx, (marker, ts) in enumerate(matches):
@@ -137,7 +139,7 @@ class NoteGenerator:
image_url = f"{BACKEND_BASE_URL.rstrip('/')}/{image_relative_path.lstrip('/')}"
replacement = f"![]({image_url})"
new_markdown = new_markdown.replace(marker, replacement, 1)
print(f"替换后的 markdown{new_markdown}")
return new_markdown
except Exception as e:
@@ -180,14 +182,18 @@ class NoteGenerator:
_format: list = None,
style: str = None,
extras: str = None,
path: Union[str, None] = None
path: Union[str, None] = None,
video_understanding: bool = False,
video_interval=0,
grid_size=[]
) -> NoteResult:
try:
logger.info(f"🎯 开始解析并生成笔记task_id={task_id}")
self.update_task_status(task_id, TaskStatus.PARSING)
downloader = self.get_downloader(platform)
gpt = self.get_gpt(model_name=model_name, provider_id=provider_id)
video_img_urls = []
audio_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_audio.json")
transcript_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_transcript.json")
markdown_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_markdown.md")
@@ -201,11 +207,20 @@ class NoteGenerator:
audio_data = json.load(f)
audio = AudioDownloadResult(**audio_data)
else:
if 'screenshot' in _format:
if 'screenshot' in _format or video_understanding:
video_path = downloader.download_video(video_url)
self.video_path = video_path
logger.info(f"成功下载视频文件: {video_path}")
screenshot= 'screenshot' in _format
video_img_urls = VideoReader(
video_path=video_path,
grid_size=tuple(grid_size),
frame_interval=video_interval,
unit_width=1280,
unit_height=720,
save_quality=90,
).run()
screenshot = 'screenshot' in _format
audio: AudioDownloadResult = downloader.download(
video_url=video_url,
quality=quality,
@@ -261,6 +276,7 @@ class NoteGenerator:
segment=transcript.segments,
tags=audio.raw_info.get('tags'),
screenshot=screenshot,
video_img_urls=video_img_urls,
link=link,
_format=_format,
style=style,
@@ -279,12 +295,13 @@ class NoteGenerator:
# -------- 4. 插入截图 --------
if _format and 'screenshot' in _format:
try:
markdown = self.insert_screenshots_into_markdown(markdown, self.video_path, image_base_url, output_dir,_format)
markdown = self.insert_screenshots_into_markdown(markdown, self.video_path, image_base_url,
output_dir, _format)
except Exception as e:
logger.warning(f"⚠️ 插入截图失败跳过处理task_id={task_id},错误信息:{e}")
if _format and 'link' in _format:
try:
markdown = replace_content_markers(markdown, video_id=audio.video_id,platform=platform)
markdown = replace_content_markers(markdown, video_id=audio.video_id, platform=platform)
except Exception as e:
logger.warning(f"⚠️ 插入链接失败跳过处理task_id={task_id},错误信息:{e}")
# 注意:截图失败不终止整体流程
@@ -296,7 +313,7 @@ class NoteGenerator:
# -------- 6. 完成 --------
self.update_task_status(task_id, TaskStatus.SUCCESS)
logger.info(f"✅ 笔记生成成功task_id={task_id}")
if platform != 'local':
if platform != 'local':
transcription_finished.send({
"file_path": audio.file_path,
})
@@ -310,7 +327,3 @@ class NoteGenerator:
logger.error(f"❌ 笔记生成流程异常终止task_id={task_id},错误信息:{e}")
self.update_task_status(task_id, TaskStatus.FAILED, message=str(e))
raise f'❌ 笔记生成流程异常终止task_id={task_id},错误信息:{e}'

View File

@@ -0,0 +1,134 @@
import base64
import os
import re
import subprocess
import ffmpeg
from PIL import Image, ImageDraw, ImageFont
from app.utils.logger import get_logger
logger = get_logger(__name__)
class VideoReader:
def __init__(self,
video_path: str,
grid_size=(3, 3),
frame_interval=2,
unit_width=960,
unit_height=540,
save_quality=90,
font_path="fonts/arial.ttf",
frame_dir="data/output_frames",
grid_dir="data/grid_output"):
self.video_path = video_path
self.grid_size = grid_size
self.frame_interval = frame_interval
self.unit_width = unit_width
self.unit_height = unit_height
self.save_quality = save_quality
self.font_path = font_path
self.frame_dir = frame_dir
self.grid_dir = grid_dir
def format_time(self, seconds: float) -> str:
mm = int(seconds // 60)
ss = int(seconds % 60)
return f"{mm:02d}_{ss:02d}"
def extract_time_from_filename(self, filename: str) -> float:
match = re.search(r"frame_(\d{2})_(\d{2})\.jpg", filename)
if match:
mm, ss = map(int, match.groups())
return mm * 60 + ss
return float('inf')
def extract_frames(self, max_frames=1000) -> list[str]:
try:
os.makedirs(self.frame_dir, exist_ok=True)
duration = float(ffmpeg.probe(self.video_path)["format"]["duration"])
timestamps = [i for i in range(0, int(duration), self.frame_interval)][:max_frames]
image_paths = []
for ts in timestamps:
time_label = self.format_time(ts)
output_path = os.path.join(self.frame_dir, f"frame_{time_label}.jpg")
cmd = ["ffmpeg", "-ss", str(ts), "-i", self.video_path, "-frames:v", "1", "-q:v", "2", "-y", output_path,
"-hide_banner", "-loglevel", "error"]
subprocess.run(cmd, check=True)
image_paths.append(output_path)
return image_paths
except Exception as e:
logger.error(f"分割帧发生错误:{str(e)}")
raise ValueError("视频处理失败")
def group_images(self) -> list[list[str]]:
image_files = [os.path.join(self.frame_dir, f) for f in os.listdir(self.frame_dir) if
f.startswith("frame_") and f.endswith(".jpg")]
image_files.sort(key=lambda f: self.extract_time_from_filename(os.path.basename(f)))
group_size = self.grid_size[0] * self.grid_size[1]
return [image_files[i:i + group_size] for i in range(0, len(image_files), group_size)]
def concat_images(self, image_paths: list[str], name: str) -> str:
os.makedirs(self.grid_dir, exist_ok=True)
font = ImageFont.truetype(self.font_path, 48) if os.path.exists(self.font_path) else ImageFont.load_default()
images = []
for path in image_paths:
img = Image.open(path).convert("RGB").resize((self.unit_width, self.unit_height), Image.Resampling.LANCZOS)
timestamp = re.search(r"frame_(\d{2})_(\d{2})\.jpg", os.path.basename(path))
time_text = f"{timestamp.group(1)}:{timestamp.group(2)}" if timestamp else ""
draw = ImageDraw.Draw(img)
draw.text((10, 10), time_text, fill="yellow", font=font, stroke_width=1, stroke_fill="black")
images.append(img)
cols, rows = self.grid_size
grid_img = Image.new("RGB", (self.unit_width * cols, self.unit_height * rows), (255, 255, 255))
for i, img in enumerate(images):
x = (i % cols) * self.unit_width
y = (i // cols) * self.unit_height
grid_img.paste(img, (x, y))
save_path = os.path.join(self.grid_dir, f"{name}.jpg")
grid_img.save(save_path, quality=self.save_quality)
return save_path
def encode_images_to_base64(self, image_paths: list[str]) -> list[str]:
base64_images = []
for path in image_paths:
with open(path, "rb") as img_file:
encoded_string = base64.b64encode(img_file.read()).decode("utf-8")
base64_images.append(f"data:image/jpeg;base64,{encoded_string}")
return base64_images
def run(self)->list[str]:
logger.info("🚀 开始提取视频帧...")
try:
#清空帧文件夹
for file in os.listdir(self.frame_dir):
if file.startswith("frame_"):
os.remove(os.path.join(self.frame_dir, file))
#清空网格文件夹
for file in os.listdir(self.grid_dir):
if file.startswith("grid_"):
os.remove(os.path.join(self.grid_dir, file))
self.extract_frames()
logger.info("🧩 开始拼接网格图...")
image_paths = []
groups = self.group_images()
for idx, group in enumerate(groups, start=1):
if len(group) < self.grid_size[0] * self.grid_size[1]:
logger.warning(f"⚠️ 跳过第 {idx} 组,图片不足 {self.grid_size[0] * self.grid_size[1]}")
continue
out_path = self.concat_images(group, f"grid_{idx}")
image_paths.append(out_path)
logger.info("📤 开始编码图像...")
urls = self.encode_images_to_base64(image_paths)
return urls
except Exception as e:
logger.error(f"发生错误:{str(e)}")
raise ValueError("视频处理失败")