feat(note): 添加视频理解功能- 在 GPT 模型中增加 video_img_urls 字段用于存储视频截图

- 在笔记生成请求中添加视频理解相关参数
- 实现视频截图功能,支持按指定间隔生成截图
- 更新笔记生成逻辑,支持视频理解功能- 在前端服务中添加视频理解相关参数
This commit is contained in:
黄建武
2025-05-02 23:47:15 +08:00
parent e4c1c0f7d1
commit 6e084f720d
10 changed files with 576 additions and 534 deletions

View File

@@ -1,238 +1,189 @@
/* NoteForm.tsx ---------------------------------------------------- */
import {
Form,
FormControl,
FormField,
FormItem,
FormLabel,
FormMessage,
Form, FormControl, FormField, FormItem, FormLabel, FormMessage,
} from '@/components/ui/form.tsx'
import { useEffect } from 'react'
import { Input } from '@/components/ui/input.tsx'
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from '@/components/ui/select.tsx'
import { Button } from '@/components/ui/button.tsx'
import { Checkbox } from '@/components/ui/checkbox.tsx'
import { useForm } from 'react-hook-form'
import { z } from 'zod'
import { useForm, useWatch } from 'react-hook-form'
import { zodResolver } from '@hookform/resolvers/zod'
import { Info, Clock, Loader2 } from 'lucide-react'
import { message } from 'antd'
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from '@/components/ui/tooltip.tsx'
import { z } from 'zod'
import { Info, Loader2 } from 'lucide-react'
import { message, Alert } from 'antd'
import { generateNote } from '@/services/note.ts'
import { useTaskStore } from '@/store/taskStore'
import NoteHistory from '@/pages/HomePage/components/NoteHistory.tsx'
import { useModelStore } from '@/store/modelStore'
import { Alert } from 'antd'
import { Textarea } from '@/components/ui/textarea.tsx'
import { ScrollArea } from '@/components/ui/scroll-area.tsx'
import { uploadFile } from '@/services/upload.ts'
// ✅ 定义表单 schema
const formSchema = z
.object({
import { useTaskStore } from '@/store/taskStore'
import { useModelStore } from '@/store/modelStore'
import {Tooltip, TooltipContent, TooltipProvider, TooltipTrigger} from "@/components/ui/tooltip.tsx";
import {Checkbox} from "@/components/ui/checkbox.tsx";
import {ScrollArea} from "@/components/ui/scroll-area.tsx";
import {Button} from "@/components/ui/button.tsx";
import {Select, SelectContent, SelectItem, SelectTrigger, SelectValue} from "@/components/ui/select.tsx";
import {Input} from "@/components/ui/input.tsx";
import {Textarea} from "@/components/ui/textarea.tsx";
/* -------------------- 常量 -------------------- */
const noteFormats = [
{ label: '目录', value: 'toc' },
{ label: '原片跳转', value: 'link' },
{ label: '原片截图', value: 'screenshot' },
{ label: 'AI总结', value: 'summary' },
] as const
const noteStyles = [
{ label: '精简', value: 'minimal' },
{ label: '详细', value: 'detailed' },
{ label: '教程', value: 'tutorial' },
{ label: '学术', value: 'academic' },
{ label: '小红书', value: 'xiaohongshu' },
{ label: '生活向', value: 'life_journal' },
{ label: '任务导向', value: 'task_oriented' },
{ label: '商业风格', value: 'business' },
{ label: '会议纪要', value: 'meeting_minutes' },
] as const
/* -------------------- 校验 Schema -------------------- */
const formSchema = z.object({
video_url: z.string(),
platform: z.string().nonempty('请选择平台'),
quality: z.enum(['fast', 'medium', 'slow'], {
required_error: '请选择音频质量',
}),
quality: z.enum(['fast', 'medium', 'slow']),
screenshot: z.boolean().optional(),
link: z.boolean().optional(),
model_name: z.string().nonempty('请选择模型'),
format: z.array(z.string()).default([]),
style: z.string().nonempty('请选择笔记生成风格'),
extras: z.string().optional(),
})
.superRefine((data, ctx) => {
const { video_url, platform } = data
video_understanding: z.boolean().optional(),
video_interval: z.coerce.number().min(1).max(30).default(4).optional(),
grid_size: z.tuple([
z.coerce.number().min(1).max(10),
z.coerce.number().min(1).max(10),
]).default([3, 3]).optional(),
}).superRefine(({ video_url, platform }, ctx) => {
if (platform === 'local' || platform === 'douyin') {
if (!video_url || typeof video_url !== 'string') {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: '本地视频路径不能为空',
path: ['video_url'],
})
if (!video_url) {
ctx.addIssue({ code: 'custom', message: '本地视频路径不能为空', path: ['video_url'] })
}
} else {
try {
const url = new URL(video_url)
if (!(url.protocol === 'http:' || url.protocol === 'https:')) {
throw new Error()
}
if (!['http:', 'https:'].includes(url.protocol)) throw new Error()
} catch {
ctx.addIssue({
code: z.ZodIssueCode.custom,
message: '请输入正确的视频链接',
path: ['video_url'],
})
ctx.addIssue({ code: 'custom', message: '请输入正确的视频链接', path: ['video_url'] })
}
}
})
})
type NoteFormValues = z.infer<typeof formSchema>
const noteFormats = [
{
label: '目录',
value: 'toc',
},
{ label: '原片跳转', value: 'link' },
{ label: '原片截图', value: 'screenshot' },
{ label: 'AI总结', value: 'summary' },
]
const noteStyles = [
{
label: '精简',
value: 'minimal', // 简洁、快速呈现要点
},
{
label: '详细',
value: 'detailed', // 详细记录,包含时间戳、关键点
},
{
label: '教程',
value: 'tutorial', // 详细记录,包含时间戳、关键点
},
{
label: '学术',
value: 'academic', // 适合学术报告,正式且结构化
},
{
label: '小红书',
value: 'xiaohongshu', // 适合社交平台分享,亲切、口语化
},
{
label: '生活向',
value: 'life_journal', // 记录个人生活感悟,情感化表达
},
{
label: '任务导向',
value: 'task_oriented', // 强调任务、目标,适合工作和待办事项
},
{
label: '商业风格',
value: 'business', // 适合商业报告、会议纪要,正式且精准
},
{
label: '会议纪要',
value: 'meeting_minutes', // 适合商业报告、会议纪要,正式且精准
},
]
const NoteForm = () => {
useTaskStore(state => state.tasks)
const setCurrentTask = useTaskStore(state => state.setCurrentTask)
const currentTaskId = useTaskStore(state => state.currentTaskId)
const getCurrentTask = useTaskStore(state => state.getCurrentTask)
const loadEnabledModels = useModelStore(state => state.loadEnabledModels)
const modelList = useModelStore(state => state.modelList)
const showFeatureHint = useModelStore(state => state.showFeatureHint)
const setShowFeatureHint = useModelStore(state => state.setShowFeatureHint)
const form = useForm<NoteFormValues>({
resolver: zodResolver(formSchema),
defaultValues: {
video_url: '',
platform: 'bilibili',
quality: 'medium', // 默认中等质量
screenshot: false,
model_name: modelList[0]?.model_name || '', // 确保有值
format: [], // 初始化为空数组
style: 'minimal', // 默认选择精简风格
extras: '', // 初始化为空字符串
},
})
const platform = form.watch('platform')
const onClose = () => {
setShowFeatureHint(false)
}
const isGenerating = () => {
console.log('🚀 isGenerating', getCurrentTask()?.status)
return (
getCurrentTask()?.status != 'SUCCESS' &&
getCurrentTask()?.status != 'FAILED' &&
getCurrentTask()?.status != undefined
)
}
const handleFileUpload = async (file: File, onSuccess: (url: string) => void) => {
if (!file) return
const formData = new FormData()
formData.append('file', file)
try {
const res = await uploadFile(formData)
if (res.data.code === 0) {
const uploadedUrl = res.data.data.url
console.log('✅ 上传成功', uploadedUrl)
onSuccess(uploadedUrl)
}
} catch (error) {
console.error('上传失败', error)
// 可以弹个 toast 或者提示上传失败
}
}
// TODO 修复选择其他视频平台以后再选择本地视频还可以选择 Link 的问题
const onSubmit = async (data: NoteFormValues) => {
console.log('🎯 提交内容:', data)
message.success('提交任务')
const payload = {
video_url: data.video_url,
platform: data.platform,
quality: data.quality,
model_name: data.model_name,
provider_id: modelList.find(model => model.model_name === data.model_name).provider_id,
format: data.format,
style: data.style,
extras: data.extras,
}
const res = await generateNote(payload)
const taskId = res.data.task_id
useTaskStore.getState().addPendingTask(taskId, data.platform, payload)
}
useEffect(() => {
loadEnabledModels()
}, [])
return (
<>
<ScrollArea className="sm:h-[400px] md:h-[800px]">
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)} className="space-y-2">
<div className="flex w-full items-center gap-2 py-1.5">
<Button
type="submit"
className="bg-primary w-full sm:w-full"
disabled={isGenerating()}
>
{isGenerating() && <Loader2 className="mr-2 h-4 w-4 animate-spin" />}
{isGenerating() ? '正在生成…' : '生成笔记'}
</Button>
</div>
<div className="space-y-2">
/* -------------------- 可复用子组件 -------------------- */
const SectionHeader = ({ title, tip }: { title: string; tip?: string }) => (
<div className="my-3 flex items-center justify-between">
<h2 className="block"></h2>
<h2 className="block">{title}</h2>
{tip && (
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />
</TooltipTrigger>
<TooltipContent>
<p className="text-xs">YouTube等平台</p>
</TooltipContent>
<TooltipContent className="text-xs">{tip}</TooltipContent>
</Tooltip>
</TooltipProvider>
)}
</div>
)
const CheckboxGroup = ({
value = [], onChange, disabledMap,
}: {
value?: string[]
onChange: (v: string[]) => void
disabledMap: Record<string, boolean>
}) => (
<div className="flex flex-wrap space-x-1.5">
{noteFormats.map(({ label, value: v }) => (
<label key={v} className="flex items-center space-x-2">
<Checkbox
checked={value.includes(v)}
disabled={disabledMap[v]}
onCheckedChange={checked =>
onChange(checked ? [...value, v] : value.filter(x => x !== v))}
/>
<span>{label}</span>
</label>
))}
</div>
)
/* -------------------- 主组件 -------------------- */
const NoteForm = () => {
/* ---- 全局状态 ---- */
const { addPendingTask, currentTaskId, getCurrentTask } = useTaskStore()
const { loadEnabledModels, modelList, showFeatureHint, setShowFeatureHint } = useModelStore()
/* ---- 表单 ---- */
const form = useForm<NoteFormValues>({
resolver: zodResolver(formSchema),
defaultValues: {
platform: 'bilibili',
quality: 'medium',
model_name: modelList[0]?.model_name || '',
style: 'minimal',
video_interval: 4,
grid_size: [3, 3],
format: [],
},
})
/* ---- 派生状态(只 watch 一次,提高性能) ---- */
const platform = useWatch({ control: form.control, name: 'platform' }) as string
const videoUnderstandingEnabled = useWatch({ control: form.control, name: 'video_understanding' })
/* ---- 副作用 ---- */
useEffect(() => {
loadEnabledModels()
return}, [])
/* ---- 帮助函数 ---- */
const isGenerating = () =>
!['SUCCESS', 'FAILED', undefined].includes(getCurrentTask()?.status)
const handleFileUpload = async (file: File, cb: (url: string) => void) => {
const formData = new FormData()
formData.append('file', file)
try {
const { data } = await uploadFile(formData)
if (data.code === 0) cb(data.data.url)
} catch (err) {
console.error('上传失败:', err)
message.error('上传失败,请重试')
}
}
const onSubmit = async (values: NoteFormValues) => {
const payload:NoteFormValues = {
...values,
provider_id: modelList.find(m => m.model_name === values.model_name)!.provider_id,
}
message.success('已提交任务')
const { data } = await generateNote(payload)
addPendingTask(data.task_id, values.platform, payload)
}
/* -------------------- 渲染 -------------------- */
return (
<ScrollArea className="sm:h-[400px] md:h-[800px]">
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)} className="space-y-4">
{/* 顶部按钮 */}
<Button type="submit" className="w-full bg-primary" disabled={isGenerating()}>
{isGenerating() && <Loader2 className="mr-2 h-4 w-4 animate-spin" />}
{isGenerating() ? '正在生成…' : '生成笔记'}
</Button>
{/* 视频链接 & 平台 */}
<SectionHeader title="视频链接" tip="支持 B 站、YouTube 等平台" />
<div className="flex gap-2">
{/* 平台选择 */}
<FormField
@@ -242,13 +193,11 @@ const NoteForm = () => {
<FormItem>
<Select onValueChange={field.onChange} defaultValue={field.value}>
<FormControl>
<SelectTrigger className="w-32">
<SelectValue placeholder="选择平台" />
</SelectTrigger>
<SelectTrigger className="w-32"><SelectValue /></SelectTrigger>
</FormControl>
<SelectContent>
<SelectItem value="bilibili"></SelectItem>
<SelectItem value="youtube">Youtube</SelectItem>
<SelectItem value="youtube">YouTube</SelectItem>
<SelectItem value="douyin"></SelectItem>
<SelectItem value="local"></SelectItem>
</SelectContent>
@@ -257,274 +206,198 @@ const NoteForm = () => {
</FormItem>
)}
/>
{/* 视频地址 */}
{/* 链接输入 / 上传框 */}
<FormField
control={form.control}
name="video_url"
render={({ field }) => (
<FormItem className="flex-1">
<FormControl>
{form.watch('platform') === 'local' ? (
<div className="flex flex-col gap-2">
{/* 第一行:本地路径输入框 */}
<Input placeholder="请输入本地视频路径" {...field} className="w-full" />
</div>
) : (
<Input placeholder="请输入视频网站链接" {...field} />
)}
</FormControl>
<FormMessage />
</FormItem>
)}
/>
</div>
<FormField
control={form.control}
name="video_url"
render={({ field }) => (
<FormItem className="w-full">
<FormControl>
{form.watch('platform') === 'local' ? (
{platform === 'local' ? (
<>
<Input placeholder="请输入本地视频路径" {...field} />
<div
className="hover:border-primary flex h-40 w-full cursor-pointer items-center justify-center rounded-md border-2 border-dashed border-gray-300 transition-colors"
onDragOver={e => {
e.preventDefault()
e.stopPropagation()
}}
className="hover:border-primary mt-2 flex h-40 cursor-pointer items-center justify-center rounded-md border-2 border-dashed border-gray-300 transition-colors"
onDragOver={e => { e.preventDefault(); e.stopPropagation() }}
onDrop={e => {
e.preventDefault()
e.stopPropagation()
const file = e.dataTransfer.files?.[0]
if (file) {
handleFileUpload(file, uploadedUrl => {
field.onChange(uploadedUrl)
})
}
if (file) handleFileUpload(file, field.onChange)
}}
onClick={() => {
const input = document.createElement('input')
input.type = 'file'
input.accept = 'video/*'
input.onchange = (e: any) => {
const file = e.target.files?.[0]
if (file) {
handleFileUpload(file, uploadedUrl => {
field.onChange(uploadedUrl)
})
}
input.onchange = e => {
const file = (e.target as HTMLInputElement).files?.[0]
if (file) handleFileUpload(file, field.onChange)
}
input.click()
}}
>
<div className="text-center text-sm text-gray-500">
<p className="mb-2"></p>
<p className="text-xs text-gray-400"></p>
</div>
<p className="text-center text-sm text-gray-500">
<br />
<span className="text-xs text-gray-400"></span>
</p>
</div>
</>
) : (
<></>
<Input placeholder="请输入视频网站链接" {...field} />
)}
</FormControl>
{/* ❗可以不要FormMessage不然重复两次报错提示 */}
<FormMessage />
</FormItem>
)}
/>
{/*<p className="text-xs text-neutral-500">*/}
{/* 支持哔哩哔哩视频链接,例如:*/}
{/* https://www.bilibili.com/video/BV1vc25YQE9X/*/}
{/*</p>*/}
{/*<FormField*/}
{/* control={form.control}*/}
{/* name="quality"*/}
{/* render={({ field }) => (*/}
{/* <FormItem>*/}
{/* <div className="my-3 flex items-center justify-between">*/}
{/* <h2 className="block">音频质量</h2>*/}
{/* <TooltipProvider>*/}
{/* <Tooltip>*/}
{/* <TooltipTrigger asChild>*/}
{/* <Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />*/}
{/* </TooltipTrigger>*/}
{/* <TooltipContent>*/}
{/* <p className="max-w-[200px] text-xs">*/}
{/* 质量越高,下载体积越大,速度越慢*/}
{/* </p>*/}
{/* </TooltipContent>*/}
{/* </Tooltip>*/}
{/* </TooltipProvider>*/}
{/* </div>*/}
{/* <Select onValueChange={field.onChange} defaultValue={field.value}>*/}
{/* <FormControl>*/}
{/* <SelectTrigger className="w-full">*/}
{/* <SelectValue placeholder="选择质量" />*/}
{/* </SelectTrigger>*/}
{/* </FormControl>*/}
{/* <SelectContent>*/}
{/* <SelectItem value="fast">快速(压缩)</SelectItem>*/}
{/* <SelectItem value="medium">中等(推荐)</SelectItem>*/}
{/* <SelectItem value="slow">高质量(清晰)</SelectItem>*/}
{/* </SelectContent>*/}
{/* </Select>*/}
{/* /!*<FormDescription className="text-xs text-neutral-500">*!/*/}
{/* /!* 质量越高,下载体积越大,速度越慢*!/*/}
{/* /!*</FormDescription>*!/*/}
{/* <FormMessage />*/}
{/* </FormItem>*/}
{/* )}*/}
{/*/>*/}
</div>
{/* 模型选择 */}
<FormField
control={form.control}
name="model_name"
render={({ field }) => (
<FormItem>
<div className="my-3 flex items-center justify-between">
<h2 className="block"></h2>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />
</TooltipTrigger>
<TooltipContent>
<p className="max-w-[200px] text-xs">
</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>
<SectionHeader title="模型选择" tip="不同模型效果不同,建议自行测试" />
<Select onValueChange={field.onChange} defaultValue={field.value}>
<FormControl>
<SelectTrigger className="w-full">
<SelectValue placeholder="选择配置好的模型" />
</SelectTrigger>
<SelectTrigger className="w-full"><SelectValue /></SelectTrigger>
</FormControl>
<SelectContent>
{modelList.map(item => {
return <SelectItem value={item.model_name}>{item.model_name}</SelectItem>
})}
{modelList.map(m => (
<SelectItem key={m.model_name} value={m.model_name}>
{m.model_name}
</SelectItem>
))}
</SelectContent>
</Select>
{/*<FormDescription className="text-xs text-neutral-500">*/}
{/* 质量越高,下载体积越大,速度越慢*/}
{/*</FormDescription>*/}
<FormMessage />
</FormItem>
)}
/>
{/* 视频理解 */}
<SectionHeader title="视频理解" tip="将视频截图发给多模态模型辅助分析" />
<div className="flex flex-col gap-2">
<FormField
control={form.control}
name="video_understanding"
render={({ field }) => (
<FormItem>
<div className="flex items-center gap-2">
<FormLabel></FormLabel>
<Checkbox
checked={videoUnderstandingEnabled}
onCheckedChange={v => form.setValue('video_understanding', v)}
/>
</div>
<FormMessage />
</FormItem>
)}
/>
<Alert
type="info"
message="推荐多模态模型qwen2.5-vl-72b-instruct / gpt-4o"
className="text-sm"
/>
<div className="grid grid-cols-2 gap-4">
{/* 采样间隔 */}
<FormField
control={form.control}
name="video_interval"
render={({ field }) => (
<FormItem>
<FormLabel></FormLabel>
<Input
disabled={!videoUnderstandingEnabled}
type="number"
{...field}
/>
<FormMessage />
</FormItem>
)}
/>
{/* 拼图大小 */}
<FormField
control={form.control}
name="grid_size"
render={({ field }) => (
<FormItem>
<FormLabel> × </FormLabel>
<div className="flex items-center space-x-2">
<Input
disabled={!videoUnderstandingEnabled}
type="number"
value={field.value?.[0] || 3}
onChange={e =>
field.onChange([+e.target.value, field.value?.[1] || 3])}
className="w-16"
/>
<span>x</span>
<Input
disabled={!videoUnderstandingEnabled}
type="number"
value={field.value?.[1] || 3}
onChange={e =>
field.onChange([field.value?.[0] || 3, +e.target.value])}
className="w-16"
/>
</div>
<FormMessage />
</FormItem>
)}
/>
</div>
</div>
{/* 笔记风格 */}
<FormField
control={form.control}
name="style"
render={({ field }) => (
<FormItem>
<div className="my-3 flex items-center justify-between">
<h2 className="block"></h2>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />
</TooltipTrigger>
<TooltipContent>
<p className="max-w-[200px] text-xs"></p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>
<SectionHeader title="笔记风格" tip="选择生成笔记的呈现风格" />
<Select onValueChange={field.onChange} defaultValue={field.value}>
<FormControl>
<SelectTrigger className="w-full">
<SelectValue placeholder="选择笔记风格" />
</SelectTrigger>
<SelectTrigger className="w-full"><SelectValue /></SelectTrigger>
</FormControl>
<SelectContent>
{noteStyles.map(item => (
<SelectItem key={item.value} value={item.value}>
{item.label}
</SelectItem>
{noteStyles.map(({ label, value }) => (
<SelectItem key={value} value={value}>{label}</SelectItem>
))}
</SelectContent>
</Select>
<FormMessage />
</FormItem>
)}
/>
{/* 笔记格式 */}
<FormField
control={form.control}
name="format"
render={({ field }) => (
<FormItem>
<div className="my-3 flex items-center justify-between">
<h2 className="block"></h2>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />
</TooltipTrigger>
<TooltipContent>
<p className="text-xs">
</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>
<FormControl>
<div className="flex flex-wrap space-x-1.5">
{noteFormats.map(item => (
<label key={item.value} className="flex items-center space-x-2">
<Checkbox
checked={field.value?.includes(item.value)}
disabled={item.value === 'link' && platform === 'local'}
onCheckedChange={checked => {
const currentValue = field.value ?? [] // ✨ 保底是数组
if (checked) {
field.onChange([...currentValue, item.value])
} else {
field.onChange(currentValue.filter(v => v !== item.value))
}
<SectionHeader title="笔记格式" tip="选择要包含的笔记元素" />
<CheckboxGroup
value={field.value}
onChange={field.onChange}
disabledMap={{
link: platform === 'local',
screenshot: !videoUnderstandingEnabled,
}}
/>
<span>{item.label}</span>
</label>
))}
</div>
</FormControl>
<FormField
control={form.control}
name="extras"
render={({ field }) => (
<FormItem>
<div className="my-3 flex items-center justify-between">
<h2 className="block"></h2>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Info className="hover:text-primary h-4 w-4 cursor-pointer text-neutral-400" />
</TooltipTrigger>
<TooltipContent>
<p className="text-xs">Prompt最后 </p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>
<Textarea placeholder={'笔记需要罗列出 xxx 关键点'} {...field} />
{/*<FormDescription className="text-xs text-neutral-500">*/}
{/* 质量越高,下载体积越大,速度越慢*/}
{/*</FormDescription>*/}
<FormMessage />
</FormItem>
)}
/>
{/* 备注 */}
<FormField
control={form.control}
name="extras"
render={({ field }) => (
<FormItem>
<SectionHeader title="备注" tip="可在 Prompt 结尾附加自定义说明" />
<Textarea placeholder="笔记需要罗列出 xxx 关键点…" {...field} />
<FormMessage />
</FormItem>
)}
@@ -532,11 +405,6 @@ const NoteForm = () => {
</form>
</Form>
</ScrollArea>
{/* 添加一些额外的说明或功能介绍 */}
{/*<div className="bg-primary-light mt-6 rounded-lg p-4"></div>*/}
</>
)
}

View File

@@ -12,6 +12,9 @@ export const generateNote = async (data: {
format: Array<string>
style: string
extras?: string
video_understand?: boolean
video_interval?: number
grid_size:Array<number>
}) => {
try {
const response = await request.post('/generate_note', data)

View File

@@ -40,7 +40,6 @@ def generate_base_prompt(title, segment_text, tags, _format=None, style=None, ex
# 添加额外内容
if extras:
prompt += f"\n{extras}"
print(prompt)
return prompt
@@ -104,9 +103,9 @@ def get_link_format():
def get_screenshot_format():
return '''
11. **原片截图**: 如果某个部分涉及**视觉演示**或任何能帮助理解的内容,插入截图提示
- 格式:`*Screenshot-[mm:ss]`
至少插入 1-3张截图
11. **原片截图**:你收到的截图一般是一个网格网格的每张图片就是一个时间点左上角会包含时间mm:ss的格式请你结合我发你的图片插入截图提示请你帮助用户更好的理解视频内容请你认真的分析每个图片和对应的转写文案插入最合适的内容来备注用户理解请一定按照这个格式 返回否则系统无法解析
- 格式:`*Screenshot-[mm:ss]`
'''

View File

@@ -7,6 +7,7 @@ from app.models.transcriber_model import TranscriptSegment
from datetime import timedelta
from typing import List
class UniversalGPT(GPT):
def __init__(self, client, model: str, temperature: float = 0.7):
self.client = client
@@ -28,21 +29,41 @@ class UniversalGPT(GPT):
def ensure_segments_type(self, segments) -> List[TranscriptSegment]:
return [TranscriptSegment(**seg) if isinstance(seg, dict) else seg for seg in segments]
def create_messages(self, segments: List[TranscriptSegment],**kwargs):
print("UniversalGPT",kwargs)
content =generate_base_prompt(
def create_messages(self, segments: List[TranscriptSegment], **kwargs):
content_text = generate_base_prompt(
title=kwargs.get('title'),
segment_text=self._build_segment_text(segments),
tags=kwargs.get('tags'),
_format=kwargs.get('_format'),
style=kwargs.get('style'),
extras=kwargs.get('extras')
extras=kwargs.get('extras'),
)
return [{"role": "user", "content": content }]
# ⛳ 组装 content 数组,支持 text + image_url 混合
content = [{"type": "text", "text": content_text}]
video_img_urls = kwargs.get('video_img_urls', [])
for url in video_img_urls:
content.append({
"type": "image_url",
"image_url": {
"url": url,
"detail": "auto"
}
})
# ✅ 正确格式:整体包在一个 message 里role + content array
messages = [{
"role": "user",
"content": content
}]
return messages
def list_models(self):
return self.client.models.list()
def summarize(self, source: GPTSource) -> str:
self.screenshot = source.screenshot
self.link = source.link
@@ -51,8 +72,8 @@ class UniversalGPT(GPT):
messages = self.create_messages(
source.segment,
title=source.title,
tags=source.tags
,
tags=source.tags,
video_img_urls=source.video_img_urls,
_format=source._format,
style=source.style,
extras=source.extras
@@ -63,7 +84,3 @@ class UniversalGPT(GPT):
temperature=0.7
)
return response.choices[0].message.content.strip()
if __name__ == '__main__':
print('s')

View File

@@ -14,4 +14,5 @@ class GPTSource:
style: Optional[str] = None
extras: Optional[str] = None
_format: Optional[list] = None
video_img_urls: Optional[list] = None

View File

@@ -37,12 +37,15 @@ class VideoRequest(BaseModel):
quality: DownloadQuality
screenshot: Optional[bool] = False
link: Optional[bool] = False
model_name:str
provider_id:str
model_name: str
provider_id: str
task_id: Optional[str] = None
format:Optional[list]=[]
style:str=None
extras:Optional[str]
format: Optional[list] = []
style: str = None
extras: Optional[str]
video_understanding: Optional[bool] = False
video_interval: Optional[int] = 0
grid_size: Optional[list] = []
@field_validator("video_url")
def validate_supported_url(cls, v):
@@ -59,6 +62,7 @@ class VideoRequest(BaseModel):
NOTE_OUTPUT_DIR = "note_results"
UPLOAD_DIR = "uploads"
def save_note_to_file(task_id: str, note):
os.makedirs(NOTE_OUTPUT_DIR, exist_ok=True)
with open(os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.json"), "w", encoding="utf-8") as f:
@@ -66,8 +70,10 @@ def save_note_to_file(task_id: str, note):
def run_note_task(task_id: str, video_url: str, platform: str, quality: DownloadQuality,
link: bool = False,screenshot: bool = False,model_name:str=None,provider_id:str=None,
_format:list=None,style:str=None,extras:str=None):
link: bool = False, screenshot: bool = False, model_name: str = None, provider_id: str = None,
_format: list = None, style: str = None, extras: str = None, video_understanding: bool = False,
video_interval=0, grid_size=[]
):
try:
if not model_name or not provider_id:
raise HTTPException(status_code=400, detail="请选择模型和提供者")
@@ -84,25 +90,26 @@ def run_note_task(task_id: str, video_url: str, platform: str, quality: Download
style=style,
extras=extras,
screenshot=screenshot
, video_understanding=video_understanding,
video_interval=video_interval,
grid_size=grid_size
)
print('Note 结果',note)
logger.info(f"Note generated: {task_id}")
save_note_to_file(task_id, note)
except Exception as e:
save_note_to_file(task_id, {"error": str(e)})
@router.post('/delete_task')
def delete_task(data:RecordRequest):
def delete_task(data: RecordRequest):
try:
NoteGenerator().delete_note(video_id=data.video_id,platform=data.platform)
NoteGenerator().delete_note(video_id=data.video_id, platform=data.platform)
return R.success(msg='删除成功')
except Exception as e:
return R.error(msg=e)
@router.post("/upload")
async def upload(file: UploadFile = File(...)):
os.makedirs(UPLOAD_DIR, exist_ok=True)
@@ -114,6 +121,7 @@ async def upload(file: UploadFile = File(...)):
# 假设你静态目录挂载了 /uploads
return R.success({"url": f"/uploads/{file.filename}"})
@router.post("/generate_note")
def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
try:
@@ -128,7 +136,6 @@ def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
#
# )
if data.task_id:
# 如果传了task_id说明是重试
task_id = data.task_id
@@ -139,14 +146,14 @@ def generate_note(data: VideoRequest, background_tasks: BackgroundTasks):
# 正常新建任务
task_id = str(uuid.uuid4())
background_tasks.add_task(run_note_task, task_id, data.video_url, data.platform, data.quality,data.link ,data.screenshot,data.model_name,data.provider_id,data.format,data.style,data.extras)
background_tasks.add_task(run_note_task, task_id, data.video_url, data.platform, data.quality, data.link,
data.screenshot, data.model_name, data.provider_id, data.format, data.style,
data.extras, data.video_understanding, data.video_interval, data.grid_size)
return R.success({"task_id": task_id})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/task_status/{task_id}")
def get_task_status(task_id: str):
status_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.status.json")

View File

@@ -28,7 +28,7 @@ from app.services.constant import SUPPORT_PLATFORM_MAP
from app.services.provider import ProviderService
from app.transcriber.base import Transcriber
from app.transcriber.transcriber_provider import get_transcriber,_transcribers
from app.transcriber.transcriber_provider import get_transcriber, _transcribers
from app.transcriber.whisper import WhisperTranscriber
import re
@@ -39,12 +39,13 @@ from app.utils.video_helper import generate_screenshot
# from app.services.gpt import summarize_text
from dotenv import load_dotenv
from app.utils.logger import get_logger
from app.utils.video_reader import VideoReader
from events import transcription_finished
logger = get_logger(__name__)
load_dotenv()
api_path = os.getenv("API_BASE_URL", "http://localhost")
BACKEND_PORT= os.getenv("BACKEND_PORT", 8000)
BACKEND_PORT = os.getenv("BACKEND_PORT", 8000)
BACKEND_BASE_URL = f"{api_path}:{BACKEND_PORT}"
output_dir = os.getenv('OUT_DIR')
@@ -53,11 +54,12 @@ logger.info("starting up")
NOTE_OUTPUT_DIR = "note_results"
class NoteGenerator:
def __init__(self):
self.model_size: str = 'base'
self.device: Union[str, None] = None
self.transcriber_type = os.getenv('TRANSCRIBER_TYPE','fast-whisper')
self.transcriber_type = os.getenv('TRANSCRIBER_TYPE', 'fast-whisper')
self.transcriber = self.get_transcriber()
self.video_path = None
logger.info("初始化NoteGenerator")
@@ -94,7 +96,7 @@ class NoteGenerator:
return gpt
def get_downloader(self, platform: str) -> Downloader:
downloader =SUPPORT_PLATFORM_MAP[platform]
downloader = SUPPORT_PLATFORM_MAP[platform]
if downloader:
logger.info(f"使用{downloader}下载器")
return downloader
@@ -120,7 +122,7 @@ class NoteGenerator:
insert_video_task(video_id=video_id, platform=platform, task_id=task_id)
def insert_screenshots_into_markdown(self, markdown: str, video_path: str, image_base_url: str,
output_dir: str,_format:list) -> str:
output_dir: str, _format: list) -> str:
"""
扫描 markdown 中的 *Screenshot-xx:xx生成截图并插入 markdown 图片
:param markdown:
@@ -128,7 +130,7 @@ class NoteGenerator:
"""
matches = self.extract_screenshot_timestamps(markdown)
new_markdown = markdown
print(f"匹配到的截图:{matches}")
logger.info(f"开始为笔记生成截图")
try:
for idx, (marker, ts) in enumerate(matches):
@@ -137,7 +139,7 @@ class NoteGenerator:
image_url = f"{BACKEND_BASE_URL.rstrip('/')}/{image_relative_path.lstrip('/')}"
replacement = f"![]({image_url})"
new_markdown = new_markdown.replace(marker, replacement, 1)
print(f"替换后的 markdown{new_markdown}")
return new_markdown
except Exception as e:
@@ -180,14 +182,18 @@ class NoteGenerator:
_format: list = None,
style: str = None,
extras: str = None,
path: Union[str, None] = None
path: Union[str, None] = None,
video_understanding: bool = False,
video_interval=0,
grid_size=[]
) -> NoteResult:
try:
logger.info(f"🎯 开始解析并生成笔记task_id={task_id}")
self.update_task_status(task_id, TaskStatus.PARSING)
downloader = self.get_downloader(platform)
gpt = self.get_gpt(model_name=model_name, provider_id=provider_id)
video_img_urls = []
audio_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_audio.json")
transcript_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_transcript.json")
markdown_cache_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}_markdown.md")
@@ -201,11 +207,20 @@ class NoteGenerator:
audio_data = json.load(f)
audio = AudioDownloadResult(**audio_data)
else:
if 'screenshot' in _format:
if 'screenshot' in _format or video_understanding:
video_path = downloader.download_video(video_url)
self.video_path = video_path
logger.info(f"成功下载视频文件: {video_path}")
screenshot= 'screenshot' in _format
video_img_urls = VideoReader(
video_path=video_path,
grid_size=tuple(grid_size),
frame_interval=video_interval,
unit_width=1280,
unit_height=720,
save_quality=90,
).run()
screenshot = 'screenshot' in _format
audio: AudioDownloadResult = downloader.download(
video_url=video_url,
quality=quality,
@@ -261,6 +276,7 @@ class NoteGenerator:
segment=transcript.segments,
tags=audio.raw_info.get('tags'),
screenshot=screenshot,
video_img_urls=video_img_urls,
link=link,
_format=_format,
style=style,
@@ -279,12 +295,13 @@ class NoteGenerator:
# -------- 4. 插入截图 --------
if _format and 'screenshot' in _format:
try:
markdown = self.insert_screenshots_into_markdown(markdown, self.video_path, image_base_url, output_dir,_format)
markdown = self.insert_screenshots_into_markdown(markdown, self.video_path, image_base_url,
output_dir, _format)
except Exception as e:
logger.warning(f"⚠️ 插入截图失败跳过处理task_id={task_id},错误信息:{e}")
if _format and 'link' in _format:
try:
markdown = replace_content_markers(markdown, video_id=audio.video_id,platform=platform)
markdown = replace_content_markers(markdown, video_id=audio.video_id, platform=platform)
except Exception as e:
logger.warning(f"⚠️ 插入链接失败跳过处理task_id={task_id},错误信息:{e}")
# 注意:截图失败不终止整体流程
@@ -310,7 +327,3 @@ class NoteGenerator:
logger.error(f"❌ 笔记生成流程异常终止task_id={task_id},错误信息:{e}")
self.update_task_status(task_id, TaskStatus.FAILED, message=str(e))
raise f'❌ 笔记生成流程异常终止task_id={task_id},错误信息:{e}'

View File

@@ -0,0 +1,134 @@
import base64
import os
import re
import subprocess
import ffmpeg
from PIL import Image, ImageDraw, ImageFont
from app.utils.logger import get_logger
logger = get_logger(__name__)
class VideoReader:
def __init__(self,
video_path: str,
grid_size=(3, 3),
frame_interval=2,
unit_width=960,
unit_height=540,
save_quality=90,
font_path="fonts/arial.ttf",
frame_dir="data/output_frames",
grid_dir="data/grid_output"):
self.video_path = video_path
self.grid_size = grid_size
self.frame_interval = frame_interval
self.unit_width = unit_width
self.unit_height = unit_height
self.save_quality = save_quality
self.font_path = font_path
self.frame_dir = frame_dir
self.grid_dir = grid_dir
def format_time(self, seconds: float) -> str:
mm = int(seconds // 60)
ss = int(seconds % 60)
return f"{mm:02d}_{ss:02d}"
def extract_time_from_filename(self, filename: str) -> float:
match = re.search(r"frame_(\d{2})_(\d{2})\.jpg", filename)
if match:
mm, ss = map(int, match.groups())
return mm * 60 + ss
return float('inf')
def extract_frames(self, max_frames=1000) -> list[str]:
try:
os.makedirs(self.frame_dir, exist_ok=True)
duration = float(ffmpeg.probe(self.video_path)["format"]["duration"])
timestamps = [i for i in range(0, int(duration), self.frame_interval)][:max_frames]
image_paths = []
for ts in timestamps:
time_label = self.format_time(ts)
output_path = os.path.join(self.frame_dir, f"frame_{time_label}.jpg")
cmd = ["ffmpeg", "-ss", str(ts), "-i", self.video_path, "-frames:v", "1", "-q:v", "2", "-y", output_path,
"-hide_banner", "-loglevel", "error"]
subprocess.run(cmd, check=True)
image_paths.append(output_path)
return image_paths
except Exception as e:
logger.error(f"分割帧发生错误:{str(e)}")
raise ValueError("视频处理失败")
def group_images(self) -> list[list[str]]:
image_files = [os.path.join(self.frame_dir, f) for f in os.listdir(self.frame_dir) if
f.startswith("frame_") and f.endswith(".jpg")]
image_files.sort(key=lambda f: self.extract_time_from_filename(os.path.basename(f)))
group_size = self.grid_size[0] * self.grid_size[1]
return [image_files[i:i + group_size] for i in range(0, len(image_files), group_size)]
def concat_images(self, image_paths: list[str], name: str) -> str:
os.makedirs(self.grid_dir, exist_ok=True)
font = ImageFont.truetype(self.font_path, 48) if os.path.exists(self.font_path) else ImageFont.load_default()
images = []
for path in image_paths:
img = Image.open(path).convert("RGB").resize((self.unit_width, self.unit_height), Image.Resampling.LANCZOS)
timestamp = re.search(r"frame_(\d{2})_(\d{2})\.jpg", os.path.basename(path))
time_text = f"{timestamp.group(1)}:{timestamp.group(2)}" if timestamp else ""
draw = ImageDraw.Draw(img)
draw.text((10, 10), time_text, fill="yellow", font=font, stroke_width=1, stroke_fill="black")
images.append(img)
cols, rows = self.grid_size
grid_img = Image.new("RGB", (self.unit_width * cols, self.unit_height * rows), (255, 255, 255))
for i, img in enumerate(images):
x = (i % cols) * self.unit_width
y = (i // cols) * self.unit_height
grid_img.paste(img, (x, y))
save_path = os.path.join(self.grid_dir, f"{name}.jpg")
grid_img.save(save_path, quality=self.save_quality)
return save_path
def encode_images_to_base64(self, image_paths: list[str]) -> list[str]:
base64_images = []
for path in image_paths:
with open(path, "rb") as img_file:
encoded_string = base64.b64encode(img_file.read()).decode("utf-8")
base64_images.append(f"data:image/jpeg;base64,{encoded_string}")
return base64_images
def run(self)->list[str]:
logger.info("🚀 开始提取视频帧...")
try:
#清空帧文件夹
for file in os.listdir(self.frame_dir):
if file.startswith("frame_"):
os.remove(os.path.join(self.frame_dir, file))
#清空网格文件夹
for file in os.listdir(self.grid_dir):
if file.startswith("grid_"):
os.remove(os.path.join(self.grid_dir, file))
self.extract_frames()
logger.info("🧩 开始拼接网格图...")
image_paths = []
groups = self.group_images()
for idx, group in enumerate(groups, start=1):
if len(group) < self.grid_size[0] * self.grid_size[1]:
logger.warning(f"⚠️ 跳过第 {idx} 组,图片不足 {self.grid_size[0] * self.grid_size[1]}")
continue
out_path = self.concat_images(group, f"grid_{idx}")
image_paths.append(out_path)
logger.info("📤 开始编码图像...")
urls = self.encode_images_to_base64(image_paths)
return urls
except Exception as e:
logger.error(f"发生错误:{str(e)}")
raise ValueError("视频处理失败")

BIN
backend/fonts/arial.ttf Normal file

Binary file not shown.

Binary file not shown.