mirror of
https://github.com/JefferyHcool/BiliNote.git
synced 2026-06-10 10:09:44 +08:00
feat(extension): 多模态视频理解开关 + 抽帧/拼图参数(对齐 web NoteForm)
web 端 NoteForm 早就有 video_understanding / video_interval / grid_size 三件套, 插件之前没有,导致用户在视觉模型上想用「画面理解」时只能去 web 端发任务。 新增字段(types.ts Settings 与 GenerateRequest 同步): - video_understanding: boolean,默认 false(关) - video_interval: number,1-30 秒,默认 6(与 web NoteForm 默认一致) - grid_size: [number, number],1-10,默认 [2,2] UI 落地: - popup 「高级」折叠区:开关 + interval + grid_size 行/列三栏,启用时才显示后两个, 并提示需要选视觉模型 - options General 页:单独一节「视频理解(多模态)」展开同样字段 - popup start() 与 background startTask() 在 generate_note 请求里带上这三个字段; 关闭时不传(避免覆盖 backend 默认) 回归风险:默认 false,对现有用户行为不变。 依赖:feature/extension-form-parity(叠加在它之上,因为 Settings 是同一片字段域)。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -87,6 +87,9 @@ async function startTask(url: string): Promise<{ ok: boolean, taskId?: string, e
|
||||
link: formats.includes('link'),
|
||||
style: settings.style || undefined,
|
||||
extras: settings.extras || undefined,
|
||||
video_understanding: settings.video_understanding || undefined,
|
||||
video_interval: settings.video_understanding ? settings.video_interval : undefined,
|
||||
grid_size: settings.video_understanding ? settings.grid_size : undefined,
|
||||
prefetched_transcript: prefetched ?? undefined,
|
||||
}),
|
||||
})
|
||||
|
||||
@@ -12,6 +12,9 @@ export const DEFAULT_SETTINGS: Settings = {
|
||||
link: false,
|
||||
style: 'minimal',
|
||||
extras: '',
|
||||
video_understanding: false,
|
||||
video_interval: 6,
|
||||
grid_size: [2, 2],
|
||||
}
|
||||
|
||||
export const MAX_TASKS = 30
|
||||
|
||||
@@ -40,6 +40,9 @@ export interface GenerateRequest {
|
||||
format?: string[]
|
||||
style?: string
|
||||
extras?: string
|
||||
video_understanding?: boolean
|
||||
video_interval?: number
|
||||
grid_size?: [number, number]
|
||||
// 客户端在浏览器里直接抓到的字幕,跳过后端的 download_subtitles + 音频转写
|
||||
prefetched_transcript?: {
|
||||
language: string
|
||||
@@ -117,6 +120,13 @@ export interface Settings {
|
||||
link: boolean
|
||||
style: NoteStyle
|
||||
extras: string
|
||||
// 多模态视频理解:抽帧拼图喂给视觉模型,提升画面相关问题的回答质量
|
||||
// 要求所选 model 是视觉模型(如 gpt-4o / gemini / claude-opus 系列),文字模型会忽略图片
|
||||
video_understanding: boolean
|
||||
// 抽帧间隔(秒),范围 1-30,默认 6
|
||||
video_interval: number
|
||||
// 拼图网格 [rows, cols],每张拼图最多 rows*cols 帧。默认 [2,2]
|
||||
grid_size: [number, number]
|
||||
}
|
||||
|
||||
export interface ProviderUpdatePayload {
|
||||
|
||||
@@ -165,5 +165,39 @@ onMounted(async () => {
|
||||
/>
|
||||
</label>
|
||||
</section>
|
||||
|
||||
<section class="section-card">
|
||||
<h2 class="font-semibold">视频理解(多模态)</h2>
|
||||
<p class="text-xs text-gray-500">
|
||||
启用后会按抽帧间隔截取视频帧拼成网格图,连同字幕一起喂给视觉模型,提升画面相关问题的回答质量。
|
||||
<strong class="text-amber-700">需要选择视觉模型</strong>(GPT-4o / Gemini / Claude 等),文字模型会忽略图片。
|
||||
</p>
|
||||
<label class="flex items-center gap-2 text-sm">
|
||||
<input v-model="settings.video_understanding" type="checkbox">
|
||||
启用视频理解
|
||||
</label>
|
||||
<div v-if="settings.video_understanding" class="grid grid-cols-3 gap-3 text-sm">
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">抽帧间隔(秒, 1-30)</span>
|
||||
<input v-model.number="settings.video_interval" type="number" min="1" max="30" class="input">
|
||||
</label>
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">拼图行 (1-10)</span>
|
||||
<input
|
||||
:value="settings.grid_size?.[0] ?? 2"
|
||||
type="number" min="1" max="10" class="input"
|
||||
@input="settings.grid_size = [Number(($event.target as HTMLInputElement).value) || 2, settings.grid_size?.[1] ?? 2]"
|
||||
>
|
||||
</label>
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">拼图列 (1-10)</span>
|
||||
<input
|
||||
:value="settings.grid_size?.[1] ?? 2"
|
||||
type="number" min="1" max="10" class="input"
|
||||
@input="settings.grid_size = [settings.grid_size?.[0] ?? 2, Number(($event.target as HTMLInputElement).value) || 2]"
|
||||
>
|
||||
</label>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
@@ -80,6 +80,9 @@ async function start() {
|
||||
link: formats.includes('link'),
|
||||
style: settings.value.style || undefined,
|
||||
extras: settings.value.extras || undefined,
|
||||
video_understanding: settings.value.video_understanding || undefined,
|
||||
video_interval: settings.value.video_understanding ? settings.value.video_interval : undefined,
|
||||
grid_size: settings.value.video_understanding ? settings.value.grid_size : undefined,
|
||||
prefetched_transcript: prefetched ?? undefined,
|
||||
})
|
||||
activeTaskId.value = task_id
|
||||
@@ -225,6 +228,41 @@ onUnmounted(() => {
|
||||
placeholder="例如:重点关注游戏开发部分;保留所有专业术语原文"
|
||||
/>
|
||||
</label>
|
||||
<label class="flex items-center gap-2 mt-2">
|
||||
<input v-model="settings.video_understanding" type="checkbox">
|
||||
<span class="text-gray-600">启用视频理解(抽帧拼图喂视觉模型)</span>
|
||||
</label>
|
||||
<div v-if="settings.video_understanding" class="grid grid-cols-3 gap-2 mt-2">
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">抽帧间隔(秒)</span>
|
||||
<input
|
||||
v-model.number="settings.video_interval"
|
||||
type="number" min="1" max="30"
|
||||
class="border rounded px-1 py-0.5"
|
||||
>
|
||||
</label>
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">拼图行</span>
|
||||
<input
|
||||
:value="settings.grid_size?.[0] ?? 2"
|
||||
type="number" min="1" max="10"
|
||||
class="border rounded px-1 py-0.5"
|
||||
@input="settings.grid_size = [Number(($event.target as HTMLInputElement).value) || 2, settings.grid_size?.[1] ?? 2]"
|
||||
>
|
||||
</label>
|
||||
<label class="flex flex-col gap-1">
|
||||
<span class="text-gray-600">拼图列</span>
|
||||
<input
|
||||
:value="settings.grid_size?.[1] ?? 2"
|
||||
type="number" min="1" max="10"
|
||||
class="border rounded px-1 py-0.5"
|
||||
@input="settings.grid_size = [settings.grid_size?.[0] ?? 2, Number(($event.target as HTMLInputElement).value) || 2]"
|
||||
>
|
||||
</label>
|
||||
</div>
|
||||
<p v-if="settings.video_understanding" class="text-amber-700 mt-1">
|
||||
⚠ 需要选择视觉模型(GPT-4o / Gemini / Claude 等),文字模型会忽略图片
|
||||
</p>
|
||||
</details>
|
||||
|
||||
<div class="text-xs text-gray-600">
|
||||
|
||||
Reference in New Issue
Block a user