feat(chat): 索引视频元信息(标题、作者、简介、标签等)

- 新增 _build_meta_chunk,将 audio_meta 中的标题、UP主、
  简介、标签、时长、平台、链接等构建为可检索的 chunk
- query 时同时从 meta/markdown/transcript 三种来源检索
- is_indexed 检测旧索引缺少 meta 时返回 false,自动触发重建
- system prompt 新增 [视频信息] 来源说明

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
huangjianwu
2026-03-23 15:41:07 +08:00
parent a92c779dd6
commit 8a8e448e22
2 changed files with 67 additions and 8 deletions

View File

@@ -9,8 +9,9 @@ from app.utils.logger import get_logger
logger = get_logger(__name__)
SYSTEM_PROMPT = """你是一个视频笔记问答助手。你可以参考两种来源回答用户的问题:
1. [笔记] — AI 生成的视频摘要笔记
2. [转录] — 视频原始语音转录文本(含时间戳)
1. [视频信息] — 视频标题、作者、简介、标签等元信息
2. [笔记] — AI 生成的视频摘要笔记
3. [转录] — 视频原始语音转录文本(含时间戳)
以下是检索到的相关内容:
@@ -31,7 +32,9 @@ def _build_context(chunks: list[dict]) -> str:
for i, chunk in enumerate(chunks, 1):
meta = chunk.get("metadata", {})
source_type = meta.get("source_type", "unknown")
if source_type == "markdown":
if source_type == "meta":
label = "[视频信息]"
elif source_type == "markdown":
label = f"[笔记 - {meta.get('section_title', '')}]"
else:
start = meta.get("start_time", 0)
@@ -77,6 +80,9 @@ def chat(
# 1. 检索相关片段
chunks = vector_store.query(task_id, question, n_results=5)
print(
f"检索到 {len(chunks)} 个相关片段: {[c['metadata'].get('source_type') for c in chunks]}"
)
if not chunks:
return {
"answer": "暂未找到相关笔记内容,请确认笔记已生成并完成索引。",

View File

@@ -55,6 +55,52 @@ def _chunk_transcript(segments: list[dict], window_size: int = 15, overlap: int
return chunks
def _build_meta_chunk(audio_meta: dict) -> list[dict]:
"""将视频元信息(标题、作者、描述、标签等)构建为可检索的 chunk。"""
if not audio_meta:
return []
raw = audio_meta.get("raw_info", {}) or {}
parts = []
title = audio_meta.get("title") or raw.get("title", "")
if title:
parts.append(f"视频标题:{title}")
uploader = raw.get("uploader", "")
if uploader:
parts.append(f"视频作者/UP主{uploader}")
desc = raw.get("description", "")
if desc:
parts.append(f"视频简介:{desc[:500]}")
tags = raw.get("tags", [])
if tags and isinstance(tags, list):
parts.append(f"标签:{', '.join(str(t) for t in tags[:20])}")
duration = audio_meta.get("duration", 0)
if duration:
m, s = divmod(int(duration), 60)
parts.append(f"视频时长:{m}{s}")
platform = audio_meta.get("platform", "")
if platform:
parts.append(f"平台:{platform}")
url = raw.get("webpage_url", "")
if url:
parts.append(f"链接:{url}")
if not parts:
return []
return [{
"text": "\n".join(parts),
"metadata": {"source_type": "meta"},
}]
class VectorStoreManager:
"""基于 ChromaDB 的笔记向量存储管理器。"""
@@ -83,9 +129,12 @@ class VectorStoreManager:
transcript = note_data.get("transcript", {})
segments = transcript.get("segments", [])
audio_meta = note_data.get("audio_meta", {})
meta_chunks = _build_meta_chunk(audio_meta)
md_chunks = _chunk_markdown(markdown)
tr_chunks = _chunk_transcript(segments)
all_chunks = md_chunks + tr_chunks
all_chunks = meta_chunks + md_chunks + tr_chunks
if not all_chunks:
logger.warning(f"笔记内容为空,跳过索引: {task_id}")
@@ -138,8 +187,8 @@ class VectorStoreManager:
all_chunks = []
# 分别从两种来源检索 n_results 条
for source_type in ("markdown", "transcript"):
# 分别从来源检索
for source_type in ("meta", "markdown", "transcript"):
try:
results = collection.query(
query_texts=[query_text],
@@ -164,10 +213,14 @@ class VectorStoreManager:
pass
def is_indexed(self, task_id: str) -> bool:
"""检查指定任务是否已建立索引"""
"""检查指定任务是否已建立完整索引(含 meta 信息)"""
col_name = self._collection_name(task_id)
try:
col = self._client.get_collection(col_name)
return col.count() > 0
if col.count() == 0:
return False
# 检查是否包含 meta chunk旧索引可能缺失
meta = col.get(where={"source_type": "meta"}, limit=1)
return len(meta["ids"]) > 0
except Exception:
return False