mirror of
https://github.com/JefferyHcool/BiliNote.git
synced 2026-05-11 01:49:46 +08:00
feat(chat): 索引视频元信息(标题、作者、简介、标签等)
- 新增 _build_meta_chunk,将 audio_meta 中的标题、UP主、 简介、标签、时长、平台、链接等构建为可检索的 chunk - query 时同时从 meta/markdown/transcript 三种来源检索 - is_indexed 检测旧索引缺少 meta 时返回 false,自动触发重建 - system prompt 新增 [视频信息] 来源说明 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,8 +9,9 @@ from app.utils.logger import get_logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = """你是一个视频笔记问答助手。你可以参考两种来源回答用户的问题:
|
||||
1. [笔记] — AI 生成的视频摘要笔记
|
||||
2. [转录] — 视频原始语音转录文本(含时间戳)
|
||||
1. [视频信息] — 视频标题、作者、简介、标签等元信息
|
||||
2. [笔记] — AI 生成的视频摘要笔记
|
||||
3. [转录] — 视频原始语音转录文本(含时间戳)
|
||||
|
||||
以下是检索到的相关内容:
|
||||
|
||||
@@ -31,7 +32,9 @@ def _build_context(chunks: list[dict]) -> str:
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
meta = chunk.get("metadata", {})
|
||||
source_type = meta.get("source_type", "unknown")
|
||||
if source_type == "markdown":
|
||||
if source_type == "meta":
|
||||
label = "[视频信息]"
|
||||
elif source_type == "markdown":
|
||||
label = f"[笔记 - {meta.get('section_title', '')}]"
|
||||
else:
|
||||
start = meta.get("start_time", 0)
|
||||
@@ -77,6 +80,9 @@ def chat(
|
||||
|
||||
# 1. 检索相关片段
|
||||
chunks = vector_store.query(task_id, question, n_results=5)
|
||||
print(
|
||||
f"检索到 {len(chunks)} 个相关片段: {[c['metadata'].get('source_type') for c in chunks]}"
|
||||
)
|
||||
if not chunks:
|
||||
return {
|
||||
"answer": "暂未找到相关笔记内容,请确认笔记已生成并完成索引。",
|
||||
|
||||
@@ -55,6 +55,52 @@ def _chunk_transcript(segments: list[dict], window_size: int = 15, overlap: int
|
||||
return chunks
|
||||
|
||||
|
||||
def _build_meta_chunk(audio_meta: dict) -> list[dict]:
|
||||
"""将视频元信息(标题、作者、描述、标签等)构建为可检索的 chunk。"""
|
||||
if not audio_meta:
|
||||
return []
|
||||
|
||||
raw = audio_meta.get("raw_info", {}) or {}
|
||||
parts = []
|
||||
|
||||
title = audio_meta.get("title") or raw.get("title", "")
|
||||
if title:
|
||||
parts.append(f"视频标题:{title}")
|
||||
|
||||
uploader = raw.get("uploader", "")
|
||||
if uploader:
|
||||
parts.append(f"视频作者/UP主:{uploader}")
|
||||
|
||||
desc = raw.get("description", "")
|
||||
if desc:
|
||||
parts.append(f"视频简介:{desc[:500]}")
|
||||
|
||||
tags = raw.get("tags", [])
|
||||
if tags and isinstance(tags, list):
|
||||
parts.append(f"标签:{', '.join(str(t) for t in tags[:20])}")
|
||||
|
||||
duration = audio_meta.get("duration", 0)
|
||||
if duration:
|
||||
m, s = divmod(int(duration), 60)
|
||||
parts.append(f"视频时长:{m}分{s}秒")
|
||||
|
||||
platform = audio_meta.get("platform", "")
|
||||
if platform:
|
||||
parts.append(f"平台:{platform}")
|
||||
|
||||
url = raw.get("webpage_url", "")
|
||||
if url:
|
||||
parts.append(f"链接:{url}")
|
||||
|
||||
if not parts:
|
||||
return []
|
||||
|
||||
return [{
|
||||
"text": "\n".join(parts),
|
||||
"metadata": {"source_type": "meta"},
|
||||
}]
|
||||
|
||||
|
||||
class VectorStoreManager:
|
||||
"""基于 ChromaDB 的笔记向量存储管理器。"""
|
||||
|
||||
@@ -83,9 +129,12 @@ class VectorStoreManager:
|
||||
transcript = note_data.get("transcript", {})
|
||||
segments = transcript.get("segments", [])
|
||||
|
||||
audio_meta = note_data.get("audio_meta", {})
|
||||
|
||||
meta_chunks = _build_meta_chunk(audio_meta)
|
||||
md_chunks = _chunk_markdown(markdown)
|
||||
tr_chunks = _chunk_transcript(segments)
|
||||
all_chunks = md_chunks + tr_chunks
|
||||
all_chunks = meta_chunks + md_chunks + tr_chunks
|
||||
|
||||
if not all_chunks:
|
||||
logger.warning(f"笔记内容为空,跳过索引: {task_id}")
|
||||
@@ -138,8 +187,8 @@ class VectorStoreManager:
|
||||
|
||||
all_chunks = []
|
||||
|
||||
# 分别从两种来源各检索 n_results 条
|
||||
for source_type in ("markdown", "transcript"):
|
||||
# 分别从各来源检索
|
||||
for source_type in ("meta", "markdown", "transcript"):
|
||||
try:
|
||||
results = collection.query(
|
||||
query_texts=[query_text],
|
||||
@@ -164,10 +213,14 @@ class VectorStoreManager:
|
||||
pass
|
||||
|
||||
def is_indexed(self, task_id: str) -> bool:
|
||||
"""检查指定任务是否已建立索引。"""
|
||||
"""检查指定任务是否已建立完整索引(含 meta 信息)。"""
|
||||
col_name = self._collection_name(task_id)
|
||||
try:
|
||||
col = self._client.get_collection(col_name)
|
||||
return col.count() > 0
|
||||
if col.count() == 0:
|
||||
return False
|
||||
# 检查是否包含 meta chunk,旧索引可能缺失
|
||||
meta = col.get(where={"source_type": "meta"}, limit=1)
|
||||
return len(meta["ids"]) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user