From a92c779dd6f814b40b2843a16cdde9d811978b09 Mon Sep 17 00:00:00 2001 From: huangjianwu Date: Mon, 23 Mar 2026 15:35:31 +0800 Subject: [PATCH] =?UTF-8?q?fix(chat):=20RAG=20=E6=A3=80=E7=B4=A2=E5=90=8C?= =?UTF-8?q?=E6=97=B6=E5=8F=AC=E5=9B=9E=E7=AC=94=E8=AE=B0=E5=92=8C=E8=BD=AC?= =?UTF-8?q?=E5=BD=95=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 之前 query 只做一次全局检索,embedding 模型倾向匹配笔记, 导致转录原文几乎不会被召回。 - 改为分别对 markdown 和 transcript 各检索 n_results 条, 合并后按距离排序取 top-n - 更新 system prompt,明确区分笔记和转录两种来源, 引导 LLM 根据问题类型选择合适的来源回答 Co-Authored-By: Claude Opus 4.6 --- backend/app/services/chat_service.py | 15 +++++++--- backend/app/services/vector_store.py | 45 +++++++++++++++++++++------- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/backend/app/services/chat_service.py b/backend/app/services/chat_service.py index c3ac2fb..7986625 100644 --- a/backend/app/services/chat_service.py +++ b/backend/app/services/chat_service.py @@ -8,14 +8,21 @@ from app.utils.logger import get_logger logger = get_logger(__name__) -SYSTEM_PROMPT = """你是一个视频笔记问答助手。根据以下笔记内容回答用户的问题。 -如果笔记内容中没有相关信息,请诚实告知用户。回答时尽量引用笔记中的具体内容。 +SYSTEM_PROMPT = """你是一个视频笔记问答助手。你可以参考两种来源回答用户的问题: +1. [笔记] — AI 生成的视频摘要笔记 +2. [转录] — 视频原始语音转录文本(含时间戳) ---- 相关笔记内容 --- +以下是检索到的相关内容: + +--- 相关内容 --- {context} --- -请用中文回答,保持简洁准确。""" +回答要求: +- 优先使用转录原文回答关于视频具体内容、原话、细节的问题 +- 优先使用笔记回答关于总结、要点、结构的问题 +- 如果确实没有相关信息,请诚实告知 +- 请用中文回答,保持简洁准确""" def _build_context(chunks: list[dict]) -> str: diff --git a/backend/app/services/vector_store.py b/backend/app/services/vector_store.py index e859a9b..10d813d 100644 --- a/backend/app/services/vector_store.py +++ b/backend/app/services/vector_store.py @@ -111,18 +111,11 @@ class VectorStoreManager: collection.add(documents=documents, metadatas=metadatas, ids=ids) logger.info(f"向量索引完成: task_id={task_id}, chunks={len(all_chunks)}") - def query(self, task_id: str, query_text: str, n_results: int = 5) -> list[dict]: - """检索与查询最相关的文档片段。""" - col_name = self._collection_name(task_id) - try: - collection = self._client.get_collection(col_name) - except Exception: - logger.warning(f"Collection 不存在: {col_name}") - return [] - - results = collection.query(query_texts=[query_text], n_results=n_results) - + def _parse_results(self, results: dict) -> list[dict]: + """将 ChromaDB query 结果转换为 chunk 列表。""" chunks = [] + if not results or not results.get("documents") or not results["documents"][0]: + return chunks for i in range(len(results["documents"][0])): chunks.append({ "text": results["documents"][0][i], @@ -131,6 +124,36 @@ class VectorStoreManager: }) return chunks + def query(self, task_id: str, query_text: str, n_results: int = 5) -> list[dict]: + """ + 分别从 markdown 和 transcript 各检索,确保两种来源都被召回, + 最后按距离排序返回 top-n。 + """ + col_name = self._collection_name(task_id) + try: + collection = self._client.get_collection(col_name) + except Exception: + logger.warning(f"Collection 不存在: {col_name}") + return [] + + all_chunks = [] + + # 分别从两种来源各检索 n_results 条 + for source_type in ("markdown", "transcript"): + try: + results = collection.query( + query_texts=[query_text], + n_results=n_results, + where={"source_type": source_type}, + ) + all_chunks.extend(self._parse_results(results)) + except Exception: + pass + + # 按距离排序,取 top-n + all_chunks.sort(key=lambda c: c.get("distance", 999)) + return all_chunks[:n_results] + def delete_index(self, task_id: str) -> None: """删除指定任务的向量索引。""" col_name = self._collection_name(task_id)