feat: Enhance vector database providers with source path handling and improved search functionality

2026-06-02 06:00:38 +08:00 · 2025-09-27 13:34:18 +08:00
parent ee6e570ccb
commit a4af9475ef
10 changed files with 1082 additions and 353 deletions
--- a/services/processors/vector_index.py
+++ b/services/processors/vector_index.py
@@ -1,11 +1,95 @@
-from typing import Dict, Any
+from typing import Dict, Any, List, Tuple
 from fastapi.responses import Response
 import base64
+import mimetypes
+import os
+from io import BytesIO
+
 from services.ai import describe_image_base64, get_text_embedding
 from services.vector_db import VectorDBService, DEFAULT_VECTOR_DIMENSION
 from services.logging import LogService
 from services.config import ConfigCenter

+try:  # Pillow is optional but bundled with the project dependencies
+    from PIL import Image
+except ImportError:  # pragma: no cover - fallback when pillow missing
+    Image = None
+
+
+CHUNK_SIZE = 800
+CHUNK_OVERLAP = 200
+MAX_IMAGE_EDGE = 1600
+JPEG_QUALITY = 85
+
+
+def _chunk_text(content: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Tuple[int, str, int, int]]:
+    """按固定窗口拆分文本，返回(chunk_id, chunk_text, start, end)。"""
+    if chunk_size <= 0:
+        chunk_size = CHUNK_SIZE
+    if overlap >= chunk_size:
+        overlap = max(chunk_size // 4, 1)
+
+    chunks: List[Tuple[int, str, int, int]] = []
+    step = chunk_size - overlap
+    idx = 0
+    start = 0
+    length = len(content)
+
+    while start < length:
+        end = min(length, start + chunk_size)
+        chunk = content[start:end].strip()
+        if chunk:
+            chunks.append((idx, chunk, start, end))
+            idx += 1
+        if end >= length:
+            break
+        start += step
+    return chunks
+
+
+def _guess_mime(path: str) -> str:
+    mime, _ = mimetypes.guess_type(path)
+    return mime or "application/octet-stream"
+
+
+def _chunk_key(path: str, chunk_id: str) -> str:
+    return f"{path}#chunk={chunk_id}"
+
+
+def _compress_image_for_embedding(input_bytes: bytes) -> Tuple[bytes, Dict[str, Any] | None]:
+    """压缩图片，降低发送到视觉模型的体积。"""
+    if Image is None:
+        return input_bytes, None
+
+    try:
+        with Image.open(BytesIO(input_bytes)) as img:
+            img = img.convert("RGB")
+            width, height = img.size
+            longest_edge = max(width, height)
+            scale = 1.0
+            if longest_edge > MAX_IMAGE_EDGE:
+                scale = MAX_IMAGE_EDGE / float(longest_edge)
+                new_size = (max(int(width * scale), 1), max(int(height * scale), 1))
+                resample_mode = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
+                img = img.resize(new_size, resample=resample_mode)
+
+            buffer = BytesIO()
+            img.save(buffer, format="JPEG", quality=JPEG_QUALITY, optimize=True)
+            compressed = buffer.getvalue()
+
+            if len(compressed) < len(input_bytes):
+                return compressed, {
+                    "original_bytes": len(input_bytes),
+                    "compressed_bytes": len(compressed),
+                    "scaled": scale < 1.0,
+                    "width": img.width,
+                    "height": img.height,
+                }
+    except Exception:  # pragma: no cover - 任意图像处理异常时回退
+        return input_bytes, None
+
+    return input_bytes, None
+

 class VectorIndexProcessor:
    name = "向量索引"
@@ -33,6 +117,7 @@ class VectorIndexProcessor:
        index_type = config.get("index_type", "vector")
        vector_db = VectorDBService()
        collection_name = "vector_collection"
+
        if action == "destroy":
            await vector_db.delete_vector(collection_name, path)
            await LogService.info(
@@ -42,9 +127,19 @@ class VectorIndexProcessor:
            )
            return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain")

-        if index_type == 'simple':
+        mime_type = _guess_mime(path)
+
+        if index_type == "simple":
            await vector_db.ensure_collection(collection_name, vector=False)
-            await vector_db.upsert_vector(collection_name, {'path': path})
+            await vector_db.delete_vector(collection_name, path)
+            await vector_db.upsert_vector(collection_name, {
+                "path": path,
+                "source_path": path,
+                "chunk_id": "filename",
+                "mime": mime_type,
+                "type": "filename",
+                "name": os.path.basename(path),
+            })
            await LogService.info(
                "processor:vector_index",
                f"Created simple index for {path}",
@@ -53,24 +148,7 @@ class VectorIndexProcessor:
            return Response(content=f"文件 {path} 的普通索引已创建", media_type="text/plain")

        file_ext = path.split('.')[-1].lower()
-        description = ""
-        embedding = None
-
-        if file_ext in ["jpg", "jpeg", "png", "bmp"]:
-            base64_image = base64.b64encode(input_bytes).decode("utf-8")
-            description = await describe_image_base64(base64_image)
-            embedding = await get_text_embedding(description)
-            log_message = f"Indexed image {path}"
-            response_message = f"图片已索引，描述：{description}"
-        elif file_ext in ["txt", "md"]:
-            text = input_bytes.decode("utf-8")
-            embedding = await get_text_embedding(text)
-            description = text[:100] + "..." if len(text) > 100 else text
-            log_message = f"Indexed text file {path}"
-            response_message = f"文本文件已索引"
-        
-        if embedding is None:
-            return Response(content="不支持的文件类型", status_code=400)
+        details: Dict[str, Any] = {"path": path, "action": "create", "index_type": "vector"}

        raw_dim = await ConfigCenter.get('AI_EMBED_DIM', DEFAULT_VECTOR_DIMENSION)
        try:
@@ -81,15 +159,103 @@ class VectorIndexProcessor:
            vector_dim = DEFAULT_VECTOR_DIMENSION

        await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
-        await vector_db.upsert_vector(
-            collection_name, {'path': path, 'embedding': embedding})
-        
+        await vector_db.delete_vector(collection_name, path)
+
+        if file_ext in ["jpg", "jpeg", "png", "bmp"]:
+            processed_bytes, compression = _compress_image_for_embedding(input_bytes)
+            base64_image = base64.b64encode(processed_bytes).decode("utf-8")
+            description = await describe_image_base64(base64_image)
+            embedding = await get_text_embedding(description)
+            image_mime = "image/jpeg" if compression else mime_type
+            await vector_db.upsert_vector(collection_name, {
+                "path": _chunk_key(path, "image"),
+                "source_path": path,
+                "chunk_id": "image",
+                "embedding": embedding,
+                "text": description,
+                "mime": image_mime,
+                "type": "image",
+            })
+            details["description"] = description
+            if compression:
+                details["image_compression"] = compression
+            await LogService.info(
+                "processor:vector_index",
+                f"Indexed image {path}",
+                details=details,
+            )
+            return Response(content=f"图片已索引，描述：{description}", media_type="text/plain")
+
+        if file_ext in ["txt", "md"]:
+            try:
+                text = input_bytes.decode("utf-8")
+            except UnicodeDecodeError:
+                return Response(content="文本文件解码失败", status_code=400)
+
+            chunks = _chunk_text(text)
+            if not chunks:
+                await vector_db.upsert_vector(collection_name, {
+                    "path": _chunk_key(path, "0"),
+                    "source_path": path,
+                    "chunk_id": "0",
+                    "embedding": await get_text_embedding(text or path),
+                    "text": text,
+                    "mime": mime_type,
+                    "type": "text",
+                    "start_offset": 0,
+                    "end_offset": len(text),
+                })
+                details["chunks"] = 1
+                await LogService.info(
+                    "processor:vector_index",
+                    f"Indexed text file {path}",
+                    details=details,
+                )
+                return Response(content="文本文件已索引", media_type="text/plain")
+
+            chunk_count = 0
+            for chunk_id, chunk_text, start, end in chunks:
+                embedding = await get_text_embedding(chunk_text)
+                await vector_db.upsert_vector(collection_name, {
+                    "path": _chunk_key(path, str(chunk_id)),
+                    "source_path": path,
+                    "chunk_id": str(chunk_id),
+                    "embedding": embedding,
+                    "text": chunk_text,
+                    "mime": mime_type,
+                    "type": "text",
+                    "start_offset": start,
+                    "end_offset": end,
+                })
+                chunk_count += 1
+
+            details["chunks"] = chunk_count
+            sample = chunks[0][1]
+            details["sample"] = sample[:120]
+            await LogService.info(
+                "processor:vector_index",
+                f"Indexed text file {path}",
+                details=details,
+            )
+            return Response(content="文本文件已索引", media_type="text/plain")
+
+        # 其他类型暂未支持向量索引，回退为文件名索引
+        await vector_db.delete_vector(collection_name, path)
+        await vector_db.upsert_vector(collection_name, {
+            "path": _chunk_key(path, "fallback"),
+            "source_path": path,
+            "chunk_id": "filename",
+            "mime": mime_type,
+            "type": "filename",
+            "name": os.path.basename(path),
+            "embedding": [0.0] * vector_dim,
+        })
        await LogService.info(
            "processor:vector_index",
-            log_message,
-            details={"path": path, "description": description, "action": "create", "index_type": "vector"},
+            f"File type fallback to simple index for {path}",
+            details={"path": path, "action": "create", "index_type": "simple", "original_type": file_ext},
        )
-        return Response(content=response_message, media_type="text/plain")
+        return Response(content="暂不支持该类型的向量索引，已创建文件名索引", media_type="text/plain")


 PROCESSOR_TYPE = "vector_index"