feat: add vector and file collection constants, update vector index handling

2026-05-06 18:22:44 +08:00 · 2026-01-03 15:12:20 +08:00
parent ff7eb13187
commit 2fa93a1eeb
6 changed files with 55 additions and 32 deletions
--- a/api/routers.py
+++ b/api/routers.py
@@ -19,8 +19,8 @@ from domain.audit import router as audit

 def include_routers(app: FastAPI):
    app.include_router(adapters.router)
-    app.include_router(virtual_fs.router)
    app.include_router(search_api.router)
+    app.include_router(virtual_fs.router)
    app.include_router(auth.router)
    app.include_router(config.router)
    app.include_router(processors.router)
--- a/domain/ai/service.py
+++ b/domain/ai/service.py
@@ -19,6 +19,8 @@ from .vector_providers import (
 )

 DEFAULT_VECTOR_DIMENSION = 4096
+VECTOR_COLLECTION_NAME = "vector_collection"
+FILE_COLLECTION_NAME = "file_collection"

 OPENAI_EMBEDDING_DIMS = {
    "text-embedding-3-large": 3072,
--- a/domain/processors/builtin/vector_index.py
+++ b/domain/processors/builtin/vector_index.py
@@ -9,7 +9,12 @@ from PIL import Image

 from ..base import BaseProcessor
 from domain.ai.inference import describe_image_base64, get_text_embedding, provider_service
-from domain.ai.service import VectorDBService, DEFAULT_VECTOR_DIMENSION
+from domain.ai.service import (
+    VectorDBService,
+    DEFAULT_VECTOR_DIMENSION,
+    VECTOR_COLLECTION_NAME,
+    FILE_COLLECTION_NAME,
+)


 CHUNK_SIZE = 800
@@ -112,18 +117,20 @@ class VectorIndexProcessor:
        action = config.get("action", "create")
        index_type = config.get("index_type", "vector")
        vector_db = VectorDBService()
-        collection_name = "vector_collection"
+        vector_collection = VECTOR_COLLECTION_NAME
+        file_collection = FILE_COLLECTION_NAME

        if action == "destroy":
-            await vector_db.delete_vector(collection_name, path)
+            target_collection = file_collection if index_type == "simple" else vector_collection
+            await vector_db.delete_vector(target_collection, path)
            return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain")

        mime_type = _guess_mime(path)

        if index_type == "simple":
-            await vector_db.ensure_collection(collection_name, vector=False)
-            await vector_db.delete_vector(collection_name, path)
-            await vector_db.upsert_vector(collection_name, {
+            await vector_db.ensure_collection(file_collection, vector=False)
+            await vector_db.delete_vector(file_collection, path)
+            await vector_db.upsert_vector(file_collection, {
                "path": path,
                "source_path": path,
                "chunk_id": "filename",
@@ -146,8 +153,8 @@ class VectorIndexProcessor:
            if vector_dim <= 0:
                vector_dim = DEFAULT_VECTOR_DIMENSION

-        await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
-        await vector_db.delete_vector(collection_name, path)
+        await vector_db.ensure_collection(vector_collection, vector=True, dim=vector_dim)
+        await vector_db.delete_vector(vector_collection, path)

        if file_ext in ["jpg", "jpeg", "png", "bmp"]:
            processed_bytes, compression = _compress_image_for_embedding(input_bytes)
@@ -155,7 +162,7 @@ class VectorIndexProcessor:
            description = await describe_image_base64(base64_image)
            embedding = await get_text_embedding(description)
            image_mime = "image/jpeg" if compression else mime_type
-            await vector_db.upsert_vector(collection_name, {
+            await vector_db.upsert_vector(vector_collection, {
                "path": _chunk_key(path, "image"),
                "source_path": path,
                "chunk_id": "image",
@@ -177,7 +184,7 @@ class VectorIndexProcessor:

            chunks = _chunk_text(text)
            if not chunks:
-                await vector_db.upsert_vector(collection_name, {
+                await vector_db.upsert_vector(vector_collection, {
                    "path": _chunk_key(path, "0"),
                    "source_path": path,
                    "chunk_id": "0",
@@ -194,7 +201,7 @@ class VectorIndexProcessor:
            chunk_count = 0
            for chunk_id, chunk_text, start, end in chunks:
                embedding = await get_text_embedding(chunk_text)
-                await vector_db.upsert_vector(collection_name, {
+                await vector_db.upsert_vector(vector_collection, {
                    "path": _chunk_key(path, str(chunk_id)),
                    "source_path": path,
                    "chunk_id": str(chunk_id),
@@ -213,15 +220,15 @@ class VectorIndexProcessor:
            return Response(content="文本文件已索引", media_type="text/plain")

        # 其他类型暂未支持向量索引，回退为文件名索引
-        await vector_db.delete_vector(collection_name, path)
-        await vector_db.upsert_vector(collection_name, {
-            "path": _chunk_key(path, "fallback"),
+        await vector_db.ensure_collection(file_collection, vector=False)
+        await vector_db.delete_vector(file_collection, path)
+        await vector_db.upsert_vector(file_collection, {
+            "path": path,
            "source_path": path,
            "chunk_id": "filename",
            "mime": mime_type,
            "type": "filename",
            "name": os.path.basename(path),
-            "embedding": [0.0] * vector_dim,
        })
        return Response(content="暂不支持该类型的向量索引，已创建文件名索引", media_type="text/plain")

--- a/domain/virtual_fs/listing.py
+++ b/domain/virtual_fs/listing.py
@@ -4,7 +4,7 @@ from fastapi import HTTPException

 from api.response import page
 from domain.adapters.registry import runtime_registry
-from domain.ai.service import VectorDBService
+from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME
 from domain.virtual_fs.thumbnail import is_image_filename, is_video_filename
 from models import StorageAdapter

@@ -161,13 +161,19 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
    @classmethod
    async def _gather_vector_index(cls, full_path: str, limit: int = 20):
        vector_db = VectorDBService()
-        try:
-            raw_results = await vector_db.search_by_path("vector_collection", full_path, max(limit * 2, 20))
-        except Exception:
-            return None
-
        matched = []
-        if raw_results:
+        had_success = False
+        fetch_limit = max(limit * 2, 20)
+        for collection_name in (VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME):
+            try:
+                raw_results = await vector_db.search_by_path(collection_name, full_path, fetch_limit)
+            except Exception:
+                continue
+
+            if not raw_results:
+                had_success = True
+                continue
+            had_success = True
            buckets = raw_results if isinstance(raw_results, list) else [raw_results]
            for bucket in buckets:
                if not bucket:
@@ -193,6 +199,9 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
                        entry["preview_truncated"] = len(text) > preview_limit
                    matched.append(entry)

+        if not had_success:
+            return None
+
        if not matched:
            return {"total": 0, "entries": [], "by_type": {}, "has_more": False}

--- a/domain/virtual_fs/search/search_api.py
+++ b/domain/virtual_fs/search/search_api.py
@@ -1,5 +1,6 @@
 from fastapi import APIRouter, Depends, Query

+from api.response import success
 from domain.auth.service import get_current_active_user
 from domain.auth.types import User
 from domain.virtual_fs.search.search_service import VirtualFSSearchService
@@ -17,10 +18,11 @@ async def search_files(
    user: User = Depends(get_current_active_user),
 ):
    if not q.strip():
-        return {"items": [], "query": q}
+        return success({"items": [], "query": q, "mode": mode})

    top_k = max(top_k, 1)
    page = max(page, 1)
    page_size = max(min(page_size, 100), 1)

-    return await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
+    data = await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
+    return success(data)
--- a/domain/virtual_fs/search/search_service.py
+++ b/domain/virtual_fs/search/search_service.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List, Tuple

 from domain.virtual_fs.types import SearchResultItem
 from domain.ai.inference import get_text_embedding
-from domain.ai.service import VectorDBService
+from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME


 def _normalize_result(raw: Dict[str, Any], source: str, fallback_score: float = 0.0) -> SearchResultItem:
@@ -53,7 +53,7 @@ async def _vector_search(query: str, top_k: int) -> List[SearchResultItem]:
        return []

    try:
-        raw_results = await vector_db.search_vectors("vector_collection", embedding, max(top_k, 10))
+        raw_results = await vector_db.search_vectors(VECTOR_COLLECTION_NAME, embedding, max(top_k, 10))
    except Exception:
        return []

@@ -68,12 +68,15 @@ async def _filename_search(query: str, page: int, page_size: int) -> Tuple[List[
    vector_db = VectorDBService()
    limit = max(page * page_size + 1, page_size * (page + 2))
    limit = min(limit, 2000)
-    try:
-        raw_results = await vector_db.search_by_path("vector_collection", query, limit)
-    except Exception:
-        return [], False
+    records: List[Dict[str, Any]] = []
+    for collection_name in (FILE_COLLECTION_NAME, VECTOR_COLLECTION_NAME):
+        try:
+            raw_results = await vector_db.search_by_path(collection_name, query, limit)
+        except Exception:
+            continue
+        if raw_results:
+            records.extend(raw_results[0] or [])

-    records = raw_results[0] if raw_results else []
    deduped: List[SearchResultItem] = []
    seen_paths: set[str] = set()
    for record in records or []: