From 2fa93a1eeb6994e054b8707fd85e04b723896f98 Mon Sep 17 00:00:00 2001 From: shiyu Date: Sat, 3 Jan 2026 15:12:20 +0800 Subject: [PATCH] feat: add vector and file collection constants, update vector index handling --- api/routers.py | 2 +- domain/ai/service.py | 2 ++ domain/processors/builtin/vector_index.py | 37 +++++++++++++--------- domain/virtual_fs/listing.py | 23 ++++++++++---- domain/virtual_fs/search/search_api.py | 6 ++-- domain/virtual_fs/search/search_service.py | 17 ++++++---- 6 files changed, 55 insertions(+), 32 deletions(-) diff --git a/api/routers.py b/api/routers.py index e66e85c..5fa3d51 100644 --- a/api/routers.py +++ b/api/routers.py @@ -19,8 +19,8 @@ from domain.audit import router as audit def include_routers(app: FastAPI): app.include_router(adapters.router) - app.include_router(virtual_fs.router) app.include_router(search_api.router) + app.include_router(virtual_fs.router) app.include_router(auth.router) app.include_router(config.router) app.include_router(processors.router) diff --git a/domain/ai/service.py b/domain/ai/service.py index 5469cf8..9157894 100644 --- a/domain/ai/service.py +++ b/domain/ai/service.py @@ -19,6 +19,8 @@ from .vector_providers import ( ) DEFAULT_VECTOR_DIMENSION = 4096 +VECTOR_COLLECTION_NAME = "vector_collection" +FILE_COLLECTION_NAME = "file_collection" OPENAI_EMBEDDING_DIMS = { "text-embedding-3-large": 3072, diff --git a/domain/processors/builtin/vector_index.py b/domain/processors/builtin/vector_index.py index e0044ad..f5ba2b1 100644 --- a/domain/processors/builtin/vector_index.py +++ b/domain/processors/builtin/vector_index.py @@ -9,7 +9,12 @@ from PIL import Image from ..base import BaseProcessor from domain.ai.inference import describe_image_base64, get_text_embedding, provider_service -from domain.ai.service import VectorDBService, DEFAULT_VECTOR_DIMENSION +from domain.ai.service import ( + VectorDBService, + DEFAULT_VECTOR_DIMENSION, + VECTOR_COLLECTION_NAME, + FILE_COLLECTION_NAME, +) CHUNK_SIZE = 800 @@ -112,18 +117,20 @@ class VectorIndexProcessor: action = config.get("action", "create") index_type = config.get("index_type", "vector") vector_db = VectorDBService() - collection_name = "vector_collection" + vector_collection = VECTOR_COLLECTION_NAME + file_collection = FILE_COLLECTION_NAME if action == "destroy": - await vector_db.delete_vector(collection_name, path) + target_collection = file_collection if index_type == "simple" else vector_collection + await vector_db.delete_vector(target_collection, path) return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain") mime_type = _guess_mime(path) if index_type == "simple": - await vector_db.ensure_collection(collection_name, vector=False) - await vector_db.delete_vector(collection_name, path) - await vector_db.upsert_vector(collection_name, { + await vector_db.ensure_collection(file_collection, vector=False) + await vector_db.delete_vector(file_collection, path) + await vector_db.upsert_vector(file_collection, { "path": path, "source_path": path, "chunk_id": "filename", @@ -146,8 +153,8 @@ class VectorIndexProcessor: if vector_dim <= 0: vector_dim = DEFAULT_VECTOR_DIMENSION - await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim) - await vector_db.delete_vector(collection_name, path) + await vector_db.ensure_collection(vector_collection, vector=True, dim=vector_dim) + await vector_db.delete_vector(vector_collection, path) if file_ext in ["jpg", "jpeg", "png", "bmp"]: processed_bytes, compression = _compress_image_for_embedding(input_bytes) @@ -155,7 +162,7 @@ class VectorIndexProcessor: description = await describe_image_base64(base64_image) embedding = await get_text_embedding(description) image_mime = "image/jpeg" if compression else mime_type - await vector_db.upsert_vector(collection_name, { + await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, "image"), "source_path": path, "chunk_id": "image", @@ -177,7 +184,7 @@ class VectorIndexProcessor: chunks = _chunk_text(text) if not chunks: - await vector_db.upsert_vector(collection_name, { + await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, "0"), "source_path": path, "chunk_id": "0", @@ -194,7 +201,7 @@ class VectorIndexProcessor: chunk_count = 0 for chunk_id, chunk_text, start, end in chunks: embedding = await get_text_embedding(chunk_text) - await vector_db.upsert_vector(collection_name, { + await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, str(chunk_id)), "source_path": path, "chunk_id": str(chunk_id), @@ -213,15 +220,15 @@ class VectorIndexProcessor: return Response(content="文本文件已索引", media_type="text/plain") # 其他类型暂未支持向量索引,回退为文件名索引 - await vector_db.delete_vector(collection_name, path) - await vector_db.upsert_vector(collection_name, { - "path": _chunk_key(path, "fallback"), + await vector_db.ensure_collection(file_collection, vector=False) + await vector_db.delete_vector(file_collection, path) + await vector_db.upsert_vector(file_collection, { + "path": path, "source_path": path, "chunk_id": "filename", "mime": mime_type, "type": "filename", "name": os.path.basename(path), - "embedding": [0.0] * vector_dim, }) return Response(content="暂不支持该类型的向量索引,已创建文件名索引", media_type="text/plain") diff --git a/domain/virtual_fs/listing.py b/domain/virtual_fs/listing.py index 4d3362e..f0b78eb 100644 --- a/domain/virtual_fs/listing.py +++ b/domain/virtual_fs/listing.py @@ -4,7 +4,7 @@ from fastapi import HTTPException from api.response import page from domain.adapters.registry import runtime_registry -from domain.ai.service import VectorDBService +from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME from domain.virtual_fs.thumbnail import is_image_filename, is_video_filename from models import StorageAdapter @@ -161,13 +161,19 @@ class VirtualFSListingMixin(VirtualFSResolverMixin): @classmethod async def _gather_vector_index(cls, full_path: str, limit: int = 20): vector_db = VectorDBService() - try: - raw_results = await vector_db.search_by_path("vector_collection", full_path, max(limit * 2, 20)) - except Exception: - return None - matched = [] - if raw_results: + had_success = False + fetch_limit = max(limit * 2, 20) + for collection_name in (VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME): + try: + raw_results = await vector_db.search_by_path(collection_name, full_path, fetch_limit) + except Exception: + continue + + if not raw_results: + had_success = True + continue + had_success = True buckets = raw_results if isinstance(raw_results, list) else [raw_results] for bucket in buckets: if not bucket: @@ -193,6 +199,9 @@ class VirtualFSListingMixin(VirtualFSResolverMixin): entry["preview_truncated"] = len(text) > preview_limit matched.append(entry) + if not had_success: + return None + if not matched: return {"total": 0, "entries": [], "by_type": {}, "has_more": False} diff --git a/domain/virtual_fs/search/search_api.py b/domain/virtual_fs/search/search_api.py index f5aa7e9..c53ae8b 100644 --- a/domain/virtual_fs/search/search_api.py +++ b/domain/virtual_fs/search/search_api.py @@ -1,5 +1,6 @@ from fastapi import APIRouter, Depends, Query +from api.response import success from domain.auth.service import get_current_active_user from domain.auth.types import User from domain.virtual_fs.search.search_service import VirtualFSSearchService @@ -17,10 +18,11 @@ async def search_files( user: User = Depends(get_current_active_user), ): if not q.strip(): - return {"items": [], "query": q} + return success({"items": [], "query": q, "mode": mode}) top_k = max(top_k, 1) page = max(page, 1) page_size = max(min(page_size, 100), 1) - return await VirtualFSSearchService.search(q, top_k, mode, page, page_size) + data = await VirtualFSSearchService.search(q, top_k, mode, page, page_size) + return success(data) diff --git a/domain/virtual_fs/search/search_service.py b/domain/virtual_fs/search/search_service.py index 0e8fd47..53be215 100644 --- a/domain/virtual_fs/search/search_service.py +++ b/domain/virtual_fs/search/search_service.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Tuple from domain.virtual_fs.types import SearchResultItem from domain.ai.inference import get_text_embedding -from domain.ai.service import VectorDBService +from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME def _normalize_result(raw: Dict[str, Any], source: str, fallback_score: float = 0.0) -> SearchResultItem: @@ -53,7 +53,7 @@ async def _vector_search(query: str, top_k: int) -> List[SearchResultItem]: return [] try: - raw_results = await vector_db.search_vectors("vector_collection", embedding, max(top_k, 10)) + raw_results = await vector_db.search_vectors(VECTOR_COLLECTION_NAME, embedding, max(top_k, 10)) except Exception: return [] @@ -68,12 +68,15 @@ async def _filename_search(query: str, page: int, page_size: int) -> Tuple[List[ vector_db = VectorDBService() limit = max(page * page_size + 1, page_size * (page + 2)) limit = min(limit, 2000) - try: - raw_results = await vector_db.search_by_path("vector_collection", query, limit) - except Exception: - return [], False + records: List[Dict[str, Any]] = [] + for collection_name in (FILE_COLLECTION_NAME, VECTOR_COLLECTION_NAME): + try: + raw_results = await vector_db.search_by_path(collection_name, query, limit) + except Exception: + continue + if raw_results: + records.extend(raw_results[0] or []) - records = raw_results[0] if raw_results else [] deduped: List[SearchResultItem] = [] seen_paths: set[str] = set() for record in records or []: