feat: add vector and file collection constants, update vector index handling

This commit is contained in:
shiyu
2026-01-03 15:12:20 +08:00
parent ff7eb13187
commit 2fa93a1eeb
6 changed files with 55 additions and 32 deletions

View File

@@ -19,8 +19,8 @@ from domain.audit import router as audit
def include_routers(app: FastAPI):
app.include_router(adapters.router)
app.include_router(virtual_fs.router)
app.include_router(search_api.router)
app.include_router(virtual_fs.router)
app.include_router(auth.router)
app.include_router(config.router)
app.include_router(processors.router)

View File

@@ -19,6 +19,8 @@ from .vector_providers import (
)
DEFAULT_VECTOR_DIMENSION = 4096
VECTOR_COLLECTION_NAME = "vector_collection"
FILE_COLLECTION_NAME = "file_collection"
OPENAI_EMBEDDING_DIMS = {
"text-embedding-3-large": 3072,

View File

@@ -9,7 +9,12 @@ from PIL import Image
from ..base import BaseProcessor
from domain.ai.inference import describe_image_base64, get_text_embedding, provider_service
from domain.ai.service import VectorDBService, DEFAULT_VECTOR_DIMENSION
from domain.ai.service import (
VectorDBService,
DEFAULT_VECTOR_DIMENSION,
VECTOR_COLLECTION_NAME,
FILE_COLLECTION_NAME,
)
CHUNK_SIZE = 800
@@ -112,18 +117,20 @@ class VectorIndexProcessor:
action = config.get("action", "create")
index_type = config.get("index_type", "vector")
vector_db = VectorDBService()
collection_name = "vector_collection"
vector_collection = VECTOR_COLLECTION_NAME
file_collection = FILE_COLLECTION_NAME
if action == "destroy":
await vector_db.delete_vector(collection_name, path)
target_collection = file_collection if index_type == "simple" else vector_collection
await vector_db.delete_vector(target_collection, path)
return Response(content=f"文件 {path}{index_type} 索引已销毁", media_type="text/plain")
mime_type = _guess_mime(path)
if index_type == "simple":
await vector_db.ensure_collection(collection_name, vector=False)
await vector_db.delete_vector(collection_name, path)
await vector_db.upsert_vector(collection_name, {
await vector_db.ensure_collection(file_collection, vector=False)
await vector_db.delete_vector(file_collection, path)
await vector_db.upsert_vector(file_collection, {
"path": path,
"source_path": path,
"chunk_id": "filename",
@@ -146,8 +153,8 @@ class VectorIndexProcessor:
if vector_dim <= 0:
vector_dim = DEFAULT_VECTOR_DIMENSION
await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
await vector_db.delete_vector(collection_name, path)
await vector_db.ensure_collection(vector_collection, vector=True, dim=vector_dim)
await vector_db.delete_vector(vector_collection, path)
if file_ext in ["jpg", "jpeg", "png", "bmp"]:
processed_bytes, compression = _compress_image_for_embedding(input_bytes)
@@ -155,7 +162,7 @@ class VectorIndexProcessor:
description = await describe_image_base64(base64_image)
embedding = await get_text_embedding(description)
image_mime = "image/jpeg" if compression else mime_type
await vector_db.upsert_vector(collection_name, {
await vector_db.upsert_vector(vector_collection, {
"path": _chunk_key(path, "image"),
"source_path": path,
"chunk_id": "image",
@@ -177,7 +184,7 @@ class VectorIndexProcessor:
chunks = _chunk_text(text)
if not chunks:
await vector_db.upsert_vector(collection_name, {
await vector_db.upsert_vector(vector_collection, {
"path": _chunk_key(path, "0"),
"source_path": path,
"chunk_id": "0",
@@ -194,7 +201,7 @@ class VectorIndexProcessor:
chunk_count = 0
for chunk_id, chunk_text, start, end in chunks:
embedding = await get_text_embedding(chunk_text)
await vector_db.upsert_vector(collection_name, {
await vector_db.upsert_vector(vector_collection, {
"path": _chunk_key(path, str(chunk_id)),
"source_path": path,
"chunk_id": str(chunk_id),
@@ -213,15 +220,15 @@ class VectorIndexProcessor:
return Response(content="文本文件已索引", media_type="text/plain")
# 其他类型暂未支持向量索引,回退为文件名索引
await vector_db.delete_vector(collection_name, path)
await vector_db.upsert_vector(collection_name, {
"path": _chunk_key(path, "fallback"),
await vector_db.ensure_collection(file_collection, vector=False)
await vector_db.delete_vector(file_collection, path)
await vector_db.upsert_vector(file_collection, {
"path": path,
"source_path": path,
"chunk_id": "filename",
"mime": mime_type,
"type": "filename",
"name": os.path.basename(path),
"embedding": [0.0] * vector_dim,
})
return Response(content="暂不支持该类型的向量索引,已创建文件名索引", media_type="text/plain")

View File

@@ -4,7 +4,7 @@ from fastapi import HTTPException
from api.response import page
from domain.adapters.registry import runtime_registry
from domain.ai.service import VectorDBService
from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME
from domain.virtual_fs.thumbnail import is_image_filename, is_video_filename
from models import StorageAdapter
@@ -161,13 +161,19 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
@classmethod
async def _gather_vector_index(cls, full_path: str, limit: int = 20):
vector_db = VectorDBService()
try:
raw_results = await vector_db.search_by_path("vector_collection", full_path, max(limit * 2, 20))
except Exception:
return None
matched = []
if raw_results:
had_success = False
fetch_limit = max(limit * 2, 20)
for collection_name in (VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME):
try:
raw_results = await vector_db.search_by_path(collection_name, full_path, fetch_limit)
except Exception:
continue
if not raw_results:
had_success = True
continue
had_success = True
buckets = raw_results if isinstance(raw_results, list) else [raw_results]
for bucket in buckets:
if not bucket:
@@ -193,6 +199,9 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
entry["preview_truncated"] = len(text) > preview_limit
matched.append(entry)
if not had_success:
return None
if not matched:
return {"total": 0, "entries": [], "by_type": {}, "has_more": False}

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends, Query
from api.response import success
from domain.auth.service import get_current_active_user
from domain.auth.types import User
from domain.virtual_fs.search.search_service import VirtualFSSearchService
@@ -17,10 +18,11 @@ async def search_files(
user: User = Depends(get_current_active_user),
):
if not q.strip():
return {"items": [], "query": q}
return success({"items": [], "query": q, "mode": mode})
top_k = max(top_k, 1)
page = max(page, 1)
page_size = max(min(page_size, 100), 1)
return await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
data = await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
return success(data)

View File

@@ -2,7 +2,7 @@ from typing import Any, Dict, List, Tuple
from domain.virtual_fs.types import SearchResultItem
from domain.ai.inference import get_text_embedding
from domain.ai.service import VectorDBService
from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME
def _normalize_result(raw: Dict[str, Any], source: str, fallback_score: float = 0.0) -> SearchResultItem:
@@ -53,7 +53,7 @@ async def _vector_search(query: str, top_k: int) -> List[SearchResultItem]:
return []
try:
raw_results = await vector_db.search_vectors("vector_collection", embedding, max(top_k, 10))
raw_results = await vector_db.search_vectors(VECTOR_COLLECTION_NAME, embedding, max(top_k, 10))
except Exception:
return []
@@ -68,12 +68,15 @@ async def _filename_search(query: str, page: int, page_size: int) -> Tuple[List[
vector_db = VectorDBService()
limit = max(page * page_size + 1, page_size * (page + 2))
limit = min(limit, 2000)
try:
raw_results = await vector_db.search_by_path("vector_collection", query, limit)
except Exception:
return [], False
records: List[Dict[str, Any]] = []
for collection_name in (FILE_COLLECTION_NAME, VECTOR_COLLECTION_NAME):
try:
raw_results = await vector_db.search_by_path(collection_name, query, limit)
except Exception:
continue
if raw_results:
records.extend(raw_results[0] or [])
records = raw_results[0] if raw_results else []
deduped: List[SearchResultItem] = []
seen_paths: set[str] = set()
for record in records or []: