mirror of
https://github.com/DrizzleTime/Foxel.git
synced 2026-05-06 18:22:44 +08:00
feat: add vector and file collection constants, update vector index handling
This commit is contained in:
@@ -19,8 +19,8 @@ from domain.audit import router as audit
|
||||
|
||||
def include_routers(app: FastAPI):
|
||||
app.include_router(adapters.router)
|
||||
app.include_router(virtual_fs.router)
|
||||
app.include_router(search_api.router)
|
||||
app.include_router(virtual_fs.router)
|
||||
app.include_router(auth.router)
|
||||
app.include_router(config.router)
|
||||
app.include_router(processors.router)
|
||||
|
||||
@@ -19,6 +19,8 @@ from .vector_providers import (
|
||||
)
|
||||
|
||||
DEFAULT_VECTOR_DIMENSION = 4096
|
||||
VECTOR_COLLECTION_NAME = "vector_collection"
|
||||
FILE_COLLECTION_NAME = "file_collection"
|
||||
|
||||
OPENAI_EMBEDDING_DIMS = {
|
||||
"text-embedding-3-large": 3072,
|
||||
|
||||
@@ -9,7 +9,12 @@ from PIL import Image
|
||||
|
||||
from ..base import BaseProcessor
|
||||
from domain.ai.inference import describe_image_base64, get_text_embedding, provider_service
|
||||
from domain.ai.service import VectorDBService, DEFAULT_VECTOR_DIMENSION
|
||||
from domain.ai.service import (
|
||||
VectorDBService,
|
||||
DEFAULT_VECTOR_DIMENSION,
|
||||
VECTOR_COLLECTION_NAME,
|
||||
FILE_COLLECTION_NAME,
|
||||
)
|
||||
|
||||
|
||||
CHUNK_SIZE = 800
|
||||
@@ -112,18 +117,20 @@ class VectorIndexProcessor:
|
||||
action = config.get("action", "create")
|
||||
index_type = config.get("index_type", "vector")
|
||||
vector_db = VectorDBService()
|
||||
collection_name = "vector_collection"
|
||||
vector_collection = VECTOR_COLLECTION_NAME
|
||||
file_collection = FILE_COLLECTION_NAME
|
||||
|
||||
if action == "destroy":
|
||||
await vector_db.delete_vector(collection_name, path)
|
||||
target_collection = file_collection if index_type == "simple" else vector_collection
|
||||
await vector_db.delete_vector(target_collection, path)
|
||||
return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain")
|
||||
|
||||
mime_type = _guess_mime(path)
|
||||
|
||||
if index_type == "simple":
|
||||
await vector_db.ensure_collection(collection_name, vector=False)
|
||||
await vector_db.delete_vector(collection_name, path)
|
||||
await vector_db.upsert_vector(collection_name, {
|
||||
await vector_db.ensure_collection(file_collection, vector=False)
|
||||
await vector_db.delete_vector(file_collection, path)
|
||||
await vector_db.upsert_vector(file_collection, {
|
||||
"path": path,
|
||||
"source_path": path,
|
||||
"chunk_id": "filename",
|
||||
@@ -146,8 +153,8 @@ class VectorIndexProcessor:
|
||||
if vector_dim <= 0:
|
||||
vector_dim = DEFAULT_VECTOR_DIMENSION
|
||||
|
||||
await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
|
||||
await vector_db.delete_vector(collection_name, path)
|
||||
await vector_db.ensure_collection(vector_collection, vector=True, dim=vector_dim)
|
||||
await vector_db.delete_vector(vector_collection, path)
|
||||
|
||||
if file_ext in ["jpg", "jpeg", "png", "bmp"]:
|
||||
processed_bytes, compression = _compress_image_for_embedding(input_bytes)
|
||||
@@ -155,7 +162,7 @@ class VectorIndexProcessor:
|
||||
description = await describe_image_base64(base64_image)
|
||||
embedding = await get_text_embedding(description)
|
||||
image_mime = "image/jpeg" if compression else mime_type
|
||||
await vector_db.upsert_vector(collection_name, {
|
||||
await vector_db.upsert_vector(vector_collection, {
|
||||
"path": _chunk_key(path, "image"),
|
||||
"source_path": path,
|
||||
"chunk_id": "image",
|
||||
@@ -177,7 +184,7 @@ class VectorIndexProcessor:
|
||||
|
||||
chunks = _chunk_text(text)
|
||||
if not chunks:
|
||||
await vector_db.upsert_vector(collection_name, {
|
||||
await vector_db.upsert_vector(vector_collection, {
|
||||
"path": _chunk_key(path, "0"),
|
||||
"source_path": path,
|
||||
"chunk_id": "0",
|
||||
@@ -194,7 +201,7 @@ class VectorIndexProcessor:
|
||||
chunk_count = 0
|
||||
for chunk_id, chunk_text, start, end in chunks:
|
||||
embedding = await get_text_embedding(chunk_text)
|
||||
await vector_db.upsert_vector(collection_name, {
|
||||
await vector_db.upsert_vector(vector_collection, {
|
||||
"path": _chunk_key(path, str(chunk_id)),
|
||||
"source_path": path,
|
||||
"chunk_id": str(chunk_id),
|
||||
@@ -213,15 +220,15 @@ class VectorIndexProcessor:
|
||||
return Response(content="文本文件已索引", media_type="text/plain")
|
||||
|
||||
# 其他类型暂未支持向量索引,回退为文件名索引
|
||||
await vector_db.delete_vector(collection_name, path)
|
||||
await vector_db.upsert_vector(collection_name, {
|
||||
"path": _chunk_key(path, "fallback"),
|
||||
await vector_db.ensure_collection(file_collection, vector=False)
|
||||
await vector_db.delete_vector(file_collection, path)
|
||||
await vector_db.upsert_vector(file_collection, {
|
||||
"path": path,
|
||||
"source_path": path,
|
||||
"chunk_id": "filename",
|
||||
"mime": mime_type,
|
||||
"type": "filename",
|
||||
"name": os.path.basename(path),
|
||||
"embedding": [0.0] * vector_dim,
|
||||
})
|
||||
return Response(content="暂不支持该类型的向量索引,已创建文件名索引", media_type="text/plain")
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ from fastapi import HTTPException
|
||||
|
||||
from api.response import page
|
||||
from domain.adapters.registry import runtime_registry
|
||||
from domain.ai.service import VectorDBService
|
||||
from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME
|
||||
from domain.virtual_fs.thumbnail import is_image_filename, is_video_filename
|
||||
from models import StorageAdapter
|
||||
|
||||
@@ -161,13 +161,19 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
|
||||
@classmethod
|
||||
async def _gather_vector_index(cls, full_path: str, limit: int = 20):
|
||||
vector_db = VectorDBService()
|
||||
try:
|
||||
raw_results = await vector_db.search_by_path("vector_collection", full_path, max(limit * 2, 20))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
matched = []
|
||||
if raw_results:
|
||||
had_success = False
|
||||
fetch_limit = max(limit * 2, 20)
|
||||
for collection_name in (VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME):
|
||||
try:
|
||||
raw_results = await vector_db.search_by_path(collection_name, full_path, fetch_limit)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not raw_results:
|
||||
had_success = True
|
||||
continue
|
||||
had_success = True
|
||||
buckets = raw_results if isinstance(raw_results, list) else [raw_results]
|
||||
for bucket in buckets:
|
||||
if not bucket:
|
||||
@@ -193,6 +199,9 @@ class VirtualFSListingMixin(VirtualFSResolverMixin):
|
||||
entry["preview_truncated"] = len(text) > preview_limit
|
||||
matched.append(entry)
|
||||
|
||||
if not had_success:
|
||||
return None
|
||||
|
||||
if not matched:
|
||||
return {"total": 0, "entries": [], "by_type": {}, "has_more": False}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from api.response import success
|
||||
from domain.auth.service import get_current_active_user
|
||||
from domain.auth.types import User
|
||||
from domain.virtual_fs.search.search_service import VirtualFSSearchService
|
||||
@@ -17,10 +18,11 @@ async def search_files(
|
||||
user: User = Depends(get_current_active_user),
|
||||
):
|
||||
if not q.strip():
|
||||
return {"items": [], "query": q}
|
||||
return success({"items": [], "query": q, "mode": mode})
|
||||
|
||||
top_k = max(top_k, 1)
|
||||
page = max(page, 1)
|
||||
page_size = max(min(page_size, 100), 1)
|
||||
|
||||
return await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
|
||||
data = await VirtualFSSearchService.search(q, top_k, mode, page, page_size)
|
||||
return success(data)
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Any, Dict, List, Tuple
|
||||
|
||||
from domain.virtual_fs.types import SearchResultItem
|
||||
from domain.ai.inference import get_text_embedding
|
||||
from domain.ai.service import VectorDBService
|
||||
from domain.ai.service import VectorDBService, VECTOR_COLLECTION_NAME, FILE_COLLECTION_NAME
|
||||
|
||||
|
||||
def _normalize_result(raw: Dict[str, Any], source: str, fallback_score: float = 0.0) -> SearchResultItem:
|
||||
@@ -53,7 +53,7 @@ async def _vector_search(query: str, top_k: int) -> List[SearchResultItem]:
|
||||
return []
|
||||
|
||||
try:
|
||||
raw_results = await vector_db.search_vectors("vector_collection", embedding, max(top_k, 10))
|
||||
raw_results = await vector_db.search_vectors(VECTOR_COLLECTION_NAME, embedding, max(top_k, 10))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
@@ -68,12 +68,15 @@ async def _filename_search(query: str, page: int, page_size: int) -> Tuple[List[
|
||||
vector_db = VectorDBService()
|
||||
limit = max(page * page_size + 1, page_size * (page + 2))
|
||||
limit = min(limit, 2000)
|
||||
try:
|
||||
raw_results = await vector_db.search_by_path("vector_collection", query, limit)
|
||||
except Exception:
|
||||
return [], False
|
||||
records: List[Dict[str, Any]] = []
|
||||
for collection_name in (FILE_COLLECTION_NAME, VECTOR_COLLECTION_NAME):
|
||||
try:
|
||||
raw_results = await vector_db.search_by_path(collection_name, query, limit)
|
||||
except Exception:
|
||||
continue
|
||||
if raw_results:
|
||||
records.extend(raw_results[0] or [])
|
||||
|
||||
records = raw_results[0] if raw_results else []
|
||||
deduped: List[SearchResultItem] = []
|
||||
seen_paths: set[str] = set()
|
||||
for record in records or []:
|
||||
|
||||
Reference in New Issue
Block a user