mirror of
https://github.com/DrizzleTime/Foxel.git
synced 2026-06-01 05:30:31 +08:00
feat: Enhance vector database providers with source path handling and improved search functionality
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from schemas.fs import SearchResultItem
|
||||
from services.auth import get_current_active_user, User
|
||||
from services.ai import get_text_embedding
|
||||
@@ -6,24 +9,96 @@ from services.vector_db import VectorDBService
|
||||
|
||||
router = APIRouter(prefix="/api/search", tags=["search"])
|
||||
|
||||
async def search_files_by_vector(q: str, top_k: int):
|
||||
embedding = await get_text_embedding(q)
|
||||
vector_db = VectorDBService()
|
||||
results = await vector_db.search_vectors("vector_collection", embedding, top_k)
|
||||
items = [
|
||||
SearchResultItem(id=res["id"], path=res["entity"]["path"], score=res["distance"])
|
||||
for res in results[0]
|
||||
]
|
||||
return {"items": items, "query": q}
|
||||
|
||||
async def search_files_by_name(q: str, top_k: int):
|
||||
def _normalize_result(raw: Dict[str, Any], source: str, fallback_score: float = 0.0) -> SearchResultItem:
|
||||
entity = dict(raw.get("entity") or {})
|
||||
source_path = entity.get("source_path")
|
||||
stored_path = entity.get("path")
|
||||
path = source_path or stored_path or ""
|
||||
chunk_id_value = entity.get("chunk_id")
|
||||
chunk_id = str(chunk_id_value) if chunk_id_value is not None else None
|
||||
snippet = entity.get("text") or entity.get("description") or entity.get("name")
|
||||
mime = entity.get("mime")
|
||||
start_offset = entity.get("start_offset")
|
||||
end_offset = entity.get("end_offset")
|
||||
raw_score = raw.get("distance")
|
||||
score = float(raw_score) if raw_score is not None else fallback_score
|
||||
|
||||
metadata = {
|
||||
"retrieval_source": source,
|
||||
"raw_distance": raw_score,
|
||||
}
|
||||
if stored_path and stored_path != path:
|
||||
metadata["stored_path"] = stored_path
|
||||
vector_id = entity.get("vector_id")
|
||||
if vector_id:
|
||||
metadata["vector_id"] = vector_id
|
||||
|
||||
return SearchResultItem(
|
||||
id=str(raw.get("id")),
|
||||
path=path,
|
||||
score=score,
|
||||
chunk_id=chunk_id,
|
||||
snippet=snippet,
|
||||
mime=mime,
|
||||
source_type=entity.get("type") or source,
|
||||
start_offset=start_offset,
|
||||
end_offset=end_offset,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
async def _vector_search(query: str, top_k: int) -> List[SearchResultItem]:
|
||||
vector_db = VectorDBService()
|
||||
results = await vector_db.search_by_path("vector_collection", q, top_k)
|
||||
items = [
|
||||
SearchResultItem(id=idx, path=res["entity"]["path"], score=res["distance"])
|
||||
for idx, res in enumerate(results[0])
|
||||
]
|
||||
return {"items": items, "query": q}
|
||||
try:
|
||||
embedding = await get_text_embedding(query)
|
||||
except Exception:
|
||||
embedding = None
|
||||
if not embedding:
|
||||
return []
|
||||
|
||||
try:
|
||||
raw_results = await vector_db.search_vectors("vector_collection", embedding, max(top_k, 10))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
results: List[SearchResultItem] = []
|
||||
for bucket in raw_results or []:
|
||||
for record in bucket or []:
|
||||
results.append(_normalize_result(record, "vector"))
|
||||
return results
|
||||
|
||||
|
||||
async def _filename_search(query: str, page: int, page_size: int) -> Tuple[List[SearchResultItem], bool]:
|
||||
vector_db = VectorDBService()
|
||||
limit = max(page * page_size + 1, page_size * (page + 2))
|
||||
limit = min(limit, 2000)
|
||||
try:
|
||||
raw_results = await vector_db.search_by_path("vector_collection", query, limit)
|
||||
except Exception:
|
||||
return [], False
|
||||
|
||||
records = raw_results[0] if raw_results else []
|
||||
deduped: List[SearchResultItem] = []
|
||||
seen_paths: set[str] = set()
|
||||
for record in records or []:
|
||||
item = _normalize_result(record, "filename", fallback_score=1.0)
|
||||
stored_path = item.metadata.get("stored_path") if item.metadata else None
|
||||
key = item.path or stored_path or ""
|
||||
if key in seen_paths:
|
||||
continue
|
||||
seen_paths.add(key)
|
||||
deduped.append(item)
|
||||
|
||||
start = max(page - 1, 0) * page_size
|
||||
end = start + page_size
|
||||
page_items = deduped[start:end]
|
||||
for offset, item in enumerate(page_items):
|
||||
if item.metadata is None:
|
||||
item.metadata = {}
|
||||
item.metadata.setdefault("retrieval_rank", start + offset)
|
||||
has_more = len(deduped) > end
|
||||
return page_items, has_more
|
||||
|
||||
|
||||
@router.get("")
|
||||
@@ -31,11 +106,32 @@ async def search_files(
|
||||
q: str = Query(..., description="搜索查询"),
|
||||
top_k: int = Query(10, description="返回结果数量"),
|
||||
mode: str = Query("vector", description="搜索模式: 'vector' 或 'filename'"),
|
||||
page: int = Query(1, description="分页页码,仅在文件名搜索模式下生效"),
|
||||
page_size: int = Query(10, description="分页大小,仅在文件名搜索模式下生效"),
|
||||
user: User = Depends(get_current_active_user),
|
||||
):
|
||||
if not q.strip():
|
||||
return {"items": [], "query": q}
|
||||
|
||||
top_k = max(top_k, 1)
|
||||
page = max(page, 1)
|
||||
page_size = max(min(page_size, 100), 1)
|
||||
|
||||
if mode == "vector":
|
||||
return await search_files_by_vector(q, top_k)
|
||||
items = (await _vector_search(q, top_k))[:top_k]
|
||||
elif mode == "filename":
|
||||
return await search_files_by_name(q, top_k)
|
||||
items, has_more = await _filename_search(q, page, page_size)
|
||||
return {
|
||||
"items": items,
|
||||
"query": q,
|
||||
"mode": mode,
|
||||
"pagination": {
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"has_more": has_more,
|
||||
},
|
||||
}
|
||||
else:
|
||||
return {"items": [], "query": q, "error": "Invalid search mode"}
|
||||
items = (await _vector_search(q, top_k))[:top_k]
|
||||
|
||||
return {"items": items, "query": q, "mode": mode}
|
||||
|
||||
Reference in New Issue
Block a user