feat: Enhance vector database providers with source path handling and improved search functionality

This commit is contained in:
shiyu
2025-09-27 13:34:18 +08:00
parent ee6e570ccb
commit a4af9475ef
10 changed files with 1082 additions and 353 deletions

View File

@@ -50,15 +50,20 @@ class MilvusLiteProvider(BaseVectorProvider):
client = self._get_client()
if client.has_collection(collection_name):
return
common_fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
FieldSchema(name="source_path", dtype=DataType.VARCHAR, max_length=512, is_primary=False, auto_id=False),
]
if vector:
vector_dim = dim if isinstance(dim, int) and dim > 0 else 0
if vector_dim <= 0:
vector_dim = 4096
fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
*common_fields,
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=vector_dim),
]
schema = CollectionSchema(fields, description="Image vector collection")
schema = CollectionSchema(fields, description="Vector collection", enable_dynamic_field=True)
client.create_collection(collection_name, schema=schema)
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
@@ -70,38 +75,98 @@ class MilvusLiteProvider(BaseVectorProvider):
)
client.create_index(collection_name, index_params=index_params)
else:
fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
]
schema = CollectionSchema(fields, description="Simple file index")
schema = CollectionSchema(common_fields, description="Simple file index", enable_dynamic_field=True)
client.create_collection(collection_name, schema=schema)
def upsert_vector(self, collection_name: str, data: Dict[str, Any]) -> None:
self._get_client().upsert(collection_name, data)
payload = dict(data)
payload.setdefault("source_path", payload.get("path"))
payload.setdefault("vector_id", payload.get("path"))
self._get_client().upsert(collection_name, data=[payload])
def delete_vector(self, collection_name: str, path: str) -> None:
self._get_client().delete(collection_name, ids=[path])
client = self._get_client()
escaped = path.replace('"', '\\"')
client.delete(collection_name, filter=f'source_path == "{escaped}"')
def search_vectors(self, collection_name: str, query_embedding, top_k: int):
search_params = {"metric_type": "COSINE"}
return self._get_client().search(
output_fields = [
"path",
"source_path",
"chunk_id",
"mime",
"text",
"start_offset",
"end_offset",
"type",
"name",
]
raw_results = self._get_client().search(
collection_name,
data=[query_embedding],
anns_field="embedding",
search_params=search_params,
limit=top_k,
output_fields=["path"],
output_fields=output_fields,
)
formatted: List[List[Dict[str, Any]]] = []
for hits in raw_results:
bucket: List[Dict[str, Any]] = []
for hit in hits:
if hasattr(hit, "entity"):
entity = dict(getattr(hit, "entity", {}) or {})
hit_id = getattr(hit, "id", None)
distance = getattr(hit, "distance", None)
elif isinstance(hit, dict):
entity = dict((hit.get("entity") or {}))
hit_id = hit.get("id")
distance = hit.get("distance")
else:
entity = {}
hit_id = None
distance = None
entity.setdefault("path", entity.get("source_path"))
bucket.append({
"id": hit_id,
"distance": distance,
"entity": entity,
})
formatted.append(bucket)
return formatted
def search_by_path(self, collection_name: str, query_path: str, top_k: int):
filter_expr = f"path like '%{query_path}%'" if query_path else "path like '%%'"
if query_path:
escaped = query_path.replace('"', '\\"')
filter_expr = f'source_path like "%{escaped}%"'
else:
filter_expr = "source_path like '%%'"
results = self._get_client().query(
collection_name,
filter=filter_expr,
limit=top_k,
output_fields=["path"],
output_fields=[
"path",
"source_path",
"chunk_id",
"mime",
"text",
"start_offset",
"end_offset",
"type",
"name",
],
)
return [[{"id": r["path"], "distance": 1.0, "entity": {"path": r["path"]}} for r in results]]
formatted = []
for row in results:
entity = dict(row)
entity.setdefault("path", entity.get("source_path"))
formatted.append({
"id": entity.get("path"),
"distance": 1.0,
"entity": entity,
})
return [formatted]
def get_all_stats(self) -> Dict[str, Any]:
client = self._get_client()

View File

@@ -58,15 +58,19 @@ class MilvusServerProvider(BaseVectorProvider):
client = self._get_client()
if client.has_collection(collection_name):
return
common_fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
FieldSchema(name="source_path", dtype=DataType.VARCHAR, max_length=512, is_primary=False, auto_id=False),
]
if vector:
vector_dim = dim if isinstance(dim, int) and dim > 0 else 0
if vector_dim <= 0:
vector_dim = 4096
fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
*common_fields,
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=vector_dim),
]
schema = CollectionSchema(fields, description="Image vector collection")
schema = CollectionSchema(fields, description="Vector collection", enable_dynamic_field=True)
client.create_collection(collection_name, schema=schema)
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
@@ -78,38 +82,98 @@ class MilvusServerProvider(BaseVectorProvider):
)
client.create_index(collection_name, index_params=index_params)
else:
fields = [
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False),
]
schema = CollectionSchema(fields, description="Simple file index")
schema = CollectionSchema(common_fields, description="Simple file index", enable_dynamic_field=True)
client.create_collection(collection_name, schema=schema)
def upsert_vector(self, collection_name: str, data: Dict[str, Any]) -> None:
self._get_client().upsert(collection_name, data)
payload = dict(data)
payload.setdefault("source_path", payload.get("path"))
payload.setdefault("vector_id", payload.get("path"))
self._get_client().upsert(collection_name, data=[payload])
def delete_vector(self, collection_name: str, path: str) -> None:
self._get_client().delete(collection_name, ids=[path])
client = self._get_client()
escaped = path.replace('"', '\\"')
client.delete(collection_name, filter=f'source_path == "{escaped}"')
def search_vectors(self, collection_name: str, query_embedding, top_k: int):
search_params = {"metric_type": "COSINE"}
return self._get_client().search(
output_fields = [
"path",
"source_path",
"chunk_id",
"mime",
"text",
"start_offset",
"end_offset",
"type",
"name",
]
raw_results = self._get_client().search(
collection_name,
data=[query_embedding],
anns_field="embedding",
search_params=search_params,
limit=top_k,
output_fields=["path"],
output_fields=output_fields,
)
formatted: List[List[Dict[str, Any]]] = []
for hits in raw_results:
bucket: List[Dict[str, Any]] = []
for hit in hits:
if hasattr(hit, "entity"):
entity = dict(getattr(hit, "entity", {}) or {})
hit_id = getattr(hit, "id", None)
distance = getattr(hit, "distance", None)
elif isinstance(hit, dict):
entity = dict((hit.get("entity") or {}))
hit_id = hit.get("id")
distance = hit.get("distance")
else:
entity = {}
hit_id = None
distance = None
entity.setdefault("path", entity.get("source_path"))
bucket.append({
"id": hit_id,
"distance": distance,
"entity": entity,
})
formatted.append(bucket)
return formatted
def search_by_path(self, collection_name: str, query_path: str, top_k: int):
filter_expr = f"path like '%{query_path}%'" if query_path else "path like '%%'"
if query_path:
escaped = query_path.replace('"', '\\"')
filter_expr = f'source_path like "%{escaped}%"'
else:
filter_expr = "source_path like '%%'"
results = self._get_client().query(
collection_name,
filter=filter_expr,
limit=top_k,
output_fields=["path"],
output_fields=[
"path",
"source_path",
"chunk_id",
"mime",
"text",
"start_offset",
"end_offset",
"type",
"name",
],
)
return [[{"id": r["path"], "distance": 1.0, "entity": {"path": r["path"]}} for r in results]]
formatted = []
for row in results:
entity = dict(row)
entity.setdefault("path", entity.get("source_path"))
formatted.append({
"id": entity.get("path"),
"distance": 1.0,
"entity": entity,
})
return [formatted]
def get_all_stats(self) -> Dict[str, Any]:
client = self._get_client()

View File

@@ -58,29 +58,59 @@ class QdrantProvider(BaseVectorProvider):
size = dim if vector and isinstance(dim, int) and dim > 0 else 1
return qmodels.VectorParams(size=size, distance=qmodels.Distance.COSINE)
def _ensure_payload_indexes(self, client: QdrantClient, collection_name: str) -> None:
for field in ("path", "source_path"):
try:
client.create_payload_index(
collection_name=collection_name,
field_name=field,
field_schema="keyword",
)
except Exception as exc: # pragma: no cover - 依赖外部服务
message = str(exc).lower()
if "already exists" in message or "index exists" in message:
continue
# 旧版本 qdrant 可能返回带状态码的异常,这里容忍重复创建
raise
def ensure_collection(self, collection_name: str, vector: bool, dim: int) -> None:
client = self._get_client()
try:
if client.collection_exists(collection_name):
return
exists = client.collection_exists(collection_name)
except Exception as exc: # pragma: no cover - 依赖外部服务
raise RuntimeError(f"Failed to check Qdrant collection '{collection_name}': {exc}") from exc
if exists:
try:
self._ensure_payload_indexes(client, collection_name)
except Exception:
pass
return
vectors_config = self._vector_params(vector, dim)
try:
client.create_collection(collection_name=collection_name, vectors_config=vectors_config)
except Exception as exc: # pragma: no cover
if "already exists" in str(exc).lower():
try:
self._ensure_payload_indexes(client, collection_name)
except Exception:
pass
return
raise RuntimeError(f"Failed to create Qdrant collection '{collection_name}': {exc}") from exc
try:
self._ensure_payload_indexes(client, collection_name)
except Exception:
pass
@staticmethod
def _point_id(path: str) -> str:
return str(uuid5(NAMESPACE_URL, path))
def _point_id(uid: str) -> str:
return str(uuid5(NAMESPACE_URL, uid))
def _prepare_point(self, data: Dict[str, Any]) -> qmodels.PointStruct:
path = data.get("path")
if not path:
uid = data.get("path")
if not uid:
raise ValueError("Qdrant upsert requires 'path' in data")
embedding = data.get("embedding")
@@ -89,8 +119,11 @@ class QdrantProvider(BaseVectorProvider):
else:
vector = [float(x) for x in embedding]
payload = {"path": path}
return qmodels.PointStruct(id=self._point_id(path), vector=vector, payload=payload)
payload = {k: v for k, v in data.items() if k != "embedding"}
payload.setdefault("vector_id", uid)
source_path = payload.get("source_path") or payload.get("path")
payload["path"] = source_path
return qmodels.PointStruct(id=self._point_id(str(uid)), vector=vector, payload=payload)
def upsert_vector(self, collection_name: str, data: Dict[str, Any]) -> None:
client = self._get_client()
@@ -99,7 +132,12 @@ class QdrantProvider(BaseVectorProvider):
def delete_vector(self, collection_name: str, path: str) -> None:
client = self._get_client()
selector = qmodels.PointIdsList(points=[self._point_id(path)])
condition = qmodels.FieldCondition(
key="path",
match=qmodels.MatchValue(value=path),
)
flt = qmodels.Filter(must=[condition])
selector = qmodels.FilterSelector(filter=flt)
client.delete(collection_name=collection_name, points_selector=selector, wait=True)
def _format_search_results(self, points: Sequence[qmodels.ScoredPoint]):
@@ -107,7 +145,7 @@ class QdrantProvider(BaseVectorProvider):
{
"id": point.id,
"distance": point.score,
"entity": {"path": (point.payload or {}).get("path")},
"entity": point.payload or {},
}
for point in points
]
@@ -141,11 +179,11 @@ class QdrantProvider(BaseVectorProvider):
break
for record in records:
path = (record.payload or {}).get("path")
if query_path and path:
if query_path not in path:
continue
results.append({"id": record.id, "distance": 1.0, "entity": {"path": path}})
payload = record.payload or {}
path = payload.get("path")
if query_path and path and query_path not in path:
continue
results.append({"id": record.id, "distance": 1.0, "entity": payload})
if len(results) >= top_k:
break