Files
Foxel/services/processors/vector_index.py

100 lines
4.1 KiB
Python

from typing import Dict, Any
from fastapi.responses import Response
import base64
from services.ai import describe_image_base64, get_text_embedding
from services.vector_db import VectorDBService, DEFAULT_VECTOR_DIMENSION
from services.logging import LogService
from services.config import ConfigCenter
class VectorIndexProcessor:
name = "向量索引"
supported_exts = ["jpg", "jpeg", "png", "bmp", "txt", "md"]
config_schema = [
{
"key": "action", "label": "操作", "type": "select", "required": True, "default": "create",
"options": [
{"value": "create", "label": "创建索引"},
{"value": "destroy", "label": "销毁索引"},
]
},
{
"key": "index_type", "label": "索引类型", "type": "select", "required": True, "default": "vector",
"options": [
{"value": "vector", "label": "向量索引"},
{"value": "simple", "label": "普通索引"},
]
}
]
produces_file = False
async def process(self, input_bytes: bytes, path: str, config: Dict[str, Any]) -> Response:
action = config.get("action", "create")
index_type = config.get("index_type", "vector")
vector_db = VectorDBService()
collection_name = "vector_collection"
if action == "destroy":
vector_db.delete_vector(collection_name, path)
await LogService.info(
"processor:vector_index",
f"Destroyed {index_type} index for {path}",
details={"path": path, "action": "destroy", "index_type": index_type},
)
return Response(content=f"文件 {path}{index_type} 索引已销毁", media_type="text/plain")
if index_type == 'simple':
vector_db.ensure_collection(collection_name, vector=False)
vector_db.upsert_vector(collection_name, {'path': path})
await LogService.info(
"processor:vector_index",
f"Created simple index for {path}",
details={"path": path, "action": "create", "index_type": "simple"},
)
return Response(content=f"文件 {path} 的普通索引已创建", media_type="text/plain")
file_ext = path.split('.')[-1].lower()
description = ""
embedding = None
if file_ext in ["jpg", "jpeg", "png", "bmp"]:
base64_image = base64.b64encode(input_bytes).decode("utf-8")
description = await describe_image_base64(base64_image)
embedding = await get_text_embedding(description)
log_message = f"Indexed image {path}"
response_message = f"图片已索引,描述:{description}"
elif file_ext in ["txt", "md"]:
text = input_bytes.decode("utf-8")
embedding = await get_text_embedding(text)
description = text[:100] + "..." if len(text) > 100 else text
log_message = f"Indexed text file {path}"
response_message = f"文本文件已索引"
if embedding is None:
return Response(content="不支持的文件类型", status_code=400)
raw_dim = await ConfigCenter.get('AI_EMBED_DIM', DEFAULT_VECTOR_DIMENSION)
try:
vector_dim = int(raw_dim)
except (TypeError, ValueError):
vector_dim = DEFAULT_VECTOR_DIMENSION
if vector_dim <= 0:
vector_dim = DEFAULT_VECTOR_DIMENSION
vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
vector_db.upsert_vector(
collection_name, {'path': path, 'embedding': embedding})
await LogService.info(
"processor:vector_index",
log_message,
details={"path": path, "description": description, "action": "create", "index_type": "vector"},
)
return Response(content=response_message, media_type="text/plain")
PROCESSOR_TYPE = "vector_index"
PROCESSOR_NAME = VectorIndexProcessor.name
SUPPORTED_EXTS = VectorIndexProcessor.supported_exts
CONFIG_SCHEMA = VectorIndexProcessor.config_schema
def PROCESSOR_FACTORY(): return VectorIndexProcessor()