Foxel/services/processors/vector_index.py

from typing import Dict, Any
from fastapi.responses import Response
import base64
from services.ai import describe_image_base64, get_text_embedding
from services.vector_db import VectorDBService, DEFAULT_VECTOR_DIMENSION
from services.logging import LogService
from services.config import ConfigCenter


class VectorIndexProcessor:
    name = "向量索引"
    supported_exts = ["jpg", "jpeg", "png", "bmp", "txt", "md"]
    config_schema = [
        {
            "key": "action", "label": "操作", "type": "select", "required": True, "default": "create",
            "options": [
                {"value": "create", "label": "创建索引"},
                {"value": "destroy", "label": "销毁索引"},
            ]
        },
        {
            "key": "index_type", "label": "索引类型", "type": "select", "required": True, "default": "vector",
            "options": [
                {"value": "vector", "label": "向量索引"},
                {"value": "simple", "label": "普通索引"},
            ]
        }
    ]
    produces_file = False

    async def process(self, input_bytes: bytes, path: str, config: Dict[str, Any]) -> Response:
        action = config.get("action", "create")
        index_type = config.get("index_type", "vector")
        vector_db = VectorDBService()
        collection_name = "vector_collection"
        if action == "destroy":
            vector_db.delete_vector(collection_name, path)
            await LogService.info(
                "processor:vector_index",
                f"Destroyed {index_type} index for {path}",
                details={"path": path, "action": "destroy", "index_type": index_type},
            )
            return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain")

        if index_type == 'simple':
            vector_db.ensure_collection(collection_name, vector=False)
            vector_db.upsert_vector(collection_name, {'path': path})
            await LogService.info(
                "processor:vector_index",
                f"Created simple index for {path}",
                details={"path": path, "action": "create", "index_type": "simple"},
            )
            return Response(content=f"文件 {path} 的普通索引已创建", media_type="text/plain")

        file_ext = path.split('.')[-1].lower()
        description = ""
        embedding = None

        if file_ext in ["jpg", "jpeg", "png", "bmp"]:
            base64_image = base64.b64encode(input_bytes).decode("utf-8")
            description = await describe_image_base64(base64_image)
            embedding = await get_text_embedding(description)
            log_message = f"Indexed image {path}"
            response_message = f"图片已索引，描述：{description}"
        elif file_ext in ["txt", "md"]:
            text = input_bytes.decode("utf-8")
            embedding = await get_text_embedding(text)
            description = text[:100] + "..." if len(text) > 100 else text
            log_message = f"Indexed text file {path}"
            response_message = f"文本文件已索引"

        if embedding is None:
            return Response(content="不支持的文件类型", status_code=400)

        raw_dim = await ConfigCenter.get('AI_EMBED_DIM', DEFAULT_VECTOR_DIMENSION)
        try:
            vector_dim = int(raw_dim)
        except (TypeError, ValueError):
            vector_dim = DEFAULT_VECTOR_DIMENSION
        if vector_dim <= 0:
            vector_dim = DEFAULT_VECTOR_DIMENSION

        vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
        vector_db.upsert_vector(
            collection_name, {'path': path, 'embedding': embedding})

        await LogService.info(
            "processor:vector_index",
            log_message,
            details={"path": path, "description": description, "action": "create", "index_type": "vector"},
        )
        return Response(content=response_message, media_type="text/plain")


PROCESSOR_TYPE = "vector_index"
PROCESSOR_NAME = VectorIndexProcessor.name
SUPPORTED_EXTS = VectorIndexProcessor.supported_exts
CONFIG_SCHEMA = VectorIndexProcessor.config_schema
def PROCESSOR_FACTORY(): return VectorIndexProcessor()