import base64 import mimetypes import os from io import BytesIO from typing import Dict, Any, List, Tuple from fastapi.responses import Response from PIL import Image from ..base import BaseProcessor from domain.ai import ( DEFAULT_VECTOR_DIMENSION, FILE_COLLECTION_NAME, VECTOR_COLLECTION_NAME, VectorDBService, describe_image_base64, get_text_embedding, provider_service, ) CHUNK_SIZE = 800 CHUNK_OVERLAP = 200 MAX_IMAGE_EDGE = 1600 JPEG_QUALITY = 85 def _chunk_text(content: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Tuple[int, str, int, int]]: """按固定窗口拆分文本,返回(chunk_id, chunk_text, start, end)。""" if chunk_size <= 0: chunk_size = CHUNK_SIZE if overlap >= chunk_size: overlap = max(chunk_size // 4, 1) chunks: List[Tuple[int, str, int, int]] = [] step = chunk_size - overlap idx = 0 start = 0 length = len(content) while start < length: end = min(length, start + chunk_size) chunk = content[start:end].strip() if chunk: chunks.append((idx, chunk, start, end)) idx += 1 if end >= length: break start += step return chunks def _guess_mime(path: str) -> str: mime, _ = mimetypes.guess_type(path) return mime or "application/octet-stream" def _chunk_key(path: str, chunk_id: str) -> str: return f"{path}#chunk={chunk_id}" def _compress_image_for_embedding(input_bytes: bytes) -> Tuple[bytes, Dict[str, Any] | None]: """压缩图片,降低发送到视觉模型的体积。""" if Image is None: return input_bytes, None try: with Image.open(BytesIO(input_bytes)) as img: img = img.convert("RGB") width, height = img.size longest_edge = max(width, height) scale = 1.0 if longest_edge > MAX_IMAGE_EDGE: scale = MAX_IMAGE_EDGE / float(longest_edge) new_size = (max(int(width * scale), 1), max(int(height * scale), 1)) resample_mode = getattr(getattr(Image, "Resampling", Image), "LANCZOS") img = img.resize(new_size, resample=resample_mode) buffer = BytesIO() img.save(buffer, format="JPEG", quality=JPEG_QUALITY, optimize=True) compressed = buffer.getvalue() if len(compressed) < len(input_bytes): return compressed, { "original_bytes": len(input_bytes), "compressed_bytes": len(compressed), "scaled": scale < 1.0, "width": img.width, "height": img.height, } except Exception: # pragma: no cover - 任意图像处理异常时回退 return input_bytes, None return input_bytes, None class VectorIndexProcessor: name = "向量索引" supported_exts: List[str] = [] # 留空表示不限扩展名 config_schema = [ { "key": "action", "label": "操作", "type": "select", "required": True, "default": "create", "options": [ {"value": "create", "label": "创建索引"}, {"value": "destroy", "label": "销毁索引"}, ] }, { "key": "index_type", "label": "索引类型", "type": "select", "required": True, "default": "vector", "options": [ {"value": "vector", "label": "向量索引"}, {"value": "simple", "label": "普通索引"}, ] } ] produces_file = False requires_input_bytes = False async def process(self, input_bytes: bytes, path: str, config: Dict[str, Any]) -> Response: async def ensure_input_bytes() -> bytes: if input_bytes: return input_bytes from domain.virtual_fs import VirtualFSService return await VirtualFSService.read_file(path) action = config.get("action", "create") index_type = config.get("index_type", "vector") vector_db = VectorDBService() vector_collection = VECTOR_COLLECTION_NAME file_collection = FILE_COLLECTION_NAME if action == "destroy": target_collection = file_collection if index_type == "simple" else vector_collection await vector_db.delete_vector(target_collection, path) return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain") mime_type = _guess_mime(path) if index_type == "simple": await vector_db.ensure_collection(file_collection, vector=False) await vector_db.delete_vector(file_collection, path) await vector_db.upsert_vector(file_collection, { "path": path, "source_path": path, "chunk_id": "filename", "mime": mime_type, "type": "filename", "name": os.path.basename(path), }) return Response(content=f"文件 {path} 的普通索引已创建", media_type="text/plain") file_ext = path.split('.')[-1].lower() details: Dict[str, Any] = {"path": path, "action": "create", "index_type": "vector"} embedding_model = await provider_service.get_default_model("embedding") vector_dim = DEFAULT_VECTOR_DIMENSION if embedding_model and getattr(embedding_model, "embedding_dimensions", None): try: vector_dim = int(embedding_model.embedding_dimensions) except (TypeError, ValueError): vector_dim = DEFAULT_VECTOR_DIMENSION if vector_dim <= 0: vector_dim = DEFAULT_VECTOR_DIMENSION await vector_db.ensure_collection(vector_collection, vector=True, dim=vector_dim) await vector_db.delete_vector(vector_collection, path) if file_ext in ["jpg", "jpeg", "png", "bmp"]: file_bytes = await ensure_input_bytes() processed_bytes, compression = _compress_image_for_embedding(file_bytes) base64_image = base64.b64encode(processed_bytes).decode("utf-8") description = await describe_image_base64(base64_image) embedding = await get_text_embedding(description) image_mime = "image/jpeg" if compression else mime_type await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, "image"), "source_path": path, "chunk_id": "image", "embedding": embedding, "text": description, "mime": image_mime, "type": "image", }) details["description"] = description if compression: details["image_compression"] = compression return Response(content=f"图片已索引,描述:{description}", media_type="text/plain") if file_ext in ["txt", "md"]: try: file_bytes = await ensure_input_bytes() text = file_bytes.decode("utf-8") except UnicodeDecodeError: return Response(content="文本文件解码失败", status_code=400) chunks = _chunk_text(text) if not chunks: await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, "0"), "source_path": path, "chunk_id": "0", "embedding": await get_text_embedding(text or path), "text": text, "mime": mime_type, "type": "text", "start_offset": 0, "end_offset": len(text), }) details["chunks"] = 1 return Response(content="文本文件已索引", media_type="text/plain") chunk_count = 0 for chunk_id, chunk_text, start, end in chunks: embedding = await get_text_embedding(chunk_text) await vector_db.upsert_vector(vector_collection, { "path": _chunk_key(path, str(chunk_id)), "source_path": path, "chunk_id": str(chunk_id), "embedding": embedding, "text": chunk_text, "mime": mime_type, "type": "text", "start_offset": start, "end_offset": end, }) chunk_count += 1 details["chunks"] = chunk_count sample = chunks[0][1] details["sample"] = sample[:120] return Response(content="文本文件已索引", media_type="text/plain") # 其他类型暂未支持向量索引,回退为文件名索引 await vector_db.ensure_collection(file_collection, vector=False) await vector_db.delete_vector(file_collection, path) await vector_db.upsert_vector(file_collection, { "path": path, "source_path": path, "chunk_id": "filename", "mime": mime_type, "type": "filename", "name": os.path.basename(path), }) return Response(content="暂不支持该类型的向量索引,已创建文件名索引", media_type="text/plain") PROCESSOR_TYPE = "vector_index" PROCESSOR_NAME = VectorIndexProcessor.name SUPPORTED_EXTS = VectorIndexProcessor.supported_exts CONFIG_SCHEMA = VectorIndexProcessor.config_schema def PROCESSOR_FACTORY(): return VectorIndexProcessor()