mirror of
https://github.com/DrizzleTime/Foxel.git
synced 2026-05-10 17:43:35 +08:00
264 lines
10 KiB
Python
264 lines
10 KiB
Python
from typing import Dict, Any, List, Tuple
|
|
from fastapi.responses import Response
|
|
import base64
|
|
import mimetypes
|
|
import os
|
|
from io import BytesIO
|
|
|
|
from services.ai import describe_image_base64, get_text_embedding, provider_service
|
|
from services.vector_db import VectorDBService, DEFAULT_VECTOR_DIMENSION
|
|
from services.logging import LogService
|
|
from PIL import Image
|
|
|
|
|
|
|
|
CHUNK_SIZE = 800
|
|
CHUNK_OVERLAP = 200
|
|
MAX_IMAGE_EDGE = 1600
|
|
JPEG_QUALITY = 85
|
|
|
|
|
|
def _chunk_text(content: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Tuple[int, str, int, int]]:
|
|
"""按固定窗口拆分文本,返回(chunk_id, chunk_text, start, end)。"""
|
|
if chunk_size <= 0:
|
|
chunk_size = CHUNK_SIZE
|
|
if overlap >= chunk_size:
|
|
overlap = max(chunk_size // 4, 1)
|
|
|
|
chunks: List[Tuple[int, str, int, int]] = []
|
|
step = chunk_size - overlap
|
|
idx = 0
|
|
start = 0
|
|
length = len(content)
|
|
|
|
while start < length:
|
|
end = min(length, start + chunk_size)
|
|
chunk = content[start:end].strip()
|
|
if chunk:
|
|
chunks.append((idx, chunk, start, end))
|
|
idx += 1
|
|
if end >= length:
|
|
break
|
|
start += step
|
|
return chunks
|
|
|
|
|
|
def _guess_mime(path: str) -> str:
|
|
mime, _ = mimetypes.guess_type(path)
|
|
return mime or "application/octet-stream"
|
|
|
|
|
|
def _chunk_key(path: str, chunk_id: str) -> str:
|
|
return f"{path}#chunk={chunk_id}"
|
|
|
|
|
|
def _compress_image_for_embedding(input_bytes: bytes) -> Tuple[bytes, Dict[str, Any] | None]:
|
|
"""压缩图片,降低发送到视觉模型的体积。"""
|
|
if Image is None:
|
|
return input_bytes, None
|
|
|
|
try:
|
|
with Image.open(BytesIO(input_bytes)) as img:
|
|
img = img.convert("RGB")
|
|
width, height = img.size
|
|
longest_edge = max(width, height)
|
|
scale = 1.0
|
|
if longest_edge > MAX_IMAGE_EDGE:
|
|
scale = MAX_IMAGE_EDGE / float(longest_edge)
|
|
new_size = (max(int(width * scale), 1), max(int(height * scale), 1))
|
|
resample_mode = getattr(getattr(Image, "Resampling", Image), "LANCZOS")
|
|
img = img.resize(new_size, resample=resample_mode)
|
|
|
|
buffer = BytesIO()
|
|
img.save(buffer, format="JPEG", quality=JPEG_QUALITY, optimize=True)
|
|
compressed = buffer.getvalue()
|
|
|
|
if len(compressed) < len(input_bytes):
|
|
return compressed, {
|
|
"original_bytes": len(input_bytes),
|
|
"compressed_bytes": len(compressed),
|
|
"scaled": scale < 1.0,
|
|
"width": img.width,
|
|
"height": img.height,
|
|
}
|
|
except Exception: # pragma: no cover - 任意图像处理异常时回退
|
|
return input_bytes, None
|
|
|
|
return input_bytes, None
|
|
|
|
|
|
class VectorIndexProcessor:
|
|
name = "向量索引"
|
|
supported_exts: List[str] = [] # 留空表示不限扩展名
|
|
config_schema = [
|
|
{
|
|
"key": "action", "label": "操作", "type": "select", "required": True, "default": "create",
|
|
"options": [
|
|
{"value": "create", "label": "创建索引"},
|
|
{"value": "destroy", "label": "销毁索引"},
|
|
]
|
|
},
|
|
{
|
|
"key": "index_type", "label": "索引类型", "type": "select", "required": True, "default": "vector",
|
|
"options": [
|
|
{"value": "vector", "label": "向量索引"},
|
|
{"value": "simple", "label": "普通索引"},
|
|
]
|
|
}
|
|
]
|
|
produces_file = False
|
|
|
|
async def process(self, input_bytes: bytes, path: str, config: Dict[str, Any]) -> Response:
|
|
action = config.get("action", "create")
|
|
index_type = config.get("index_type", "vector")
|
|
vector_db = VectorDBService()
|
|
collection_name = "vector_collection"
|
|
|
|
if action == "destroy":
|
|
await vector_db.delete_vector(collection_name, path)
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"Destroyed {index_type} index for {path}",
|
|
details={"path": path, "action": "destroy", "index_type": index_type},
|
|
)
|
|
return Response(content=f"文件 {path} 的 {index_type} 索引已销毁", media_type="text/plain")
|
|
|
|
mime_type = _guess_mime(path)
|
|
|
|
if index_type == "simple":
|
|
await vector_db.ensure_collection(collection_name, vector=False)
|
|
await vector_db.delete_vector(collection_name, path)
|
|
await vector_db.upsert_vector(collection_name, {
|
|
"path": path,
|
|
"source_path": path,
|
|
"chunk_id": "filename",
|
|
"mime": mime_type,
|
|
"type": "filename",
|
|
"name": os.path.basename(path),
|
|
})
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"Created simple index for {path}",
|
|
details={"path": path, "action": "create", "index_type": "simple"},
|
|
)
|
|
return Response(content=f"文件 {path} 的普通索引已创建", media_type="text/plain")
|
|
|
|
file_ext = path.split('.')[-1].lower()
|
|
details: Dict[str, Any] = {"path": path, "action": "create", "index_type": "vector"}
|
|
|
|
embedding_model = await provider_service.get_default_model("embedding")
|
|
vector_dim = DEFAULT_VECTOR_DIMENSION
|
|
if embedding_model and getattr(embedding_model, "embedding_dimensions", None):
|
|
try:
|
|
vector_dim = int(embedding_model.embedding_dimensions)
|
|
except (TypeError, ValueError):
|
|
vector_dim = DEFAULT_VECTOR_DIMENSION
|
|
if vector_dim <= 0:
|
|
vector_dim = DEFAULT_VECTOR_DIMENSION
|
|
|
|
await vector_db.ensure_collection(collection_name, vector=True, dim=vector_dim)
|
|
await vector_db.delete_vector(collection_name, path)
|
|
|
|
if file_ext in ["jpg", "jpeg", "png", "bmp"]:
|
|
processed_bytes, compression = _compress_image_for_embedding(input_bytes)
|
|
base64_image = base64.b64encode(processed_bytes).decode("utf-8")
|
|
description = await describe_image_base64(base64_image)
|
|
embedding = await get_text_embedding(description)
|
|
image_mime = "image/jpeg" if compression else mime_type
|
|
await vector_db.upsert_vector(collection_name, {
|
|
"path": _chunk_key(path, "image"),
|
|
"source_path": path,
|
|
"chunk_id": "image",
|
|
"embedding": embedding,
|
|
"text": description,
|
|
"mime": image_mime,
|
|
"type": "image",
|
|
})
|
|
details["description"] = description
|
|
if compression:
|
|
details["image_compression"] = compression
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"Indexed image {path}",
|
|
details=details,
|
|
)
|
|
return Response(content=f"图片已索引,描述:{description}", media_type="text/plain")
|
|
|
|
if file_ext in ["txt", "md"]:
|
|
try:
|
|
text = input_bytes.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
return Response(content="文本文件解码失败", status_code=400)
|
|
|
|
chunks = _chunk_text(text)
|
|
if not chunks:
|
|
await vector_db.upsert_vector(collection_name, {
|
|
"path": _chunk_key(path, "0"),
|
|
"source_path": path,
|
|
"chunk_id": "0",
|
|
"embedding": await get_text_embedding(text or path),
|
|
"text": text,
|
|
"mime": mime_type,
|
|
"type": "text",
|
|
"start_offset": 0,
|
|
"end_offset": len(text),
|
|
})
|
|
details["chunks"] = 1
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"Indexed text file {path}",
|
|
details=details,
|
|
)
|
|
return Response(content="文本文件已索引", media_type="text/plain")
|
|
|
|
chunk_count = 0
|
|
for chunk_id, chunk_text, start, end in chunks:
|
|
embedding = await get_text_embedding(chunk_text)
|
|
await vector_db.upsert_vector(collection_name, {
|
|
"path": _chunk_key(path, str(chunk_id)),
|
|
"source_path": path,
|
|
"chunk_id": str(chunk_id),
|
|
"embedding": embedding,
|
|
"text": chunk_text,
|
|
"mime": mime_type,
|
|
"type": "text",
|
|
"start_offset": start,
|
|
"end_offset": end,
|
|
})
|
|
chunk_count += 1
|
|
|
|
details["chunks"] = chunk_count
|
|
sample = chunks[0][1]
|
|
details["sample"] = sample[:120]
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"Indexed text file {path}",
|
|
details=details,
|
|
)
|
|
return Response(content="文本文件已索引", media_type="text/plain")
|
|
|
|
# 其他类型暂未支持向量索引,回退为文件名索引
|
|
await vector_db.delete_vector(collection_name, path)
|
|
await vector_db.upsert_vector(collection_name, {
|
|
"path": _chunk_key(path, "fallback"),
|
|
"source_path": path,
|
|
"chunk_id": "filename",
|
|
"mime": mime_type,
|
|
"type": "filename",
|
|
"name": os.path.basename(path),
|
|
"embedding": [0.0] * vector_dim,
|
|
})
|
|
await LogService.info(
|
|
"processor:vector_index",
|
|
f"File type fallback to simple index for {path}",
|
|
details={"path": path, "action": "create", "index_type": "simple", "original_type": file_ext},
|
|
)
|
|
return Response(content="暂不支持该类型的向量索引,已创建文件名索引", media_type="text/plain")
|
|
|
|
|
|
PROCESSOR_TYPE = "vector_index"
|
|
PROCESSOR_NAME = VectorIndexProcessor.name
|
|
SUPPORTED_EXTS = VectorIndexProcessor.supported_exts
|
|
CONFIG_SCHEMA = VectorIndexProcessor.config_schema
|
|
def PROCESSOR_FACTORY(): return VectorIndexProcessor()
|