feat: enhance directory processing with task queuing and input handling

This commit is contained in:
shiyu
2026-01-20 11:34:09 +08:00
parent a394ffa46b
commit f22ca62902
7 changed files with 63 additions and 12 deletions

View File

@@ -114,8 +114,15 @@ class VectorIndexProcessor:
}
]
produces_file = False
requires_input_bytes = False
async def process(self, input_bytes: bytes, path: str, config: Dict[str, Any]) -> Response:
async def ensure_input_bytes() -> bytes:
if input_bytes:
return input_bytes
from domain.virtual_fs import VirtualFSService
return await VirtualFSService.read_file(path)
action = config.get("action", "create")
index_type = config.get("index_type", "vector")
vector_db = VectorDBService()
@@ -159,7 +166,8 @@ class VectorIndexProcessor:
await vector_db.delete_vector(vector_collection, path)
if file_ext in ["jpg", "jpeg", "png", "bmp"]:
processed_bytes, compression = _compress_image_for_embedding(input_bytes)
file_bytes = await ensure_input_bytes()
processed_bytes, compression = _compress_image_for_embedding(file_bytes)
base64_image = base64.b64encode(processed_bytes).decode("utf-8")
description = await describe_image_base64(base64_image)
embedding = await get_text_embedding(description)
@@ -180,7 +188,8 @@ class VectorIndexProcessor:
if file_ext in ["txt", "md"]:
try:
text = input_bytes.decode("utf-8")
file_bytes = await ensure_input_bytes()
text = file_bytes.decode("utf-8")
except UnicodeDecodeError:
return Response(content="文本文件解码失败", status_code=400)

View File

@@ -85,6 +85,44 @@ class ProcessorService:
suffix = raw_suffix
overwrite = req.overwrite
if produces_file:
if not overwrite and not suffix:
raise HTTPException(400, detail="Suffix is required when not overwriting files")
else:
overwrite = False
suffix = None
payload = {
"path": req.path,
"processor_type": req.processor_type,
"config": req.config,
"overwrite": overwrite,
"max_depth": req.max_depth,
"suffix": suffix,
}
task = await task_queue_service.add_task("process_directory_scan", payload)
return {"task_id": task.id}
@classmethod
async def scan_directory(cls, req: ProcessDirectoryRequest):
if req.max_depth is not None and req.max_depth < 0:
raise HTTPException(400, detail="max_depth must be >= 0")
is_dir = await VirtualFSService.path_is_directory(req.path)
if not is_dir:
raise HTTPException(400, detail="Path must be a directory")
schema = get_config_schema(req.processor_type)
_processor = get(req.processor_type)
if not schema or not _processor:
raise HTTPException(404, detail="Processor not found")
produces_file = bool(schema.get("produces_file"))
raw_suffix = req.suffix if req.suffix is not None else None
if raw_suffix is not None and raw_suffix.strip() == "":
raw_suffix = None
suffix = raw_suffix
overwrite = req.overwrite
if produces_file:
if not overwrite and not suffix:
raise HTTPException(400, detail="Suffix is required when not overwriting files")
@@ -133,7 +171,7 @@ class ProcessorService:
new_name = f"{name}{suffix_str}"
return str(path_obj.with_name(new_name))
scheduled_tasks: List[str] = []
scheduled_count = 0
stack: List[Tuple[str, int]] = [(rel, 0)]
page_size = 200
@@ -161,7 +199,7 @@ class ProcessorService:
save_to = None
if produces_file and not overwrite and suffix:
save_to = apply_suffix(absolute_path, suffix)
task = await task_queue_service.add_task(
await task_queue_service.add_task(
"process_file",
{
"path": absolute_path,
@@ -171,16 +209,13 @@ class ProcessorService:
"overwrite": overwrite,
},
)
scheduled_tasks.append(task.id)
scheduled_count += 1
if total is None or page * page_size >= total:
break
page += 1
return {
"task_ids": scheduled_tasks,
"scheduled": len(scheduled_tasks),
}
return {"scheduled": scheduled_count}
@classmethod
async def get_source(cls, processor_type: str):

View File

@@ -86,6 +86,12 @@ class TaskQueueService:
overwrite=params.get("overwrite", False),
)
task.result = result
elif task.name == "process_directory_scan":
from domain.processors import ProcessDirectoryRequest, ProcessorService
params = task.task_info or {}
req = ProcessDirectoryRequest(**params)
task.result = await ProcessorService.scan_directory(req)
elif task.name == "automation_task" or self._is_processor_task(task.name):
from models.database import AutomationTask

View File

@@ -43,7 +43,7 @@ export const processorsApi = {
max_depth?: number | null;
suffix?: string | null;
}) =>
request<{ task_ids: string[]; scheduled: number }>('/processors/process-directory', {
request<{ task_id: string }>('/processors/process-directory', {
method: 'POST',
json: params,
}),

View File

@@ -617,6 +617,7 @@
"Source Editor": "Source Editor",
"Module Path": "Module Path",
"Directory processing always overwrites original files": "Directory processing always overwrites original files",
"Directory execution will enqueue one task per file": "Directory execution will enqueue a scan task, then one task per file",
"No data": "No data",
"Select File": "Select File",
"Select Path": "Select Path",

View File

@@ -608,7 +608,7 @@
"Source Editor": "源码编辑",
"Module Path": "模块路径",
"Directory processing always overwrites original files": "选择目录时会强制覆盖原文件",
"Directory execution will enqueue one task per file": "目录模式会为每个文件单独创建任务",
"Directory execution will enqueue one task per file": "目录模式会先创建扫描任务,后台再为每个文件创建任务",
"Directory scope": "目录范围",
"Current level only": "仅当前层级",
"Include subdirectories": "包含子目录",

View File

@@ -276,7 +276,7 @@ const ProcessorsPage = memo(function ProcessorsPage() {
max_depth: maxDepth,
suffix: suffixValue,
});
messageApi.success(`${t('Task submitted')}: ${resp.scheduled}`);
messageApi.success(`${t('Task submitted')}: ${resp.task_id}`);
}
} else {
const payload: any = {