feat(processors): add processor management

This commit is contained in:
shiyu
2025-09-19 18:58:54 +08:00
parent fbeb673126
commit 7da49191aa
12 changed files with 863 additions and 43 deletions

View File

@@ -1,33 +1,53 @@
import pkgutil
import inspect
from importlib import import_module
from typing import Dict, Callable
import pkgutil
from importlib import import_module, reload
from pathlib import Path
from types import ModuleType
from typing import Callable, Dict, Optional
from .base import BaseProcessor
ProcessorFactory = Callable[[], BaseProcessor]
TYPE_MAP: Dict[str, ProcessorFactory] = {}
CONFIG_SCHEMAS: Dict[str, dict] = {}
MODULE_MAP: Dict[str, ModuleType] = {}
LAST_DISCOVERY_ERRORS: list[str] = []
def discover_processors(force_reload: bool = False) -> list[str]:
"""Discover available processor modules and cache their metadata."""
import services.processors # 延迟导入以避免循环
def discover_processors():
import services.processors
processors_pkg = services.processors
TYPE_MAP.clear()
CONFIG_SCHEMAS.clear()
MODULE_MAP.clear()
global LAST_DISCOVERY_ERRORS
LAST_DISCOVERY_ERRORS = []
for modinfo in pkgutil.iter_modules(processors_pkg.__path__):
if modinfo.name.startswith("_"):
continue
full_name = f"{processors_pkg.__name__}.{modinfo.name}"
try:
module = import_module(full_name)
except Exception:
if force_reload:
module = reload(module)
except Exception as exc:
LAST_DISCOVERY_ERRORS.append(f"Failed to import {full_name}: {exc}")
continue
processor_type = getattr(module, "PROCESSOR_TYPE", None)
processor_name = getattr(module, "PROCESSOR_NAME", None)
supported_exts = getattr(module, "SUPPORTED_EXTS", None)
schema = getattr(module, "CONFIG_SCHEMA", None)
factory = getattr(module, "PROCESSOR_FACTORY", None)
if not processor_type:
continue
if factory is None:
for attr in module.__dict__.values():
if inspect.isclass(attr) and attr.__name__.endswith("Processor"):
@@ -35,31 +55,85 @@ def discover_processors():
return lambda: cls()
factory = _mk()
break
if not callable(factory):
LAST_DISCOVERY_ERRORS.append(f"Processor {full_name} missing factory")
continue
try:
sample = factory()
except Exception as exc:
LAST_DISCOVERY_ERRORS.append(f"Failed to instantiate processor {processor_type}: {exc}")
continue
TYPE_MAP[processor_type] = factory
MODULE_MAP[processor_type] = module
produces_file = getattr(module, "produces_file", None)
if produces_file is None and hasattr(factory(), "produces_file"):
produces_file = getattr(factory(), "produces_file")
if produces_file is None and hasattr(sample, "produces_file"):
produces_file = getattr(sample, "produces_file")
module_file = getattr(module, "__file__", None)
module_path: Optional[str] = None
if module_file:
try:
module_path = str(Path(module_file).resolve())
except Exception:
module_path = module_file
if isinstance(supported_exts, list):
normalized_exts = [str(ext) for ext in supported_exts]
elif supported_exts:
normalized_exts = [str(supported_exts)]
else:
normalized_exts = []
if not normalized_exts and hasattr(sample, "supported_exts"):
sample_exts = getattr(sample, "supported_exts") or []
if isinstance(sample_exts, list):
normalized_exts = [str(ext) for ext in sample_exts]
if isinstance(schema, list):
CONFIG_SCHEMAS[processor_type] = {
"type": processor_type,
"name": processor_name or processor_type,
"supported_exts": supported_exts or [],
"supported_exts": normalized_exts,
"config_schema": schema,
"produces_file": produces_file if produces_file is not None else False
"produces_file": produces_file if produces_file is not None else False,
"module_path": module_path,
}
return LAST_DISCOVERY_ERRORS
def get_config_schemas() -> Dict[str, dict]:
return CONFIG_SCHEMAS
def get_config_schema(processor_type: str):
return CONFIG_SCHEMAS.get(processor_type)
def get(processor_type: str) -> BaseProcessor:
factory = TYPE_MAP.get(processor_type)
if factory:
return factory()
return None
def get_module_path(processor_type: str) -> Optional[str]:
meta = CONFIG_SCHEMAS.get(processor_type)
if not meta:
return None
return meta.get("module_path")
def get_last_discovery_errors() -> list[str]:
return LAST_DISCOVERY_ERRORS
def reload_processors() -> list[str]:
return discover_processors(force_reload=True)
discover_processors()

View File

@@ -54,7 +54,8 @@ class TaskQueueService:
path=params["path"],
processor_type=params["processor_type"],
config=params["config"],
save_to=params["save_to"]
save_to=params.get("save_to"),
overwrite=params.get("overwrite", False),
)
task.result = result
elif task.name == "automation_task":
@@ -119,4 +120,4 @@ class TaskQueueService:
await LogService.info("task_queue", "Task worker has been stopped.")
task_queue_service = TaskQueueService()
task_queue_service = TaskQueueService()

View File

@@ -1,4 +1,4 @@
from typing import Dict, Tuple, Any, Union, AsyncIterator
from typing import Dict, Tuple, Any, Union, AsyncIterator, List
from fastapi import HTTPException
import mimetypes
from fastapi.responses import Response
@@ -59,6 +59,24 @@ async def _ensure_method(adapter: Any, method: str):
return func
async def path_is_directory(path: str) -> bool:
"""判断给定路径是否为目录。"""
adapter_instance, _, root, rel = await resolve_adapter_and_rel(path)
rel = rel.rstrip('/')
if rel == '':
return True
stat_func = getattr(adapter_instance, "stat_file", None)
if not callable(stat_func):
raise HTTPException(501, detail="Adapter does not implement stat_file")
try:
info = await stat_func(root, rel)
except FileNotFoundError:
raise HTTPException(404, detail="Path not found")
if isinstance(info, dict):
return bool(info.get("is_dir"))
return False
async def list_virtual_dir(path: str, page_num: int = 1, page_size: int = 50, sort_by: str = "name", sort_order: str = "asc") -> Dict:
norm = (path if path.startswith('/') else '/' + path).rstrip('/') or '/'
adapters = await StorageAdapter.filter(enabled=True)
@@ -476,28 +494,110 @@ async def copy_path(src: str, dst: str, overwrite: bool = False, return_debug: b
return debug_info if return_debug else None
async def process_file(path: str, processor_type: str, config: dict, save_to: str = None):
"""
使用指定处理器处理文件,并可选择保存到新路径
:param path: 源文件路径
:param processor_type: 处理器类型
:param config: 处理器配置
:param save_to: 保存路径(可选),不指定则只返回处理结果
:return: 处理后的文件内容或保存结果
"""
data = await read_file(path)
async def process_file(
path: str,
processor_type: str,
config: dict,
save_to: str | None = None,
overwrite: bool = False,
) -> Any:
"""处理指定路径(文件或目录)。目录会递归处理其下所有文件。"""
processor = get_processor(processor_type)
if not processor:
raise HTTPException(
400, detail=f"Processor {processor_type} not found")
result = await processor.process(data, path, config)
if save_to and getattr(processor, "produces_file", False):
raise HTTPException(400, detail=f"Processor {processor_type} not found")
actual_is_dir = await path_is_directory(path)
supported_exts = getattr(processor, "supported_exts", None) or []
allowed_exts = {
str(ext).lower().lstrip('.')
for ext in supported_exts
if isinstance(ext, str)
}
def matches_extension(rel_path: str) -> bool:
if not allowed_exts:
return True
if '.' not in rel_path:
return '' in allowed_exts
ext = rel_path.rsplit('.', 1)[-1].lower()
return ext in allowed_exts or f'.{ext}' in allowed_exts
def coerce_result_bytes(result: Any) -> bytes:
if isinstance(result, Response):
result_bytes = result.body
else:
result_bytes = result
await write_file(save_to, result_bytes)
return {"saved_to": save_to}
return result.body
if isinstance(result, (bytes, bytearray)):
return bytes(result)
if isinstance(result, str):
return result.encode('utf-8')
raise HTTPException(500, detail="Processor must return bytes/Response when produces_file=True")
def build_absolute_path(mount_path: str, rel_path: str) -> str:
rel_norm = rel_path.lstrip('/')
mount_norm = mount_path.rstrip('/')
if not mount_norm:
return '/' + rel_norm if rel_norm else '/'
return f"{mount_norm}/{rel_norm}" if rel_norm else mount_norm
if actual_is_dir:
if save_to:
raise HTTPException(400, detail="Directory processing does not support custom save_to path")
if not overwrite:
raise HTTPException(400, detail="Directory processing requires overwrite")
adapter_instance, adapter_model, root, rel = await resolve_adapter_and_rel(path)
rel = rel.rstrip('/')
list_dir = await _ensure_method(adapter_instance, "list_dir")
processed_count = 0
stack: List[str] = [rel]
page_size = 200
while stack:
current = stack.pop()
page = 1
while True:
entries, total = await list_dir(root, current, page, page_size, "name", "asc")
if not entries and (total or 0) == 0:
break
for entry in entries:
name = entry.get("name")
if not name:
continue
child_rel = f"{current}/{name}" if current else name
if entry.get("is_dir"):
stack.append(child_rel)
continue
if not matches_extension(child_rel):
continue
absolute_path = build_absolute_path(adapter_model.path, child_rel)
data = await read_file(absolute_path)
result = await processor.process(data, absolute_path, config)
if getattr(processor, "produces_file", False):
result_bytes = coerce_result_bytes(result)
await write_file(absolute_path, result_bytes)
processed_count += 1
if total is None or page * page_size >= total:
break
page += 1
return {"processed_files": processed_count}
# 单文件处理
data = await read_file(path)
result = await processor.process(data, path, config)
target_path = save_to
if overwrite and not target_path:
target_path = path
if target_path and getattr(processor, "produces_file", False):
result_bytes = coerce_result_bytes(result)
await write_file(target_path, result_bytes)
return {"saved_to": target_path}
return result