feat: add support for audio and video input via base64

This commit adds configuration and conversion logic to handle audio and video inputs in base64 format, similar to existing image support. It includes: 1. Added supported formats and size limits in config 2. Implemented media validation and conversion in message converter 3. Updated payload building to handle media parts 4. Improved error handling and logging for media processing
2026-07-01 04:31:39 +08:00 · 2025-04-26 03:07:54 +00:00
parent cb40848c04
commit 775930edce
3 changed files with 253 additions and 6 deletions
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -69,6 +69,12 @@ class Settings(BaseSettings):
    # 日志配置
    LOG_LEVEL: str = "INFO" # 默认日志级别

+    # Audio/Video Settings
+    SUPPORTED_AUDIO_FORMATS: List[str] = ["wav", "mp3", "flac", "ogg"] # Add formats Gemini supports
+    SUPPORTED_VIDEO_FORMATS: List[str] = ["mp4", "mov", "avi", "webm"] # Add formats Gemini supports
+    MAX_AUDIO_SIZE_BYTES: int = 50 * 1024 * 1024  # Example: 50MB limit for Base64 payload
+    MAX_VIDEO_SIZE_BYTES: int = 200 * 1024 * 1024 # Example: 200MB limit
+
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # 设置默认AUTH_TOKEN（如果未提供）
@@ -78,6 +84,24 @@ class Settings(BaseSettings):
 # 创建全局配置实例
 settings = Settings()

+# Optional: Define MIME type mappings if needed, or handle directly in converter
+AUDIO_FORMAT_TO_MIMETYPE = {
+    "wav": "audio/wav",
+    "mp3": "audio/mpeg",
+    "flac": "audio/flac",
+    "ogg": "audio/ogg",
+    # Add other mappings supported by Gemini
+}
+
+VIDEO_FORMAT_TO_MIMETYPE = {
+    "mp4": "video/mp4",
+    "mov": "video/quicktime",
+    "avi": "video/x-msvideo",
+    "webm": "video/webm",
+     # Add other mappings supported by Gemini
+}
+
+
 def _parse_db_value(key: str, db_value: str, target_type: Type) -> Any:
    """尝试将数据库字符串值解析为目标 Python 类型"""
    from app.log.logger import get_config_logger # 函数内导入
--- a/app/handler/message_converter.py
+++ b/app/handler/message_converter.py
@@ -5,9 +5,14 @@ import re
 from typing import Any, Dict, List, Optional
 import requests
 import base64
+import logging # Add logging

+# Import settings and mappings
+from app.config.config import settings, AUDIO_FORMAT_TO_MIMETYPE, VIDEO_FORMAT_TO_MIMETYPE
 from app.core.constants import DATA_URL_PATTERN, IMAGE_URL_PATTERN, SUPPORTED_ROLES

+logger = logging.getLogger(__name__) # Add a logger
+

 class MessageConverter(ABC):
    """消息转换器基类"""
@@ -170,4 +175,215 @@ class OpenAIMessageConverter(MessageConverter):
                "parts": system_instruction_parts,
            }
        )
-        return converted_messages, system_instruction
+        return converted_messages, system_instruction
+
+    def _validate_media_data(self, format: str, data: str, supported_formats: List[str], max_size: int) -> tuple[Optional[str], Optional[str]]:
+        """Validates format and size of Base64 media data."""
+        if format.lower() not in supported_formats:
+            logger.error(f"Unsupported media format: {format}. Supported: {supported_formats}")
+            raise ValueError(f"Unsupported media format: {format}")
+
+        try:
+            # Decode Base64 to check size
+            # Be careful with memory usage for very large files
+            # Consider streaming decoding or checking length heuristic first if memory is a concern
+            decoded_data = base64.b64decode(data, validate=True) # Use validate=True for stricter check
+            if len(decoded_data) > max_size:
+                logger.error(f"Media data size ({len(decoded_data)} bytes) exceeds limit ({max_size} bytes).")
+                raise ValueError(f"Media data size exceeds limit of {max_size // 1024 // 1024}MB")
+            # No need to return decoded_data, just the original base64 if valid
+            return data
+        except base64.binascii.Error as e:
+            logger.error(f"Invalid Base64 data provided: {e}")
+            raise ValueError("Invalid Base64 data")
+        except Exception as e:
+            logger.error(f"Error validating media data: {e}")
+            raise # Re-raise other potential errors
+
+    def convert(self, messages: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]:
+        converted_messages = []
+        system_instruction_parts = []
+
+        for idx, msg in enumerate(messages):
+            role = msg.get("role", "")
+            parts = []
+
+            # --- Start Modification ---
+            if "content" in msg and isinstance(msg["content"], list):
+                for content_item in msg["content"]:
+                    if not isinstance(content_item, dict):
+                        # Skip non-dict items if any unexpected format appears
+                        logger.warning(f"Skipping unexpected content item format: {type(content_item)}")
+                        continue
+
+                    content_type = content_item.get("type")
+
+                    if content_type == "text" and content_item.get("text"):
+                        parts.append({"text": content_item["text"]})
+                    elif content_type == "image_url" and content_item.get("image_url", {}).get("url"):
+                        try:
+                           parts.append(_convert_image(content_item["image_url"]["url"]))
+                        except Exception as e:
+                           logger.error(f"Failed to convert image URL {content_item['image_url']['url']}: {e}")
+                           # Decide how to handle: skip part, add error text, etc.
+                           parts.append({"text": f"[Error processing image: {content_item['image_url']['url']}]"})
+                    # --- Add handling for input_audio ---
+                    elif content_type == "input_audio" and content_item.get("input_audio"):
+                        audio_info = content_item["input_audio"]
+                        audio_data = audio_info.get("data")
+                        audio_format = audio_info.get("format", "").lower()
+
+                        if not audio_data or not audio_format:
+                            logger.warning("Skipping audio part due to missing data or format.")
+                            continue
+
+                        try:
+                            # Validate size and format
+                            validated_data = self._validate_media_data(
+                                audio_format,
+                                audio_data,
+                                settings.SUPPORTED_AUDIO_FORMATS,
+                                settings.MAX_AUDIO_SIZE_BYTES
+                            )
+
+                            # Get MIME type
+                            mime_type = AUDIO_FORMAT_TO_MIMETYPE.get(audio_format)
+                            if not mime_type:
+                                # Should not happen if format validation passed, but double-check
+                                logger.error(f"Could not find MIME type for supported format: {audio_format}")
+                                raise ValueError(f"Internal error: MIME type mapping missing for {audio_format}")
+
+                            parts.append({
+                                "inlineData": {
+                                    "mimeType": mime_type,
+                                    "data": validated_data  # Use the validated Base64 data
+                                }
+                            })
+                            logger.debug(f"Successfully added audio part (format: {audio_format})")
+
+                        except ValueError as e:
+                            logger.error(f"Skipping audio part due to validation error: {e}")
+                            # Add placeholder text indicating the error
+                            parts.append({"text": f"[Error processing audio: {e}]"})
+                        except Exception as e:
+                            logger.exception(f"Unexpected error processing audio part.")
+                            parts.append({"text": "[Unexpected error processing audio]"})
+
+                    # --- Add handling for input_video (similar pattern) ---
+                    elif content_type == "input_video" and content_item.get("input_video"):
+                        video_info = content_item["input_video"]
+                        video_data = video_info.get("data")
+                        video_format = video_info.get("format", "").lower()
+
+                        if not video_data or not video_format:
+                            logger.warning("Skipping video part due to missing data or format.")
+                            continue
+
+                        try:
+                            validated_data = self._validate_media_data(
+                                video_format,
+                                video_data,
+                                settings.SUPPORTED_VIDEO_FORMATS,
+                                settings.MAX_VIDEO_SIZE_BYTES
+                            )
+                            mime_type = VIDEO_FORMAT_TO_MIMETYPE.get(video_format)
+                            if not mime_type:
+                                raise ValueError(f"Internal error: MIME type mapping missing for {video_format}")
+
+                            parts.append({
+                                "inlineData": {
+                                    "mimeType": mime_type,
+                                    "data": validated_data
+                                }
+                            })
+                            logger.debug(f"Successfully added video part (format: {video_format})")
+
+                        except ValueError as e:
+                             logger.error(f"Skipping video part due to validation error: {e}")
+                             parts.append({"text": f"[Error processing video: {e}]"})
+                        except Exception as e:
+                             logger.exception(f"Unexpected error processing video part.")
+                             parts.append({"text": "[Unexpected error processing video]"})
+                    # --- End new media handling ---
+                    else:
+                        # Log unrecognized but present types
+                        if content_type:
+                            logger.warning(f"Unsupported content type or missing data in structured content: {content_type}")
+                        # Silently ignore items without a 'type' or if structure is unexpected
+
+            # --- End Modification for list content ---
+            # Keep processing for simple string content (might contain image markdown)
+            elif "content" in msg and isinstance(msg["content"], str) and msg["content"]:
+                # This path handles simple text or markdown images.
+                # If you expect audio/video ONLY via the structured list format,
+                # this part remains as is. If you might have URLs in plain text,
+                # you'd need more complex regex parsing here.
+                parts.extend(_process_text_with_image(msg["content"]))
+            elif "tool_calls" in msg and isinstance(msg["tool_calls"], list):
+                 # Keep existing tool call processing
+                 for tool_call in msg["tool_calls"]:
+                     function_call = tool_call.get("function",{})
+                     # Sanitize arguments loading
+                     arguments_str = function_call.get("arguments","{}")
+                     try:
+                         function_call["args"] = json.loads(arguments_str)
+                     except json.JSONDecodeError:
+                         logger.warning(f"Failed to decode tool call arguments: {arguments_str}")
+                         function_call["args"] = {} # Assign empty dict on error
+                     if "arguments" in function_call: # Check before deleting
+                         # Ensure 'arguments' key exists before attempting deletion
+                         # In some OpenAI versions, it might already be absent
+                         pass # No explicit delete needed if structure is {'function': {'name': '...', 'args': ...}}
+                     else:
+                         # If 'arguments' was the source key, delete it after parsing
+                         if 'arguments' in function_call: # Check again just in case
+                            del function_call["arguments"]
+
+                     parts.append({"functionCall": function_call})
+
+            # Role assignment and message appending logic (keep as is)
+            if role not in SUPPORTED_ROLES:
+                if role == "tool":
+                    role = "user" # Gemini uses 'user' role for function/tool responses
+                # ... (rest of role handling logic) ...
+                else:
+                   # Fallback role logic
+                   if idx == len(messages) - 1:
+                       role = "user"
+                   else:
+                       # Previous logic assigned 'model'. Check if this is always correct.
+                       # Tool/Function responses are usually 'model' in Gemini after the 'user' (tool result) turn.
+                       role = "model" # Stick to 'model' as the default fallback for non-user/system/tool
+
+            if parts:
+                 if role == "system":
+                     # Check if system instructions can contain media - unlikely based on Gemini docs
+                     # Filter out non-text parts for safety?
+                     text_only_parts = [p for p in parts if "text" in p]
+                     if len(text_only_parts) != len(parts):
+                         logger.warning("Non-text parts found in system message; discarding them.")
+                     if text_only_parts:
+                        system_instruction_parts.extend(text_only_parts)
+
+                 else:
+                     # Ensure role is mapped correctly ('model' for assistant turns, 'user' for tool result turns)
+                     gemini_role = "model" if role == "assistant" else role # 'tool' role already mapped to 'user'
+                     converted_messages.append({"role": gemini_role, "parts": parts})
+
+        system_instruction = (
+            None
+            if not system_instruction_parts
+            else {
+                "role": "system", # Gemini supports a dedicated system instruction
+                "parts": system_instruction_parts,
+            }
+        )
+        # Gemini expects 'model' for assistant turns, and 'user' for function/tool responses.
+        # The role mapping logic above should handle this correctly now.
+
+        # Debug: Log the final converted structure before returning
+        # logger.debug(f"Converted messages for Gemini: {json.dumps(converted_messages, indent=2)}")
+        # if system_instruction:
+        #     logger.debug(f"System instruction for Gemini: {json.dumps(system_instruction, indent=2)}")
+
+        return converted_messages, system_instruction
--- a/app/service/chat/openai_chat_service.py
+++ b/app/service/chat/openai_chat_service.py
@@ -21,13 +21,16 @@ from app.database.services import add_error_log, add_request_log # Import add_re
 logger = get_openai_logger()


-def _has_image_parts(contents: List[Dict[str, Any]]) -> bool:
-    """判断消息是否包含图片部分"""
+def _has_media_parts(contents: List[Dict[str, Any]]) -> bool:
+    """判断消息是否包含图片、音频或视频部分 (inlineData)"""
    for content in contents:
-        if "parts" in content:
+        if content and "parts" in content and isinstance(content["parts"], list):
            for part in content["parts"]:
-                if "image_url" in part or "inline_data" in part:
+                # Check if the part is a dictionary and contains 'inlineData'
+                if isinstance(part, dict) and "inlineData" in part:
+                    # Optionally, could check part["inlineData"].get("mimeType") prefix
                    return True
+                # Add checks here if Gemini uses other keys for media (e.g., 'fileData')
    return False


@@ -46,9 +49,13 @@ def _build_tools(
            or model.endswith("-image")
            or model.endswith("-image-generation")
        )
-        and not _has_image_parts(messages)
+        and not _has_media_parts(messages) # Use the updated check
    ):
        tool["codeExecution"] = {}
+        logger.debug("Code execution tool enabled.")
+    elif _has_media_parts(messages):
+         logger.debug("Code execution tool disabled due to media parts presence.")
+
    if model.endswith("-search"):
        tool["googleSearch"] = {}