feat: support MiniMax audio provider

This commit is contained in:
jxxghp
2026-05-25 11:42:57 +08:00
parent 9af61c4744
commit 766d2699ea
3 changed files with 221 additions and 3 deletions

View File

@@ -9,11 +9,12 @@ import subprocess
from abc import ABC
from io import BytesIO
from pathlib import Path
from typing import Dict, Optional
from typing import Any, Dict, Optional
from uuid import uuid4
from app.core.config import settings
from app.log import logger
from app.utils.http import RequestUtils
class AgentCapabilityProvider(ABC):
@@ -411,6 +412,160 @@ class MiMoAudioProvider(OpenAIChatAudioProvider):
return model
class MiniMaxAudioProvider(OpenAIChatAudioProvider):
"""MiniMax 音频 provider语音合成使用官方 T2A HTTP 接口。"""
name = "minimax"
DISPLAY_NAME = "MiniMax"
DEFAULT_BASE_URL = "https://api.minimaxi.com/v1"
DEFAULT_STT_MODEL = "MiniMax-M2.7"
DEFAULT_TTS_MODEL = "speech-2.8-turbo"
DEFAULT_VOICE = "Chinese (Mandarin)_Lyrical_Voice"
AUDIO_INPUT_DATA_URL = True
SUPPORTED_TTS_MODELS = frozenset(
{
"speech-2.8-hd",
"speech-2.8-turbo",
"speech-2.6-hd",
"speech-2.6-turbo",
"speech-02-hd",
"speech-02-turbo",
"speech-01-hd",
"speech-01-turbo",
}
)
def _build_client(self, api_key: str, base_url: Optional[str]):
"""构建 MiniMax OpenAI 兼容客户端,兼容用户误填 Anthropic 端点的情况。"""
from openai import OpenAI
return OpenAI(
api_key=api_key,
base_url=self._normalize_api_base_url(base_url),
max_retries=3,
)
@classmethod
def _normalize_api_base_url(cls, base_url: Optional[str]) -> str:
"""归一化 MiniMax API 基础 URL确保后续可以拼接 OpenAI/T2A 路径。"""
normalized = (base_url or cls.DEFAULT_BASE_URL).strip().rstrip("/")
if normalized.endswith("/t2a_v2"):
normalized = normalized[: -len("/t2a_v2")]
for suffix in ("/anthropic/v1", "/openai/v1"):
if normalized.endswith(suffix):
return normalized[: -len(suffix)] + "/v1"
if not normalized.endswith("/v1"):
normalized = f"{normalized}/v1"
return normalized
@classmethod
def _build_t2a_url(cls, base_url: Optional[str]) -> str:
"""生成 MiniMax 同步 T2A 接口地址。"""
return f"{cls._normalize_api_base_url(base_url)}/t2a_v2"
def _normalize_stt_model(self) -> str:
"""将非 MiniMax 的默认转写模型名兜底为 MiniMax 对话模型。"""
model = (settings.AUDIO_INPUT_MODEL or "").strip()
if not model or model.lower().startswith(("gpt-", "mimo-")):
return self.DEFAULT_STT_MODEL
return model
def _normalize_tts_model(self) -> str:
"""将非 MiniMax 语音模型兜底为官方 T2A 模型。"""
model = (settings.AUDIO_OUTPUT_MODEL or "").strip().lower()
if model in self.SUPPORTED_TTS_MODELS:
return model
return self.DEFAULT_TTS_MODEL
def _normalize_voice_id(self) -> str:
"""将其他 provider 的默认音色兜底为 MiniMax 中文系统音色。"""
voice_id = (settings.AUDIO_OUTPUT_VOICE or "").strip()
if not voice_id or voice_id in {"alloy", "mimo_default"}:
return self.DEFAULT_VOICE
return voice_id
@staticmethod
def _decode_audio_payload(audio_data: str) -> bytes:
"""解析 MiniMax T2A 返回的音频数据,优先按官方 hex 格式处理。"""
normalized = "".join((audio_data or "").split())
try:
return bytes.fromhex(normalized)
except ValueError:
return base64.b64decode(audio_data)
@staticmethod
def _extract_minimax_error(data: dict[str, Any]) -> Optional[str]:
"""提取 MiniMax base_resp 错误信息,成功响应返回 None。"""
base_resp = data.get("base_resp") or {}
status_code = base_resp.get("status_code")
if status_code in (None, 0, "0"):
return None
status_msg = base_resp.get("status_msg") or "unknown error"
return f"{status_code}: {status_msg}"
def synthesize_speech(self, text: str) -> Optional[Path]:
"""调用 MiniMax T2A HTTP 接口合成语音文件。"""
if not text:
return None
try:
api_key, base_url = self._output_credentials()
if not api_key:
raise ValueError("音频输出 provider 未配置 API Key")
response = RequestUtils(
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"Accept": "application/json",
},
proxies=settings.PROXY or {},
timeout=60,
).post_res(
url=self._build_t2a_url(base_url),
json={
"model": self._normalize_tts_model(),
"text": text,
"stream": False,
"language_boost": "auto",
"output_format": "hex",
"voice_setting": {
"voice_id": self._normalize_voice_id(),
"speed": 1,
"vol": 1,
"pitch": 0,
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": "opus",
"channel": 1,
},
},
)
if not response:
raise ValueError("MiniMax T2A 请求无响应")
if response.status_code >= 400:
raise ValueError(f"MiniMax T2A HTTP {response.status_code}")
result = response.json()
minimax_error = self._extract_minimax_error(result)
if minimax_error:
raise ValueError(f"MiniMax T2A 返回错误: {minimax_error}")
audio_data = ((result.get("data") or {}).get("audio") or "").strip()
if not audio_data:
raise ValueError("MiniMax T2A 响应中没有音频数据")
voice_dir = settings.TEMP_PATH / "voice"
voice_dir.mkdir(parents=True, exist_ok=True)
output_path = voice_dir / f"{uuid4().hex}.opus"
output_path.write_bytes(self._decode_audio_payload(audio_data))
return output_path
except Exception as err:
logger.error(f"音频输出合成失败: provider={self.name}, error={err}")
return None
class AgentCapabilityManager:
"""Agent 能力统一入口。"""
@@ -420,6 +575,7 @@ class AgentCapabilityManager:
OpenAIAudioProvider.name: OpenAIAudioProvider(),
OpenAIChatAudioProvider.name: OpenAIChatAudioProvider(),
MiMoAudioProvider.name: MiMoAudioProvider(),
MiniMaxAudioProvider.name: MiniMaxAudioProvider(),
}
@classmethod

View File

@@ -600,7 +600,7 @@ class ConfigModel(BaseModel):
# AI智能体自动重试整理失败记录开关
AI_AGENT_RETRY_TRANSFER: bool = False
# 音频输入提供商openai/openai_chat_audio/mimo
# 音频输入提供商openai/openai_chat_audio/mimo/minimax
AUDIO_INPUT_PROVIDER: str = "openai"
# 音频输入 API 密钥
AUDIO_INPUT_API_KEY: Optional[str] = None
@@ -610,7 +610,7 @@ class ConfigModel(BaseModel):
AUDIO_INPUT_MODEL: str = "gpt-4o-mini-transcribe"
# 音频输入识别语言
AUDIO_INPUT_LANGUAGE: str = "zh"
# 音频输出提供商openai/openai_chat_audio/mimo
# 音频输出提供商openai/openai_chat_audio/mimo/minimax
AUDIO_OUTPUT_PROVIDER: str = "openai"
# 音频输出 API 密钥
AUDIO_OUTPUT_API_KEY: Optional[str] = None

View File

@@ -21,6 +21,7 @@ spec.loader.exec_module(capability_module)
AgentCapabilityManager = capability_module.AgentCapabilityManager
MiMoAudioProvider = capability_module.MiMoAudioProvider
MiniMaxAudioProvider = capability_module.MiniMaxAudioProvider
OpenAIChatAudioProvider = capability_module.OpenAIChatAudioProvider
OpenAIAudioProvider = capability_module.OpenAIAudioProvider
@@ -32,6 +33,7 @@ class AgentCapabilityManagerTest(unittest.TestCase):
"openai_chat_audio", AgentCapabilityManager.get_registered_audio_providers()
)
self.assertIn("mimo", AgentCapabilityManager.get_registered_audio_providers())
self.assertIn("minimax", AgentCapabilityManager.get_registered_audio_providers())
def test_get_audio_provider_uses_separate_input_and_output_settings(self):
with patch.object(settings, "AUDIO_INPUT_PROVIDER", "openai"), patch.object(
@@ -230,6 +232,66 @@ class AgentCapabilityManagerTest(unittest.TestCase):
)
self.assertIn("只输出转写结果", content[1]["text"])
def test_minimax_stt_normalizes_openai_default_model(self):
"""校验 MiniMax 音频输入会把 OpenAI 默认模型兜底为 MiniMax 模型。"""
provider = MiniMaxAudioProvider()
with patch.object(settings, "AUDIO_INPUT_MODEL", "gpt-4o-mini-transcribe"):
self.assertEqual(provider._normalize_stt_model(), "MiniMax-M2.7")
def test_minimax_tts_uses_t2a_http_payload(self):
"""校验 MiniMax 音频输出会调用官方 T2A HTTP 接口并写入音频文件。"""
provider = MiniMaxAudioProvider()
fake_response = SimpleNamespace(
status_code=200,
json=Mock(
return_value={
"data": {"audio": b"opus-bytes".hex(), "status": 2},
"base_resp": {"status_code": 0, "status_msg": "success"},
}
),
)
request_utils = Mock()
request_utils.post_res.return_value = fake_response
with TemporaryDirectory() as temp_dir, patch.object(
capability_module, "RequestUtils", return_value=request_utils
) as request_utils_cls, patch.object(
capability_module,
"settings",
SimpleNamespace(
TEMP_PATH=Path(temp_dir),
PROXY={},
AUDIO_OUTPUT_MODEL="gpt-4o-mini-tts",
AUDIO_OUTPUT_VOICE="alloy",
AUDIO_OUTPUT_API_KEY="sk-test",
AUDIO_OUTPUT_BASE_URL="https://api.minimaxi.com/anthropic/v1",
),
):
output_path = provider.synthesize_speech("你好")
output_bytes = output_path.read_bytes() if output_path else None
self.assertIsNotNone(output_path)
self.assertEqual(output_bytes, b"opus-bytes")
request_utils_cls.assert_called_once()
request = request_utils.post_res.call_args.kwargs
self.assertEqual(request["url"], "https://api.minimaxi.com/v1/t2a_v2")
self.assertEqual(request["json"]["model"], "speech-2.8-turbo")
self.assertEqual(
request["json"]["voice_setting"]["voice_id"],
"Chinese (Mandarin)_Lyrical_Voice",
)
self.assertEqual(request["json"]["audio_setting"]["format"], "opus")
def test_minimax_tts_accepts_base64_audio_payload(self):
"""校验 MiniMax 音频解析兼容部分代理返回的 base64 音频数据。"""
provider = MiniMaxAudioProvider()
self.assertEqual(
provider._decode_audio_payload(b64encode(b"opus-bytes").decode("utf-8")),
b"opus-bytes",
)
if __name__ == "__main__":
unittest.main()