fix: default to text replies for voice input

This commit is contained in:
jxxghp
2026-04-29 18:54:58 +08:00
parent 9ed5018cc2
commit af35101774
5 changed files with 19 additions and 77 deletions

View File

@@ -34,7 +34,6 @@ from app.chain import ChainBase
from app.core.config import settings
from app.db.transferhistory_oper import TransferHistoryOper
from app.helper.llm import LLMHelper
from app.helper.voice import VoiceHelper
from app.log import logger
from app.schemas import Notification, NotificationType
from app.schemas.message import ChannelCapabilityManager, ChannelCapability
@@ -170,7 +169,6 @@ class MoviePilotAgent:
self.channel = channel
self.source = source
self.username = username
self.reply_with_voice = False
self._tool_context: Dict[str, object] = {}
self.output_callback: Optional[Callable[[str], None]] = None
self.force_streaming = False
@@ -280,8 +278,6 @@ class MoviePilotAgent:
"""
if self.is_background:
return self.force_streaming or callable(self.output_callback)
if self.reply_with_voice:
return False
if self.force_streaming or callable(self.output_callback):
return True
# 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
@@ -379,10 +375,7 @@ class MoviePilotAgent:
"""
try:
# 系统提示词
system_prompt = prompt_manager.get_agent_prompt(
channel=self.channel,
prefer_voice_reply=self.reply_with_voice,
)
system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)
# LLM 模型(用于 agent 执行)
llm = self._initialize_llm(streaming=streaming)
@@ -460,7 +453,6 @@ class MoviePilotAgent:
f"images={len(images) if images else 0}, files={len(files) if files else 0}"
)
self._tool_context = {
"incoming_voice": self.reply_with_voice,
"user_reply_sent": False,
"reply_mode": None,
}
@@ -678,22 +670,6 @@ class MoviePilotAgent:
"""
通过原渠道发送消息给用户
"""
voice_path = None
if (
self.reply_with_voice
and VoiceHelper.resolve_reply_mode(
channel=self.channel,
source=self.source,
)
== VoiceHelper.REPLY_MODE_NATIVE
and VoiceHelper.is_available("tts")
):
# 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时,
# 这里补一层自动语音回复兜底,避免最终仍只返回纯文字。
voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
if voice_file:
voice_path = str(voice_file)
await AgentChain().async_post_message(
Notification(
channel=self.channel,
@@ -703,12 +679,6 @@ class MoviePilotAgent:
username=self.username,
title=title,
text=message,
voice_path=voice_path,
voice_caption=(
message
if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
else None
),
)
)
@@ -753,7 +723,6 @@ class _MessageTask:
channel: Optional[str] = None
source: Optional[str] = None
username: Optional[str] = None
reply_with_voice: bool = False
class AgentManager:
@@ -851,7 +820,6 @@ class AgentManager:
channel: str = None,
source: str = None,
username: str = None,
reply_with_voice: bool = False,
) -> str:
"""
处理用户消息:将消息放入会话队列,按顺序依次处理。
@@ -866,7 +834,6 @@ class AgentManager:
channel=channel,
source=source,
username=username,
reply_with_voice=reply_with_voice,
)
# 获取或创建会话队列
@@ -964,7 +931,6 @@ class AgentManager:
agent.source = task.source
if task.username:
agent.username = task.username
agent.reply_with_voice = task.reply_with_voice
return await agent.process(task.message, images=task.images, files=task.files)

View File

@@ -87,13 +87,10 @@ class PromptManager:
logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
raise
def get_agent_prompt(
self, channel: str = None, prefer_voice_reply: bool = False
) -> str:
def get_agent_prompt(self, channel: str = None) -> str:
"""
获取智能体提示词
:param channel: 消息渠道Telegram、微信、Slack等
:param prefer_voice_reply: 是否优先使用语音回复
:return: 提示词内容
"""
# 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
@@ -132,9 +129,7 @@ class PromptManager:
# MoviePilot系统信息
moviepilot_info = self._get_moviepilot_info()
voice_reply_spec = self._generate_voice_reply_instructions(
prefer_voice_reply=prefer_voice_reply
)
voice_reply_spec = self._generate_voice_reply_instructions()
# 始终替换占位符,避免后续 .format() 时因残留花括号报 KeyError
base_prompt = base_prompt.format(
@@ -326,17 +321,11 @@ class PromptManager:
return "\n".join(instructions)
@staticmethod
def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str:
if not prefer_voice_reply:
def _generate_voice_reply_instructions() -> str:
return (
"- Voice replies: Use normal text replies by default. "
"Only call `send_voice_message` when spoken playback is clearly better than plain text."
)
return (
"- Current message context: The user sent a voice message.\n"
"- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n"
"- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n"
"- Do not repeat the same full reply again after calling `send_voice_message`."
"Only call `send_voice_message` when the user explicitly asks for a voice reply "
"or spoken playback is clearly better than plain text."
)
@staticmethod

View File

@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
class SendVoiceMessageTool(MoviePilotTool):
name: str = "send_voice_message"
description: str = (
"Send a voice reply to the current user. Prefer this when the user sent a voice message "
"or when spoken playback is more natural. On channels without voice support or when TTS "
"is unavailable, it automatically falls back to sending the same content as plain text."
"Send a voice reply to the current user. Use this only when the user explicitly asks for "
"a voice reply or when spoken playback is clearly better than plain text. On channels "
"without voice support or when TTS is unavailable, it automatically falls back to sending "
"the same content as plain text."
)
args_schema: Type[BaseModel] = SendVoiceMessageInput
require_admin: bool = False

View File

@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
"""
images = CommingMessage.MessageImage.normalize_list(images)
# 识别语音为文本
reply_with_voice = bool(audio_refs)
# 语音输入只用于转写为文本,不默认改变回复形式。
has_audio_input = bool(audio_refs)
if audio_refs:
transcript = self._transcribe_audio_refs(audio_refs, channel, source)
merged_parts = []
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
username=username,
images=images,
files=files,
reply_with_voice=reply_with_voice,
)
return
if (
settings.AI_AGENT_ENABLE
and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice)
and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
):
self._handle_ai_message(
text=text,
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
username=username,
images=images,
files=files,
reply_with_voice=reply_with_voice,
)
return
@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
username: str,
images: Optional[List[CommingMessage.MessageImage]] = None,
files: Optional[List[CommingMessage.MessageAttachment]] = None,
reply_with_voice: bool = False,
session_id: Optional[str] = None,
) -> None:
"""
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
channel=channel.value if channel else None,
source=source,
username=username,
reply_with_voice=reply_with_voice,
),
global_vars.loop,
)

View File

@@ -2,7 +2,6 @@ import base64
import json
import tempfile
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, Mock, patch
from urllib.parse import quote
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
from app.chain.message import MessageChain
from app.core.config import settings
from app.helper.llm import LLMHelper
from app.helper.voice import VoiceHelper
from app.modules.discord import DiscordModule
from app.modules.qqbot import QQBotModule
from app.modules.slack import SlackModule
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):
handle_ai_message.assert_called_once()
def test_audio_message_routes_to_agent_with_voice_reply_flag(self):
def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
chain = MessageChain()
with patch.object(chain, "load_cache", return_value={}), patch.object(
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):
handle_ai_message.assert_called_once()
self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"])
self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
chain = MessageChain()
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
],
)
def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self):
def test_agent_send_agent_message_keeps_default_text_reply(self):
agent = MoviePilotAgent(
session_id="session-1",
user_id="user-1",
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
source="telegram-test",
username="tester",
)
agent.reply_with_voice = True
with patch.object(
VoiceHelper,
"resolve_reply_mode",
return_value=VoiceHelper.REPLY_MODE_NATIVE,
), patch.object(
VoiceHelper, "is_available", return_value=True
), patch.object(
VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
), patch.object(
AgentChain, "async_post_message", new_callable=AsyncMock
) as async_post_message:
import asyncio
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
asyncio.run(agent.send_agent_message("这是语音回复"))
notification = async_post_message.await_args.args[0]
self.assertEqual(notification.voice_path, "/tmp/reply.opus")
self.assertIsNone(notification.voice_path)
self.assertIsNone(notification.voice_caption)
self.assertEqual(notification.text, "这是语音回复")
def test_agent_process_wraps_request_as_structured_json(self):