From af35101774c199741ef7c11391f7ac50e4dd3da7 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Wed, 29 Apr 2026 18:54:58 +0800 Subject: [PATCH] fix: default to text replies for voice input --- app/agent/__init__.py | 36 +--------------------- app/agent/prompt/__init__.py | 23 ++++---------- app/agent/tools/impl/send_voice_message.py | 7 +++-- app/chain/message.py | 10 ++---- tests/test_agent_image_support.py | 20 +++--------- 5 files changed, 19 insertions(+), 77 deletions(-) diff --git a/app/agent/__init__.py b/app/agent/__init__.py index fb74ed08..2677f632 100644 --- a/app/agent/__init__.py +++ b/app/agent/__init__.py @@ -34,7 +34,6 @@ from app.chain import ChainBase from app.core.config import settings from app.db.transferhistory_oper import TransferHistoryOper from app.helper.llm import LLMHelper -from app.helper.voice import VoiceHelper from app.log import logger from app.schemas import Notification, NotificationType from app.schemas.message import ChannelCapabilityManager, ChannelCapability @@ -170,7 +169,6 @@ class MoviePilotAgent: self.channel = channel self.source = source self.username = username - self.reply_with_voice = False self._tool_context: Dict[str, object] = {} self.output_callback: Optional[Callable[[str], None]] = None self.force_streaming = False @@ -280,8 +278,6 @@ class MoviePilotAgent: """ if self.is_background: return self.force_streaming or callable(self.output_callback) - if self.reply_with_voice: - return False if self.force_streaming or callable(self.output_callback): return True # 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字 @@ -379,10 +375,7 @@ class MoviePilotAgent: """ try: # 系统提示词 - system_prompt = prompt_manager.get_agent_prompt( - channel=self.channel, - prefer_voice_reply=self.reply_with_voice, - ) + system_prompt = prompt_manager.get_agent_prompt(channel=self.channel) # LLM 模型(用于 agent 执行) llm = self._initialize_llm(streaming=streaming) @@ -460,7 +453,6 @@ class MoviePilotAgent: f"images={len(images) if images else 0}, files={len(files) if files else 0}" ) self._tool_context = { - "incoming_voice": self.reply_with_voice, "user_reply_sent": False, "reply_mode": None, } @@ -678,22 +670,6 @@ class MoviePilotAgent: """ 通过原渠道发送消息给用户 """ - voice_path = None - if ( - self.reply_with_voice - and VoiceHelper.resolve_reply_mode( - channel=self.channel, - source=self.source, - ) - == VoiceHelper.REPLY_MODE_NATIVE - and VoiceHelper.is_available("tts") - ): - # 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时, - # 这里补一层自动语音回复兜底,避免最终仍只返回纯文字。 - voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message) - if voice_file: - voice_path = str(voice_file) - await AgentChain().async_post_message( Notification( channel=self.channel, @@ -703,12 +679,6 @@ class MoviePilotAgent: username=self.username, title=title, text=message, - voice_path=voice_path, - voice_caption=( - message - if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT - else None - ), ) ) @@ -753,7 +723,6 @@ class _MessageTask: channel: Optional[str] = None source: Optional[str] = None username: Optional[str] = None - reply_with_voice: bool = False class AgentManager: @@ -851,7 +820,6 @@ class AgentManager: channel: str = None, source: str = None, username: str = None, - reply_with_voice: bool = False, ) -> str: """ 处理用户消息:将消息放入会话队列,按顺序依次处理。 @@ -866,7 +834,6 @@ class AgentManager: channel=channel, source=source, username=username, - reply_with_voice=reply_with_voice, ) # 获取或创建会话队列 @@ -964,7 +931,6 @@ class AgentManager: agent.source = task.source if task.username: agent.username = task.username - agent.reply_with_voice = task.reply_with_voice return await agent.process(task.message, images=task.images, files=task.files) diff --git a/app/agent/prompt/__init__.py b/app/agent/prompt/__init__.py index eb92e13e..c54771bd 100644 --- a/app/agent/prompt/__init__.py +++ b/app/agent/prompt/__init__.py @@ -87,13 +87,10 @@ class PromptManager: logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}") raise - def get_agent_prompt( - self, channel: str = None, prefer_voice_reply: bool = False - ) -> str: + def get_agent_prompt(self, channel: str = None) -> str: """ 获取智能体提示词 :param channel: 消息渠道(Telegram、微信、Slack等) - :param prefer_voice_reply: 是否优先使用语音回复 :return: 提示词内容 """ # 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。 @@ -132,9 +129,7 @@ class PromptManager: # MoviePilot系统信息 moviepilot_info = self._get_moviepilot_info() - voice_reply_spec = self._generate_voice_reply_instructions( - prefer_voice_reply=prefer_voice_reply - ) + voice_reply_spec = self._generate_voice_reply_instructions() # 始终替换占位符,避免后续 .format() 时因残留花括号报 KeyError base_prompt = base_prompt.format( @@ -326,17 +321,11 @@ class PromptManager: return "\n".join(instructions) @staticmethod - def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str: - if not prefer_voice_reply: - return ( - "- Voice replies: Use normal text replies by default. " - "Only call `send_voice_message` when spoken playback is clearly better than plain text." - ) + def _generate_voice_reply_instructions() -> str: return ( - "- Current message context: The user sent a voice message.\n" - "- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n" - "- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n" - "- Do not repeat the same full reply again after calling `send_voice_message`." + "- Voice replies: Use normal text replies by default. " + "Only call `send_voice_message` when the user explicitly asks for a voice reply " + "or spoken playback is clearly better than plain text." ) @staticmethod diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py index 08f18b4a..1ceef892 100644 --- a/app/agent/tools/impl/send_voice_message.py +++ b/app/agent/tools/impl/send_voice_message.py @@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel): class SendVoiceMessageTool(MoviePilotTool): name: str = "send_voice_message" description: str = ( - "Send a voice reply to the current user. Prefer this when the user sent a voice message " - "or when spoken playback is more natural. On channels without voice support or when TTS " - "is unavailable, it automatically falls back to sending the same content as plain text." + "Send a voice reply to the current user. Use this only when the user explicitly asks for " + "a voice reply or when spoken playback is clearly better than plain text. On channels " + "without voice support or when TTS is unavailable, it automatically falls back to sending " + "the same content as plain text." ) args_schema: Type[BaseModel] = SendVoiceMessageInput require_admin: bool = False diff --git a/app/chain/message.py b/app/chain/message.py index a1aac4df..306d988f 100644 --- a/app/chain/message.py +++ b/app/chain/message.py @@ -109,8 +109,8 @@ class MessageChain(ChainBase): """ images = CommingMessage.MessageImage.normalize_list(images) - # 识别语音为文本 - reply_with_voice = bool(audio_refs) + # 语音输入只用于转写为文本,不默认改变回复形式。 + has_audio_input = bool(audio_refs) if audio_refs: transcript = self._transcribe_audio_refs(audio_refs, channel, source) merged_parts = [] @@ -198,13 +198,12 @@ class MessageChain(ChainBase): username=username, images=images, files=files, - reply_with_voice=reply_with_voice, ) return if ( settings.AI_AGENT_ENABLE - and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice) + and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input) ): self._handle_ai_message( text=text, @@ -214,7 +213,6 @@ class MessageChain(ChainBase): username=username, images=images, files=files, - reply_with_voice=reply_with_voice, ) return @@ -866,7 +864,6 @@ class MessageChain(ChainBase): username: str, images: Optional[List[CommingMessage.MessageImage]] = None, files: Optional[List[CommingMessage.MessageAttachment]] = None, - reply_with_voice: bool = False, session_id: Optional[str] = None, ) -> None: """ @@ -978,7 +975,6 @@ class MessageChain(ChainBase): channel=channel.value if channel else None, source=source, username=username, - reply_with_voice=reply_with_voice, ), global_vars.loop, ) diff --git a/tests/test_agent_image_support.py b/tests/test_agent_image_support.py index f9b43d57..6658c974 100644 --- a/tests/test_agent_image_support.py +++ b/tests/test_agent_image_support.py @@ -2,7 +2,6 @@ import base64 import json import tempfile import unittest -from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, Mock, patch from urllib.parse import quote @@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain from app.chain.message import MessageChain from app.core.config import settings from app.helper.llm import LLMHelper -from app.helper.voice import VoiceHelper from app.modules.discord import DiscordModule from app.modules.qqbot import QQBotModule from app.modules.slack import SlackModule @@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase): handle_ai_message.assert_called_once() - def test_audio_message_routes_to_agent_with_voice_reply_flag(self): + def test_audio_message_routes_to_agent_without_forcing_voice_reply(self): chain = MessageChain() with patch.object(chain, "load_cache", return_value={}), patch.object( @@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase): handle_ai_message.assert_called_once() self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影") - self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"]) + self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs) def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self): chain = MessageChain() @@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase): ], ) - def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self): + def test_agent_send_agent_message_keeps_default_text_reply(self): agent = MoviePilotAgent( session_id="session-1", user_id="user-1", @@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase): source="telegram-test", username="tester", ) - agent.reply_with_voice = True with patch.object( - VoiceHelper, - "resolve_reply_mode", - return_value=VoiceHelper.REPLY_MODE_NATIVE, - ), patch.object( - VoiceHelper, "is_available", return_value=True - ), patch.object( - VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus") - ), patch.object( AgentChain, "async_post_message", new_callable=AsyncMock ) as async_post_message: import asyncio @@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase): asyncio.run(agent.send_agent_message("这是语音回复")) notification = async_post_message.await_args.args[0] - self.assertEqual(notification.voice_path, "/tmp/reply.opus") + self.assertIsNone(notification.voice_path) + self.assertIsNone(notification.voice_caption) self.assertEqual(notification.text, "这是语音回复") def test_agent_process_wraps_request_as_structured_json(self):