From af35101774c199741ef7c11391f7ac50e4dd3da7 Mon Sep 17 00:00:00 2001
From: jxxghp <jxxghp@gmail.com>
Date: Wed, 29 Apr 2026 18:54:58 +0800
Subject: [PATCH] fix: default to text replies for voice input

---
 app/agent/__init__.py                      | 36 +---------------------
 app/agent/prompt/__init__.py               | 23 ++++----------
 app/agent/tools/impl/send_voice_message.py |  7 +++--
 app/chain/message.py                       | 10 ++----
 tests/test_agent_image_support.py          | 20 +++---------
 5 files changed, 19 insertions(+), 77 deletions(-)

diff --git a/app/agent/__init__.py b/app/agent/__init__.py
index fb74ed08..2677f632 100644
--- a/app/agent/__init__.py
+++ b/app/agent/__init__.py
@@ -34,7 +34,6 @@ from app.chain import ChainBase
 from app.core.config import settings
 from app.db.transferhistory_oper import TransferHistoryOper
 from app.helper.llm import LLMHelper
-from app.helper.voice import VoiceHelper
 from app.log import logger
 from app.schemas import Notification, NotificationType
 from app.schemas.message import ChannelCapabilityManager, ChannelCapability
@@ -170,7 +169,6 @@ class MoviePilotAgent:
         self.channel = channel
         self.source = source
         self.username = username
-        self.reply_with_voice = False
         self._tool_context: Dict[str, object] = {}
         self.output_callback: Optional[Callable[[str], None]] = None
         self.force_streaming = False
@@ -280,8 +278,6 @@ class MoviePilotAgent:
         """
         if self.is_background:
             return self.force_streaming or callable(self.output_callback)
-        if self.reply_with_voice:
-            return False
         if self.force_streaming or callable(self.output_callback):
             return True
         # 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
@@ -379,10 +375,7 @@ class MoviePilotAgent:
         """
         try:
             # 系统提示词
-            system_prompt = prompt_manager.get_agent_prompt(
-                channel=self.channel,
-                prefer_voice_reply=self.reply_with_voice,
-            )
+            system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)
 
             # LLM 模型（用于 agent 执行）
             llm = self._initialize_llm(streaming=streaming)
@@ -460,7 +453,6 @@ class MoviePilotAgent:
                 f"images={len(images) if images else 0}, files={len(files) if files else 0}"
             )
             self._tool_context = {
-                "incoming_voice": self.reply_with_voice,
                 "user_reply_sent": False,
                 "reply_mode": None,
             }
@@ -678,22 +670,6 @@ class MoviePilotAgent:
         """
         通过原渠道发送消息给用户
         """
-        voice_path = None
-        if (
-            self.reply_with_voice
-            and VoiceHelper.resolve_reply_mode(
-                channel=self.channel,
-                source=self.source,
-            )
-            == VoiceHelper.REPLY_MODE_NATIVE
-            and VoiceHelper.is_available("tts")
-        ):
-            # 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时，
-            # 这里补一层自动语音回复兜底，避免最终仍只返回纯文字。
-            voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
-            if voice_file:
-                voice_path = str(voice_file)
-
         await AgentChain().async_post_message(
             Notification(
                 channel=self.channel,
@@ -703,12 +679,6 @@ class MoviePilotAgent:
                 username=self.username,
                 title=title,
                 text=message,
-                voice_path=voice_path,
-                voice_caption=(
-                    message
-                    if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
-                    else None
-                ),
             )
         )
 
@@ -753,7 +723,6 @@ class _MessageTask:
     channel: Optional[str] = None
     source: Optional[str] = None
     username: Optional[str] = None
-    reply_with_voice: bool = False
 
 
 class AgentManager:
@@ -851,7 +820,6 @@ class AgentManager:
         channel: str = None,
         source: str = None,
         username: str = None,
-        reply_with_voice: bool = False,
     ) -> str:
         """
         处理用户消息：将消息放入会话队列，按顺序依次处理。
@@ -866,7 +834,6 @@ class AgentManager:
             channel=channel,
             source=source,
             username=username,
-            reply_with_voice=reply_with_voice,
         )
 
         # 获取或创建会话队列
@@ -964,7 +931,6 @@ class AgentManager:
                 agent.source = task.source
             if task.username:
                 agent.username = task.username
-        agent.reply_with_voice = task.reply_with_voice
 
         return await agent.process(task.message, images=task.images, files=task.files)
 
diff --git a/app/agent/prompt/__init__.py b/app/agent/prompt/__init__.py
index eb92e13e..c54771bd 100644
--- a/app/agent/prompt/__init__.py
+++ b/app/agent/prompt/__init__.py
@@ -87,13 +87,10 @@ class PromptManager:
             logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
             raise
 
-    def get_agent_prompt(
-        self, channel: str = None, prefer_voice_reply: bool = False
-    ) -> str:
+    def get_agent_prompt(self, channel: str = None) -> str:
         """
         获取智能体提示词
         :param channel: 消息渠道（Telegram、微信、Slack等）
-        :param prefer_voice_reply: 是否优先使用语音回复
         :return: 提示词内容
         """
         # 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
@@ -132,9 +129,7 @@ class PromptManager:
 
         # MoviePilot系统信息
         moviepilot_info = self._get_moviepilot_info()
-        voice_reply_spec = self._generate_voice_reply_instructions(
-            prefer_voice_reply=prefer_voice_reply
-        )
+        voice_reply_spec = self._generate_voice_reply_instructions()
 
         # 始终替换占位符，避免后续 .format() 时因残留花括号报 KeyError
         base_prompt = base_prompt.format(
@@ -326,17 +321,11 @@ class PromptManager:
         return "\n".join(instructions)
 
     @staticmethod
-    def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str:
-        if not prefer_voice_reply:
-            return (
-                "- Voice replies: Use normal text replies by default. "
-                "Only call `send_voice_message` when spoken playback is clearly better than plain text."
-            )
+    def _generate_voice_reply_instructions() -> str:
         return (
-            "- Current message context: The user sent a voice message.\n"
-            "- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n"
-            "- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n"
-            "- Do not repeat the same full reply again after calling `send_voice_message`."
+            "- Voice replies: Use normal text replies by default. "
+            "Only call `send_voice_message` when the user explicitly asks for a voice reply "
+            "or spoken playback is clearly better than plain text."
         )
 
     @staticmethod
diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py
index 08f18b4a..1ceef892 100644
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
 class SendVoiceMessageTool(MoviePilotTool):
     name: str = "send_voice_message"
     description: str = (
-        "Send a voice reply to the current user. Prefer this when the user sent a voice message "
-        "or when spoken playback is more natural. On channels without voice support or when TTS "
-        "is unavailable, it automatically falls back to sending the same content as plain text."
+        "Send a voice reply to the current user. Use this only when the user explicitly asks for "
+        "a voice reply or when spoken playback is clearly better than plain text. On channels "
+        "without voice support or when TTS is unavailable, it automatically falls back to sending "
+        "the same content as plain text."
     )
     args_schema: Type[BaseModel] = SendVoiceMessageInput
     require_admin: bool = False
diff --git a/app/chain/message.py b/app/chain/message.py
index a1aac4df..306d988f 100644
--- a/app/chain/message.py
+++ b/app/chain/message.py
@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
         """
         images = CommingMessage.MessageImage.normalize_list(images)
 
-        # 识别语音为文本
-        reply_with_voice = bool(audio_refs)
+        # 语音输入只用于转写为文本，不默认改变回复形式。
+        has_audio_input = bool(audio_refs)
         if audio_refs:
             transcript = self._transcribe_audio_refs(audio_refs, channel, source)
             merged_parts = []
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
                 username=username,
                 images=images,
                 files=files,
-                reply_with_voice=reply_with_voice,
             )
             return
 
         if (
             settings.AI_AGENT_ENABLE
-            and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice)
+            and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
         ):
             self._handle_ai_message(
                 text=text,
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
                 username=username,
                 images=images,
                 files=files,
-                reply_with_voice=reply_with_voice,
             )
             return
 
@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
         username: str,
         images: Optional[List[CommingMessage.MessageImage]] = None,
         files: Optional[List[CommingMessage.MessageAttachment]] = None,
-        reply_with_voice: bool = False,
         session_id: Optional[str] = None,
     ) -> None:
         """
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
                     channel=channel.value if channel else None,
                     source=source,
                     username=username,
-                    reply_with_voice=reply_with_voice,
                 ),
                 global_vars.loop,
             )
diff --git a/tests/test_agent_image_support.py b/tests/test_agent_image_support.py
index f9b43d57..6658c974 100644
--- a/tests/test_agent_image_support.py
+++ b/tests/test_agent_image_support.py
@@ -2,7 +2,6 @@ import base64
 import json
 import tempfile
 import unittest
-from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, Mock, patch
 from urllib.parse import quote
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
 from app.chain.message import MessageChain
 from app.core.config import settings
 from app.helper.llm import LLMHelper
-from app.helper.voice import VoiceHelper
 from app.modules.discord import DiscordModule
 from app.modules.qqbot import QQBotModule
 from app.modules.slack import SlackModule
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):
 
         handle_ai_message.assert_called_once()
 
-    def test_audio_message_routes_to_agent_with_voice_reply_flag(self):
+    def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
         chain = MessageChain()
 
         with patch.object(chain, "load_cache", return_value={}), patch.object(
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):
 
         handle_ai_message.assert_called_once()
         self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
-        self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"])
+        self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
 
     def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
         chain = MessageChain()
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
             ],
         )
 
-    def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self):
+    def test_agent_send_agent_message_keeps_default_text_reply(self):
         agent = MoviePilotAgent(
             session_id="session-1",
             user_id="user-1",
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
             source="telegram-test",
             username="tester",
         )
-        agent.reply_with_voice = True
 
         with patch.object(
-            VoiceHelper,
-            "resolve_reply_mode",
-            return_value=VoiceHelper.REPLY_MODE_NATIVE,
-        ), patch.object(
-            VoiceHelper, "is_available", return_value=True
-        ), patch.object(
-            VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
-        ), patch.object(
             AgentChain, "async_post_message", new_callable=AsyncMock
         ) as async_post_message:
             import asyncio
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
             asyncio.run(agent.send_agent_message("这是语音回复"))
 
         notification = async_post_message.await_args.args[0]
-        self.assertEqual(notification.voice_path, "/tmp/reply.opus")
+        self.assertIsNone(notification.voice_path)
+        self.assertIsNone(notification.voice_caption)
         self.assertEqual(notification.text, "这是语音回复")
 
     def test_agent_process_wraps_request_as_structured_json(self):