fix: default to text replies for voice input

2026-05-06 20:42:43 +08:00 · 2026-04-29 18:54:58 +08:00
parent 9ed5018cc2
commit af35101774
5 changed files with 19 additions and 77 deletions
--- a/app/agent/init.py
+++ b/app/agent/init.py
@@ -34,7 +34,6 @@ from app.chain import ChainBase
 from app.core.config import settings
 from app.db.transferhistory_oper import TransferHistoryOper
 from app.helper.llm import LLMHelper
-from app.helper.voice import VoiceHelper
 from app.log import logger
 from app.schemas import Notification, NotificationType
 from app.schemas.message import ChannelCapabilityManager, ChannelCapability
@@ -170,7 +169,6 @@ class MoviePilotAgent:
        self.channel = channel
        self.source = source
        self.username = username
-        self.reply_with_voice = False
        self._tool_context: Dict[str, object] = {}
        self.output_callback: Optional[Callable[[str], None]] = None
        self.force_streaming = False
@@ -280,8 +278,6 @@ class MoviePilotAgent:
        """
        if self.is_background:
            return self.force_streaming or callable(self.output_callback)
-        if self.reply_with_voice:
-            return False
        if self.force_streaming or callable(self.output_callback):
            return True
        # 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
@@ -379,10 +375,7 @@ class MoviePilotAgent:
        """
        try:
            # 系统提示词
-            system_prompt = prompt_manager.get_agent_prompt(
-                channel=self.channel,
-                prefer_voice_reply=self.reply_with_voice,
-            )
+            system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)

            # LLM 模型（用于 agent 执行）
            llm = self._initialize_llm(streaming=streaming)
@@ -460,7 +453,6 @@ class MoviePilotAgent:
                f"images={len(images) if images else 0}, files={len(files) if files else 0}"
            )
            self._tool_context = {
-                "incoming_voice": self.reply_with_voice,
                "user_reply_sent": False,
                "reply_mode": None,
            }
@@ -678,22 +670,6 @@ class MoviePilotAgent:
        """
        通过原渠道发送消息给用户
        """
-        voice_path = None
-        if (
-            self.reply_with_voice
-            and VoiceHelper.resolve_reply_mode(
-                channel=self.channel,
-                source=self.source,
-            )
-            == VoiceHelper.REPLY_MODE_NATIVE
-            and VoiceHelper.is_available("tts")
-        ):
-            # 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时，
-            # 这里补一层自动语音回复兜底，避免最终仍只返回纯文字。
-            voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
-            if voice_file:
-                voice_path = str(voice_file)
-
        await AgentChain().async_post_message(
            Notification(
                channel=self.channel,
@@ -703,12 +679,6 @@ class MoviePilotAgent:
                username=self.username,
                title=title,
                text=message,
-                voice_path=voice_path,
-                voice_caption=(
-                    message
-                    if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
-                    else None
-                ),
            )
        )

@@ -753,7 +723,6 @@ class _MessageTask:
    channel: Optional[str] = None
    source: Optional[str] = None
    username: Optional[str] = None
-    reply_with_voice: bool = False


 class AgentManager:
@@ -851,7 +820,6 @@ class AgentManager:
        channel: str = None,
        source: str = None,
        username: str = None,
-        reply_with_voice: bool = False,
    ) -> str:
        """
        处理用户消息：将消息放入会话队列，按顺序依次处理。
@@ -866,7 +834,6 @@ class AgentManager:
            channel=channel,
            source=source,
            username=username,
-            reply_with_voice=reply_with_voice,
        )

        # 获取或创建会话队列
@@ -964,7 +931,6 @@ class AgentManager:
                agent.source = task.source
            if task.username:
                agent.username = task.username
-        agent.reply_with_voice = task.reply_with_voice

        return await agent.process(task.message, images=task.images, files=task.files)

--- a/app/agent/prompt/init.py
+++ b/app/agent/prompt/init.py
@@ -87,13 +87,10 @@ class PromptManager:
            logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
            raise

-    def get_agent_prompt(
-        self, channel: str = None, prefer_voice_reply: bool = False
-    ) -> str:
+    def get_agent_prompt(self, channel: str = None) -> str:
        """
        获取智能体提示词
        :param channel: 消息渠道（Telegram、微信、Slack等）
-        :param prefer_voice_reply: 是否优先使用语音回复
        :return: 提示词内容
        """
        # 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
@@ -132,9 +129,7 @@ class PromptManager:

        # MoviePilot系统信息
        moviepilot_info = self._get_moviepilot_info()
-        voice_reply_spec = self._generate_voice_reply_instructions(
-            prefer_voice_reply=prefer_voice_reply
-        )
+        voice_reply_spec = self._generate_voice_reply_instructions()

        # 始终替换占位符，避免后续 .format() 时因残留花括号报 KeyError
        base_prompt = base_prompt.format(
@@ -326,17 +321,11 @@ class PromptManager:
        return "\n".join(instructions)

    @staticmethod
-    def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str:
-        if not prefer_voice_reply:
+    def _generate_voice_reply_instructions() -> str:
        return (
            "- Voice replies: Use normal text replies by default. "
-                "Only call `send_voice_message` when spoken playback is clearly better than plain text."
-            )
-        return (
-            "- Current message context: The user sent a voice message.\n"
-            "- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n"
-            "- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n"
-            "- Do not repeat the same full reply again after calling `send_voice_message`."
+            "Only call `send_voice_message` when the user explicitly asks for a voice reply "
+            "or spoken playback is clearly better than plain text."
        )

    @staticmethod
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
 class SendVoiceMessageTool(MoviePilotTool):
    name: str = "send_voice_message"
    description: str = (
-        "Send a voice reply to the current user. Prefer this when the user sent a voice message "
-        "or when spoken playback is more natural. On channels without voice support or when TTS "
-        "is unavailable, it automatically falls back to sending the same content as plain text."
+        "Send a voice reply to the current user. Use this only when the user explicitly asks for "
+        "a voice reply or when spoken playback is clearly better than plain text. On channels "
+        "without voice support or when TTS is unavailable, it automatically falls back to sending "
+        "the same content as plain text."
    )
    args_schema: Type[BaseModel] = SendVoiceMessageInput
    require_admin: bool = False
--- a/app/chain/message.py
+++ b/app/chain/message.py
@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
        """
        images = CommingMessage.MessageImage.normalize_list(images)

-        # 识别语音为文本
-        reply_with_voice = bool(audio_refs)
+        # 语音输入只用于转写为文本，不默认改变回复形式。
+        has_audio_input = bool(audio_refs)
        if audio_refs:
            transcript = self._transcribe_audio_refs(audio_refs, channel, source)
            merged_parts = []
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
                username=username,
                images=images,
                files=files,
-                reply_with_voice=reply_with_voice,
            )
            return

        if (
            settings.AI_AGENT_ENABLE
-            and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice)
+            and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
        ):
            self._handle_ai_message(
                text=text,
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
                username=username,
                images=images,
                files=files,
-                reply_with_voice=reply_with_voice,
            )
            return

@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
        username: str,
        images: Optional[List[CommingMessage.MessageImage]] = None,
        files: Optional[List[CommingMessage.MessageAttachment]] = None,
-        reply_with_voice: bool = False,
        session_id: Optional[str] = None,
    ) -> None:
        """
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
                    channel=channel.value if channel else None,
                    source=source,
                    username=username,
-                    reply_with_voice=reply_with_voice,
                ),
                global_vars.loop,
            )
--- a/tests/test_agent_image_support.py
+++ b/tests/test_agent_image_support.py
@@ -2,7 +2,6 @@ import base64
 import json
 import tempfile
 import unittest
-from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, Mock, patch
 from urllib.parse import quote
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
 from app.chain.message import MessageChain
 from app.core.config import settings
 from app.helper.llm import LLMHelper
-from app.helper.voice import VoiceHelper
 from app.modules.discord import DiscordModule
 from app.modules.qqbot import QQBotModule
 from app.modules.slack import SlackModule
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):

        handle_ai_message.assert_called_once()

-    def test_audio_message_routes_to_agent_with_voice_reply_flag(self):
+    def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
        chain = MessageChain()

        with patch.object(chain, "load_cache", return_value={}), patch.object(
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):

        handle_ai_message.assert_called_once()
        self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
-        self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"])
+        self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)

    def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
        chain = MessageChain()
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
            ],
        )

-    def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self):
+    def test_agent_send_agent_message_keeps_default_text_reply(self):
        agent = MoviePilotAgent(
            session_id="session-1",
            user_id="user-1",
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
            source="telegram-test",
            username="tester",
        )
-        agent.reply_with_voice = True

        with patch.object(
-            VoiceHelper,
-            "resolve_reply_mode",
-            return_value=VoiceHelper.REPLY_MODE_NATIVE,
-        ), patch.object(
-            VoiceHelper, "is_available", return_value=True
-        ), patch.object(
-            VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
-        ), patch.object(
            AgentChain, "async_post_message", new_callable=AsyncMock
        ) as async_post_message:
            import asyncio
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
            asyncio.run(agent.send_agent_message("这是语音回复"))

        notification = async_post_message.await_args.args[0]
-        self.assertEqual(notification.voice_path, "/tmp/reply.opus")
+        self.assertIsNone(notification.voice_path)
+        self.assertIsNone(notification.voice_caption)
        self.assertEqual(notification.text, "这是语音回复")

    def test_agent_process_wraps_request_as_structured_json(self):