fix: default to text replies for voice input

2026-05-06 20:42:43 +08:00 · 2026-04-29 18:54:58 +08:00
parent 9ed5018cc2
commit af35101774
5 changed files with 19 additions and 77 deletions
--- a/app/agent/init.py
+++ b/app/agent/init.py
@@ -34,7 +34,6 @@ from app.chain import ChainBase
 from app.core.config import settings
 from app.db.transferhistory_oper import TransferHistoryOper
 from app.helper.llm import LLMHelper
 from app.helper.voice import VoiceHelper
 from app.log import logger
 from app.schemas import Notification, NotificationType
 from app.schemas.message import ChannelCapabilityManager, ChannelCapability
@@ -170,7 +169,6 @@ class MoviePilotAgent:
        self.channel = channel
        self.source = source
        self.username = username
        self.reply_with_voice = False
        self._tool_context: Dict[str, object] = {}
        self.output_callback: Optional[Callable[[str], None]] = None
        self.force_streaming = False
@@ -280,8 +278,6 @@ class MoviePilotAgent:
        """
        if self.is_background:
            return self.force_streaming or callable(self.output_callback)
        if self.reply_with_voice:
            return False
        if self.force_streaming or callable(self.output_callback):
            return True
        # 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
@@ -379,10 +375,7 @@ class MoviePilotAgent:
        """
        try:
            # 系统提示词
-            system_prompt = prompt_manager.get_agent_prompt(
+            system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)
                channel=self.channel,
                prefer_voice_reply=self.reply_with_voice,
            )
            # LLM 模型（用于 agent 执行）
            llm = self._initialize_llm(streaming=streaming)
@@ -460,7 +453,6 @@ class MoviePilotAgent:
                f"images={len(images) if images else 0}, files={len(files) if files else 0}"
            )
            self._tool_context = {
                "incoming_voice": self.reply_with_voice,
                "user_reply_sent": False,
                "reply_mode": None,
            }
@@ -678,22 +670,6 @@ class MoviePilotAgent:
        """
        通过原渠道发送消息给用户
        """
        voice_path = None
        if (
            self.reply_with_voice
            and VoiceHelper.resolve_reply_mode(
                channel=self.channel,
                source=self.source,
            )
            == VoiceHelper.REPLY_MODE_NATIVE
            and VoiceHelper.is_available("tts")
        ):
            # 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时，
            # 这里补一层自动语音回复兜底，避免最终仍只返回纯文字。
            voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
            if voice_file:
                voice_path = str(voice_file)
        await AgentChain().async_post_message(
            Notification(
                channel=self.channel,
@@ -703,12 +679,6 @@ class MoviePilotAgent:
                username=self.username,
                title=title,
                text=message,
                voice_path=voice_path,
                voice_caption=(
                    message
                    if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
                    else None
                ),
            )
        )
@@ -753,7 +723,6 @@ class _MessageTask:
    channel: Optional[str] = None
    source: Optional[str] = None
    username: Optional[str] = None
    reply_with_voice: bool = False
 class AgentManager:
@@ -851,7 +820,6 @@ class AgentManager:
        channel: str = None,
        source: str = None,
        username: str = None,
        reply_with_voice: bool = False,
    ) -> str:
        """
        处理用户消息：将消息放入会话队列，按顺序依次处理。
@@ -866,7 +834,6 @@ class AgentManager:
            channel=channel,
            source=source,
            username=username,
            reply_with_voice=reply_with_voice,
        )
        # 获取或创建会话队列
@@ -964,7 +931,6 @@ class AgentManager:
                agent.source = task.source
            if task.username:
                agent.username = task.username
        agent.reply_with_voice = task.reply_with_voice
        return await agent.process(task.message, images=task.images, files=task.files)
--- a/app/agent/prompt/init.py
+++ b/app/agent/prompt/init.py
@@ -87,13 +87,10 @@ class PromptManager:
            logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
            raise
-    def get_agent_prompt(
+    def get_agent_prompt(self, channel: str = None) -> str:
        self, channel: str = None, prefer_voice_reply: bool = False
    ) -> str:
        """
        获取智能体提示词
        :param channel: 消息渠道（Telegram、微信、Slack等）
        :param prefer_voice_reply: 是否优先使用语音回复
        :return: 提示词内容
        """
        # 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
@@ -132,9 +129,7 @@ class PromptManager:
        # MoviePilot系统信息
        moviepilot_info = self._get_moviepilot_info()
-        voice_reply_spec = self._generate_voice_reply_instructions(
+        voice_reply_spec = self._generate_voice_reply_instructions()
            prefer_voice_reply=prefer_voice_reply
        )
        # 始终替换占位符，避免后续 .format() 时因残留花括号报 KeyError
        base_prompt = base_prompt.format(
@@ -326,17 +321,11 @@ class PromptManager:
        return "\n".join(instructions)
    @staticmethod
-    def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str:
+    def _generate_voice_reply_instructions() -> str:
        if not prefer_voice_reply:
            return (
                "- Voice replies: Use normal text replies by default. "
                "Only call `send_voice_message` when spoken playback is clearly better than plain text."
            )
        return (
-            "- Current message context: The user sent a voice message.\n"
+            "- Voice replies: Use normal text replies by default. "
-            "- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n"
+            "Only call `send_voice_message` when the user explicitly asks for a voice reply "
-            "- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n"
+            "or spoken playback is clearly better than plain text."
            "- Do not repeat the same full reply again after calling `send_voice_message`."
        )
    @staticmethod
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
 class SendVoiceMessageTool(MoviePilotTool):
    name: str = "send_voice_message"
    description: str = (
-        "Send a voice reply to the current user. Prefer this when the user sent a voice message "
+        "Send a voice reply to the current user. Use this only when the user explicitly asks for "
-        "or when spoken playback is more natural. On channels without voice support or when TTS "
+        "a voice reply or when spoken playback is clearly better than plain text. On channels "
-        "is unavailable, it automatically falls back to sending the same content as plain text."
+        "without voice support or when TTS is unavailable, it automatically falls back to sending "
        "the same content as plain text."
    )
    args_schema: Type[BaseModel] = SendVoiceMessageInput
    require_admin: bool = False
--- a/app/chain/message.py
+++ b/app/chain/message.py
@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
        """
        images = CommingMessage.MessageImage.normalize_list(images)
-        # 识别语音为文本
+        # 语音输入只用于转写为文本，不默认改变回复形式。
-        reply_with_voice = bool(audio_refs)
+        has_audio_input = bool(audio_refs)
        if audio_refs:
            transcript = self._transcribe_audio_refs(audio_refs, channel, source)
            merged_parts = []
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
                username=username,
                images=images,
                files=files,
                reply_with_voice=reply_with_voice,
            )
            return
        if (
            settings.AI_AGENT_ENABLE
-            and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice)
+            and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
        ):
            self._handle_ai_message(
                text=text,
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
                username=username,
                images=images,
                files=files,
                reply_with_voice=reply_with_voice,
            )
            return
@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
        username: str,
        images: Optional[List[CommingMessage.MessageImage]] = None,
        files: Optional[List[CommingMessage.MessageAttachment]] = None,
        reply_with_voice: bool = False,
        session_id: Optional[str] = None,
    ) -> None:
        """
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
                    channel=channel.value if channel else None,
                    source=source,
                    username=username,
                    reply_with_voice=reply_with_voice,
                ),
                global_vars.loop,
            )
--- a/tests/test_agent_image_support.py
+++ b/tests/test_agent_image_support.py
@@ -2,7 +2,6 @@ import base64
 import json
 import tempfile
 import unittest
 from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, Mock, patch
 from urllib.parse import quote
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
 from app.chain.message import MessageChain
 from app.core.config import settings
 from app.helper.llm import LLMHelper
 from app.helper.voice import VoiceHelper
 from app.modules.discord import DiscordModule
 from app.modules.qqbot import QQBotModule
 from app.modules.slack import SlackModule
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):
        handle_ai_message.assert_called_once()
-    def test_audio_message_routes_to_agent_with_voice_reply_flag(self):
+    def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
        chain = MessageChain()
        with patch.object(chain, "load_cache", return_value={}), patch.object(
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):
        handle_ai_message.assert_called_once()
        self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
-        self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"])
+        self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
    def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
        chain = MessageChain()
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
            ],
        )
-    def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self):
+    def test_agent_send_agent_message_keeps_default_text_reply(self):
        agent = MoviePilotAgent(
            session_id="session-1",
            user_id="user-1",
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
            source="telegram-test",
            username="tester",
        )
        agent.reply_with_voice = True
        with patch.object(
            VoiceHelper,
            "resolve_reply_mode",
            return_value=VoiceHelper.REPLY_MODE_NATIVE,
        ), patch.object(
            VoiceHelper, "is_available", return_value=True
        ), patch.object(
            VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
        ), patch.object(
            AgentChain, "async_post_message", new_callable=AsyncMock
        ) as async_post_message:
            import asyncio
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
            asyncio.run(agent.send_agent_message("这是语音回复"))
        notification = async_post_message.await_args.args[0]
-        self.assertEqual(notification.voice_path, "/tmp/reply.opus")
+        self.assertIsNone(notification.voice_path)
        self.assertIsNone(notification.voice_caption)
        self.assertEqual(notification.text, "这是语音回复")
    def test_agent_process_wraps_request_as_structured_json(self):