fix: default to text replies for voice input

This commit is contained in:
jxxghp
2026-04-29 18:54:58 +08:00
parent 9ed5018cc2
commit af35101774
5 changed files with 19 additions and 77 deletions

View File

@@ -34,7 +34,6 @@ from app.chain import ChainBase
from app.core.config import settings from app.core.config import settings
from app.db.transferhistory_oper import TransferHistoryOper from app.db.transferhistory_oper import TransferHistoryOper
from app.helper.llm import LLMHelper from app.helper.llm import LLMHelper
from app.helper.voice import VoiceHelper
from app.log import logger from app.log import logger
from app.schemas import Notification, NotificationType from app.schemas import Notification, NotificationType
from app.schemas.message import ChannelCapabilityManager, ChannelCapability from app.schemas.message import ChannelCapabilityManager, ChannelCapability
@@ -170,7 +169,6 @@ class MoviePilotAgent:
self.channel = channel self.channel = channel
self.source = source self.source = source
self.username = username self.username = username
self.reply_with_voice = False
self._tool_context: Dict[str, object] = {} self._tool_context: Dict[str, object] = {}
self.output_callback: Optional[Callable[[str], None]] = None self.output_callback: Optional[Callable[[str], None]] = None
self.force_streaming = False self.force_streaming = False
@@ -280,8 +278,6 @@ class MoviePilotAgent:
""" """
if self.is_background: if self.is_background:
return self.force_streaming or callable(self.output_callback) return self.force_streaming or callable(self.output_callback)
if self.reply_with_voice:
return False
if self.force_streaming or callable(self.output_callback): if self.force_streaming or callable(self.output_callback):
return True return True
# 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字 # 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
@@ -379,10 +375,7 @@ class MoviePilotAgent:
""" """
try: try:
# 系统提示词 # 系统提示词
system_prompt = prompt_manager.get_agent_prompt( system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)
channel=self.channel,
prefer_voice_reply=self.reply_with_voice,
)
# LLM 模型(用于 agent 执行) # LLM 模型(用于 agent 执行)
llm = self._initialize_llm(streaming=streaming) llm = self._initialize_llm(streaming=streaming)
@@ -460,7 +453,6 @@ class MoviePilotAgent:
f"images={len(images) if images else 0}, files={len(files) if files else 0}" f"images={len(images) if images else 0}, files={len(files) if files else 0}"
) )
self._tool_context = { self._tool_context = {
"incoming_voice": self.reply_with_voice,
"user_reply_sent": False, "user_reply_sent": False,
"reply_mode": None, "reply_mode": None,
} }
@@ -678,22 +670,6 @@ class MoviePilotAgent:
""" """
通过原渠道发送消息给用户 通过原渠道发送消息给用户
""" """
voice_path = None
if (
self.reply_with_voice
and VoiceHelper.resolve_reply_mode(
channel=self.channel,
source=self.source,
)
== VoiceHelper.REPLY_MODE_NATIVE
and VoiceHelper.is_available("tts")
):
# 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时,
# 这里补一层自动语音回复兜底,避免最终仍只返回纯文字。
voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
if voice_file:
voice_path = str(voice_file)
await AgentChain().async_post_message( await AgentChain().async_post_message(
Notification( Notification(
channel=self.channel, channel=self.channel,
@@ -703,12 +679,6 @@ class MoviePilotAgent:
username=self.username, username=self.username,
title=title, title=title,
text=message, text=message,
voice_path=voice_path,
voice_caption=(
message
if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
else None
),
) )
) )
@@ -753,7 +723,6 @@ class _MessageTask:
channel: Optional[str] = None channel: Optional[str] = None
source: Optional[str] = None source: Optional[str] = None
username: Optional[str] = None username: Optional[str] = None
reply_with_voice: bool = False
class AgentManager: class AgentManager:
@@ -851,7 +820,6 @@ class AgentManager:
channel: str = None, channel: str = None,
source: str = None, source: str = None,
username: str = None, username: str = None,
reply_with_voice: bool = False,
) -> str: ) -> str:
""" """
处理用户消息:将消息放入会话队列,按顺序依次处理。 处理用户消息:将消息放入会话队列,按顺序依次处理。
@@ -866,7 +834,6 @@ class AgentManager:
channel=channel, channel=channel,
source=source, source=source,
username=username, username=username,
reply_with_voice=reply_with_voice,
) )
# 获取或创建会话队列 # 获取或创建会话队列
@@ -964,7 +931,6 @@ class AgentManager:
agent.source = task.source agent.source = task.source
if task.username: if task.username:
agent.username = task.username agent.username = task.username
agent.reply_with_voice = task.reply_with_voice
return await agent.process(task.message, images=task.images, files=task.files) return await agent.process(task.message, images=task.images, files=task.files)

View File

@@ -87,13 +87,10 @@ class PromptManager:
logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}") logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
raise raise
def get_agent_prompt( def get_agent_prompt(self, channel: str = None) -> str:
self, channel: str = None, prefer_voice_reply: bool = False
) -> str:
""" """
获取智能体提示词 获取智能体提示词
:param channel: 消息渠道Telegram、微信、Slack等 :param channel: 消息渠道Telegram、微信、Slack等
:param prefer_voice_reply: 是否优先使用语音回复
:return: 提示词内容 :return: 提示词内容
""" """
# 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。 # 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
@@ -132,9 +129,7 @@ class PromptManager:
# MoviePilot系统信息 # MoviePilot系统信息
moviepilot_info = self._get_moviepilot_info() moviepilot_info = self._get_moviepilot_info()
voice_reply_spec = self._generate_voice_reply_instructions( voice_reply_spec = self._generate_voice_reply_instructions()
prefer_voice_reply=prefer_voice_reply
)
# 始终替换占位符,避免后续 .format() 时因残留花括号报 KeyError # 始终替换占位符,避免后续 .format() 时因残留花括号报 KeyError
base_prompt = base_prompt.format( base_prompt = base_prompt.format(
@@ -326,17 +321,11 @@ class PromptManager:
return "\n".join(instructions) return "\n".join(instructions)
@staticmethod @staticmethod
def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str: def _generate_voice_reply_instructions() -> str:
if not prefer_voice_reply:
return (
"- Voice replies: Use normal text replies by default. "
"Only call `send_voice_message` when spoken playback is clearly better than plain text."
)
return ( return (
"- Current message context: The user sent a voice message.\n" "- Voice replies: Use normal text replies by default. "
"- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n" "Only call `send_voice_message` when the user explicitly asks for a voice reply "
"- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n" "or spoken playback is clearly better than plain text."
"- Do not repeat the same full reply again after calling `send_voice_message`."
) )
@staticmethod @staticmethod

View File

@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
class SendVoiceMessageTool(MoviePilotTool): class SendVoiceMessageTool(MoviePilotTool):
name: str = "send_voice_message" name: str = "send_voice_message"
description: str = ( description: str = (
"Send a voice reply to the current user. Prefer this when the user sent a voice message " "Send a voice reply to the current user. Use this only when the user explicitly asks for "
"or when spoken playback is more natural. On channels without voice support or when TTS " "a voice reply or when spoken playback is clearly better than plain text. On channels "
"is unavailable, it automatically falls back to sending the same content as plain text." "without voice support or when TTS is unavailable, it automatically falls back to sending "
"the same content as plain text."
) )
args_schema: Type[BaseModel] = SendVoiceMessageInput args_schema: Type[BaseModel] = SendVoiceMessageInput
require_admin: bool = False require_admin: bool = False

View File

@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
""" """
images = CommingMessage.MessageImage.normalize_list(images) images = CommingMessage.MessageImage.normalize_list(images)
# 识别语音为文本 # 语音输入只用于转写为文本,不默认改变回复形式。
reply_with_voice = bool(audio_refs) has_audio_input = bool(audio_refs)
if audio_refs: if audio_refs:
transcript = self._transcribe_audio_refs(audio_refs, channel, source) transcript = self._transcribe_audio_refs(audio_refs, channel, source)
merged_parts = [] merged_parts = []
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
username=username, username=username,
images=images, images=images,
files=files, files=files,
reply_with_voice=reply_with_voice,
) )
return return
if ( if (
settings.AI_AGENT_ENABLE settings.AI_AGENT_ENABLE
and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice) and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
): ):
self._handle_ai_message( self._handle_ai_message(
text=text, text=text,
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
username=username, username=username,
images=images, images=images,
files=files, files=files,
reply_with_voice=reply_with_voice,
) )
return return
@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
username: str, username: str,
images: Optional[List[CommingMessage.MessageImage]] = None, images: Optional[List[CommingMessage.MessageImage]] = None,
files: Optional[List[CommingMessage.MessageAttachment]] = None, files: Optional[List[CommingMessage.MessageAttachment]] = None,
reply_with_voice: bool = False,
session_id: Optional[str] = None, session_id: Optional[str] = None,
) -> None: ) -> None:
""" """
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
channel=channel.value if channel else None, channel=channel.value if channel else None,
source=source, source=source,
username=username, username=username,
reply_with_voice=reply_with_voice,
), ),
global_vars.loop, global_vars.loop,
) )

View File

@@ -2,7 +2,6 @@ import base64
import json import json
import tempfile import tempfile
import unittest import unittest
from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import AsyncMock, Mock, patch from unittest.mock import AsyncMock, Mock, patch
from urllib.parse import quote from urllib.parse import quote
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
from app.chain.message import MessageChain from app.chain.message import MessageChain
from app.core.config import settings from app.core.config import settings
from app.helper.llm import LLMHelper from app.helper.llm import LLMHelper
from app.helper.voice import VoiceHelper
from app.modules.discord import DiscordModule from app.modules.discord import DiscordModule
from app.modules.qqbot import QQBotModule from app.modules.qqbot import QQBotModule
from app.modules.slack import SlackModule from app.modules.slack import SlackModule
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):
handle_ai_message.assert_called_once() handle_ai_message.assert_called_once()
def test_audio_message_routes_to_agent_with_voice_reply_flag(self): def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
chain = MessageChain() chain = MessageChain()
with patch.object(chain, "load_cache", return_value={}), patch.object( with patch.object(chain, "load_cache", return_value={}), patch.object(
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):
handle_ai_message.assert_called_once() handle_ai_message.assert_called_once()
self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影") self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"]) self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self): def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
chain = MessageChain() chain = MessageChain()
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
], ],
) )
def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self): def test_agent_send_agent_message_keeps_default_text_reply(self):
agent = MoviePilotAgent( agent = MoviePilotAgent(
session_id="session-1", session_id="session-1",
user_id="user-1", user_id="user-1",
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
source="telegram-test", source="telegram-test",
username="tester", username="tester",
) )
agent.reply_with_voice = True
with patch.object( with patch.object(
VoiceHelper,
"resolve_reply_mode",
return_value=VoiceHelper.REPLY_MODE_NATIVE,
), patch.object(
VoiceHelper, "is_available", return_value=True
), patch.object(
VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
), patch.object(
AgentChain, "async_post_message", new_callable=AsyncMock AgentChain, "async_post_message", new_callable=AsyncMock
) as async_post_message: ) as async_post_message:
import asyncio import asyncio
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
asyncio.run(agent.send_agent_message("这是语音回复")) asyncio.run(agent.send_agent_message("这是语音回复"))
notification = async_post_message.await_args.args[0] notification = async_post_message.await_args.args[0]
self.assertEqual(notification.voice_path, "/tmp/reply.opus") self.assertIsNone(notification.voice_path)
self.assertIsNone(notification.voice_caption)
self.assertEqual(notification.text, "这是语音回复") self.assertEqual(notification.text, "这是语音回复")
def test_agent_process_wraps_request_as_structured_json(self): def test_agent_process_wraps_request_as_structured_json(self):