mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-05-06 20:42:43 +08:00
fix: default to text replies for voice input
This commit is contained in:
@@ -34,7 +34,6 @@ from app.chain import ChainBase
|
||||
from app.core.config import settings
|
||||
from app.db.transferhistory_oper import TransferHistoryOper
|
||||
from app.helper.llm import LLMHelper
|
||||
from app.helper.voice import VoiceHelper
|
||||
from app.log import logger
|
||||
from app.schemas import Notification, NotificationType
|
||||
from app.schemas.message import ChannelCapabilityManager, ChannelCapability
|
||||
@@ -170,7 +169,6 @@ class MoviePilotAgent:
|
||||
self.channel = channel
|
||||
self.source = source
|
||||
self.username = username
|
||||
self.reply_with_voice = False
|
||||
self._tool_context: Dict[str, object] = {}
|
||||
self.output_callback: Optional[Callable[[str], None]] = None
|
||||
self.force_streaming = False
|
||||
@@ -280,8 +278,6 @@ class MoviePilotAgent:
|
||||
"""
|
||||
if self.is_background:
|
||||
return self.force_streaming or callable(self.output_callback)
|
||||
if self.reply_with_voice:
|
||||
return False
|
||||
if self.force_streaming or callable(self.output_callback):
|
||||
return True
|
||||
# 啰嗦模式下始终需要流式输出来捕获工具调用前的 Agent 文字
|
||||
@@ -379,10 +375,7 @@ class MoviePilotAgent:
|
||||
"""
|
||||
try:
|
||||
# 系统提示词
|
||||
system_prompt = prompt_manager.get_agent_prompt(
|
||||
channel=self.channel,
|
||||
prefer_voice_reply=self.reply_with_voice,
|
||||
)
|
||||
system_prompt = prompt_manager.get_agent_prompt(channel=self.channel)
|
||||
|
||||
# LLM 模型(用于 agent 执行)
|
||||
llm = self._initialize_llm(streaming=streaming)
|
||||
@@ -460,7 +453,6 @@ class MoviePilotAgent:
|
||||
f"images={len(images) if images else 0}, files={len(files) if files else 0}"
|
||||
)
|
||||
self._tool_context = {
|
||||
"incoming_voice": self.reply_with_voice,
|
||||
"user_reply_sent": False,
|
||||
"reply_mode": None,
|
||||
}
|
||||
@@ -678,22 +670,6 @@ class MoviePilotAgent:
|
||||
"""
|
||||
通过原渠道发送消息给用户
|
||||
"""
|
||||
voice_path = None
|
||||
if (
|
||||
self.reply_with_voice
|
||||
and VoiceHelper.resolve_reply_mode(
|
||||
channel=self.channel,
|
||||
source=self.source,
|
||||
)
|
||||
== VoiceHelper.REPLY_MODE_NATIVE
|
||||
and VoiceHelper.is_available("tts")
|
||||
):
|
||||
# 当用户本轮发来语音且 Agent 未主动调用 send_voice_message 时,
|
||||
# 这里补一层自动语音回复兜底,避免最终仍只返回纯文字。
|
||||
voice_file = await asyncio.to_thread(VoiceHelper.synthesize_speech, message)
|
||||
if voice_file:
|
||||
voice_path = str(voice_file)
|
||||
|
||||
await AgentChain().async_post_message(
|
||||
Notification(
|
||||
channel=self.channel,
|
||||
@@ -703,12 +679,6 @@ class MoviePilotAgent:
|
||||
username=self.username,
|
||||
title=title,
|
||||
text=message,
|
||||
voice_path=voice_path,
|
||||
voice_caption=(
|
||||
message
|
||||
if voice_path and settings.AI_VOICE_REPLY_WITH_TEXT
|
||||
else None
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -753,7 +723,6 @@ class _MessageTask:
|
||||
channel: Optional[str] = None
|
||||
source: Optional[str] = None
|
||||
username: Optional[str] = None
|
||||
reply_with_voice: bool = False
|
||||
|
||||
|
||||
class AgentManager:
|
||||
@@ -851,7 +820,6 @@ class AgentManager:
|
||||
channel: str = None,
|
||||
source: str = None,
|
||||
username: str = None,
|
||||
reply_with_voice: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
处理用户消息:将消息放入会话队列,按顺序依次处理。
|
||||
@@ -866,7 +834,6 @@ class AgentManager:
|
||||
channel=channel,
|
||||
source=source,
|
||||
username=username,
|
||||
reply_with_voice=reply_with_voice,
|
||||
)
|
||||
|
||||
# 获取或创建会话队列
|
||||
@@ -964,7 +931,6 @@ class AgentManager:
|
||||
agent.source = task.source
|
||||
if task.username:
|
||||
agent.username = task.username
|
||||
agent.reply_with_voice = task.reply_with_voice
|
||||
|
||||
return await agent.process(task.message, images=task.images, files=task.files)
|
||||
|
||||
|
||||
@@ -87,13 +87,10 @@ class PromptManager:
|
||||
logger.error(f"加载提示词失败: {prompt_name}, 错误: {e}")
|
||||
raise
|
||||
|
||||
def get_agent_prompt(
|
||||
self, channel: str = None, prefer_voice_reply: bool = False
|
||||
) -> str:
|
||||
def get_agent_prompt(self, channel: str = None) -> str:
|
||||
"""
|
||||
获取智能体提示词
|
||||
:param channel: 消息渠道(Telegram、微信、Slack等)
|
||||
:param prefer_voice_reply: 是否优先使用语音回复
|
||||
:return: 提示词内容
|
||||
"""
|
||||
# 基础提示词只保留 MoviePilot 运行时和渠道能力相关约束。
|
||||
@@ -132,9 +129,7 @@ class PromptManager:
|
||||
|
||||
# MoviePilot系统信息
|
||||
moviepilot_info = self._get_moviepilot_info()
|
||||
voice_reply_spec = self._generate_voice_reply_instructions(
|
||||
prefer_voice_reply=prefer_voice_reply
|
||||
)
|
||||
voice_reply_spec = self._generate_voice_reply_instructions()
|
||||
|
||||
# 始终替换占位符,避免后续 .format() 时因残留花括号报 KeyError
|
||||
base_prompt = base_prompt.format(
|
||||
@@ -326,17 +321,11 @@ class PromptManager:
|
||||
return "\n".join(instructions)
|
||||
|
||||
@staticmethod
|
||||
def _generate_voice_reply_instructions(prefer_voice_reply: bool) -> str:
|
||||
if not prefer_voice_reply:
|
||||
def _generate_voice_reply_instructions() -> str:
|
||||
return (
|
||||
"- Voice replies: Use normal text replies by default. "
|
||||
"Only call `send_voice_message` when spoken playback is clearly better than plain text."
|
||||
)
|
||||
return (
|
||||
"- Current message context: The user sent a voice message.\n"
|
||||
"- Reply preference: Prioritize calling `send_voice_message` for the main user-facing reply.\n"
|
||||
"- Fallback: If native voice is unavailable on the current channel, `send_voice_message` will fall back to text.\n"
|
||||
"- Do not repeat the same full reply again after calling `send_voice_message`."
|
||||
"Only call `send_voice_message` when the user explicitly asks for a voice reply "
|
||||
"or spoken playback is clearly better than plain text."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -28,9 +28,10 @@ class SendVoiceMessageInput(BaseModel):
|
||||
class SendVoiceMessageTool(MoviePilotTool):
|
||||
name: str = "send_voice_message"
|
||||
description: str = (
|
||||
"Send a voice reply to the current user. Prefer this when the user sent a voice message "
|
||||
"or when spoken playback is more natural. On channels without voice support or when TTS "
|
||||
"is unavailable, it automatically falls back to sending the same content as plain text."
|
||||
"Send a voice reply to the current user. Use this only when the user explicitly asks for "
|
||||
"a voice reply or when spoken playback is clearly better than plain text. On channels "
|
||||
"without voice support or when TTS is unavailable, it automatically falls back to sending "
|
||||
"the same content as plain text."
|
||||
)
|
||||
args_schema: Type[BaseModel] = SendVoiceMessageInput
|
||||
require_admin: bool = False
|
||||
|
||||
@@ -109,8 +109,8 @@ class MessageChain(ChainBase):
|
||||
"""
|
||||
images = CommingMessage.MessageImage.normalize_list(images)
|
||||
|
||||
# 识别语音为文本
|
||||
reply_with_voice = bool(audio_refs)
|
||||
# 语音输入只用于转写为文本,不默认改变回复形式。
|
||||
has_audio_input = bool(audio_refs)
|
||||
if audio_refs:
|
||||
transcript = self._transcribe_audio_refs(audio_refs, channel, source)
|
||||
merged_parts = []
|
||||
@@ -198,13 +198,12 @@ class MessageChain(ChainBase):
|
||||
username=username,
|
||||
images=images,
|
||||
files=files,
|
||||
reply_with_voice=reply_with_voice,
|
||||
)
|
||||
return
|
||||
|
||||
if (
|
||||
settings.AI_AGENT_ENABLE
|
||||
and (settings.AI_AGENT_GLOBAL or images or files or reply_with_voice)
|
||||
and (settings.AI_AGENT_GLOBAL or images or files or has_audio_input)
|
||||
):
|
||||
self._handle_ai_message(
|
||||
text=text,
|
||||
@@ -214,7 +213,6 @@ class MessageChain(ChainBase):
|
||||
username=username,
|
||||
images=images,
|
||||
files=files,
|
||||
reply_with_voice=reply_with_voice,
|
||||
)
|
||||
return
|
||||
|
||||
@@ -866,7 +864,6 @@ class MessageChain(ChainBase):
|
||||
username: str,
|
||||
images: Optional[List[CommingMessage.MessageImage]] = None,
|
||||
files: Optional[List[CommingMessage.MessageAttachment]] = None,
|
||||
reply_with_voice: bool = False,
|
||||
session_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -978,7 +975,6 @@ class MessageChain(ChainBase):
|
||||
channel=channel.value if channel else None,
|
||||
source=source,
|
||||
username=username,
|
||||
reply_with_voice=reply_with_voice,
|
||||
),
|
||||
global_vars.loop,
|
||||
)
|
||||
|
||||
@@ -2,7 +2,6 @@ import base64
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock, Mock, patch
|
||||
from urllib.parse import quote
|
||||
@@ -15,7 +14,6 @@ from app.agent import MoviePilotAgent, AgentChain
|
||||
from app.chain.message import MessageChain
|
||||
from app.core.config import settings
|
||||
from app.helper.llm import LLMHelper
|
||||
from app.helper.voice import VoiceHelper
|
||||
from app.modules.discord import DiscordModule
|
||||
from app.modules.qqbot import QQBotModule
|
||||
from app.modules.slack import SlackModule
|
||||
@@ -218,7 +216,7 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
|
||||
handle_ai_message.assert_called_once()
|
||||
|
||||
def test_audio_message_routes_to_agent_with_voice_reply_flag(self):
|
||||
def test_audio_message_routes_to_agent_without_forcing_voice_reply(self):
|
||||
chain = MessageChain()
|
||||
|
||||
with patch.object(chain, "load_cache", return_value={}), patch.object(
|
||||
@@ -240,7 +238,7 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
|
||||
handle_ai_message.assert_called_once()
|
||||
self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
|
||||
self.assertTrue(handle_ai_message.call_args.kwargs["reply_with_voice"])
|
||||
self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
|
||||
|
||||
def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
|
||||
chain = MessageChain()
|
||||
@@ -322,7 +320,7 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
],
|
||||
)
|
||||
|
||||
def test_agent_send_agent_message_auto_converts_to_voice_when_supported(self):
|
||||
def test_agent_send_agent_message_keeps_default_text_reply(self):
|
||||
agent = MoviePilotAgent(
|
||||
session_id="session-1",
|
||||
user_id="user-1",
|
||||
@@ -330,17 +328,8 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
source="telegram-test",
|
||||
username="tester",
|
||||
)
|
||||
agent.reply_with_voice = True
|
||||
|
||||
with patch.object(
|
||||
VoiceHelper,
|
||||
"resolve_reply_mode",
|
||||
return_value=VoiceHelper.REPLY_MODE_NATIVE,
|
||||
), patch.object(
|
||||
VoiceHelper, "is_available", return_value=True
|
||||
), patch.object(
|
||||
VoiceHelper, "synthesize_speech", return_value=Path("/tmp/reply.opus")
|
||||
), patch.object(
|
||||
AgentChain, "async_post_message", new_callable=AsyncMock
|
||||
) as async_post_message:
|
||||
import asyncio
|
||||
@@ -348,7 +337,8 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
asyncio.run(agent.send_agent_message("这是语音回复"))
|
||||
|
||||
notification = async_post_message.await_args.args[0]
|
||||
self.assertEqual(notification.voice_path, "/tmp/reply.opus")
|
||||
self.assertIsNone(notification.voice_path)
|
||||
self.assertIsNone(notification.voice_caption)
|
||||
self.assertEqual(notification.text, "这是语音回复")
|
||||
|
||||
def test_agent_process_wraps_request_as_structured_json(self):
|
||||
|
||||
Reference in New Issue
Block a user