mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-11 10:40:18 +08:00
feat(agent): mark and propagate voice input metadata in agent messages; clarify terminal tool usage in prompts
- Add `has_audio_input` flag to agent message handling and propagate through processing pipeline - Structure agent input payloads to include `input.mode` and `input.transcribed` for voice messages - Update prompts and tool descriptions to clarify that `send_voice_message` and `ask_user_choice` are terminal tools and should not be followed by redundant text replies - Enhance tests to cover voice input metadata propagation and prompt updates
This commit is contained in:
@@ -10,6 +10,7 @@ from app.agent import (
|
||||
AgentManager,
|
||||
ReplyMode,
|
||||
UNSUPPORTED_IMAGE_INPUT_MESSAGE,
|
||||
_MessageTask,
|
||||
)
|
||||
from app.agent.memory import memory_manager
|
||||
from app.agent.tools.factory import MoviePilotToolFactory
|
||||
@@ -288,6 +289,28 @@ class AgentBackgroundOutputTest(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
process_message.assert_not_awaited()
|
||||
|
||||
async def test_agent_manager_preserves_voice_input_flag(self):
|
||||
"""会话队列执行时应把语音输入标记继续传给 Agent。"""
|
||||
manager = AgentManager()
|
||||
agent = MoviePilotAgent(session_id="session-1", user_id="user-1")
|
||||
manager.active_agents["session-1"] = agent
|
||||
agent.process = AsyncMock(return_value="ok")
|
||||
task = _MessageTask(
|
||||
session_id="session-1",
|
||||
user_id="user-1",
|
||||
message="帮我推荐一部电影",
|
||||
has_audio_input=True,
|
||||
)
|
||||
|
||||
await manager._process_message_internal(task)
|
||||
|
||||
agent.process.assert_awaited_once_with(
|
||||
"帮我推荐一部电影",
|
||||
images=None,
|
||||
files=None,
|
||||
has_audio_input=True,
|
||||
)
|
||||
|
||||
async def test_create_agent_excludes_activity_log_for_heartbeat_session(self):
|
||||
agent = MoviePilotAgent(
|
||||
session_id=f"{HEARTBEAT_SESSION_PREFIX}test__",
|
||||
|
||||
@@ -242,6 +242,7 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
|
||||
handle_ai_message.assert_called_once()
|
||||
self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
|
||||
self.assertTrue(handle_ai_message.call_args.kwargs["has_audio_input"])
|
||||
self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
|
||||
|
||||
def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
|
||||
@@ -390,8 +391,36 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
self.assertIsInstance(content, list)
|
||||
payload = json.loads(content[0]["text"])
|
||||
self.assertEqual(payload["message"], "帮我总结这个文件")
|
||||
self.assertEqual(payload["input"]["mode"], "text")
|
||||
self.assertFalse(payload["input"]["transcribed"])
|
||||
self.assertEqual(payload["files"][0]["local_path"], "/tmp/report.txt")
|
||||
|
||||
def test_agent_process_marks_voice_input_in_structured_json(self):
|
||||
"""语音输入应在结构化消息中标记为转写来源。"""
|
||||
agent = MoviePilotAgent(
|
||||
session_id="session-1",
|
||||
user_id="user-1",
|
||||
channel=MessageChannel.Telegram.value,
|
||||
source="telegram-test",
|
||||
username="tester",
|
||||
)
|
||||
|
||||
with patch(
|
||||
"app.agent.memory.memory_manager.get_agent_messages", return_value=[]
|
||||
), patch.object(agent, "_execute_agent", new_callable=AsyncMock) as execute_agent:
|
||||
asyncio.run(
|
||||
agent.process(
|
||||
"帮我推荐一部电影",
|
||||
has_audio_input=True,
|
||||
)
|
||||
)
|
||||
|
||||
messages = execute_agent.await_args.args[0]
|
||||
payload = json.loads(messages[-1].content[0]["text"])
|
||||
self.assertEqual(payload["message"], "帮我推荐一部电影")
|
||||
self.assertEqual(payload["input"]["mode"], "voice")
|
||||
self.assertTrue(payload["input"]["transcribed"])
|
||||
|
||||
def test_llm_supports_image_input_respects_explicit_override(self):
|
||||
with patch.object(settings, "LLM_SUPPORT_IMAGE_INPUT", False):
|
||||
self.assertFalse(LLMHelper.supports_image_input())
|
||||
@@ -447,6 +476,29 @@ class AgentImageSupportTest(unittest.TestCase):
|
||||
"/tmp/image_1.jpg",
|
||||
)
|
||||
|
||||
def test_handle_ai_message_forwards_voice_input_to_agent_manager(self):
|
||||
"""AI消息入队时应保留语音输入标记。"""
|
||||
chain = MessageChain()
|
||||
|
||||
with patch.object(settings, "AI_AGENT_ENABLE", True), patch.object(
|
||||
chain, "_get_or_create_session_id", return_value="session-1"
|
||||
), patch(
|
||||
"app.chain.message.agent_manager.process_message", new_callable=AsyncMock
|
||||
) as process_message, patch(
|
||||
"app.chain.message.asyncio.run_coroutine_threadsafe",
|
||||
side_effect=lambda coro, _loop: (coro.close(), Mock())[1],
|
||||
):
|
||||
chain._handle_ai_message(
|
||||
text="帮我推荐一部电影",
|
||||
channel=MessageChannel.Telegram,
|
||||
source="telegram-test",
|
||||
userid="10001",
|
||||
username="tester",
|
||||
has_audio_input=True,
|
||||
)
|
||||
|
||||
self.assertTrue(process_message.call_args.kwargs["has_audio_input"])
|
||||
|
||||
def test_slack_images_use_authenticated_data_url_download(self):
|
||||
chain = MessageChain()
|
||||
|
||||
|
||||
@@ -30,6 +30,8 @@ class TestAgentInteraction(unittest.TestCase):
|
||||
)
|
||||
|
||||
self.assertIn("ask_user_choice", telegram_prompt)
|
||||
self.assertIn("terminal interaction tool", telegram_prompt)
|
||||
self.assertIn("do not write a final text reply after it", telegram_prompt)
|
||||
self.assertNotIn("ask_user_choice", wechat_prompt)
|
||||
|
||||
def test_factory_injects_choice_tool_only_for_button_channels(self):
|
||||
@@ -60,6 +62,7 @@ class TestAgentInteraction(unittest.TestCase):
|
||||
tool = AskUserChoiceTool(session_id="session-1", user_id="10001")
|
||||
|
||||
self.assertTrue(tool.return_direct)
|
||||
self.assertIn("terminal interaction tool", tool.description)
|
||||
|
||||
def test_choice_tool_sends_buttons_and_registers_pending_request(self):
|
||||
tool = AskUserChoiceTool(session_id="session-1", user_id="10001")
|
||||
|
||||
@@ -244,6 +244,24 @@ class TestAgentPromptStyle(unittest.TestCase):
|
||||
prompt,
|
||||
)
|
||||
|
||||
def test_voice_prompt_marks_voice_tool_as_terminal_reply(self):
|
||||
"""语音回复提示词应说明语音工具会结束当前轮次。"""
|
||||
with patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True):
|
||||
prompt = prompt_manager.get_agent_prompt()
|
||||
|
||||
self.assertIn("send_voice_message", prompt)
|
||||
self.assertIn("terminal response tool", prompt)
|
||||
self.assertIn("do not write a final text reply after it", prompt)
|
||||
self.assertIn("text fallback and still completes the reply", prompt)
|
||||
|
||||
def test_core_prompt_describes_voice_input_metadata(self):
|
||||
"""核心提示词应说明结构化消息中的语音输入元信息。"""
|
||||
prompt = prompt_manager.get_agent_prompt()
|
||||
|
||||
self.assertIn("input.mode", prompt)
|
||||
self.assertIn("voice", prompt)
|
||||
self.assertIn("`message` contains its transcript", prompt)
|
||||
|
||||
def test_verbose_prompt_does_not_inject_silence_until_tools_finish_rule(self):
|
||||
with patch.object(settings, "AI_AGENT_VERBOSE", True):
|
||||
prompt = prompt_manager.get_agent_prompt()
|
||||
|
||||
@@ -441,7 +441,9 @@ class TestAgentToolStreaming(unittest.TestCase):
|
||||
self.assertEqual(notification.channel, channel)
|
||||
self.assertEqual(notification.voice_path, "/tmp/reply.opus")
|
||||
self.assertEqual(notification.voice_caption, "你好")
|
||||
self.assertTrue(SendVoiceMessageTool.return_direct)
|
||||
voice_tool = SendVoiceMessageTool(session_id="session-1", user_id="10001")
|
||||
self.assertTrue(voice_tool.return_direct)
|
||||
self.assertIn("terminal response tool", voice_tool.description)
|
||||
|
||||
def test_send_voice_message_falls_back_for_unsupported_channels(self):
|
||||
"""校验不支持语音输出的渠道继续回退为文字消息。"""
|
||||
|
||||
Reference in New Issue
Block a user