feat(agent): mark and propagate voice input metadata in agent messages; clarify terminal tool usage in prompts

- Add `has_audio_input` flag to agent message handling and propagate through processing pipeline - Structure agent input payloads to include `input.mode` and `input.transcribed` for voice messages - Update prompts and tool descriptions to clarify that `send_voice_message` and `ask_user_choice` are terminal tools and should not be followed by redundant text replies - Enhance tests to cover voice input metadata propagation and prompt updates
2026-06-13 03:30:51 +08:00 · 2026-05-31 18:04:02 +08:00
parent 13b2163788
commit 855681ff35
11 changed files with 139 additions and 10 deletions
--- a/app/agent/tools/impl/ask_user_choice.py
+++ b/app/agent/tools/impl/ask_user_choice.py
@@ -71,7 +71,9 @@ class AskUserChoiceTool(MoviePilotTool):
    return_direct: bool = True
    description: str = (
        "Ask the user to choose from button options on channels that support interactive buttons. "
-        "After the user clicks a button, the selected value will come back as the user's next message."
+        "This is a terminal interaction tool: put the full question and all options in this call, "
+        "then stop the current turn. After the user clicks a button, the selected value will come "
+        "back as the user's next message. Do not also send the same question as plain text."
    )
    args_schema: Type[BaseModel] = AskUserChoiceInput
    require_admin: bool = False
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -35,7 +35,9 @@ class SendVoiceMessageTool(MoviePilotTool):
        "Send a voice reply to the current user. Use this only when the user explicitly asks for "
        "a voice reply or when spoken playback is clearly better than plain text. On channels "
        "without voice support or when TTS is unavailable, it automatically falls back to sending "
-        "the same content as plain text."
+        "the same content as plain text. This is a terminal response tool: put the complete "
+        "user-facing reply in `message`; after this tool runs, do not send another text reply "
+        "or call `send_message` with the same content."
    )
    args_schema: Type[BaseModel] = SendVoiceMessageInput
    require_admin: bool = False