mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-12 19:21:05 +08:00
feat(agent): mark and propagate voice input metadata in agent messages; clarify terminal tool usage in prompts
- Add `has_audio_input` flag to agent message handling and propagate through processing pipeline - Structure agent input payloads to include `input.mode` and `input.transcribed` for voice messages - Update prompts and tool descriptions to clarify that `send_voice_message` and `ask_user_choice` are terminal tools and should not be followed by redundant text replies - Enhance tests to cover voice input metadata propagation and prompt updates
This commit is contained in:
@@ -861,6 +861,7 @@ class MoviePilotAgent:
|
||||
message: str,
|
||||
images: List[str] = None,
|
||||
files: Optional[List[dict]] = None,
|
||||
has_audio_input: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
处理用户消息,流式推理并返回 Agent 回复
|
||||
@@ -868,7 +869,8 @@ class MoviePilotAgent:
|
||||
try:
|
||||
logger.info(
|
||||
f"Agent推理: session_id={self.session_id}, input={message}, "
|
||||
f"images={len(images) if images else 0}, files={len(files) if files else 0}"
|
||||
f"images={len(images) if images else 0}, files={len(files) if files else 0}, "
|
||||
f"audio_input={has_audio_input}"
|
||||
)
|
||||
self._tool_context = {
|
||||
"user_reply_sent": False,
|
||||
@@ -885,6 +887,10 @@ class MoviePilotAgent:
|
||||
# 构建结构化用户消息内容
|
||||
request_payload = {
|
||||
"message": message or "",
|
||||
"input": {
|
||||
"mode": "voice" if has_audio_input else "text",
|
||||
"transcribed": bool(has_audio_input),
|
||||
},
|
||||
"images": [
|
||||
{"index": index + 1, "type": "image"}
|
||||
for index, _ in enumerate(images or [])
|
||||
@@ -1187,6 +1193,7 @@ class _MessageTask:
|
||||
message: str
|
||||
images: Optional[List[str]] = None
|
||||
files: Optional[List[dict]] = None
|
||||
has_audio_input: bool = False
|
||||
channel: Optional[str] = None
|
||||
source: Optional[str] = None
|
||||
username: Optional[str] = None
|
||||
@@ -1333,6 +1340,7 @@ class AgentManager:
|
||||
message: str,
|
||||
images: List[str] = None,
|
||||
files: Optional[List[dict]] = None,
|
||||
has_audio_input: bool = False,
|
||||
channel: str = None,
|
||||
source: str = None,
|
||||
username: str = None,
|
||||
@@ -1352,6 +1360,7 @@ class AgentManager:
|
||||
message=message,
|
||||
images=images,
|
||||
files=files,
|
||||
has_audio_input=has_audio_input,
|
||||
channel=channel,
|
||||
source=source,
|
||||
username=username,
|
||||
@@ -1488,7 +1497,13 @@ class AgentManager:
|
||||
agent.persist_output_message = task.persist_output_message
|
||||
agent.allow_message_tools = task.allow_message_tools
|
||||
|
||||
return await agent.process(task.message, images=task.images, files=task.files)
|
||||
process_kwargs = {
|
||||
"images": task.images,
|
||||
"files": task.files,
|
||||
}
|
||||
if task.has_audio_input:
|
||||
process_kwargs["has_audio_input"] = True
|
||||
return await agent.process(task.message, **process_kwargs)
|
||||
|
||||
async def stop_current_task(self, session_id: str):
|
||||
"""
|
||||
|
||||
@@ -35,7 +35,7 @@ You act as a proactive agent. Your goal is to fully resolve the user's media-rel
|
||||
- Treat manual download and subscription automation as two execution modes of the same acquisition pipeline. Manual download is user-triggered immediate acquisition; subscription is persistent site-driven monitoring and acquisition.
|
||||
- Keep the user anchored to the operational step that matters now: site, search, recognition, download, subscription, transfer, or status/history.
|
||||
- Users may attach images from supported channels; analyze them together with the text when relevant.
|
||||
- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`.
|
||||
- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Input metadata appears in `input`; when `input.mode` is `voice`, the user sent a voice message and `message` contains its transcript. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`.
|
||||
</moviepilot_domain_model>
|
||||
|
||||
<operating_principles>
|
||||
|
||||
@@ -396,7 +396,12 @@ class PromptManager:
|
||||
return (
|
||||
"Use normal text replies by default. Only call `send_voice_message` "
|
||||
"when the user explicitly asks for a voice reply or spoken playback "
|
||||
"is clearly better than plain text."
|
||||
"is clearly better than plain text. `send_voice_message` is a terminal "
|
||||
"response tool: put the complete user-facing reply in its `message` "
|
||||
"argument, then stop the turn. Do not also call `send_message`, do not "
|
||||
"write a final text reply after it, and do not repeat the same content "
|
||||
"as plain text. If native voice is unavailable, the tool sends the same "
|
||||
"content as a text fallback and still completes the reply."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -410,9 +415,11 @@ class PromptManager:
|
||||
):
|
||||
return (
|
||||
"- User questions: If you need the user to choose from a few clear options, "
|
||||
"call `ask_user_choice` to send button options. After the user clicks a button, "
|
||||
"the selected value will come back as the user's next message. After calling this tool, "
|
||||
"wait for the user's selection instead of repeating the question in plain text."
|
||||
"call `ask_user_choice` to send button options. `ask_user_choice` is a terminal "
|
||||
"interaction tool: put the full question and all options in the tool call, then "
|
||||
"stop the turn and wait for the user's selection. The selected value will come back "
|
||||
"as the user's next message. Do not also call `send_message`, do not write a final "
|
||||
"text reply after it, and do not repeat the question in plain text."
|
||||
)
|
||||
return "- User questions: When you truly need user input, ask briefly in plain text."
|
||||
|
||||
|
||||
@@ -71,7 +71,9 @@ class AskUserChoiceTool(MoviePilotTool):
|
||||
return_direct: bool = True
|
||||
description: str = (
|
||||
"Ask the user to choose from button options on channels that support interactive buttons. "
|
||||
"After the user clicks a button, the selected value will come back as the user's next message."
|
||||
"This is a terminal interaction tool: put the full question and all options in this call, "
|
||||
"then stop the current turn. After the user clicks a button, the selected value will come "
|
||||
"back as the user's next message. Do not also send the same question as plain text."
|
||||
)
|
||||
args_schema: Type[BaseModel] = AskUserChoiceInput
|
||||
require_admin: bool = False
|
||||
|
||||
@@ -35,7 +35,9 @@ class SendVoiceMessageTool(MoviePilotTool):
|
||||
"Send a voice reply to the current user. Use this only when the user explicitly asks for "
|
||||
"a voice reply or when spoken playback is clearly better than plain text. On channels "
|
||||
"without voice support or when TTS is unavailable, it automatically falls back to sending "
|
||||
"the same content as plain text."
|
||||
"the same content as plain text. This is a terminal response tool: put the complete "
|
||||
"user-facing reply in `message`; after this tool runs, do not send another text reply "
|
||||
"or call `send_message` with the same content."
|
||||
)
|
||||
args_schema: Type[BaseModel] = SendVoiceMessageInput
|
||||
require_admin: bool = False
|
||||
|
||||
@@ -350,6 +350,7 @@ class MessageChain(ChainBase):
|
||||
original_chat_id=original_chat_id,
|
||||
images=images,
|
||||
files=files,
|
||||
has_audio_input=has_audio_input,
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -366,6 +367,7 @@ class MessageChain(ChainBase):
|
||||
original_chat_id=original_chat_id,
|
||||
images=images,
|
||||
files=files,
|
||||
has_audio_input=has_audio_input,
|
||||
)
|
||||
|
||||
if MediaInteractionChain().handle_text_interaction(
|
||||
@@ -1204,6 +1206,7 @@ class MessageChain(ChainBase):
|
||||
images: Optional[List[CommingMessage.MessageImage]] = None,
|
||||
files: Optional[List[CommingMessage.MessageAttachment]] = None,
|
||||
session_id: Optional[str] = None,
|
||||
has_audio_input: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
处理AI智能体消息
|
||||
@@ -1317,6 +1320,8 @@ class MessageChain(ChainBase):
|
||||
else None,
|
||||
"original_chat_id": original_chat_id,
|
||||
}
|
||||
if has_audio_input:
|
||||
process_kwargs["has_audio_input"] = True
|
||||
# 在事件循环中处理
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
agent_manager.process_message(**process_kwargs),
|
||||
|
||||
Reference in New Issue
Block a user