Files
httprunner/uixt/ai/planner_prompts.go
lilong.129 0add3231ff refactor: merge ActionSummary and Thought fields to eliminate duplication
- Remove redundant ActionSummary field from PlanningResult struct
- Update parsers to use unified Thought field instead of duplicate fields
- Modify chat interface to display Thought instead of ActionSummary
- Update planner logging to use thought instead of summary
- Adjust prompt templates to use thought field consistently
- Switch test LLM service from UI-TARS to DoubaoVL
- Add default parameter handling for sleep tool
2025-06-05 14:19:09 +08:00

144 lines
6.3 KiB
Go

package ai
import "github.com/httprunner/httprunner/v5/uixt/option"
// system prompt for UITARSContentParser
// doubao-1.5-ui-tars on volcengine.com
// https://www.volcengine.com/docs/82379/1536429
const doubao_1_5_ui_tars_planning_prompt = `
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
` + "```" + `
Thought: ...
Action: ...
` + "```" + `
## Action Space
click(start_box='[x1, y1, x2, y2]')
left_double(start_box='[x1, y1, x2, y2]')
right_single(start_box='[x1, y1, x2, y2]')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
hotkey(key='')
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
## Note
- Use Chinese in ` + "`Thought`" + ` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in ` + "`Thought`" + ` part.
## User Instruction
`
var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{
"click": option.ACTION_TapXY,
"left_double": option.ACTION_DoubleTapXY,
"right_single": option.ACTION_SecondaryClick,
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
"scroll": option.ACTION_Scroll,
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}
// system prompt for UITARSContentParser
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py
const _ = `
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
` + "```" + `
Thought: ...
Action: ...
` + "```" + `
## Action Space
click(point='<point>x1 y1</point>')
long_press(point='<point>x1 y1</point>')
type(content='') #If you want to submit your input, use "\\n" at the end of ` + "`content`" + `.
scroll(point='<point>x1 y1</point>', direction='down or up or right or left')
open_app(app_name=\'\')
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
press_home()
press_back()
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
## Note
- Use Chinese in ` + "`Thought`" + ` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in ` + "`Thought`" + ` part.
## User Instruction
`
// system prompt for JSONContentParser
// doubao-1.5-thinking-vision-pro on volcengine.com
const doubao_1_5_thinking_vision_pro_planning_prompt = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
Restriction:
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
- Always give ONLY ONE action in ` + "`log`" + ` field (or null if no action should be done), instead of multiple actions. Supported actions are click, long_press, type, scroll, drag, press_home, press_back, wait, finished.
- Don't repeat actions in the previous logs.
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing [x1, y1, x2, y2] coordinates in 1000x1000 relative coordinates system.
Supporting actions:
- click: { action_type: "click", action_inputs: { start_box: [x1, y1, x2, y2] } }
- long_press: { action_type: "long_press", action_inputs: { start_box: [x1, y1, x2, y2] } }
- type: { action_type: "type", action_inputs: { content: string } } // If you want to submit your input, use "\\n" at the end of content.
- scroll: { action_type: "scroll", action_inputs: { start_box: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } }
- drag: { action_type: "drag", action_inputs: { start_box: [x1, y1, x2, y2], end_box: [x3, y3, x4, y4] } }
- press_home: { action_type: "press_home", action_inputs: {} }
- press_back: { action_type: "press_back", action_inputs: {} }
- wait: { action_type: "wait", action_inputs: {} } // Sleep for 5s and take a screenshot to check for any changes.
- finished: { action_type: "finished", action_inputs: { content: string } } // Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
Field description:
* The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
* Use Chinese in log and thought fields.
Return in JSON format:
{
"actions": [
{
"action_type": "...",
"action_inputs": { ... }
}
],
"thought": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese.
"error": "string" | null, // Error messages about unexpected situations, if any. Use Chinese.
}
For example, when the instruction is "点击第二个帖子的作者头像", by viewing the screenshot, you should consider locating the second post's author avatar and output the JSON:
{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250]
}
}
],
"thought": "点击第二个帖子的作者头像",
"error": null
}
## User Instruction
`
var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName{
"click": option.ACTION_TapXY,
"left_double": option.ACTION_DoubleTapXY,
"right_single": option.ACTION_SecondaryClick,
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
"scroll": option.ACTION_Scroll,
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}