fix: update AI parser to use doubao-1.5-thinking-vision-pro configuration

This commit is contained in:
lilong.129
2025-06-05 13:28:31 +08:00
parent c204542f1f
commit 0864f74021
4 changed files with 31 additions and 12 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506042316
v5.0.0-beta-2506051328

View File

@@ -26,8 +26,8 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
}
default:
return &JSONContentParser{
systemPrompt: defaultPlanningResponseJsonFormat,
actionMapping: map[string]option.ActionName{},
systemPrompt: doubao_1_5_thinking_vision_pro_planning_prompt,
actionMapping: doubao_1_5_thinking_vision_pro_action_mapping,
}
}
}
@@ -80,8 +80,14 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
if err != nil {
return nil, errors.Wrap(err, "failed to process action arguments")
}
action.ActionInputs = processedArgs
// Convert processedArgs based on action type and coordinate parameters
finalArgs, err := convertProcessedArgs(processedArgs, action.ActionType)
if err != nil {
return nil, err
}
action.ActionInputs = finalArgs
normalizedActions = append(normalizedActions, action)
}

View File

@@ -254,7 +254,8 @@ func processArgument(paramName string, paramValue interface{}, size types.Size)
// Check if a parameter is a coordinate parameter
func isCoordinateParameter(paramName string) bool {
return strings.Contains(paramName, "box") || strings.Contains(paramName, "point")
return strings.Contains(strings.ToLower(paramName), "box") ||
strings.Contains(strings.ToLower(paramName), "point")
}
// convertProcessedArgs converts processed arguments based on action type and coordinate parameters

View File

@@ -75,7 +75,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
// system prompt for JSONContentParser
// doubao-1.5-thinking-vision-pro on volcengine.com
const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
const doubao_1_5_thinking_vision_pro_planning_prompt = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
@@ -86,18 +86,18 @@ Restriction:
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing [x1, y1, x2, y2] coordinates in 1000x1000 relative coordinates system.
Supporting actions:
- click: { action_type: "click", action_inputs: { startBox: [x1, y1, x2, y2] } }
- long_press: { action_type: "long_press", action_inputs: { startBox: [x1, y1, x2, y2] } }
- click: { action_type: "click", action_inputs: { start_box: [x1, y1, x2, y2] } }
- long_press: { action_type: "long_press", action_inputs: { start_box: [x1, y1, x2, y2] } }
- type: { action_type: "type", action_inputs: { content: string } } // If you want to submit your input, use "\\n" at the end of content.
- scroll: { action_type: "scroll", action_inputs: { startBox: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } }
- drag: { action_type: "drag", action_inputs: { startBox: [x1, y1, x2, y2], endBox: [x3, y3, x4, y4] } }
- scroll: { action_type: "scroll", action_inputs: { start_box: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } }
- drag: { action_type: "drag", action_inputs: { start_box: [x1, y1, x2, y2], end_box: [x3, y3, x4, y4] } }
- press_home: { action_type: "press_home", action_inputs: {} }
- press_back: { action_type: "press_back", action_inputs: {} }
- wait: { action_type: "wait", action_inputs: {} } // Sleep for 5s and take a screenshot to check for any changes.
- finished: { action_type: "finished", action_inputs: { content: string } } // Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
Field description:
* The ` + "`startBox`" + ` and ` + "`endBox`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
* The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
* Use Chinese in log and summary fields.
Return in JSON format:
@@ -119,7 +119,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by
{
"action_type": "click",
"action_inputs": {
"startBox": [100, 200, 150, 250]
"start_box": [100, 200, 150, 250]
}
}
],
@@ -129,3 +129,15 @@ For example, when the instruction is "点击第二个帖子的作者头像", by
## User Instruction
`
var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName{
"click": option.ACTION_TapXY,
"left_double": option.ACTION_DoubleTapXY,
"right_single": option.ACTION_SecondaryClick,
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
"scroll": option.ACTION_Scroll,
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}