diff --git a/internal/version/VERSION b/internal/version/VERSION index edf71488..3b935b64 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506042316 +v5.0.0-beta-2506051328 diff --git a/uixt/ai/parser_default.go b/uixt/ai/parser_default.go index 998b074e..b950b4ce 100644 --- a/uixt/ai/parser_default.go +++ b/uixt/ai/parser_default.go @@ -26,8 +26,8 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser { } default: return &JSONContentParser{ - systemPrompt: defaultPlanningResponseJsonFormat, - actionMapping: map[string]option.ActionName{}, + systemPrompt: doubao_1_5_thinking_vision_pro_planning_prompt, + actionMapping: doubao_1_5_thinking_vision_pro_action_mapping, } } } @@ -80,8 +80,14 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes if err != nil { return nil, errors.Wrap(err, "failed to process action arguments") } - action.ActionInputs = processedArgs + // Convert processedArgs based on action type and coordinate parameters + finalArgs, err := convertProcessedArgs(processedArgs, action.ActionType) + if err != nil { + return nil, err + } + + action.ActionInputs = finalArgs normalizedActions = append(normalizedActions, action) } diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go index a603f2f5..5d72040b 100644 --- a/uixt/ai/parser_ui_tars.go +++ b/uixt/ai/parser_ui_tars.go @@ -254,7 +254,8 @@ func processArgument(paramName string, paramValue interface{}, size types.Size) // Check if a parameter is a coordinate parameter func isCoordinateParameter(paramName string) bool { - return strings.Contains(paramName, "box") || strings.Contains(paramName, "point") + return strings.Contains(strings.ToLower(paramName), "box") || + strings.Contains(strings.ToLower(paramName), "point") } // convertProcessedArgs converts processed arguments based on action type and coordinate parameters diff --git a/uixt/ai/planner_prompts.go b/uixt/ai/planner_prompts.go index 8b361926..d939522d 100644 --- a/uixt/ai/planner_prompts.go +++ b/uixt/ai/planner_prompts.go @@ -75,7 +75,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par // system prompt for JSONContentParser // doubao-1.5-thinking-vision-pro on volcengine.com -const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. +const doubao_1_5_thinking_vision_pro_planning_prompt = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. @@ -86,18 +86,18 @@ Restriction: - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing [x1, y1, x2, y2] coordinates in 1000x1000 relative coordinates system. Supporting actions: -- click: { action_type: "click", action_inputs: { startBox: [x1, y1, x2, y2] } } -- long_press: { action_type: "long_press", action_inputs: { startBox: [x1, y1, x2, y2] } } +- click: { action_type: "click", action_inputs: { start_box: [x1, y1, x2, y2] } } +- long_press: { action_type: "long_press", action_inputs: { start_box: [x1, y1, x2, y2] } } - type: { action_type: "type", action_inputs: { content: string } } // If you want to submit your input, use "\\n" at the end of content. -- scroll: { action_type: "scroll", action_inputs: { startBox: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } } -- drag: { action_type: "drag", action_inputs: { startBox: [x1, y1, x2, y2], endBox: [x3, y3, x4, y4] } } +- scroll: { action_type: "scroll", action_inputs: { start_box: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } } +- drag: { action_type: "drag", action_inputs: { start_box: [x1, y1, x2, y2], end_box: [x3, y3, x4, y4] } } - press_home: { action_type: "press_home", action_inputs: {} } - press_back: { action_type: "press_back", action_inputs: {} } - wait: { action_type: "wait", action_inputs: {} } // Sleep for 5s and take a screenshot to check for any changes. - finished: { action_type: "finished", action_inputs: { content: string } } // Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. Field description: -* The ` + "`startBox`" + ` and ` + "`endBox`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system. +* The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system. * Use Chinese in log and summary fields. Return in JSON format: @@ -119,7 +119,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by { "action_type": "click", "action_inputs": { - "startBox": [100, 200, 150, 250] + "start_box": [100, 200, 150, 250] } } ], @@ -129,3 +129,15 @@ For example, when the instruction is "点击第二个帖子的作者头像", by ## User Instruction ` + +var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName{ + "click": option.ACTION_TapXY, + "left_double": option.ACTION_DoubleTapXY, + "right_single": option.ACTION_SecondaryClick, + "drag": option.ACTION_Drag, + "hotkey": option.ACTION_KeyCode, + "type": option.ACTION_Input, + "scroll": option.ACTION_Scroll, + "wait": option.ACTION_Sleep, + "finished": option.ACTION_Finished, +}