fix: update AI parser to use doubao-1.5-thinking-vision-pro configuration

2026-06-26 10:01:28 +08:00 · 2025-06-05 13:28:31 +08:00
parent c204542f1f
commit 0864f74021
4 changed files with 31 additions and 12 deletions
--- a/internal/version/VERSION
+++ b/internal/version/VERSION
@@ -1 +1 @@
-v5.0.0-beta-2506042316
+v5.0.0-beta-2506051328
--- a/uixt/ai/parser_default.go
+++ b/uixt/ai/parser_default.go
@@ -26,8 +26,8 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
 		}
 	default:
 		return &JSONContentParser{
-			systemPrompt:  defaultPlanningResponseJsonFormat,
-			actionMapping: map[string]option.ActionName{},
+			systemPrompt:  doubao_1_5_thinking_vision_pro_planning_prompt,
+			actionMapping: doubao_1_5_thinking_vision_pro_action_mapping,
 		}
 	}
 }
@@ -80,8 +80,14 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
 		if err != nil {
 			return nil, errors.Wrap(err, "failed to process action arguments")
 		}
-		action.ActionInputs = processedArgs

+		// Convert processedArgs based on action type and coordinate parameters
+		finalArgs, err := convertProcessedArgs(processedArgs, action.ActionType)
+		if err != nil {
+			return nil, err
+		}
+
+		action.ActionInputs = finalArgs
 		normalizedActions = append(normalizedActions, action)
 	}

--- a/uixt/ai/parser_ui_tars.go
+++ b/uixt/ai/parser_ui_tars.go
@@ -254,7 +254,8 @@ func processArgument(paramName string, paramValue interface{}, size types.Size)

 // Check if a parameter is a coordinate parameter
 func isCoordinateParameter(paramName string) bool {
-	return strings.Contains(paramName, "box") || strings.Contains(paramName, "point")
+	return strings.Contains(strings.ToLower(paramName), "box") ||
+		strings.Contains(strings.ToLower(paramName), "point")
 }

 // convertProcessedArgs converts processed arguments based on action type and coordinate parameters
--- a/uixt/ai/planner_prompts.go
+++ b/uixt/ai/planner_prompts.go
@@ -75,7 +75,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par

 // system prompt for JSONContentParser
 // doubao-1.5-thinking-vision-pro on volcengine.com
-const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+const doubao_1_5_thinking_vision_pro_planning_prompt = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.

 Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.

@@ -86,18 +86,18 @@ Restriction:
 - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing [x1, y1, x2, y2] coordinates in 1000x1000 relative coordinates system.

 Supporting actions:
- click: { action_type: "click", action_inputs: { startBox: [x1, y1, x2, y2] } }
- long_press: { action_type: "long_press", action_inputs: { startBox: [x1, y1, x2, y2] } }
+- click: { action_type: "click", action_inputs: { start_box: [x1, y1, x2, y2] } }
+- long_press: { action_type: "long_press", action_inputs: { start_box: [x1, y1, x2, y2] } }
 - type: { action_type: "type", action_inputs: { content: string } } // If you want to submit your input, use "\\n" at the end of content.
- scroll: { action_type: "scroll", action_inputs: { startBox: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } }
- drag: { action_type: "drag", action_inputs: { startBox: [x1, y1, x2, y2], endBox: [x3, y3, x4, y4] } }
+- scroll: { action_type: "scroll", action_inputs: { start_box: [x1, y1, x2, y2], direction: "down" | "up" | "left" | "right" } }
+- drag: { action_type: "drag", action_inputs: { start_box: [x1, y1, x2, y2], end_box: [x3, y3, x4, y4] } }
 - press_home: { action_type: "press_home", action_inputs: {} }
 - press_back: { action_type: "press_back", action_inputs: {} }
 - wait: { action_type: "wait", action_inputs: {} } // Sleep for 5s and take a screenshot to check for any changes.
 - finished: { action_type: "finished", action_inputs: { content: string } } // Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.

 Field description:
-* The ` + "`startBox`" + ` and ` + "`endBox`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
+* The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
 * Use Chinese in log and summary fields.

 Return in JSON format:
@@ -119,7 +119,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by
    {
      "action_type": "click",
      "action_inputs": {
-        "startBox": [100, 200, 150, 250]
+        "start_box": [100, 200, 150, 250]
      }
    }
  ],
@@ -129,3 +129,15 @@ For example, when the instruction is "点击第二个帖子的作者头像", by

 ## User Instruction
 `
+
+var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName{
+	"click":        option.ACTION_TapXY,
+	"left_double":  option.ACTION_DoubleTapXY,
+	"right_single": option.ACTION_SecondaryClick,
+	"drag":         option.ACTION_Drag,
+	"hotkey":       option.ACTION_KeyCode,
+	"type":         option.ACTION_Input,
+	"scroll":       option.ACTION_Scroll,
+	"wait":         option.ACTION_Sleep,
+	"finished":     option.ACTION_Finished,
+}