diff --git a/code/code.go b/code/code.go
index 7f79326c..54da9ae0 100644
--- a/code/code.go
+++ b/code/code.go
@@ -127,6 +127,7 @@ var (
LLMRequestServiceError = errors.New("request LLM service error") // 112
LLMParsePlanningResponseError = errors.New("parse LLM planning response error") // 113
LLMParseAssertionResponseError = errors.New("parse LLM assertion response error") // 114
+ LLMParseQueryResponseError = errors.New("parse LLM query response error") // 115
)
var errorsMap = map[error]int{
@@ -217,6 +218,7 @@ var errorsMap = map[error]int{
LLMRequestServiceError: 112,
LLMParsePlanningResponseError: 113,
LLMParseAssertionResponseError: 114,
+ LLMParseQueryResponseError: 115,
// trackings related
TrackingGetError: 90,
diff --git a/internal/version/VERSION b/internal/version/VERSION
index 74353128..1d1e0f38 100644
--- a/internal/version/VERSION
+++ b/internal/version/VERSION
@@ -1 +1 @@
-v5.0.0-beta-2506101609
+v5.0.0-beta-2506101640
diff --git a/uixt/ai/parser_test.go b/uixt/ai/parser_test.go
index 405d4aec..e290a995 100644
--- a/uixt/ai/parser_test.go
+++ b/uixt/ai/parser_test.go
@@ -1276,3 +1276,83 @@ func TestNormalizeActionCoordinates_StringArray(t *testing.T) {
})
}
}
+
+func TestUITARSContentParser_ParseScrollAction(t *testing.T) {
+ parser := &UITARSContentParser{
+ modelType: option.DOUBAO_1_5_UI_TARS_250328,
+ systemPrompt: doubao_1_5_ui_tars_planning_prompt,
+ actionMapping: doubao_1_5_ui_tars_action_mapping,
+ }
+
+ size := types.Size{Width: 1080, Height: 1920}
+
+ tests := []struct {
+ name string
+ content string
+ expectedDirection string
+ }{
+ {
+ name: "scroll left with bbox format",
+ content: `Thought: 我需要向左滑动
+Action: scroll(direction='left', start_box='850 500 850 500')`,
+ expectedDirection: "left",
+ },
+ {
+ name: "scroll up with array format",
+ content: `Thought: 我需要向上滑动
+Action: scroll(direction='up', start_box='[400, 600]')`,
+ expectedDirection: "up",
+ },
+ {
+ name: "scroll down with array format",
+ content: `Thought: 我需要向下滑动
+Action: scroll(direction='down', start_box='[500, 800]')`,
+ expectedDirection: "down",
+ },
+ {
+ name: "real log example - scroll left",
+ content: `Thought: 我仔细观察了当前的游戏局面,发现两个2分别位于右下角和右中位置。之前尝试了几次滑动都没有成功,现在我需要重新思考策略。既然向上滑动没有效果,那我决定换个方向,尝试向左滑动看看。这样应该能让这两个2相遇并合并,为后续的游戏进展打下基础。
+Action: scroll(direction='left', start_box='850 500 850 500')`,
+ expectedDirection: "left",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result, err := parser.Parse(tt.content, size)
+ assert.NoError(t, err)
+ assert.NotNil(t, result)
+ assert.Len(t, result.ToolCalls, 1)
+
+ toolCall := result.ToolCalls[0]
+
+ // Verify tool call structure
+ assert.Equal(t, "uixt__swipe", toolCall.Function.Name)
+ assert.Equal(t, "function", toolCall.Type)
+ assert.NotEmpty(t, toolCall.ID)
+
+ // Parse and verify arguments
+ var args map[string]interface{}
+ err = json.Unmarshal([]byte(toolCall.Function.Arguments), &args)
+ assert.NoError(t, err)
+
+ // Verify direction parameter is present and correct
+ assert.Contains(t, args, "direction")
+ assert.Equal(t, tt.expectedDirection, args["direction"])
+
+ // Verify coordinates are present and reasonable
+ assert.Contains(t, args, "x")
+ assert.Contains(t, args, "y")
+ assert.IsType(t, float64(0), args["x"])
+ assert.IsType(t, float64(0), args["y"])
+
+ // Verify coordinates are within screen bounds
+ x := args["x"].(float64)
+ y := args["y"].(float64)
+ assert.Greater(t, x, 0.0)
+ assert.Less(t, x, float64(size.Width))
+ assert.Greater(t, y, 0.0)
+ assert.Less(t, y, float64(size.Height))
+ })
+ }
+}
diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go
index c37aef3f..22dff047 100644
--- a/uixt/ai/parser_ui_tars.go
+++ b/uixt/ai/parser_ui_tars.go
@@ -290,6 +290,26 @@ func convertProcessedArgs(processedArgs map[string]interface{}, actionType strin
return options.ToMap(), nil
}
+ // For scroll operations, handle both coordinates and direction
+ if actionType == "scroll" && hasStartBox {
+ startCoords, ok := startBox.([]float64)
+ if !ok {
+ return nil, fmt.Errorf("invalid coordinate format for scroll operation")
+ }
+
+ options := option.ActionOptions{
+ X: builtin.RoundToOneDecimal(startCoords[0]),
+ Y: builtin.RoundToOneDecimal(startCoords[1]),
+ }
+
+ // Add direction parameter if present
+ if direction, hasDirection := processedArgs["direction"]; hasDirection {
+ options.Direction = direction.(string)
+ }
+
+ return options.ToMap(), nil
+ }
+
// For single coordinate operations, return the coordinate array directly
if hasStartBox {
startCoords, ok := startBox.([]float64)
diff --git a/uixt/ai/planner_prompts.go b/uixt/ai/planner_prompts.go
index 754e5398..57c1704f 100644
--- a/uixt/ai/planner_prompts.go
+++ b/uixt/ai/planner_prompts.go
@@ -39,7 +39,7 @@ var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
- "scroll": option.ACTION_Scroll,
+ "scroll": option.ACTION_Swipe, // swipe up/down/left/right
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}
@@ -136,7 +136,7 @@ var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
- "scroll": option.ACTION_Scroll,
+ "scroll": option.ACTION_Swipe, // swipe up/down/left/right
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}