diff --git a/internal/version/VERSION b/internal/version/VERSION index 0aa278b1..d0e40a61 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506062218 +v5.0.0-beta-2506071503 diff --git a/uixt/ai/parser_test.go b/uixt/ai/parser_test.go index 12925d4a..ad557355 100644 --- a/uixt/ai/parser_test.go +++ b/uixt/ai/parser_test.go @@ -15,21 +15,23 @@ func TestParseActionToStructureOutput(t *testing.T) { assert.Nil(t, err) function := result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__click") - // ActionInputs is now directly a coordinate array - var coords []float64 - err = json.Unmarshal([]byte(function.Arguments), &coords) + + var arguments map[string]interface{} + err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - assert.Equal(t, 2, len(coords)) + assert.Contains(t, arguments, "x") + assert.Contains(t, arguments, "y") text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='550 450 550 450')" result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024}) assert.Nil(t, err) function = result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__click") - // ActionInputs is now directly a coordinate array - err = json.Unmarshal([]byte(function.Arguments), &coords) + + err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - assert.Equal(t, 2, len(coords)) + assert.Contains(t, arguments, "x") + assert.Contains(t, arguments, "y") // Test new bracket format - should convert bounding box to center point text = "Thought: 我需要点击这个按钮\nAction: click(start_box='[100, 200, 150, 250]')" @@ -37,29 +39,27 @@ func TestParseActionToStructureOutput(t *testing.T) { assert.Nil(t, err) function = result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__click") - // ActionInputs is now directly a coordinate array - err = json.Unmarshal([]byte(function.Arguments), &coords) - assert.Nil(t, err) - // Should be converted to center point [125, 225] from bounding box [100, 200, 150, 250] - assert.Equal(t, 2, len(coords)) - assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125 - assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225 - // Test drag operation with both start_box and end_box - should merge center points into single array + err = json.Unmarshal([]byte(function.Arguments), &arguments) + assert.Nil(t, err) + // Should be converted to center point x=125, y=225 from bounding box [100, 200, 150, 250] + assert.Equal(t, 125.0, arguments["x"]) // (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, arguments["y"]) // (200 + 250) / 2 = 225 + + // Test drag operation with both start_box and end_box - should use from_x,from_y,to_x,to_y format text = "Thought: 我需要拖拽元素\nAction: drag(start_box='[100, 200, 150, 250]', end_box='[300, 400, 350, 450]')" result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) assert.Nil(t, err) function = result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__drag") - // ActionInputs is now directly a coordinate array - err = json.Unmarshal([]byte(function.Arguments), &coords) + // ActionInputs is now in from_x,from_y,to_x,to_y format for drag operations + err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - // Should be merged into single array [start_center_x, start_center_y, end_center_x, end_center_y] - assert.Equal(t, 4, len(coords)) - assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125 - assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225 - assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325 - assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425 + // Should be converted to from_x,from_y,to_x,to_y format + assert.Equal(t, 125.0, arguments["from_x"]) // start center x: (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, arguments["from_y"]) // start center y: (200 + 250) / 2 = 225 + assert.Equal(t, 325.0, arguments["to_x"]) // end center x: (300 + 350) / 2 = 325 + assert.Equal(t, 425.0, arguments["to_y"]) // end center y: (400 + 450) / 2 = 425 } // Test normalizeCoordinatesFormat function @@ -799,33 +799,30 @@ func TestNewCoordinateConversion(t *testing.T) { function := result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__click") - // ActionInputs is now directly a coordinate array - var coords []float64 - err = json.Unmarshal([]byte(function.Arguments), &coords) + var arguments map[string]interface{} + err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - // Should convert bounding box [100,200,150,250] to center point [125.0, 225.0] - assert.Equal(t, 2, len(coords)) - assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125 - assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225 + // Should convert bounding box [100,200,150,250] to center point x=125.0, y=225.0 + assert.Equal(t, 125.0, arguments["x"]) // (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, arguments["y"]) // (200 + 250) / 2 = 225 - // Test drag operation conversion to merged array + // Test drag operation conversion to from_x,from_y,to_x,to_y format text = "Thought: 我需要拖拽元素\nAction: drag(start_box='100,200,150,250', end_box='300,400,350,450')" result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) assert.Nil(t, err) function = result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__drag") - // ActionInputs is now directly a coordinate array - err = json.Unmarshal([]byte(function.Arguments), &coords) + // ActionInputs is now in from_x,from_y,to_x,to_y format for drag operations + err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - // Should merge start_box and end_box center points into single array [125.0, 225.0, 325.0, 425.0] - assert.Equal(t, 4, len(coords)) - assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125 - assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225 - assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325 - assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425 + // Should convert to from_x,from_y,to_x,to_y format + assert.Equal(t, 125.0, arguments["from_x"]) // start center x: (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, arguments["from_y"]) // start center y: (200 + 250) / 2 = 225 + assert.Equal(t, 325.0, arguments["to_x"]) // end center x: (300 + 350) / 2 = 325 + assert.Equal(t, 425.0, arguments["to_y"]) // end center y: (400 + 450) / 2 = 425 // Test non-coordinate operation (type action) text = "Thought: 我需要输入文本\nAction: type(content='Hello World')" @@ -834,9 +831,262 @@ func TestNewCoordinateConversion(t *testing.T) { function = result.ToolCalls[0].Function assert.Equal(t, function.Name, "uixt__type") - // ActionInputs should be a map for non-coordinate operations - var arguments map[string]interface{} + // ActionInputs should be a map for non-coordinate operations with parameter mapping err = json.Unmarshal([]byte(function.Arguments), &arguments) assert.Nil(t, err) - assert.Equal(t, "Hello World", arguments["content"]) + assert.Equal(t, "Hello World", arguments["text"]) // content should be mapped to text +} + +// Test convertProcessedArgs function +func TestConvertProcessedArgs(t *testing.T) { + tests := []struct { + name string + processedArgs map[string]interface{} + actionType string + expected map[string]interface{} + expectError bool + description string + }{ + // Single coordinate operation tests + { + name: "single_coordinate_operation", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.0, 225.0}, + }, + actionType: "click", + expected: map[string]interface{}{ + "x": 125.0, + "y": 225.0, + }, + description: "Single coordinate operation should convert to x,y format", + }, + { + name: "single_coordinate_with_rounding", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.123456, 225.987654}, + }, + actionType: "click", + expected: map[string]interface{}{ + "x": 125.1, + "y": 226.0, + }, + description: "Coordinates should be rounded to one decimal place", + }, + // Drag operation tests + { + name: "drag_operation_dual_coordinates", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.0, 225.0}, + "end_box": []float64{325.0, 425.0}, + }, + actionType: "drag", + expected: map[string]interface{}{ + "from_x": 125.0, + "from_y": 225.0, + "to_x": 325.0, + "to_y": 425.0, + }, + description: "Drag operation should convert to from_x,from_y,to_x,to_y format", + }, + { + name: "drag_operation_with_rounding", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.123456, 225.987654}, + "end_box": []float64{325.555555, 425.444444}, + }, + actionType: "drag", + expected: map[string]interface{}{ + "from_x": 125.1, + "from_y": 226.0, + "to_x": 325.6, + "to_y": 425.4, + }, + description: "Drag coordinates should be rounded to one decimal place", + }, + // Non-coordinate operation tests + { + name: "non_coordinate_operation_with_parameter_mapping", + processedArgs: map[string]interface{}{ + "content": "Hello World", + "direction": "down", + }, + actionType: "type", + expected: map[string]interface{}{ + "text": "Hello World", // content should be mapped to text + "direction": "down", + }, + description: "Non-coordinate operation should apply parameter name mapping", + }, + { + name: "non_coordinate_operation_key_mapping", + processedArgs: map[string]interface{}{ + "key": "enter", + }, + actionType: "hotkey", + expected: map[string]interface{}{ + "keycode": "enter", // key should be mapped to keycode + }, + description: "Key parameter should be mapped to keycode", + }, + { + name: "non_coordinate_operation_mixed_parameters", + processedArgs: map[string]interface{}{ + "content": "Test input", + "key": "ctrl+c", + "direction": "up", + "timeout": 5, + }, + actionType: "mixed", + expected: map[string]interface{}{ + "text": "Test input", // content -> text + "keycode": "ctrl+c", // key -> keycode + "direction": "up", // unchanged + "timeout": 5, // unchanged + }, + description: "Mixed parameters should apply correct mappings", + }, + { + name: "empty_arguments", + processedArgs: map[string]interface{}{}, + actionType: "empty", + expected: map[string]interface{}{}, + description: "Empty arguments should return empty map", + }, + // Error cases + { + name: "invalid_single_coordinate_format", + processedArgs: map[string]interface{}{ + "start_box": "invalid", + }, + actionType: "click", + expectError: true, + description: "Invalid coordinate format should cause error", + }, + { + name: "invalid_drag_start_coordinate", + processedArgs: map[string]interface{}{ + "start_box": "invalid", + "end_box": []float64{325.0, 425.0}, + }, + actionType: "drag", + expectError: true, + description: "Invalid start coordinate in drag should cause error", + }, + { + name: "invalid_drag_end_coordinate", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.0, 225.0}, + "end_box": "invalid", + }, + actionType: "drag", + expectError: true, + description: "Invalid end coordinate in drag should cause error", + }, + { + name: "drag_insufficient_start_coordinates", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.0}, // Only one coordinate + "end_box": []float64{325.0, 425.0}, + }, + actionType: "drag", + expectError: true, + description: "Insufficient start coordinates in drag should cause error", + }, + { + name: "drag_insufficient_end_coordinates", + processedArgs: map[string]interface{}{ + "start_box": []float64{125.0, 225.0}, + "end_box": []float64{325.0}, // Only one coordinate + }, + actionType: "drag", + expectError: true, + description: "Insufficient end coordinates in drag should cause error", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := convertProcessedArgs(tt.processedArgs, tt.actionType) + + if tt.expectError { + assert.Error(t, err, "Test case: %s", tt.description) + return + } + + assert.NoError(t, err, "Test case: %s", tt.description) + assert.Equal(t, len(tt.expected), len(result), "Test case: %s", tt.description) + + for key, expectedValue := range tt.expected { + actualValue, exists := result[key] + assert.True(t, exists, "Key %s should exist in result for test: %s", key, tt.description) + assert.Equal(t, expectedValue, actualValue, "Value for key %s should match for test: %s", key, tt.description) + } + }) + } +} + +// Test mapParameterName function +func TestMapParameterName(t *testing.T) { + tests := []struct { + name string + paramName string + expected string + description string + }{ + { + name: "content_to_text", + paramName: "content", + expected: "text", + description: "content parameter should be mapped to text", + }, + { + name: "key_to_keycode", + paramName: "key", + expected: "keycode", + description: "key parameter should be mapped to keycode", + }, + { + name: "unchanged_parameter_direction", + paramName: "direction", + expected: "direction", + description: "direction parameter should remain unchanged", + }, + { + name: "unchanged_parameter_start_box", + paramName: "start_box", + expected: "start_box", + description: "start_box parameter should remain unchanged", + }, + { + name: "unchanged_parameter_end_box", + paramName: "end_box", + expected: "end_box", + description: "end_box parameter should remain unchanged", + }, + { + name: "unchanged_parameter_timeout", + paramName: "timeout", + expected: "timeout", + description: "timeout parameter should remain unchanged", + }, + { + name: "unchanged_parameter_custom", + paramName: "custom_param", + expected: "custom_param", + description: "custom parameter should remain unchanged", + }, + { + name: "empty_parameter_name", + paramName: "", + expected: "", + description: "empty parameter name should remain empty", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := mapParameterName(tt.paramName) + assert.Equal(t, tt.expected, result, "Test case: %s", tt.description) + }) + } } diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go index 9e5a3ed2..8eb154da 100644 --- a/uixt/ai/parser_ui_tars.go +++ b/uixt/ai/parser_ui_tars.go @@ -301,15 +301,28 @@ func convertProcessedArgs(processedArgs map[string]interface{}, actionType strin return options.ToMap(), nil } - // For non-coordinate operations, return the original arguments map - // TODO + // For non-coordinate operations, apply parameter name mapping and return the arguments map finalArgs := make(map[string]interface{}) for key, value := range processedArgs { - finalArgs[key] = value + // Map parameter names to match ActionOptions field names + mappedKey := mapParameterName(key) + finalArgs[mappedKey] = value } return finalArgs, nil } +// mapParameterName maps UI-TARS parameter names to ActionOptions field names +func mapParameterName(paramName string) string { + switch paramName { + case "content": + return "text" // Map content to text for input operations + case "key": + return "keycode" // Map key to keycode for hotkey operations + default: + return paramName + } +} + // normalizeActionCoordinates normalizes coordinates from various formats to actual pixel coordinates func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) { switch v := coordData.(type) { diff --git a/uixt/driver_ext_ai_test.go b/uixt/driver_ext_ai_test.go index 4c0ff4f3..0f374a6a 100644 --- a/uixt/driver_ext_ai_test.go +++ b/uixt/driver_ext_ai_test.go @@ -1,3 +1,5 @@ +//go:build localtest + package uixt import ( @@ -11,6 +13,50 @@ import ( "github.com/stretchr/testify/assert" ) +func TestDriverExt_TapByLLM(t *testing.T) { + driver := setupDriverExt(t) + err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像") + assert.Nil(t, err) + + err = driver.AIAssert("当前在个人介绍页") + assert.Nil(t, err) +} + +func TestDriverExt_StartToGoal(t *testing.T) { + driver := setupDriverExt(t) + + userInstruction := `连连看是一款经典的益智消除类小游戏,通常以图案或图标为主要元素。以下是连连看的基本规则说明: + 1. 游戏目标: 玩家需要在规定时间内,通过连接相同的图案或图标,将它们从游戏界面中消除。 + 2. 连接规则: + - 两个相同的图案可以通过不超过三条直线连接。 + - 连接线可以水平或垂直,但不能斜线,也不能跨过其他图案。 + - 连接线的转折次数不能超过两次。 + 3. 游戏界面: + - 游戏界面通常是一个矩形区域,内含多个图案或图标,排列成行和列。 + - 图案或图标在未选中状态下背景为白色,选中状态下背景为绿色。 + 4. 时间限制: 游戏通常设有时间限制,玩家需要在时间耗尽前完成所有图案的消除。 + 5. 得分机制: 每成功连接并消除一对图案,玩家会获得相应的分数。完成游戏后,根据剩余时间和消除效率计算总分。 + 6. 关卡设计: 游戏可能包含多个关卡,随着关卡的推进,图案的复杂度和数量会增加。 + + 注意事项: + 1、当连接错误时,顶部的红心会减少一个,需及时调整策略,避免红心变为0个后游戏失败 + 2、不要连续 2 次点击同一个图案 + 3、不要犯重复的错误 + ` + + userInstruction += "\n\n请严格按照以上游戏规则,开始游戏;注意,请只做点击操作" + + err := driver.StartToGoal(context.Background(), userInstruction) + assert.Nil(t, err) +} + +func TestDriverExt_PlanNextAction(t *testing.T) { + driver := setupDriverExt(t) + result, err := driver.PlanNextAction(context.Background(), "启动抖音") + assert.Nil(t, err) + t.Log(result) +} + func TestXTDriver_isTaskFinished(t *testing.T) { driver := &XTDriver{} diff --git a/uixt/driver_ext_test.go b/uixt/driver_ext_test.go index 6302bb53..112139dc 100644 --- a/uixt/driver_ext_test.go +++ b/uixt/driver_ext_test.go @@ -4,7 +4,6 @@ package uixt import ( "bytes" - "context" "image" "os" "testing" @@ -129,50 +128,6 @@ func TestDriverExt_TapByOCR(t *testing.T) { assert.Nil(t, err) } -func TestDriverExt_TapByLLM(t *testing.T) { - driver := setupDriverExt(t) - err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像") - assert.Nil(t, err) - - err = driver.AIAssert("当前在个人介绍页") - assert.Nil(t, err) -} - -func TestDriverExt_StartToGoal(t *testing.T) { - driver := setupDriverExt(t) - - userInstruction := `连连看是一款经典的益智消除类小游戏,通常以图案或图标为主要元素。以下是连连看的基本规则说明: - 1. 游戏目标: 玩家需要在规定时间内,通过连接相同的图案或图标,将它们从游戏界面中消除。 - 2. 连接规则: - - 两个相同的图案可以通过不超过三条直线连接。 - - 连接线可以水平或垂直,但不能斜线,也不能跨过其他图案。 - - 连接线的转折次数不能超过两次。 - 3. 游戏界面: - - 游戏界面通常是一个矩形区域,内含多个图案或图标,排列成行和列。 - - 图案或图标在未选中状态下背景为白色,选中状态下背景为绿色。 - 4. 时间限制: 游戏通常设有时间限制,玩家需要在时间耗尽前完成所有图案的消除。 - 5. 得分机制: 每成功连接并消除一对图案,玩家会获得相应的分数。完成游戏后,根据剩余时间和消除效率计算总分。 - 6. 关卡设计: 游戏可能包含多个关卡,随着关卡的推进,图案的复杂度和数量会增加。 - - 注意事项: - 1、当连接错误时,顶部的红心会减少一个,需及时调整策略,避免红心变为0个后游戏失败 - 2、不要连续 2 次点击同一个图案 - 3、不要犯重复的错误 - ` - - userInstruction += "\n\n请严格按照以上游戏规则,开始游戏;注意,请只做点击操作" - - err := driver.StartToGoal(context.Background(), userInstruction) - assert.Nil(t, err) -} - -func TestDriverExt_PlanNextAction(t *testing.T) { - driver := setupDriverExt(t) - result, err := driver.PlanNextAction(context.Background(), "启动抖音") - assert.Nil(t, err) - t.Log(result) -} - func TestDriverExt_prepareSwipeAction(t *testing.T) { driver := setupDriverExt(t)