diff --git a/internal/version/VERSION b/internal/version/VERSION
index 9d8a0b32..edf71488 100644
--- a/internal/version/VERSION
+++ b/internal/version/VERSION
@@ -1 +1 @@
-v5.0.0-beta-2506031820
+v5.0.0-beta-2506042316
diff --git a/server/main.go b/server/main.go
index e881caf2..71258846 100644
--- a/server/main.go
+++ b/server/main.go
@@ -25,7 +25,7 @@ type Router struct {
}
func (r *Router) InitMCPHost(configPath string) error {
- mcpHost, err := mcphost.NewMCPHost(configPath, false)
+ mcpHost, err := mcphost.NewMCPHost(configPath, true)
if err != nil {
log.Error().Err(err).Msg("init MCP host failed")
return err
diff --git a/uixt/ai/parser_default.go b/uixt/ai/parser_default.go
index 979b811d..998b074e 100644
--- a/uixt/ai/parser_default.go
+++ b/uixt/ai/parser_default.go
@@ -21,18 +21,21 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
switch modelType {
case option.LLMServiceTypeUITARS:
return &UITARSContentParser{
- systemPrompt: doubao_1_5_ui_tars_planning_prompt,
+ systemPrompt: doubao_1_5_ui_tars_planning_prompt,
+ actionMapping: doubao_1_5_ui_tars_action_mapping,
}
default:
return &JSONContentParser{
- systemPrompt: defaultPlanningResponseJsonFormat,
+ systemPrompt: defaultPlanningResponseJsonFormat,
+ actionMapping: map[string]option.ActionName{},
}
}
}
// JSONContentParser parses the response as JSON string format
type JSONContentParser struct {
- systemPrompt string
+ systemPrompt string
+ actionMapping map[string]option.ActionName
}
func (p *JSONContentParser) SystemPrompt() string {
@@ -83,7 +86,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
}
// Convert actions to tool calls using function from parser_ui_tars.go
- toolCalls := convertActionsToToolCalls(normalizedActions)
+ toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping)
return &PlanningResult{
ToolCalls: toolCalls,
diff --git a/uixt/ai/parser_test.go b/uixt/ai/parser_test.go
index b35f5bc5..12925d4a 100644
--- a/uixt/ai/parser_test.go
+++ b/uixt/ai/parser_test.go
@@ -14,62 +14,52 @@ func TestParseActionToStructureOutput(t *testing.T) {
result, err := parser.Parse(text, types.Size{Height: 224, Width: 224})
assert.Nil(t, err)
function := result.ToolCalls[0].Function
- assert.Equal(t, function.Name, "click")
- assert.Contains(t, function.Arguments, "start_box")
+ assert.Equal(t, function.Name, "uixt__click")
+ // ActionInputs is now directly a coordinate array
+ var coords []float64
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
+ assert.Nil(t, err)
+ assert.Equal(t, 2, len(coords))
text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='550 450 550 450')"
result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024})
assert.Nil(t, err)
function = result.ToolCalls[0].Function
- assert.Equal(t, function.Name, "click")
- assert.Contains(t, function.Arguments, "start_box")
+ assert.Equal(t, function.Name, "uixt__click")
+ // ActionInputs is now directly a coordinate array
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
+ assert.Nil(t, err)
+ assert.Equal(t, 2, len(coords))
- // Test new bracket format
+ // Test new bracket format - should convert bounding box to center point
text = "Thought: 我需要点击这个按钮\nAction: click(start_box='[100, 200, 150, 250]')"
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
assert.Nil(t, err)
function = result.ToolCalls[0].Function
- assert.Equal(t, function.Name, "click")
- assert.Contains(t, function.Arguments, "start_box")
- arguments := make(map[string]interface{})
- err = json.Unmarshal([]byte(function.Arguments), &arguments)
+ assert.Equal(t, function.Name, "uixt__click")
+ // ActionInputs is now directly a coordinate array
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
assert.Nil(t, err)
- coordsInterface := arguments["start_box"].([]interface{})
- coords := make([]float64, len(coordsInterface))
- for i, v := range coordsInterface {
- coords[i] = v.(float64)
- }
- assert.Equal(t, 4, len(coords))
- assert.Equal(t, 100.0, coords[0])
- assert.Equal(t, 200.0, coords[1])
- assert.Equal(t, 150.0, coords[2])
- assert.Equal(t, 250.0, coords[3])
+ // Should be converted to center point [125, 225] from bounding box [100, 200, 150, 250]
+ assert.Equal(t, 2, len(coords))
+ assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125
+ assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225
- // Test drag operation with both start_box and end_box
+ // Test drag operation with both start_box and end_box - should merge center points into single array
text = "Thought: 我需要拖拽元素\nAction: drag(start_box='[100, 200, 150, 250]', end_box='[300, 400, 350, 450]')"
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
assert.Nil(t, err)
function = result.ToolCalls[0].Function
- assert.Equal(t, function.Name, "drag")
- assert.Contains(t, function.Arguments, "start_box")
- assert.Contains(t, function.Arguments, "end_box")
- arguments = make(map[string]interface{})
- err = json.Unmarshal([]byte(function.Arguments), &arguments)
+ assert.Equal(t, function.Name, "uixt__drag")
+ // ActionInputs is now directly a coordinate array
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
assert.Nil(t, err)
- startCoordsInterface := arguments["start_box"].([]interface{})
- endCoordsInterface := arguments["end_box"].([]interface{})
- startCoords := make([]float64, len(startCoordsInterface))
- endCoords := make([]float64, len(endCoordsInterface))
- for i, v := range startCoordsInterface {
- startCoords[i] = v.(float64)
- }
- for i, v := range endCoordsInterface {
- endCoords[i] = v.(float64)
- }
- assert.Equal(t, 4, len(startCoords))
- assert.Equal(t, 4, len(endCoords))
- assert.Equal(t, 100.0, startCoords[0])
- assert.Equal(t, 300.0, endCoords[0])
+ // Should be merged into single array [start_center_x, start_center_y, end_center_x, end_center_y]
+ assert.Equal(t, 4, len(coords))
+ assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125
+ assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225
+ assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325
+ assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425
}
// Test normalizeCoordinatesFormat function
@@ -79,159 +69,59 @@ func TestNormalizeCoordinatesFormat(t *testing.T) {
input string
expected string
}{
+ // Basic format conversions
{
- name: "point tag with 2 numbers",
+ name: "point_tag_2_numbers",
input: "100 200",
expected: "(100,200)",
},
{
- name: "point tag with 4 numbers",
+ name: "point_tag_4_numbers",
input: "100 200 150 250",
expected: "(100,200,150,250)",
},
{
- name: "bbox tag",
+ name: "bbox_tag",
input: "100 200 150 250",
expected: "(100,200,150,250)",
},
{
- name: "bracket format with spaces",
+ name: "bracket_format_4_coords",
input: "[100, 200, 150, 250]",
expected: "(100,200,150,250)",
},
+ // Edge cases
{
- name: "bracket format without spaces",
- input: "[100,200,150,250]",
- expected: "(100,200,150,250)",
- },
- {
- name: "bracket format with irregular spaces",
- input: "[100, 200, 150, 250]",
- expected: "(100,200,150,250)",
- },
- {
- name: "multiple point tags",
- input: "100 200 and 300 400",
- expected: "(100,200) and (300,400)",
- },
- {
- name: "mixed formats",
- input: "100 200 and [300, 400, 350, 450]",
- expected: "(100,200) and (300,400,350,450)",
- },
- {
- name: "documentation_example_coordinates",
- input: "235 512",
- expected: "(235,512)",
- },
- {
- name: "documentation_example_bbox",
- input: "235 512 451 553",
- expected: "(235,512,451,553)",
- },
- {
- name: "mobile_coordinates_point",
- input: "200 600",
- expected: "(200,600)",
- },
- {
- name: "tablet_coordinates_bbox",
- input: "750 400 800 450",
- expected: "(750,400,800,450)",
- },
- // Note: Bracket format with 2 coordinates is NOT supported by the function
- // Only 4-coordinate bracket format is supported
- {
- name: "bracket_format_two_coordinates_not_converted",
- input: "[100, 200]",
- expected: "[100, 200]", // Function doesn't convert this format
- },
- // Note: Decimal coordinates are NOT supported by the regex (only \d+ is matched)
- {
- name: "point_tag_with_decimals_not_converted",
- input: "100.5 200.7",
- expected: "100.5 200.7", // Function doesn't convert decimals
- },
- {
- name: "bbox_tag_with_decimals_not_converted",
- input: "100.5 200.7 150.3 250.9",
- expected: "100.5 200.7 150.3 250.9", // Function doesn't convert decimals
- },
- {
- name: "bracket_format_with_decimals_not_converted",
- input: "[100.5, 200.7, 150.3, 250.9]",
- expected: "[100.5, 200.7, 150.3, 250.9]", // Function doesn't convert decimals
- },
- {
- name: "multiple_bracket_formats",
- input: "[100, 200] and [300, 400, 350, 450]",
- expected: "[100, 200] and (300,400,350,450)", // Only 4-coord format converted
- },
- {
- name: "multiple_bbox_tags",
- input: "100 200 150 250 then 300 400 350 450",
- expected: "(100,200,150,250) then (300,400,350,450)",
- },
- {
- name: "edge_case_zero_coordinates",
+ name: "zero_coordinates",
input: "0 0",
expected: "(0,0)",
},
- {
- name: "edge_case_maximum_coordinates",
- input: "1000 1000",
- expected: "(1000,1000)",
- },
- {
- name: "complex_mixed_formats",
- input: "click 100 200 then drag [300, 400, 350, 450] to 500 600 550 650",
- expected: "click (100,200) then drag (300,400,350,450) to (500,600,550,650)",
- },
- {
- name: "no_coordinates",
- input: "click on button",
- expected: "click on button",
- },
- {
- name: "empty_string",
- input: "",
- expected: "",
- },
- {
- name: "only_text_no_tags",
- input: "some random text without coordinates",
- expected: "some random text without coordinates",
- },
- // Note: Extra spaces in brackets with 4 coords are NOT handled properly by the regex
- {
- name: "bracket_format_with_extra_spaces_not_converted",
- input: "[ 100 , 200 , 150 , 250 ]",
- expected: "[ 100 , 200 , 150 , 250 ]", // Function regex doesn't handle extra spaces
- },
{
name: "large_coordinates",
input: "1920 1080",
expected: "(1920,1080)",
},
+ // Multiple formats in one string
{
- name: "ultrawide_coordinates",
- input: "0 0 3440 1440",
- expected: "(0,0,3440,1440)",
+ name: "mixed_formats",
+ input: "100 200 and [300, 400, 350, 450]",
+ expected: "(100,200) and (300,400,350,450)",
+ },
+ // Unsupported formats (should remain unchanged)
+ {
+ name: "bracket_2_coords_not_converted",
+ input: "[100, 200]",
+ expected: "[100, 200]",
},
{
- name: "real_world_action_example",
- input: "Action: click(start_box='235 512')",
- expected: "Action: click(start_box='(235,512)')",
+ name: "decimals_not_converted",
+ input: "100.5 200.7",
+ expected: "100.5 200.7",
},
{
- name: "real_world_drag_example",
- input: "Action: drag(start_box='[100, 200, 150, 250]', end_box='300 400 350 450')",
- expected: "Action: drag(start_box='(100,200,150,250)', end_box='(300,400,350,450)')",
- },
- {
- name: "real_world_example_1",
- input: "235 512",
- expected: "(235,512)", // Should be string format for normalizeCoordinatesFormat
+ name: "no_coordinates",
+ input: "click on button",
+ expected: "click on button",
},
}
@@ -253,141 +143,82 @@ func TestConvertRelativeToAbsolute(t *testing.T) {
expectedResult float64
description string
}{
+ // Basic conversion tests
{
- name: "standard_1000x2000_x_coordinate",
- size: types.Size{Width: 1000, Height: 2000},
- relativeCoord: 500, // 500/1000 * 1000 = 500
- isXCoord: true,
- expectedResult: 500.0,
- description: "Standard case: X coordinate conversion",
- },
- {
- name: "standard_1000x2000_y_coordinate",
- size: types.Size{Width: 1000, Height: 2000},
- relativeCoord: 500, // 500/1000 * 2000 = 1000
- isXCoord: false,
- expectedResult: 1000.0,
- description: "Standard case: Y coordinate conversion",
- },
- {
- name: "example_from_documentation_x",
+ name: "standard_x_coordinate",
size: types.Size{Width: 1920, Height: 1080},
- relativeCoord: 235, // round(1920*235/1000) = 451
+ relativeCoord: 500, // 500/1000 * 1920 = 960
isXCoord: true,
- expectedResult: 451.2, // 实际计算值为451.2,测试精确值
- description: "Documentation example: X coordinate (235, 512) on 1920x1080",
+ expectedResult: 960.0,
+ description: "Standard X coordinate conversion",
},
{
- name: "example_from_documentation_y",
+ name: "standard_y_coordinate",
size: types.Size{Width: 1920, Height: 1080},
- relativeCoord: 512, // round(1080*512/1000) = 553
+ relativeCoord: 500, // 500/1000 * 1080 = 540
isXCoord: false,
- expectedResult: 553.0, // 实际计算值为553.0
- description: "Documentation example: Y coordinate (235, 512) on 1920x1080",
+ expectedResult: 540.0,
+ description: "Standard Y coordinate conversion",
},
+ // Mobile device tests
{
- name: "mobile_device_x_coordinate",
+ name: "mobile_x_coordinate",
size: types.Size{Width: 375, Height: 812},
relativeCoord: 200, // 200/1000 * 375 = 75
isXCoord: true,
expectedResult: 75.0,
- description: "Mobile device: iPhone X size X coordinate",
+ description: "Mobile device X coordinate",
},
{
- name: "mobile_device_y_coordinate",
+ name: "mobile_y_coordinate",
size: types.Size{Width: 375, Height: 812},
relativeCoord: 600, // 600/1000 * 812 = 487.2
isXCoord: false,
expectedResult: 487.2,
- description: "Mobile device: iPhone X size Y coordinate",
+ description: "Mobile device Y coordinate",
},
+ // Edge cases
{
- name: "tablet_device_x_coordinate",
- size: types.Size{Width: 1024, Height: 768},
- relativeCoord: 750, // 750/1000 * 1024 = 768
- isXCoord: true,
- expectedResult: 768.0,
- description: "Tablet device: iPad size X coordinate",
- },
- {
- name: "tablet_device_y_coordinate",
- size: types.Size{Width: 1024, Height: 768},
- relativeCoord: 400, // 400/1000 * 768 = 307.2
- isXCoord: false,
- expectedResult: 307.2,
- description: "Tablet device: iPad size Y coordinate",
- },
- {
- name: "edge_case_zero_coordinate",
+ name: "zero_coordinate",
size: types.Size{Width: 1920, Height: 1080},
- relativeCoord: 0, // 0/1000 * width/height = 0
+ relativeCoord: 0,
isXCoord: true,
expectedResult: 0.0,
- description: "Edge case: Zero coordinate",
+ description: "Zero coordinate",
},
{
- name: "edge_case_maximum_coordinate_x",
+ name: "maximum_coordinate",
size: types.Size{Width: 1920, Height: 1080},
relativeCoord: 1000, // 1000/1000 * 1920 = 1920
isXCoord: true,
expectedResult: 1920.0,
- description: "Edge case: Maximum X coordinate (1000 -> full width)",
+ description: "Maximum coordinate (1000 -> full width)",
},
+ // Coordinates > 1000 (normalization scenarios)
{
- name: "edge_case_maximum_coordinate_y",
+ name: "coordinate_greater_than_1000",
size: types.Size{Width: 1920, Height: 1080},
- relativeCoord: 1000, // 1000/1000 * 1080 = 1080
- isXCoord: false,
- expectedResult: 1080.0,
- description: "Edge case: Maximum Y coordinate (1000 -> full height)",
- },
- {
- name: "rounding_precision_test_x",
- size: types.Size{Width: 1000, Height: 1000},
- relativeCoord: 333, // 333/1000 * 1000 = 333
+ relativeCoord: 1500, // 1500/1000 * 1920 = 2880
isXCoord: true,
- expectedResult: 333.0,
- description: "Precision test: X coordinate with rounding",
+ expectedResult: 2880.0,
+ description: "Coordinate > 1000: normalization test",
},
{
- name: "rounding_precision_test_y",
- size: types.Size{Width: 1000, Height: 2000},
- relativeCoord: 750, // 750/1000 * 2000 = 1500
+ name: "very_large_coordinate",
+ size: types.Size{Width: 1920, Height: 1080},
+ relativeCoord: 2000, // 2000/1000 * 1080 = 2160
isXCoord: false,
- expectedResult: 1500.0,
- description: "Precision test: Y coordinate with rounding",
+ expectedResult: 2160.0,
+ description: "Very large coordinate test",
},
+ // High resolution test
{
- name: "small_screen_x_coordinate",
- size: types.Size{Width: 480, Height: 800},
- relativeCoord: 125, // 125/1000 * 480 = 60
+ name: "4k_resolution_large_coordinate",
+ size: types.Size{Width: 3840, Height: 2160},
+ relativeCoord: 1500, // 1500/1000 * 3840 = 5760
isXCoord: true,
- expectedResult: 60.0,
- description: "Small screen: X coordinate conversion",
- },
- {
- name: "small_screen_y_coordinate",
- size: types.Size{Width: 480, Height: 800},
- relativeCoord: 875, // 875/1000 * 800 = 700
- isXCoord: false,
- expectedResult: 700.0,
- description: "Small screen: Y coordinate conversion",
- },
- {
- name: "ultrawide_monitor_x_coordinate",
- size: types.Size{Width: 3440, Height: 1440},
- relativeCoord: 450, // 450/1000 * 3440 = 1548
- isXCoord: true,
- expectedResult: 1548.0,
- description: "Ultrawide monitor: X coordinate conversion",
- },
- {
- name: "ultrawide_monitor_y_coordinate",
- size: types.Size{Width: 3440, Height: 1440},
- relativeCoord: 720, // 720/1000 * 1440 = 1036.8
- isXCoord: false,
- expectedResult: 1036.8,
- description: "Ultrawide monitor: Y coordinate conversion",
+ expectedResult: 5760.0,
+ description: "4K resolution with large coordinate",
},
}
@@ -662,153 +493,101 @@ func TestNormalizeStringCoordinates(t *testing.T) {
expectError bool
description string
}{
+ // Basic coordinate formats
{
name: "simple_coordinate_string",
coordStr: "100,200,150,250",
size: types.Size{Width: 1000, Height: 1000},
expected: []float64{100.0, 200.0, 150.0, 250.0},
- description: "Simple comma-separated coordinate string",
+ description: "Simple comma-separated coordinates",
},
{
- name: "coordinate_string_with_spaces",
- coordStr: " 100 , 200 , 150 , 250 ",
- size: types.Size{Width: 1000, Height: 1000},
- expected: []float64{100.0, 200.0, 150.0, 250.0},
- description: "Coordinate string with spaces",
- },
- {
- name: "documentation_example_point_tag",
+ name: "point_tag_format",
coordStr: "235 512",
size: types.Size{Width: 1920, Height: 1080},
expected: []float64{451.2, 553.0}, // 235/1000*1920=451.2, 512/1000*1080=553.0
- description: "Documentation example: point tag on 1920x1080",
+ description: "Point tag format with screen scaling",
},
{
- name: "documentation_example_bbox_tag",
- coordStr: "235 512 451 553",
+ name: "bbox_tag_format",
+ coordStr: "100 200 150 250",
size: types.Size{Width: 1920, Height: 1080},
- expected: []float64{451.2, 553.0, 865.9, 597.2}, // All converted from relative to absolute
- description: "Documentation example: bbox tag on 1920x1080",
+ expected: []float64{192.0, 216.0, 288.0, 270.0}, // All scaled to 1920x1080
+ description: "Bbox tag format with screen scaling",
},
{
- name: "mobile_device_point",
- coordStr: "200 600",
- size: types.Size{Width: 375, Height: 812},
- expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2
- description: "Mobile device: iPhone X point coordinate",
- },
- {
- name: "mobile_device_bbox",
- coordStr: "200 600 400 800",
- size: types.Size{Width: 375, Height: 812},
- expected: []float64{75.0, 487.2, 150.0, 649.6}, // Mobile device bbox
- description: "Mobile device: iPhone X bbox coordinate",
- },
- {
- name: "tablet_device_coordinates",
- coordStr: "[750, 400, 800, 450]",
- size: types.Size{Width: 1024, Height: 768},
- expected: []float64{768.0, 307.2, 819.2, 345.6}, // Tablet coordinates
- description: "Tablet device: iPad coordinate conversion",
- },
- {
- name: "bracket_format_two_coords",
- coordStr: "[100, 200]",
- size: types.Size{Width: 1000, Height: 1000},
- expected: []float64{100.0, 200.0},
- description: "Bracket format with two coordinates",
- },
- {
- name: "bracket_format_four_coords",
+ name: "bracket_format",
coordStr: "[100, 200, 150, 250]",
size: types.Size{Width: 1000, Height: 1000},
expected: []float64{100.0, 200.0, 150.0, 250.0},
- description: "Bracket format with four coordinates",
+ description: "Bracket format coordinates",
},
+ // Mobile device test
{
- name: "edge_case_zero_coordinates",
+ name: "mobile_device_coordinates",
+ coordStr: "200 600",
+ size: types.Size{Width: 375, Height: 812},
+ expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2
+ description: "Mobile device coordinate conversion",
+ },
+ // Edge cases
+ {
+ name: "zero_coordinates",
coordStr: "0,0,0,0",
size: types.Size{Width: 1920, Height: 1080},
expected: []float64{0.0, 0.0, 0.0, 0.0},
- description: "Edge case: all zero coordinates",
+ description: "Zero coordinates",
},
{
- name: "edge_case_maximum_coordinates",
+ name: "maximum_coordinates",
coordStr: "1000,1000,1000,1000",
size: types.Size{Width: 1920, Height: 1080},
- expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum relative coords -> screen edges
- description: "Edge case: maximum coordinates (1000 -> screen edges)",
+ expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum -> screen edges
+ description: "Maximum coordinates (1000 -> screen edges)",
},
+ // Coordinates > 1000 (normalization scenarios)
{
- name: "ultrawide_monitor_coords",
- coordStr: "450 720",
- size: types.Size{Width: 3440, Height: 1440},
- expected: []float64{1548.0, 1036.8}, // 450/1000*3440=1548, 720/1000*1440=1036.8
- description: "Ultrawide monitor: coordinate conversion",
- },
- {
- name: "small_screen_coordinates",
- coordStr: "125 875 250 950",
- size: types.Size{Width: 480, Height: 800},
- expected: []float64{60.0, 700.0, 120.0, 760.0}, // Small screen bbox
- description: "Small screen: coordinate conversion",
- },
- {
- name: "real_world_example_1",
- coordStr: "235 512",
+ name: "coordinates_greater_than_1000",
+ coordStr: "1200,1500,1400,1800",
size: types.Size{Width: 1920, Height: 1080},
- expected: []float64{451.2, 553.0}, // Real documentation example
- description: "Real world: documentation example coordinates",
+ expected: []float64{2304.0, 1620.0, 2688.0, 1944.0}, // Scaled up for larger screen
+ description: "Coordinates > 1000: scaling to larger screen",
},
{
- name: "real_world_example_2",
- coordStr: "[375, 600, 425, 650]",
- size: types.Size{Width: 1080, Height: 1920},
- expected: []float64{405.0, 1152.0, 459.0, 1248.0}, // Portrait mobile bbox
- description: "Real world: portrait mobile bbox",
+ name: "very_large_coordinates",
+ coordStr: "[2000, 3000, 2500, 3500]",
+ size: types.Size{Width: 1920, Height: 1080},
+ expected: []float64{3840.0, 3240.0, 4800.0, 3780.0}, // Very large coordinates
+ description: "Very large coordinates > 2000",
},
- // Error cases - decimal coordinates are not supported by the regex (\d+ only matches integers)
+ {
+ name: "mixed_coordinates_boundary",
+ coordStr: "800,1200,1000,1500",
+ size: types.Size{Width: 1920, Height: 1080},
+ expected: []float64{1536.0, 1296.0, 1920.0, 1620.0}, // Mixed coordinates
+ description: "Mixed coordinates around 1000 boundary",
+ },
+ // Error cases
{
name: "empty_string",
coordStr: "",
size: types.Size{Width: 1000, Height: 1000},
expectError: true,
- description: "Error case: empty string",
+ description: "Empty string should cause error",
},
{
name: "invalid_coordinate_string",
coordStr: "abc,def",
size: types.Size{Width: 1000, Height: 1000},
expectError: true,
- description: "Error case: invalid coordinate string",
+ description: "Invalid coordinate string should cause error",
},
{
name: "insufficient_coordinates",
coordStr: "100",
size: types.Size{Width: 1000, Height: 1000},
expectError: true,
- description: "Error case: insufficient coordinates",
- },
- {
- name: "invalid_bracket_format",
- coordStr: "[abc, def]",
- size: types.Size{Width: 1000, Height: 1000},
- expectError: true,
- description: "Error case: invalid bracket format",
- },
- {
- name: "invalid_point_tag",
- coordStr: "abc def",
- size: types.Size{Width: 1000, Height: 1000},
- expectError: true,
- description: "Error case: invalid point tag",
- },
- {
- name: "invalid_bbox_tag",
- coordStr: "abc def ghi jkl",
- size: types.Size{Width: 1000, Height: 1000},
- expectError: true,
- description: "Error case: invalid bbox tag",
+ description: "Insufficient coordinates should cause error",
},
}
@@ -832,7 +611,7 @@ func TestNormalizeStringCoordinates(t *testing.T) {
// Test normalizeActionCoordinates function
func TestNormalizeActionCoordinates(t *testing.T) {
- size := types.Size{Width: 1000, Height: 1000}
+ size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization
tests := []struct {
name string
@@ -843,27 +622,27 @@ func TestNormalizeActionCoordinates(t *testing.T) {
{
name: "JSON array format - []interface{}",
coordData: []interface{}{100.0, 200.0, 150.0, 250.0},
- expected: []float64{100.0, 200.0, 150.0, 250.0},
+ expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
},
{
name: "JSON array format with int values",
coordData: []interface{}{100, 200, 150, 250},
- expected: []float64{100.0, 200.0, 150.0, 250.0},
+ expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
},
{
name: "float64 slice format",
coordData: []float64{100.0, 200.0, 150.0, 250.0},
- expected: []float64{100.0, 200.0, 150.0, 250.0},
+ expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
},
{
name: "string format",
coordData: "100,200,150,250",
- expected: []float64{100.0, 200.0, 150.0, 250.0},
+ expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
},
{
name: "two-element coordinate",
coordData: []interface{}{100.0, 200.0},
- expected: []float64{100.0, 200.0},
+ expected: []float64{192.0, 160.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160
},
{
name: "insufficient elements in array",
@@ -902,7 +681,7 @@ func TestNormalizeActionCoordinates(t *testing.T) {
// Test processActionArguments function
func TestProcessActionArguments(t *testing.T) {
- size := types.Size{Width: 1000, Height: 1000}
+ size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization
tests := []struct {
name string
@@ -911,29 +690,49 @@ func TestProcessActionArguments(t *testing.T) {
expectError bool
}{
{
- name: "coordinate and non-coordinate parameters",
+ name: "basic_coordinate_and_text_parameters",
rawArgs: map[string]interface{}{
"start_box": "100,200,150,250",
"content": "Hello\\nWorld",
},
expected: map[string]interface{}{
- "start_box": []float64{100.0, 200.0, 150.0, 250.0},
+ "start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180
"content": "Hello\nWorld",
},
},
{
- name: "multiple coordinate parameters",
+ name: "drag_operation_dual_coordinates",
rawArgs: map[string]interface{}{
"start_box": "100,200,150,250",
"end_box": "300,400,350,450",
},
expected: map[string]interface{}{
- "start_box": []float64{100.0, 200.0, 150.0, 250.0},
- "end_box": []float64{300.0, 400.0, 350.0, 450.0},
+ "start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180
+ "end_box": []float64{624.0, 340.0}, // Center point: [300,400,350,450] -> scaled coords [576,320,672,360] -> center (576+672)/2=624, (320+360)/2=340
},
},
{
- name: "only non-coordinate parameters",
+ name: "coordinates_greater_than_1000",
+ rawArgs: map[string]interface{}{
+ "start_box": "1200,1500,1400,1800",
+ },
+ expected: map[string]interface{}{
+ "start_box": []float64{2496.0, 1320.0}, // Center point: [1200,1500,1400,1800] -> scaled coords [2304,1200,2688,1440] -> center (2304+2688)/2=2496, (1200+1440)/2=1320
+ },
+ },
+ {
+ name: "mixed_large_and_small_coordinates",
+ rawArgs: map[string]interface{}{
+ "start_box": "800,1200,1000,1500",
+ "end_box": "1500,500,2000,800",
+ },
+ expected: map[string]interface{}{
+ "start_box": []float64{1728.0, 1080.0}, // Center point: [800,1200,1000,1500] -> scaled coords [1536,960,1920,1200] -> center (1536+1920)/2=1728, (960+1200)/2=1080
+ "end_box": []float64{3360.0, 520.0}, // Center point: [1500,500,2000,800] -> scaled coords [2880,400,3840,640] -> center (2880+3840)/2=3360, (400+640)/2=520
+ },
+ },
+ {
+ name: "non_coordinate_parameters_only",
rawArgs: map[string]interface{}{
"content": "Hello World",
"direction": "down",
@@ -944,12 +743,12 @@ func TestProcessActionArguments(t *testing.T) {
},
},
{
- name: "empty arguments",
+ name: "empty_arguments",
rawArgs: map[string]interface{}{},
expected: map[string]interface{}{},
},
{
- name: "invalid coordinate parameter",
+ name: "invalid_coordinate_parameter",
rawArgs: map[string]interface{}{
"start_box": "invalid",
},
@@ -988,3 +787,56 @@ func TestProcessActionArguments(t *testing.T) {
})
}
}
+
+// Test new coordinate conversion logic
+func TestNewCoordinateConversion(t *testing.T) {
+ parser := &UITARSContentParser{}
+
+ // Test single start_box conversion to center point
+ text := "Thought: 我需要点击这个按钮\nAction: click(start_box='100,200,150,250')"
+ result, err := parser.Parse(text, types.Size{Height: 1000, Width: 1000})
+ assert.Nil(t, err)
+ function := result.ToolCalls[0].Function
+ assert.Equal(t, function.Name, "uixt__click")
+
+ // ActionInputs is now directly a coordinate array
+ var coords []float64
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
+ assert.Nil(t, err)
+
+ // Should convert bounding box [100,200,150,250] to center point [125.0, 225.0]
+ assert.Equal(t, 2, len(coords))
+ assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125
+ assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225
+
+ // Test drag operation conversion to merged array
+ text = "Thought: 我需要拖拽元素\nAction: drag(start_box='100,200,150,250', end_box='300,400,350,450')"
+ result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
+ assert.Nil(t, err)
+ function = result.ToolCalls[0].Function
+ assert.Equal(t, function.Name, "uixt__drag")
+
+ // ActionInputs is now directly a coordinate array
+ err = json.Unmarshal([]byte(function.Arguments), &coords)
+ assert.Nil(t, err)
+
+ // Should merge start_box and end_box center points into single array [125.0, 225.0, 325.0, 425.0]
+ assert.Equal(t, 4, len(coords))
+ assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125
+ assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225
+ assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325
+ assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425
+
+ // Test non-coordinate operation (type action)
+ text = "Thought: 我需要输入文本\nAction: type(content='Hello World')"
+ result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
+ assert.Nil(t, err)
+ function = result.ToolCalls[0].Function
+ assert.Equal(t, function.Name, "uixt__type")
+
+ // ActionInputs should be a map for non-coordinate operations
+ var arguments map[string]interface{}
+ err = json.Unmarshal([]byte(function.Arguments), &arguments)
+ assert.Nil(t, err)
+ assert.Equal(t, "Hello World", arguments["content"])
+}
diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go
index 12778b19..a603f2f5 100644
--- a/uixt/ai/parser_ui_tars.go
+++ b/uixt/ai/parser_ui_tars.go
@@ -10,6 +10,7 @@ import (
"time"
"github.com/cloudwego/eino/schema"
+ "github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/rs/zerolog/log"
)
@@ -20,7 +21,8 @@ const (
// UITARSContentParser parses the Thought/Action format response
type UITARSContentParser struct {
- systemPrompt string
+ systemPrompt string
+ actionMapping map[string]option.ActionName
}
func (p *UITARSContentParser) SystemPrompt() string {
@@ -47,7 +49,7 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR
}
// Convert actions to tool calls
- toolCalls := convertActionsToToolCalls(actions)
+ toolCalls := convertActionsToToolCalls(actions, p.actionMapping)
return &PlanningResult{
ToolCalls: toolCalls,
@@ -92,10 +94,15 @@ func (p *UITARSContentParser) parseActionString(actionStr string, size types.Siz
return nil, err
}
- // Create final action
+ // Convert processedArgs based on action type and coordinate parameters
+ finalArgs, err := convertProcessedArgs(processedArgs, actionType)
+ if err != nil {
+ return nil, err
+ }
+
action := Action{
ActionType: actionType,
- ActionInputs: processedArgs,
+ ActionInputs: finalArgs,
}
return []Action{action}, nil
@@ -147,6 +154,18 @@ func normalizeCoordinatesFormat(text string) string {
}
// convertRelativeToAbsolute converts relative coordinates to absolute pixel coordinates
+// The coordinate system uses a 1000x1000 relative coordinate system as the base.
+// This function maps relative coordinates to actual screen resolution coordinates.
+//
+// Conversion formula:
+// - For X coordinates: absolute_x = (relative_x / 1000) * screen_width
+// - For Y coordinates: absolute_y = (relative_y / 1000) * screen_height
+//
+// Example:
+// - Screen size: 1920x1080
+// - Relative coordinate: 500 (in 1000x1000 system)
+// - X conversion: 500/1000 * 1920 = 960 pixels
+// - Y conversion: 500/1000 * 1080 = 540 pixels
func convertRelativeToAbsolute(relativeCoord float64, isXCoord bool, size types.Size) float64 {
if isXCoord {
return math.Round((relativeCoord/DefaultFactor*float64(size.Width))*10) / 10
@@ -204,7 +223,9 @@ func normalizeParameterName(paramName string) string {
// processActionArguments processes raw arguments based on action type and parameter types
// Input: rawArgs={"start_box": "100,200,150,250"}
-// Output: processedArgs={"start_box": [120.5, 240.1, 180.7, 300.2]} (converted to pixels)
+// Output: processedArgs={"start_box": [125.0, 225.0]} (converted to center point coordinates)
+// For drag: rawArgs={"start_box": "100,200,150,250", "end_box": "300,400,350,450"}
+// Output: processedArgs={"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} (both converted to center points)
func processActionArguments(rawArgs map[string]interface{}, size types.Size) (map[string]interface{}, error) {
processedArgs := make(map[string]interface{})
@@ -222,9 +243,9 @@ func processActionArguments(rawArgs map[string]interface{}, size types.Size) (ma
// Process a single argument based on its name and value
func processArgument(paramName string, paramValue interface{}, size types.Size) (interface{}, error) {
- // Handle coordinate parameters
+ // Handle coordinate parameters - convert bounding box to center point
if isCoordinateParameter(paramName) {
- return normalizeActionCoordinates(paramValue, size)
+ return normalizeActionCoordinatesToCenterPoint(paramValue, size)
}
// Handle other parameter types (content, key, direction, etc.)
@@ -236,6 +257,59 @@ func isCoordinateParameter(paramName string) bool {
return strings.Contains(paramName, "box") || strings.Contains(paramName, "point")
}
+// convertProcessedArgs converts processed arguments based on action type and coordinate parameters
+// For single start_box: {"start_box": [125.0, 225.0]} -> {"start_box": [125.0, 225.0]}
+// For drag with start_box and end_box: {"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} -> {"start_box": [125.0, 225.0, 325.0, 425.0]}
+func convertProcessedArgs(processedArgs map[string]interface{}, actionType string) (map[string]interface{}, error) {
+ // Handle coordinate parameters based on action type
+ startBox, hasStartBox := processedArgs["start_box"]
+ endBox, hasEndBox := processedArgs["end_box"]
+
+ // Check if this is a drag operation that should merge coordinates
+ if hasStartBox && hasEndBox {
+ // Drag operation: merge start_box and end_box into a single coordinate array
+ startCoords, ok1 := startBox.([]float64)
+ endCoords, ok2 := endBox.([]float64)
+
+ if !ok1 || !ok2 {
+ return nil, fmt.Errorf("invalid coordinate format for drag operation")
+ }
+
+ if len(startCoords) != 2 || len(endCoords) != 2 {
+ return nil, fmt.Errorf("drag operation requires 2-element coordinate arrays, got start: %d, end: %d", len(startCoords), len(endCoords))
+ }
+
+ options := option.ActionOptions{
+ FromX: startCoords[0],
+ FromY: startCoords[1],
+ ToX: endCoords[0],
+ ToY: endCoords[1],
+ }
+ return options.ToMap(), nil
+ }
+
+ // For single coordinate operations, return the coordinate array directly
+ if hasStartBox {
+ startCoords, ok := startBox.([]float64)
+ if !ok {
+ return nil, fmt.Errorf("invalid coordinate format for single operation")
+ }
+ options := option.ActionOptions{
+ X: startCoords[0],
+ Y: startCoords[1],
+ }
+ return options.ToMap(), nil
+ }
+
+ // For non-coordinate operations, return the original arguments map
+ // TODO
+ finalArgs := make(map[string]interface{})
+ for key, value := range processedArgs {
+ finalArgs[key] = value
+ }
+ return finalArgs, nil
+}
+
// normalizeActionCoordinates normalizes coordinates from various formats to actual pixel coordinates
func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
switch v := coordData.(type) {
@@ -350,15 +424,39 @@ func normalizeStringCoordinates(coordStr string, size types.Size) ([]float64, er
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
}
+// normalizeActionCoordinatesToCenterPoint converts bounding box coordinates to center point coordinates
+// Input: "100,200,150,250" (x1,y1,x2,y2) -> Output: [125.0, 225.0] (center point in absolute pixels)
+// Input: "100,200" (x,y) -> Output: [100.0, 200.0] (point in absolute pixels)
+func normalizeActionCoordinatesToCenterPoint(coordData interface{}, size types.Size) ([]float64, error) {
+ // First normalize coordinates to get absolute pixel coordinates
+ coords, err := normalizeActionCoordinates(coordData, size)
+ if err != nil {
+ return nil, err
+ }
+
+ // Convert bounding box to center point
+ if len(coords) == 4 {
+ // [x1, y1, x2, y2] -> [center_x, center_y]
+ centerX := (coords[0] + coords[2]) / 2
+ centerY := (coords[1] + coords[3]) / 2
+ return []float64{centerX, centerY}, nil
+ } else if len(coords) == 2 {
+ // Already a point [x, y], return as-is
+ return coords, nil
+ } else {
+ return nil, fmt.Errorf("invalid coordinate format: expected 2 or 4 coordinates, got %d", len(coords))
+ }
+}
+
// Action represents a parsed action with its context.
type Action struct {
- ActionType string `json:"action_type"`
+ ActionType string `json:"action_type"` // map to option.ActionName
ActionInputs map[string]any `json:"action_inputs"`
}
// convertActionsToToolCalls converts actions to tool calls
// This is a shared function used by both JSONContentParser and UITARSContentParser
-func convertActionsToToolCalls(actions []Action) []schema.ToolCall {
+func convertActionsToToolCalls(actions []Action, actionMapping map[string]option.ActionName) []schema.ToolCall {
toolCalls := make([]schema.ToolCall, 0, len(actions))
for _, action := range actions {
jsonArgs, err := json.Marshal(action.ActionInputs)
@@ -366,11 +464,15 @@ func convertActionsToToolCalls(actions []Action) []schema.ToolCall {
log.Error().Interface("action", action).Msg("failed to marshal action inputs")
continue
}
+ actionName := string(actionMapping[action.ActionType])
+ if actionName == "" {
+ actionName = action.ActionType
+ }
toolCalls = append(toolCalls, schema.ToolCall{
- ID: action.ActionType + "_" + strconv.FormatInt(time.Now().Unix(), 10),
+ ID: actionName + "_" + strconv.FormatInt(time.Now().Unix(), 10),
Type: "function",
Function: schema.FunctionCall{
- Name: "uixt__" + action.ActionType,
+ Name: "uixt__" + actionName,
Arguments: string(jsonArgs),
},
})
diff --git a/uixt/ai/planner.go b/uixt/ai/planner.go
index 210dd19a..ead8fbed 100644
--- a/uixt/ai/planner.go
+++ b/uixt/ai/planner.go
@@ -2,7 +2,6 @@ package ai
import (
"context"
- "fmt"
"time"
"github.com/cloudwego/eino-ext/components/model/openai"
@@ -140,20 +139,12 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
Error: err.Error(),
}
log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed")
- // append assistant message
- p.history.Append(&schema.Message{
- Role: schema.Assistant,
- Content: message.Content,
- })
- } else {
- // append assistant message with tool calls
- p.history.Append(&schema.Message{
- Role: schema.Tool,
- Content: result.Content,
- ToolCalls: result.ToolCalls,
- ToolCallID: fmt.Sprintf("%d", time.Now().Unix()),
- })
}
+ // append assistant message (since we're parsing content, not using native function calling)
+ p.history.Append(&schema.Message{
+ Role: schema.Assistant,
+ Content: message.Content,
+ })
log.Info().
Interface("summary", result.ActionSummary).
diff --git a/uixt/ai/planner_prompts.go b/uixt/ai/planner_prompts.go
index 51007086..8b361926 100644
--- a/uixt/ai/planner_prompts.go
+++ b/uixt/ai/planner_prompts.go
@@ -1,5 +1,7 @@
package ai
+import "github.com/httprunner/httprunner/v5/uixt/option"
+
// system prompt for UITARSContentParser
// doubao-1.5-ui-tars on volcengine.com
// https://www.volcengine.com/docs/82379/1536429
@@ -14,13 +16,12 @@ Action: ...
## Action Space
click(start_box='[x1, y1, x2, y2]')
-long_press(start_box='[x1, y1, x2, y2]')
+left_double(start_box='[x1, y1, x2, y2]')
+right_single(start_box='[x1, y1, x2, y2]')
+drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
+hotkey(key='')
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
-open_app(app_name=\'\')
-drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
-press_home()
-press_back()
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
@@ -31,6 +32,18 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
## User Instruction
`
+var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{
+ "click": option.ACTION_TapXY,
+ "left_double": option.ACTION_DoubleTapXY,
+ "right_single": option.ACTION_SecondaryClick,
+ "drag": option.ACTION_Drag,
+ "hotkey": option.ACTION_KeyCode,
+ "type": option.ACTION_Input,
+ "scroll": option.ACTION_Scroll,
+ "wait": option.ACTION_Sleep,
+ "finished": option.ACTION_Finished,
+}
+
// system prompt for UITARSContentParser
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py
const _ = `
diff --git a/uixt/ai/planner_test.go b/uixt/ai/planner_test.go
index 8bfeb0ec..5cf9fbb5 100644
--- a/uixt/ai/planner_test.go
+++ b/uixt/ai/planner_test.go
@@ -29,7 +29,7 @@ func TestVLMPlanning(t *testing.T) {
userInstruction += "\n\n请基于以上游戏规则,给出下一步可点击的两个图标坐标"
- modelConfig, err := GetModelConfig(option.LLMServiceTypeDoubaoVL)
+ modelConfig, err := GetModelConfig(option.LLMServiceTypeUITARS)
require.NoError(t, err)
planner, err := NewPlanner(context.Background(), modelConfig)
@@ -157,9 +157,9 @@ func TestHandleSwitch(t *testing.T) {
imageFile string
actionType string
}{
- {"testdata/deepseek_think_off.png", "click"}, // 关闭状态,需要点击开启
- {"testdata/deepseek_think_on.png", "click"}, // 关闭状态,需要点击开启
- {"testdata/deepseek_network_on.png", "finished"}, // 开启状态,无需操作
+ {"testdata/deepseek_think_off.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启
+ {"testdata/deepseek_think_on.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启
+ {"testdata/deepseek_network_on.png", "uixt__finished"}, // 开启状态,无需操作
}
for _, tc := range testCases {
diff --git a/uixt/option/action.go b/uixt/option/action.go
index 8b6068c3..7c13b753 100644
--- a/uixt/option/action.go
+++ b/uixt/option/action.go
@@ -2,6 +2,7 @@ package option
import (
"context"
+ "encoding/json"
"fmt"
"math/rand/v2"
"reflect"
@@ -330,6 +331,18 @@ func (o *ActionOptions) Options() []ActionOption {
return options
}
+func (o *ActionOptions) ToMap() map[string]interface{} {
+ result := make(map[string]interface{})
+ b, err := json.Marshal(o)
+ if err != nil {
+ return nil
+ }
+ if err := json.Unmarshal(b, &result); err != nil {
+ return nil
+ }
+ return result
+}
+
func (o *ActionOptions) ApplyTapOffset(absX, absY float64) (float64, float64) {
if len(o.TapOffset) == 2 {
absX += float64(o.TapOffset[0])
diff --git a/uixt/sdk.go b/uixt/sdk.go
index 08840576..5b91fc93 100644
--- a/uixt/sdk.go
+++ b/uixt/sdk.go
@@ -64,9 +64,10 @@ func (c *MCPClient4XTDriver) ListTools(ctx context.Context, req mcp.ListToolsReq
}
func (c *MCPClient4XTDriver) CallTool(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) {
- actionTool := c.Server.GetToolByAction(option.ActionName(req.Params.Name))
+ actionName := strings.TrimPrefix(req.Params.Name, "uixt__")
+ actionTool := c.Server.GetToolByAction(option.ActionName(actionName))
if actionTool == nil {
- return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", req.Params.Name)), nil
+ return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", actionName)), nil
}
handler := actionTool.Implement()
return handler(ctx, req)