From c204542f1f652c0aba7a59f1e32cc5add25cb1a7 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Wed, 4 Jun 2025 22:39:17 +0800 Subject: [PATCH] feat: optimize UI-TARS parser with coordinate conversion and action mapping - Add action mapping for UI-TARS parser to convert action names to option.ActionName - Implement bounding box to center point coordinate conversion for better accuracy - Update coordinate normalization to handle coordinates > 1000 properly - Enhance test cases to verify coordinate scaling and center point conversion - Improve action argument processing with proper coordinate transformation - Add comprehensive test coverage for coordinate conversion edge cases Key improvements: - Bounding box [x1,y1,x2,y2] now converts to center point [cx,cy] for actions - Coordinate scaling properly handles different screen resolutions - Action names are mapped through doubao_1_5_ui_tars_action_mapping - Enhanced error handling for invalid coordinate formats --- internal/version/VERSION | 2 +- server/main.go | 2 +- uixt/ai/parser_default.go | 11 +- uixt/ai/parser_test.go | 590 ++++++++++++++----------------------- uixt/ai/parser_ui_tars.go | 124 +++++++- uixt/ai/planner.go | 19 +- uixt/ai/planner_prompts.go | 23 +- uixt/ai/planner_test.go | 8 +- uixt/option/action.go | 13 + uixt/sdk.go | 5 +- 10 files changed, 386 insertions(+), 411 deletions(-) diff --git a/internal/version/VERSION b/internal/version/VERSION index 9d8a0b32..edf71488 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506031820 +v5.0.0-beta-2506042316 diff --git a/server/main.go b/server/main.go index e881caf2..71258846 100644 --- a/server/main.go +++ b/server/main.go @@ -25,7 +25,7 @@ type Router struct { } func (r *Router) InitMCPHost(configPath string) error { - mcpHost, err := mcphost.NewMCPHost(configPath, false) + mcpHost, err := mcphost.NewMCPHost(configPath, true) if err != nil { log.Error().Err(err).Msg("init MCP host failed") return err diff --git a/uixt/ai/parser_default.go b/uixt/ai/parser_default.go index 979b811d..998b074e 100644 --- a/uixt/ai/parser_default.go +++ b/uixt/ai/parser_default.go @@ -21,18 +21,21 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser { switch modelType { case option.LLMServiceTypeUITARS: return &UITARSContentParser{ - systemPrompt: doubao_1_5_ui_tars_planning_prompt, + systemPrompt: doubao_1_5_ui_tars_planning_prompt, + actionMapping: doubao_1_5_ui_tars_action_mapping, } default: return &JSONContentParser{ - systemPrompt: defaultPlanningResponseJsonFormat, + systemPrompt: defaultPlanningResponseJsonFormat, + actionMapping: map[string]option.ActionName{}, } } } // JSONContentParser parses the response as JSON string format type JSONContentParser struct { - systemPrompt string + systemPrompt string + actionMapping map[string]option.ActionName } func (p *JSONContentParser) SystemPrompt() string { @@ -83,7 +86,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes } // Convert actions to tool calls using function from parser_ui_tars.go - toolCalls := convertActionsToToolCalls(normalizedActions) + toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping) return &PlanningResult{ ToolCalls: toolCalls, diff --git a/uixt/ai/parser_test.go b/uixt/ai/parser_test.go index b35f5bc5..12925d4a 100644 --- a/uixt/ai/parser_test.go +++ b/uixt/ai/parser_test.go @@ -14,62 +14,52 @@ func TestParseActionToStructureOutput(t *testing.T) { result, err := parser.Parse(text, types.Size{Height: 224, Width: 224}) assert.Nil(t, err) function := result.ToolCalls[0].Function - assert.Equal(t, function.Name, "click") - assert.Contains(t, function.Arguments, "start_box") + assert.Equal(t, function.Name, "uixt__click") + // ActionInputs is now directly a coordinate array + var coords []float64 + err = json.Unmarshal([]byte(function.Arguments), &coords) + assert.Nil(t, err) + assert.Equal(t, 2, len(coords)) text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='550 450 550 450')" result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024}) assert.Nil(t, err) function = result.ToolCalls[0].Function - assert.Equal(t, function.Name, "click") - assert.Contains(t, function.Arguments, "start_box") + assert.Equal(t, function.Name, "uixt__click") + // ActionInputs is now directly a coordinate array + err = json.Unmarshal([]byte(function.Arguments), &coords) + assert.Nil(t, err) + assert.Equal(t, 2, len(coords)) - // Test new bracket format + // Test new bracket format - should convert bounding box to center point text = "Thought: 我需要点击这个按钮\nAction: click(start_box='[100, 200, 150, 250]')" result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) assert.Nil(t, err) function = result.ToolCalls[0].Function - assert.Equal(t, function.Name, "click") - assert.Contains(t, function.Arguments, "start_box") - arguments := make(map[string]interface{}) - err = json.Unmarshal([]byte(function.Arguments), &arguments) + assert.Equal(t, function.Name, "uixt__click") + // ActionInputs is now directly a coordinate array + err = json.Unmarshal([]byte(function.Arguments), &coords) assert.Nil(t, err) - coordsInterface := arguments["start_box"].([]interface{}) - coords := make([]float64, len(coordsInterface)) - for i, v := range coordsInterface { - coords[i] = v.(float64) - } - assert.Equal(t, 4, len(coords)) - assert.Equal(t, 100.0, coords[0]) - assert.Equal(t, 200.0, coords[1]) - assert.Equal(t, 150.0, coords[2]) - assert.Equal(t, 250.0, coords[3]) + // Should be converted to center point [125, 225] from bounding box [100, 200, 150, 250] + assert.Equal(t, 2, len(coords)) + assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225 - // Test drag operation with both start_box and end_box + // Test drag operation with both start_box and end_box - should merge center points into single array text = "Thought: 我需要拖拽元素\nAction: drag(start_box='[100, 200, 150, 250]', end_box='[300, 400, 350, 450]')" result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) assert.Nil(t, err) function = result.ToolCalls[0].Function - assert.Equal(t, function.Name, "drag") - assert.Contains(t, function.Arguments, "start_box") - assert.Contains(t, function.Arguments, "end_box") - arguments = make(map[string]interface{}) - err = json.Unmarshal([]byte(function.Arguments), &arguments) + assert.Equal(t, function.Name, "uixt__drag") + // ActionInputs is now directly a coordinate array + err = json.Unmarshal([]byte(function.Arguments), &coords) assert.Nil(t, err) - startCoordsInterface := arguments["start_box"].([]interface{}) - endCoordsInterface := arguments["end_box"].([]interface{}) - startCoords := make([]float64, len(startCoordsInterface)) - endCoords := make([]float64, len(endCoordsInterface)) - for i, v := range startCoordsInterface { - startCoords[i] = v.(float64) - } - for i, v := range endCoordsInterface { - endCoords[i] = v.(float64) - } - assert.Equal(t, 4, len(startCoords)) - assert.Equal(t, 4, len(endCoords)) - assert.Equal(t, 100.0, startCoords[0]) - assert.Equal(t, 300.0, endCoords[0]) + // Should be merged into single array [start_center_x, start_center_y, end_center_x, end_center_y] + assert.Equal(t, 4, len(coords)) + assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225 + assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325 + assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425 } // Test normalizeCoordinatesFormat function @@ -79,159 +69,59 @@ func TestNormalizeCoordinatesFormat(t *testing.T) { input string expected string }{ + // Basic format conversions { - name: "point tag with 2 numbers", + name: "point_tag_2_numbers", input: "100 200", expected: "(100,200)", }, { - name: "point tag with 4 numbers", + name: "point_tag_4_numbers", input: "100 200 150 250", expected: "(100,200,150,250)", }, { - name: "bbox tag", + name: "bbox_tag", input: "100 200 150 250", expected: "(100,200,150,250)", }, { - name: "bracket format with spaces", + name: "bracket_format_4_coords", input: "[100, 200, 150, 250]", expected: "(100,200,150,250)", }, + // Edge cases { - name: "bracket format without spaces", - input: "[100,200,150,250]", - expected: "(100,200,150,250)", - }, - { - name: "bracket format with irregular spaces", - input: "[100, 200, 150, 250]", - expected: "(100,200,150,250)", - }, - { - name: "multiple point tags", - input: "100 200 and 300 400", - expected: "(100,200) and (300,400)", - }, - { - name: "mixed formats", - input: "100 200 and [300, 400, 350, 450]", - expected: "(100,200) and (300,400,350,450)", - }, - { - name: "documentation_example_coordinates", - input: "235 512", - expected: "(235,512)", - }, - { - name: "documentation_example_bbox", - input: "235 512 451 553", - expected: "(235,512,451,553)", - }, - { - name: "mobile_coordinates_point", - input: "200 600", - expected: "(200,600)", - }, - { - name: "tablet_coordinates_bbox", - input: "750 400 800 450", - expected: "(750,400,800,450)", - }, - // Note: Bracket format with 2 coordinates is NOT supported by the function - // Only 4-coordinate bracket format is supported - { - name: "bracket_format_two_coordinates_not_converted", - input: "[100, 200]", - expected: "[100, 200]", // Function doesn't convert this format - }, - // Note: Decimal coordinates are NOT supported by the regex (only \d+ is matched) - { - name: "point_tag_with_decimals_not_converted", - input: "100.5 200.7", - expected: "100.5 200.7", // Function doesn't convert decimals - }, - { - name: "bbox_tag_with_decimals_not_converted", - input: "100.5 200.7 150.3 250.9", - expected: "100.5 200.7 150.3 250.9", // Function doesn't convert decimals - }, - { - name: "bracket_format_with_decimals_not_converted", - input: "[100.5, 200.7, 150.3, 250.9]", - expected: "[100.5, 200.7, 150.3, 250.9]", // Function doesn't convert decimals - }, - { - name: "multiple_bracket_formats", - input: "[100, 200] and [300, 400, 350, 450]", - expected: "[100, 200] and (300,400,350,450)", // Only 4-coord format converted - }, - { - name: "multiple_bbox_tags", - input: "100 200 150 250 then 300 400 350 450", - expected: "(100,200,150,250) then (300,400,350,450)", - }, - { - name: "edge_case_zero_coordinates", + name: "zero_coordinates", input: "0 0", expected: "(0,0)", }, - { - name: "edge_case_maximum_coordinates", - input: "1000 1000", - expected: "(1000,1000)", - }, - { - name: "complex_mixed_formats", - input: "click 100 200 then drag [300, 400, 350, 450] to 500 600 550 650", - expected: "click (100,200) then drag (300,400,350,450) to (500,600,550,650)", - }, - { - name: "no_coordinates", - input: "click on button", - expected: "click on button", - }, - { - name: "empty_string", - input: "", - expected: "", - }, - { - name: "only_text_no_tags", - input: "some random text without coordinates", - expected: "some random text without coordinates", - }, - // Note: Extra spaces in brackets with 4 coords are NOT handled properly by the regex - { - name: "bracket_format_with_extra_spaces_not_converted", - input: "[ 100 , 200 , 150 , 250 ]", - expected: "[ 100 , 200 , 150 , 250 ]", // Function regex doesn't handle extra spaces - }, { name: "large_coordinates", input: "1920 1080", expected: "(1920,1080)", }, + // Multiple formats in one string { - name: "ultrawide_coordinates", - input: "0 0 3440 1440", - expected: "(0,0,3440,1440)", + name: "mixed_formats", + input: "100 200 and [300, 400, 350, 450]", + expected: "(100,200) and (300,400,350,450)", + }, + // Unsupported formats (should remain unchanged) + { + name: "bracket_2_coords_not_converted", + input: "[100, 200]", + expected: "[100, 200]", }, { - name: "real_world_action_example", - input: "Action: click(start_box='235 512')", - expected: "Action: click(start_box='(235,512)')", + name: "decimals_not_converted", + input: "100.5 200.7", + expected: "100.5 200.7", }, { - name: "real_world_drag_example", - input: "Action: drag(start_box='[100, 200, 150, 250]', end_box='300 400 350 450')", - expected: "Action: drag(start_box='(100,200,150,250)', end_box='(300,400,350,450)')", - }, - { - name: "real_world_example_1", - input: "235 512", - expected: "(235,512)", // Should be string format for normalizeCoordinatesFormat + name: "no_coordinates", + input: "click on button", + expected: "click on button", }, } @@ -253,141 +143,82 @@ func TestConvertRelativeToAbsolute(t *testing.T) { expectedResult float64 description string }{ + // Basic conversion tests { - name: "standard_1000x2000_x_coordinate", - size: types.Size{Width: 1000, Height: 2000}, - relativeCoord: 500, // 500/1000 * 1000 = 500 - isXCoord: true, - expectedResult: 500.0, - description: "Standard case: X coordinate conversion", - }, - { - name: "standard_1000x2000_y_coordinate", - size: types.Size{Width: 1000, Height: 2000}, - relativeCoord: 500, // 500/1000 * 2000 = 1000 - isXCoord: false, - expectedResult: 1000.0, - description: "Standard case: Y coordinate conversion", - }, - { - name: "example_from_documentation_x", + name: "standard_x_coordinate", size: types.Size{Width: 1920, Height: 1080}, - relativeCoord: 235, // round(1920*235/1000) = 451 + relativeCoord: 500, // 500/1000 * 1920 = 960 isXCoord: true, - expectedResult: 451.2, // 实际计算值为451.2,测试精确值 - description: "Documentation example: X coordinate (235, 512) on 1920x1080", + expectedResult: 960.0, + description: "Standard X coordinate conversion", }, { - name: "example_from_documentation_y", + name: "standard_y_coordinate", size: types.Size{Width: 1920, Height: 1080}, - relativeCoord: 512, // round(1080*512/1000) = 553 + relativeCoord: 500, // 500/1000 * 1080 = 540 isXCoord: false, - expectedResult: 553.0, // 实际计算值为553.0 - description: "Documentation example: Y coordinate (235, 512) on 1920x1080", + expectedResult: 540.0, + description: "Standard Y coordinate conversion", }, + // Mobile device tests { - name: "mobile_device_x_coordinate", + name: "mobile_x_coordinate", size: types.Size{Width: 375, Height: 812}, relativeCoord: 200, // 200/1000 * 375 = 75 isXCoord: true, expectedResult: 75.0, - description: "Mobile device: iPhone X size X coordinate", + description: "Mobile device X coordinate", }, { - name: "mobile_device_y_coordinate", + name: "mobile_y_coordinate", size: types.Size{Width: 375, Height: 812}, relativeCoord: 600, // 600/1000 * 812 = 487.2 isXCoord: false, expectedResult: 487.2, - description: "Mobile device: iPhone X size Y coordinate", + description: "Mobile device Y coordinate", }, + // Edge cases { - name: "tablet_device_x_coordinate", - size: types.Size{Width: 1024, Height: 768}, - relativeCoord: 750, // 750/1000 * 1024 = 768 - isXCoord: true, - expectedResult: 768.0, - description: "Tablet device: iPad size X coordinate", - }, - { - name: "tablet_device_y_coordinate", - size: types.Size{Width: 1024, Height: 768}, - relativeCoord: 400, // 400/1000 * 768 = 307.2 - isXCoord: false, - expectedResult: 307.2, - description: "Tablet device: iPad size Y coordinate", - }, - { - name: "edge_case_zero_coordinate", + name: "zero_coordinate", size: types.Size{Width: 1920, Height: 1080}, - relativeCoord: 0, // 0/1000 * width/height = 0 + relativeCoord: 0, isXCoord: true, expectedResult: 0.0, - description: "Edge case: Zero coordinate", + description: "Zero coordinate", }, { - name: "edge_case_maximum_coordinate_x", + name: "maximum_coordinate", size: types.Size{Width: 1920, Height: 1080}, relativeCoord: 1000, // 1000/1000 * 1920 = 1920 isXCoord: true, expectedResult: 1920.0, - description: "Edge case: Maximum X coordinate (1000 -> full width)", + description: "Maximum coordinate (1000 -> full width)", }, + // Coordinates > 1000 (normalization scenarios) { - name: "edge_case_maximum_coordinate_y", + name: "coordinate_greater_than_1000", size: types.Size{Width: 1920, Height: 1080}, - relativeCoord: 1000, // 1000/1000 * 1080 = 1080 - isXCoord: false, - expectedResult: 1080.0, - description: "Edge case: Maximum Y coordinate (1000 -> full height)", - }, - { - name: "rounding_precision_test_x", - size: types.Size{Width: 1000, Height: 1000}, - relativeCoord: 333, // 333/1000 * 1000 = 333 + relativeCoord: 1500, // 1500/1000 * 1920 = 2880 isXCoord: true, - expectedResult: 333.0, - description: "Precision test: X coordinate with rounding", + expectedResult: 2880.0, + description: "Coordinate > 1000: normalization test", }, { - name: "rounding_precision_test_y", - size: types.Size{Width: 1000, Height: 2000}, - relativeCoord: 750, // 750/1000 * 2000 = 1500 + name: "very_large_coordinate", + size: types.Size{Width: 1920, Height: 1080}, + relativeCoord: 2000, // 2000/1000 * 1080 = 2160 isXCoord: false, - expectedResult: 1500.0, - description: "Precision test: Y coordinate with rounding", + expectedResult: 2160.0, + description: "Very large coordinate test", }, + // High resolution test { - name: "small_screen_x_coordinate", - size: types.Size{Width: 480, Height: 800}, - relativeCoord: 125, // 125/1000 * 480 = 60 + name: "4k_resolution_large_coordinate", + size: types.Size{Width: 3840, Height: 2160}, + relativeCoord: 1500, // 1500/1000 * 3840 = 5760 isXCoord: true, - expectedResult: 60.0, - description: "Small screen: X coordinate conversion", - }, - { - name: "small_screen_y_coordinate", - size: types.Size{Width: 480, Height: 800}, - relativeCoord: 875, // 875/1000 * 800 = 700 - isXCoord: false, - expectedResult: 700.0, - description: "Small screen: Y coordinate conversion", - }, - { - name: "ultrawide_monitor_x_coordinate", - size: types.Size{Width: 3440, Height: 1440}, - relativeCoord: 450, // 450/1000 * 3440 = 1548 - isXCoord: true, - expectedResult: 1548.0, - description: "Ultrawide monitor: X coordinate conversion", - }, - { - name: "ultrawide_monitor_y_coordinate", - size: types.Size{Width: 3440, Height: 1440}, - relativeCoord: 720, // 720/1000 * 1440 = 1036.8 - isXCoord: false, - expectedResult: 1036.8, - description: "Ultrawide monitor: Y coordinate conversion", + expectedResult: 5760.0, + description: "4K resolution with large coordinate", }, } @@ -662,153 +493,101 @@ func TestNormalizeStringCoordinates(t *testing.T) { expectError bool description string }{ + // Basic coordinate formats { name: "simple_coordinate_string", coordStr: "100,200,150,250", size: types.Size{Width: 1000, Height: 1000}, expected: []float64{100.0, 200.0, 150.0, 250.0}, - description: "Simple comma-separated coordinate string", + description: "Simple comma-separated coordinates", }, { - name: "coordinate_string_with_spaces", - coordStr: " 100 , 200 , 150 , 250 ", - size: types.Size{Width: 1000, Height: 1000}, - expected: []float64{100.0, 200.0, 150.0, 250.0}, - description: "Coordinate string with spaces", - }, - { - name: "documentation_example_point_tag", + name: "point_tag_format", coordStr: "235 512", size: types.Size{Width: 1920, Height: 1080}, expected: []float64{451.2, 553.0}, // 235/1000*1920=451.2, 512/1000*1080=553.0 - description: "Documentation example: point tag on 1920x1080", + description: "Point tag format with screen scaling", }, { - name: "documentation_example_bbox_tag", - coordStr: "235 512 451 553", + name: "bbox_tag_format", + coordStr: "100 200 150 250", size: types.Size{Width: 1920, Height: 1080}, - expected: []float64{451.2, 553.0, 865.9, 597.2}, // All converted from relative to absolute - description: "Documentation example: bbox tag on 1920x1080", + expected: []float64{192.0, 216.0, 288.0, 270.0}, // All scaled to 1920x1080 + description: "Bbox tag format with screen scaling", }, { - name: "mobile_device_point", - coordStr: "200 600", - size: types.Size{Width: 375, Height: 812}, - expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2 - description: "Mobile device: iPhone X point coordinate", - }, - { - name: "mobile_device_bbox", - coordStr: "200 600 400 800", - size: types.Size{Width: 375, Height: 812}, - expected: []float64{75.0, 487.2, 150.0, 649.6}, // Mobile device bbox - description: "Mobile device: iPhone X bbox coordinate", - }, - { - name: "tablet_device_coordinates", - coordStr: "[750, 400, 800, 450]", - size: types.Size{Width: 1024, Height: 768}, - expected: []float64{768.0, 307.2, 819.2, 345.6}, // Tablet coordinates - description: "Tablet device: iPad coordinate conversion", - }, - { - name: "bracket_format_two_coords", - coordStr: "[100, 200]", - size: types.Size{Width: 1000, Height: 1000}, - expected: []float64{100.0, 200.0}, - description: "Bracket format with two coordinates", - }, - { - name: "bracket_format_four_coords", + name: "bracket_format", coordStr: "[100, 200, 150, 250]", size: types.Size{Width: 1000, Height: 1000}, expected: []float64{100.0, 200.0, 150.0, 250.0}, - description: "Bracket format with four coordinates", + description: "Bracket format coordinates", }, + // Mobile device test { - name: "edge_case_zero_coordinates", + name: "mobile_device_coordinates", + coordStr: "200 600", + size: types.Size{Width: 375, Height: 812}, + expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2 + description: "Mobile device coordinate conversion", + }, + // Edge cases + { + name: "zero_coordinates", coordStr: "0,0,0,0", size: types.Size{Width: 1920, Height: 1080}, expected: []float64{0.0, 0.0, 0.0, 0.0}, - description: "Edge case: all zero coordinates", + description: "Zero coordinates", }, { - name: "edge_case_maximum_coordinates", + name: "maximum_coordinates", coordStr: "1000,1000,1000,1000", size: types.Size{Width: 1920, Height: 1080}, - expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum relative coords -> screen edges - description: "Edge case: maximum coordinates (1000 -> screen edges)", + expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum -> screen edges + description: "Maximum coordinates (1000 -> screen edges)", }, + // Coordinates > 1000 (normalization scenarios) { - name: "ultrawide_monitor_coords", - coordStr: "450 720", - size: types.Size{Width: 3440, Height: 1440}, - expected: []float64{1548.0, 1036.8}, // 450/1000*3440=1548, 720/1000*1440=1036.8 - description: "Ultrawide monitor: coordinate conversion", - }, - { - name: "small_screen_coordinates", - coordStr: "125 875 250 950", - size: types.Size{Width: 480, Height: 800}, - expected: []float64{60.0, 700.0, 120.0, 760.0}, // Small screen bbox - description: "Small screen: coordinate conversion", - }, - { - name: "real_world_example_1", - coordStr: "235 512", + name: "coordinates_greater_than_1000", + coordStr: "1200,1500,1400,1800", size: types.Size{Width: 1920, Height: 1080}, - expected: []float64{451.2, 553.0}, // Real documentation example - description: "Real world: documentation example coordinates", + expected: []float64{2304.0, 1620.0, 2688.0, 1944.0}, // Scaled up for larger screen + description: "Coordinates > 1000: scaling to larger screen", }, { - name: "real_world_example_2", - coordStr: "[375, 600, 425, 650]", - size: types.Size{Width: 1080, Height: 1920}, - expected: []float64{405.0, 1152.0, 459.0, 1248.0}, // Portrait mobile bbox - description: "Real world: portrait mobile bbox", + name: "very_large_coordinates", + coordStr: "[2000, 3000, 2500, 3500]", + size: types.Size{Width: 1920, Height: 1080}, + expected: []float64{3840.0, 3240.0, 4800.0, 3780.0}, // Very large coordinates + description: "Very large coordinates > 2000", }, - // Error cases - decimal coordinates are not supported by the regex (\d+ only matches integers) + { + name: "mixed_coordinates_boundary", + coordStr: "800,1200,1000,1500", + size: types.Size{Width: 1920, Height: 1080}, + expected: []float64{1536.0, 1296.0, 1920.0, 1620.0}, // Mixed coordinates + description: "Mixed coordinates around 1000 boundary", + }, + // Error cases { name: "empty_string", coordStr: "", size: types.Size{Width: 1000, Height: 1000}, expectError: true, - description: "Error case: empty string", + description: "Empty string should cause error", }, { name: "invalid_coordinate_string", coordStr: "abc,def", size: types.Size{Width: 1000, Height: 1000}, expectError: true, - description: "Error case: invalid coordinate string", + description: "Invalid coordinate string should cause error", }, { name: "insufficient_coordinates", coordStr: "100", size: types.Size{Width: 1000, Height: 1000}, expectError: true, - description: "Error case: insufficient coordinates", - }, - { - name: "invalid_bracket_format", - coordStr: "[abc, def]", - size: types.Size{Width: 1000, Height: 1000}, - expectError: true, - description: "Error case: invalid bracket format", - }, - { - name: "invalid_point_tag", - coordStr: "abc def", - size: types.Size{Width: 1000, Height: 1000}, - expectError: true, - description: "Error case: invalid point tag", - }, - { - name: "invalid_bbox_tag", - coordStr: "abc def ghi jkl", - size: types.Size{Width: 1000, Height: 1000}, - expectError: true, - description: "Error case: invalid bbox tag", + description: "Insufficient coordinates should cause error", }, } @@ -832,7 +611,7 @@ func TestNormalizeStringCoordinates(t *testing.T) { // Test normalizeActionCoordinates function func TestNormalizeActionCoordinates(t *testing.T) { - size := types.Size{Width: 1000, Height: 1000} + size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization tests := []struct { name string @@ -843,27 +622,27 @@ func TestNormalizeActionCoordinates(t *testing.T) { { name: "JSON array format - []interface{}", coordData: []interface{}{100.0, 200.0, 150.0, 250.0}, - expected: []float64{100.0, 200.0, 150.0, 250.0}, + expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc. }, { name: "JSON array format with int values", coordData: []interface{}{100, 200, 150, 250}, - expected: []float64{100.0, 200.0, 150.0, 250.0}, + expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc. }, { name: "float64 slice format", coordData: []float64{100.0, 200.0, 150.0, 250.0}, - expected: []float64{100.0, 200.0, 150.0, 250.0}, + expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc. }, { name: "string format", coordData: "100,200,150,250", - expected: []float64{100.0, 200.0, 150.0, 250.0}, + expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc. }, { name: "two-element coordinate", coordData: []interface{}{100.0, 200.0}, - expected: []float64{100.0, 200.0}, + expected: []float64{192.0, 160.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160 }, { name: "insufficient elements in array", @@ -902,7 +681,7 @@ func TestNormalizeActionCoordinates(t *testing.T) { // Test processActionArguments function func TestProcessActionArguments(t *testing.T) { - size := types.Size{Width: 1000, Height: 1000} + size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization tests := []struct { name string @@ -911,29 +690,49 @@ func TestProcessActionArguments(t *testing.T) { expectError bool }{ { - name: "coordinate and non-coordinate parameters", + name: "basic_coordinate_and_text_parameters", rawArgs: map[string]interface{}{ "start_box": "100,200,150,250", "content": "Hello\\nWorld", }, expected: map[string]interface{}{ - "start_box": []float64{100.0, 200.0, 150.0, 250.0}, + "start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180 "content": "Hello\nWorld", }, }, { - name: "multiple coordinate parameters", + name: "drag_operation_dual_coordinates", rawArgs: map[string]interface{}{ "start_box": "100,200,150,250", "end_box": "300,400,350,450", }, expected: map[string]interface{}{ - "start_box": []float64{100.0, 200.0, 150.0, 250.0}, - "end_box": []float64{300.0, 400.0, 350.0, 450.0}, + "start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180 + "end_box": []float64{624.0, 340.0}, // Center point: [300,400,350,450] -> scaled coords [576,320,672,360] -> center (576+672)/2=624, (320+360)/2=340 }, }, { - name: "only non-coordinate parameters", + name: "coordinates_greater_than_1000", + rawArgs: map[string]interface{}{ + "start_box": "1200,1500,1400,1800", + }, + expected: map[string]interface{}{ + "start_box": []float64{2496.0, 1320.0}, // Center point: [1200,1500,1400,1800] -> scaled coords [2304,1200,2688,1440] -> center (2304+2688)/2=2496, (1200+1440)/2=1320 + }, + }, + { + name: "mixed_large_and_small_coordinates", + rawArgs: map[string]interface{}{ + "start_box": "800,1200,1000,1500", + "end_box": "1500,500,2000,800", + }, + expected: map[string]interface{}{ + "start_box": []float64{1728.0, 1080.0}, // Center point: [800,1200,1000,1500] -> scaled coords [1536,960,1920,1200] -> center (1536+1920)/2=1728, (960+1200)/2=1080 + "end_box": []float64{3360.0, 520.0}, // Center point: [1500,500,2000,800] -> scaled coords [2880,400,3840,640] -> center (2880+3840)/2=3360, (400+640)/2=520 + }, + }, + { + name: "non_coordinate_parameters_only", rawArgs: map[string]interface{}{ "content": "Hello World", "direction": "down", @@ -944,12 +743,12 @@ func TestProcessActionArguments(t *testing.T) { }, }, { - name: "empty arguments", + name: "empty_arguments", rawArgs: map[string]interface{}{}, expected: map[string]interface{}{}, }, { - name: "invalid coordinate parameter", + name: "invalid_coordinate_parameter", rawArgs: map[string]interface{}{ "start_box": "invalid", }, @@ -988,3 +787,56 @@ func TestProcessActionArguments(t *testing.T) { }) } } + +// Test new coordinate conversion logic +func TestNewCoordinateConversion(t *testing.T) { + parser := &UITARSContentParser{} + + // Test single start_box conversion to center point + text := "Thought: 我需要点击这个按钮\nAction: click(start_box='100,200,150,250')" + result, err := parser.Parse(text, types.Size{Height: 1000, Width: 1000}) + assert.Nil(t, err) + function := result.ToolCalls[0].Function + assert.Equal(t, function.Name, "uixt__click") + + // ActionInputs is now directly a coordinate array + var coords []float64 + err = json.Unmarshal([]byte(function.Arguments), &coords) + assert.Nil(t, err) + + // Should convert bounding box [100,200,150,250] to center point [125.0, 225.0] + assert.Equal(t, 2, len(coords)) + assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225 + + // Test drag operation conversion to merged array + text = "Thought: 我需要拖拽元素\nAction: drag(start_box='100,200,150,250', end_box='300,400,350,450')" + result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) + assert.Nil(t, err) + function = result.ToolCalls[0].Function + assert.Equal(t, function.Name, "uixt__drag") + + // ActionInputs is now directly a coordinate array + err = json.Unmarshal([]byte(function.Arguments), &coords) + assert.Nil(t, err) + + // Should merge start_box and end_box center points into single array [125.0, 225.0, 325.0, 425.0] + assert.Equal(t, 4, len(coords)) + assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125 + assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225 + assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325 + assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425 + + // Test non-coordinate operation (type action) + text = "Thought: 我需要输入文本\nAction: type(content='Hello World')" + result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000}) + assert.Nil(t, err) + function = result.ToolCalls[0].Function + assert.Equal(t, function.Name, "uixt__type") + + // ActionInputs should be a map for non-coordinate operations + var arguments map[string]interface{} + err = json.Unmarshal([]byte(function.Arguments), &arguments) + assert.Nil(t, err) + assert.Equal(t, "Hello World", arguments["content"]) +} diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go index 12778b19..a603f2f5 100644 --- a/uixt/ai/parser_ui_tars.go +++ b/uixt/ai/parser_ui_tars.go @@ -10,6 +10,7 @@ import ( "time" "github.com/cloudwego/eino/schema" + "github.com/httprunner/httprunner/v5/uixt/option" "github.com/httprunner/httprunner/v5/uixt/types" "github.com/rs/zerolog/log" ) @@ -20,7 +21,8 @@ const ( // UITARSContentParser parses the Thought/Action format response type UITARSContentParser struct { - systemPrompt string + systemPrompt string + actionMapping map[string]option.ActionName } func (p *UITARSContentParser) SystemPrompt() string { @@ -47,7 +49,7 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR } // Convert actions to tool calls - toolCalls := convertActionsToToolCalls(actions) + toolCalls := convertActionsToToolCalls(actions, p.actionMapping) return &PlanningResult{ ToolCalls: toolCalls, @@ -92,10 +94,15 @@ func (p *UITARSContentParser) parseActionString(actionStr string, size types.Siz return nil, err } - // Create final action + // Convert processedArgs based on action type and coordinate parameters + finalArgs, err := convertProcessedArgs(processedArgs, actionType) + if err != nil { + return nil, err + } + action := Action{ ActionType: actionType, - ActionInputs: processedArgs, + ActionInputs: finalArgs, } return []Action{action}, nil @@ -147,6 +154,18 @@ func normalizeCoordinatesFormat(text string) string { } // convertRelativeToAbsolute converts relative coordinates to absolute pixel coordinates +// The coordinate system uses a 1000x1000 relative coordinate system as the base. +// This function maps relative coordinates to actual screen resolution coordinates. +// +// Conversion formula: +// - For X coordinates: absolute_x = (relative_x / 1000) * screen_width +// - For Y coordinates: absolute_y = (relative_y / 1000) * screen_height +// +// Example: +// - Screen size: 1920x1080 +// - Relative coordinate: 500 (in 1000x1000 system) +// - X conversion: 500/1000 * 1920 = 960 pixels +// - Y conversion: 500/1000 * 1080 = 540 pixels func convertRelativeToAbsolute(relativeCoord float64, isXCoord bool, size types.Size) float64 { if isXCoord { return math.Round((relativeCoord/DefaultFactor*float64(size.Width))*10) / 10 @@ -204,7 +223,9 @@ func normalizeParameterName(paramName string) string { // processActionArguments processes raw arguments based on action type and parameter types // Input: rawArgs={"start_box": "100,200,150,250"} -// Output: processedArgs={"start_box": [120.5, 240.1, 180.7, 300.2]} (converted to pixels) +// Output: processedArgs={"start_box": [125.0, 225.0]} (converted to center point coordinates) +// For drag: rawArgs={"start_box": "100,200,150,250", "end_box": "300,400,350,450"} +// Output: processedArgs={"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} (both converted to center points) func processActionArguments(rawArgs map[string]interface{}, size types.Size) (map[string]interface{}, error) { processedArgs := make(map[string]interface{}) @@ -222,9 +243,9 @@ func processActionArguments(rawArgs map[string]interface{}, size types.Size) (ma // Process a single argument based on its name and value func processArgument(paramName string, paramValue interface{}, size types.Size) (interface{}, error) { - // Handle coordinate parameters + // Handle coordinate parameters - convert bounding box to center point if isCoordinateParameter(paramName) { - return normalizeActionCoordinates(paramValue, size) + return normalizeActionCoordinatesToCenterPoint(paramValue, size) } // Handle other parameter types (content, key, direction, etc.) @@ -236,6 +257,59 @@ func isCoordinateParameter(paramName string) bool { return strings.Contains(paramName, "box") || strings.Contains(paramName, "point") } +// convertProcessedArgs converts processed arguments based on action type and coordinate parameters +// For single start_box: {"start_box": [125.0, 225.0]} -> {"start_box": [125.0, 225.0]} +// For drag with start_box and end_box: {"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} -> {"start_box": [125.0, 225.0, 325.0, 425.0]} +func convertProcessedArgs(processedArgs map[string]interface{}, actionType string) (map[string]interface{}, error) { + // Handle coordinate parameters based on action type + startBox, hasStartBox := processedArgs["start_box"] + endBox, hasEndBox := processedArgs["end_box"] + + // Check if this is a drag operation that should merge coordinates + if hasStartBox && hasEndBox { + // Drag operation: merge start_box and end_box into a single coordinate array + startCoords, ok1 := startBox.([]float64) + endCoords, ok2 := endBox.([]float64) + + if !ok1 || !ok2 { + return nil, fmt.Errorf("invalid coordinate format for drag operation") + } + + if len(startCoords) != 2 || len(endCoords) != 2 { + return nil, fmt.Errorf("drag operation requires 2-element coordinate arrays, got start: %d, end: %d", len(startCoords), len(endCoords)) + } + + options := option.ActionOptions{ + FromX: startCoords[0], + FromY: startCoords[1], + ToX: endCoords[0], + ToY: endCoords[1], + } + return options.ToMap(), nil + } + + // For single coordinate operations, return the coordinate array directly + if hasStartBox { + startCoords, ok := startBox.([]float64) + if !ok { + return nil, fmt.Errorf("invalid coordinate format for single operation") + } + options := option.ActionOptions{ + X: startCoords[0], + Y: startCoords[1], + } + return options.ToMap(), nil + } + + // For non-coordinate operations, return the original arguments map + // TODO + finalArgs := make(map[string]interface{}) + for key, value := range processedArgs { + finalArgs[key] = value + } + return finalArgs, nil +} + // normalizeActionCoordinates normalizes coordinates from various formats to actual pixel coordinates func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) { switch v := coordData.(type) { @@ -350,15 +424,39 @@ func normalizeStringCoordinates(coordStr string, size types.Size) ([]float64, er return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr) } +// normalizeActionCoordinatesToCenterPoint converts bounding box coordinates to center point coordinates +// Input: "100,200,150,250" (x1,y1,x2,y2) -> Output: [125.0, 225.0] (center point in absolute pixels) +// Input: "100,200" (x,y) -> Output: [100.0, 200.0] (point in absolute pixels) +func normalizeActionCoordinatesToCenterPoint(coordData interface{}, size types.Size) ([]float64, error) { + // First normalize coordinates to get absolute pixel coordinates + coords, err := normalizeActionCoordinates(coordData, size) + if err != nil { + return nil, err + } + + // Convert bounding box to center point + if len(coords) == 4 { + // [x1, y1, x2, y2] -> [center_x, center_y] + centerX := (coords[0] + coords[2]) / 2 + centerY := (coords[1] + coords[3]) / 2 + return []float64{centerX, centerY}, nil + } else if len(coords) == 2 { + // Already a point [x, y], return as-is + return coords, nil + } else { + return nil, fmt.Errorf("invalid coordinate format: expected 2 or 4 coordinates, got %d", len(coords)) + } +} + // Action represents a parsed action with its context. type Action struct { - ActionType string `json:"action_type"` + ActionType string `json:"action_type"` // map to option.ActionName ActionInputs map[string]any `json:"action_inputs"` } // convertActionsToToolCalls converts actions to tool calls // This is a shared function used by both JSONContentParser and UITARSContentParser -func convertActionsToToolCalls(actions []Action) []schema.ToolCall { +func convertActionsToToolCalls(actions []Action, actionMapping map[string]option.ActionName) []schema.ToolCall { toolCalls := make([]schema.ToolCall, 0, len(actions)) for _, action := range actions { jsonArgs, err := json.Marshal(action.ActionInputs) @@ -366,11 +464,15 @@ func convertActionsToToolCalls(actions []Action) []schema.ToolCall { log.Error().Interface("action", action).Msg("failed to marshal action inputs") continue } + actionName := string(actionMapping[action.ActionType]) + if actionName == "" { + actionName = action.ActionType + } toolCalls = append(toolCalls, schema.ToolCall{ - ID: action.ActionType + "_" + strconv.FormatInt(time.Now().Unix(), 10), + ID: actionName + "_" + strconv.FormatInt(time.Now().Unix(), 10), Type: "function", Function: schema.FunctionCall{ - Name: "uixt__" + action.ActionType, + Name: "uixt__" + actionName, Arguments: string(jsonArgs), }, }) diff --git a/uixt/ai/planner.go b/uixt/ai/planner.go index 210dd19a..ead8fbed 100644 --- a/uixt/ai/planner.go +++ b/uixt/ai/planner.go @@ -2,7 +2,6 @@ package ai import ( "context" - "fmt" "time" "github.com/cloudwego/eino-ext/components/model/openai" @@ -140,20 +139,12 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes Error: err.Error(), } log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed") - // append assistant message - p.history.Append(&schema.Message{ - Role: schema.Assistant, - Content: message.Content, - }) - } else { - // append assistant message with tool calls - p.history.Append(&schema.Message{ - Role: schema.Tool, - Content: result.Content, - ToolCalls: result.ToolCalls, - ToolCallID: fmt.Sprintf("%d", time.Now().Unix()), - }) } + // append assistant message (since we're parsing content, not using native function calling) + p.history.Append(&schema.Message{ + Role: schema.Assistant, + Content: message.Content, + }) log.Info(). Interface("summary", result.ActionSummary). diff --git a/uixt/ai/planner_prompts.go b/uixt/ai/planner_prompts.go index 51007086..8b361926 100644 --- a/uixt/ai/planner_prompts.go +++ b/uixt/ai/planner_prompts.go @@ -1,5 +1,7 @@ package ai +import "github.com/httprunner/httprunner/v5/uixt/option" + // system prompt for UITARSContentParser // doubao-1.5-ui-tars on volcengine.com // https://www.volcengine.com/docs/82379/1536429 @@ -14,13 +16,12 @@ Action: ... ## Action Space click(start_box='[x1, y1, x2, y2]') -long_press(start_box='[x1, y1, x2, y2]') +left_double(start_box='[x1, y1, x2, y2]') +right_single(start_box='[x1, y1, x2, y2]') +drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') +hotkey(key='') type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `. scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left') -open_app(app_name=\'\') -drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]') -press_home() -press_back() wait() #Sleep for 5s and take a screenshot to check for any changes. finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. @@ -31,6 +32,18 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par ## User Instruction ` +var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{ + "click": option.ACTION_TapXY, + "left_double": option.ACTION_DoubleTapXY, + "right_single": option.ACTION_SecondaryClick, + "drag": option.ACTION_Drag, + "hotkey": option.ACTION_KeyCode, + "type": option.ACTION_Input, + "scroll": option.ACTION_Scroll, + "wait": option.ACTION_Sleep, + "finished": option.ACTION_Finished, +} + // system prompt for UITARSContentParser // https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py const _ = ` diff --git a/uixt/ai/planner_test.go b/uixt/ai/planner_test.go index 8bfeb0ec..5cf9fbb5 100644 --- a/uixt/ai/planner_test.go +++ b/uixt/ai/planner_test.go @@ -29,7 +29,7 @@ func TestVLMPlanning(t *testing.T) { userInstruction += "\n\n请基于以上游戏规则,给出下一步可点击的两个图标坐标" - modelConfig, err := GetModelConfig(option.LLMServiceTypeDoubaoVL) + modelConfig, err := GetModelConfig(option.LLMServiceTypeUITARS) require.NoError(t, err) planner, err := NewPlanner(context.Background(), modelConfig) @@ -157,9 +157,9 @@ func TestHandleSwitch(t *testing.T) { imageFile string actionType string }{ - {"testdata/deepseek_think_off.png", "click"}, // 关闭状态,需要点击开启 - {"testdata/deepseek_think_on.png", "click"}, // 关闭状态,需要点击开启 - {"testdata/deepseek_network_on.png", "finished"}, // 开启状态,无需操作 + {"testdata/deepseek_think_off.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启 + {"testdata/deepseek_think_on.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启 + {"testdata/deepseek_network_on.png", "uixt__finished"}, // 开启状态,无需操作 } for _, tc := range testCases { diff --git a/uixt/option/action.go b/uixt/option/action.go index 8b6068c3..7c13b753 100644 --- a/uixt/option/action.go +++ b/uixt/option/action.go @@ -2,6 +2,7 @@ package option import ( "context" + "encoding/json" "fmt" "math/rand/v2" "reflect" @@ -330,6 +331,18 @@ func (o *ActionOptions) Options() []ActionOption { return options } +func (o *ActionOptions) ToMap() map[string]interface{} { + result := make(map[string]interface{}) + b, err := json.Marshal(o) + if err != nil { + return nil + } + if err := json.Unmarshal(b, &result); err != nil { + return nil + } + return result +} + func (o *ActionOptions) ApplyTapOffset(absX, absY float64) (float64, float64) { if len(o.TapOffset) == 2 { absX += float64(o.TapOffset[0]) diff --git a/uixt/sdk.go b/uixt/sdk.go index 08840576..5b91fc93 100644 --- a/uixt/sdk.go +++ b/uixt/sdk.go @@ -64,9 +64,10 @@ func (c *MCPClient4XTDriver) ListTools(ctx context.Context, req mcp.ListToolsReq } func (c *MCPClient4XTDriver) CallTool(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { - actionTool := c.Server.GetToolByAction(option.ActionName(req.Params.Name)) + actionName := strings.TrimPrefix(req.Params.Name, "uixt__") + actionTool := c.Server.GetToolByAction(option.ActionName(actionName)) if actionTool == nil { - return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", req.Params.Name)), nil + return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", actionName)), nil } handler := actionTool.Implement() return handler(ctx, req)