mirror of
https://github.com/httprunner/httprunner.git
synced 2026-06-06 08:19:45 +08:00
feat: optimize UI-TARS parser with coordinate conversion and action mapping
- Add action mapping for UI-TARS parser to convert action names to option.ActionName - Implement bounding box to center point coordinate conversion for better accuracy - Update coordinate normalization to handle coordinates > 1000 properly - Enhance test cases to verify coordinate scaling and center point conversion - Improve action argument processing with proper coordinate transformation - Add comprehensive test coverage for coordinate conversion edge cases Key improvements: - Bounding box [x1,y1,x2,y2] now converts to center point [cx,cy] for actions - Coordinate scaling properly handles different screen resolutions - Action names are mapped through doubao_1_5_ui_tars_action_mapping - Enhanced error handling for invalid coordinate formats
This commit is contained in:
@@ -21,18 +21,21 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
||||
switch modelType {
|
||||
case option.LLMServiceTypeUITARS:
|
||||
return &UITARSContentParser{
|
||||
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
|
||||
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
|
||||
actionMapping: doubao_1_5_ui_tars_action_mapping,
|
||||
}
|
||||
default:
|
||||
return &JSONContentParser{
|
||||
systemPrompt: defaultPlanningResponseJsonFormat,
|
||||
systemPrompt: defaultPlanningResponseJsonFormat,
|
||||
actionMapping: map[string]option.ActionName{},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// JSONContentParser parses the response as JSON string format
|
||||
type JSONContentParser struct {
|
||||
systemPrompt string
|
||||
systemPrompt string
|
||||
actionMapping map[string]option.ActionName
|
||||
}
|
||||
|
||||
func (p *JSONContentParser) SystemPrompt() string {
|
||||
@@ -83,7 +86,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
|
||||
}
|
||||
|
||||
// Convert actions to tool calls using function from parser_ui_tars.go
|
||||
toolCalls := convertActionsToToolCalls(normalizedActions)
|
||||
toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping)
|
||||
|
||||
return &PlanningResult{
|
||||
ToolCalls: toolCalls,
|
||||
|
||||
@@ -14,62 +14,52 @@ func TestParseActionToStructureOutput(t *testing.T) {
|
||||
result, err := parser.Parse(text, types.Size{Height: 224, Width: 224})
|
||||
assert.Nil(t, err)
|
||||
function := result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "click")
|
||||
assert.Contains(t, function.Arguments, "start_box")
|
||||
assert.Equal(t, function.Name, "uixt__click")
|
||||
// ActionInputs is now directly a coordinate array
|
||||
var coords []float64
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, 2, len(coords))
|
||||
|
||||
text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='<point>550 450 550 450</point>')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024})
|
||||
assert.Nil(t, err)
|
||||
function = result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "click")
|
||||
assert.Contains(t, function.Arguments, "start_box")
|
||||
assert.Equal(t, function.Name, "uixt__click")
|
||||
// ActionInputs is now directly a coordinate array
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, 2, len(coords))
|
||||
|
||||
// Test new bracket format
|
||||
// Test new bracket format - should convert bounding box to center point
|
||||
text = "Thought: 我需要点击这个按钮\nAction: click(start_box='[100, 200, 150, 250]')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||
assert.Nil(t, err)
|
||||
function = result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "click")
|
||||
assert.Contains(t, function.Arguments, "start_box")
|
||||
arguments := make(map[string]interface{})
|
||||
err = json.Unmarshal([]byte(function.Arguments), &arguments)
|
||||
assert.Equal(t, function.Name, "uixt__click")
|
||||
// ActionInputs is now directly a coordinate array
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
coordsInterface := arguments["start_box"].([]interface{})
|
||||
coords := make([]float64, len(coordsInterface))
|
||||
for i, v := range coordsInterface {
|
||||
coords[i] = v.(float64)
|
||||
}
|
||||
assert.Equal(t, 4, len(coords))
|
||||
assert.Equal(t, 100.0, coords[0])
|
||||
assert.Equal(t, 200.0, coords[1])
|
||||
assert.Equal(t, 150.0, coords[2])
|
||||
assert.Equal(t, 250.0, coords[3])
|
||||
// Should be converted to center point [125, 225] from bounding box [100, 200, 150, 250]
|
||||
assert.Equal(t, 2, len(coords))
|
||||
assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125
|
||||
assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225
|
||||
|
||||
// Test drag operation with both start_box and end_box
|
||||
// Test drag operation with both start_box and end_box - should merge center points into single array
|
||||
text = "Thought: 我需要拖拽元素\nAction: drag(start_box='[100, 200, 150, 250]', end_box='[300, 400, 350, 450]')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||
assert.Nil(t, err)
|
||||
function = result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "drag")
|
||||
assert.Contains(t, function.Arguments, "start_box")
|
||||
assert.Contains(t, function.Arguments, "end_box")
|
||||
arguments = make(map[string]interface{})
|
||||
err = json.Unmarshal([]byte(function.Arguments), &arguments)
|
||||
assert.Equal(t, function.Name, "uixt__drag")
|
||||
// ActionInputs is now directly a coordinate array
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
startCoordsInterface := arguments["start_box"].([]interface{})
|
||||
endCoordsInterface := arguments["end_box"].([]interface{})
|
||||
startCoords := make([]float64, len(startCoordsInterface))
|
||||
endCoords := make([]float64, len(endCoordsInterface))
|
||||
for i, v := range startCoordsInterface {
|
||||
startCoords[i] = v.(float64)
|
||||
}
|
||||
for i, v := range endCoordsInterface {
|
||||
endCoords[i] = v.(float64)
|
||||
}
|
||||
assert.Equal(t, 4, len(startCoords))
|
||||
assert.Equal(t, 4, len(endCoords))
|
||||
assert.Equal(t, 100.0, startCoords[0])
|
||||
assert.Equal(t, 300.0, endCoords[0])
|
||||
// Should be merged into single array [start_center_x, start_center_y, end_center_x, end_center_y]
|
||||
assert.Equal(t, 4, len(coords))
|
||||
assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125
|
||||
assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225
|
||||
assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325
|
||||
assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425
|
||||
}
|
||||
|
||||
// Test normalizeCoordinatesFormat function
|
||||
@@ -79,159 +69,59 @@ func TestNormalizeCoordinatesFormat(t *testing.T) {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
// Basic format conversions
|
||||
{
|
||||
name: "point tag with 2 numbers",
|
||||
name: "point_tag_2_numbers",
|
||||
input: "<point>100 200</point>",
|
||||
expected: "(100,200)",
|
||||
},
|
||||
{
|
||||
name: "point tag with 4 numbers",
|
||||
name: "point_tag_4_numbers",
|
||||
input: "<point>100 200 150 250</point>",
|
||||
expected: "(100,200,150,250)",
|
||||
},
|
||||
{
|
||||
name: "bbox tag",
|
||||
name: "bbox_tag",
|
||||
input: "<bbox>100 200 150 250</bbox>",
|
||||
expected: "(100,200,150,250)",
|
||||
},
|
||||
{
|
||||
name: "bracket format with spaces",
|
||||
name: "bracket_format_4_coords",
|
||||
input: "[100, 200, 150, 250]",
|
||||
expected: "(100,200,150,250)",
|
||||
},
|
||||
// Edge cases
|
||||
{
|
||||
name: "bracket format without spaces",
|
||||
input: "[100,200,150,250]",
|
||||
expected: "(100,200,150,250)",
|
||||
},
|
||||
{
|
||||
name: "bracket format with irregular spaces",
|
||||
input: "[100, 200, 150, 250]",
|
||||
expected: "(100,200,150,250)",
|
||||
},
|
||||
{
|
||||
name: "multiple point tags",
|
||||
input: "<point>100 200</point> and <point>300 400</point>",
|
||||
expected: "(100,200) and (300,400)",
|
||||
},
|
||||
{
|
||||
name: "mixed formats",
|
||||
input: "<point>100 200</point> and [300, 400, 350, 450]",
|
||||
expected: "(100,200) and (300,400,350,450)",
|
||||
},
|
||||
{
|
||||
name: "documentation_example_coordinates",
|
||||
input: "<point>235 512</point>",
|
||||
expected: "(235,512)",
|
||||
},
|
||||
{
|
||||
name: "documentation_example_bbox",
|
||||
input: "<bbox>235 512 451 553</bbox>",
|
||||
expected: "(235,512,451,553)",
|
||||
},
|
||||
{
|
||||
name: "mobile_coordinates_point",
|
||||
input: "<point>200 600</point>",
|
||||
expected: "(200,600)",
|
||||
},
|
||||
{
|
||||
name: "tablet_coordinates_bbox",
|
||||
input: "<bbox>750 400 800 450</bbox>",
|
||||
expected: "(750,400,800,450)",
|
||||
},
|
||||
// Note: Bracket format with 2 coordinates is NOT supported by the function
|
||||
// Only 4-coordinate bracket format is supported
|
||||
{
|
||||
name: "bracket_format_two_coordinates_not_converted",
|
||||
input: "[100, 200]",
|
||||
expected: "[100, 200]", // Function doesn't convert this format
|
||||
},
|
||||
// Note: Decimal coordinates are NOT supported by the regex (only \d+ is matched)
|
||||
{
|
||||
name: "point_tag_with_decimals_not_converted",
|
||||
input: "<point>100.5 200.7</point>",
|
||||
expected: "<point>100.5 200.7</point>", // Function doesn't convert decimals
|
||||
},
|
||||
{
|
||||
name: "bbox_tag_with_decimals_not_converted",
|
||||
input: "<bbox>100.5 200.7 150.3 250.9</bbox>",
|
||||
expected: "<bbox>100.5 200.7 150.3 250.9</bbox>", // Function doesn't convert decimals
|
||||
},
|
||||
{
|
||||
name: "bracket_format_with_decimals_not_converted",
|
||||
input: "[100.5, 200.7, 150.3, 250.9]",
|
||||
expected: "[100.5, 200.7, 150.3, 250.9]", // Function doesn't convert decimals
|
||||
},
|
||||
{
|
||||
name: "multiple_bracket_formats",
|
||||
input: "[100, 200] and [300, 400, 350, 450]",
|
||||
expected: "[100, 200] and (300,400,350,450)", // Only 4-coord format converted
|
||||
},
|
||||
{
|
||||
name: "multiple_bbox_tags",
|
||||
input: "<bbox>100 200 150 250</bbox> then <bbox>300 400 350 450</bbox>",
|
||||
expected: "(100,200,150,250) then (300,400,350,450)",
|
||||
},
|
||||
{
|
||||
name: "edge_case_zero_coordinates",
|
||||
name: "zero_coordinates",
|
||||
input: "<point>0 0</point>",
|
||||
expected: "(0,0)",
|
||||
},
|
||||
{
|
||||
name: "edge_case_maximum_coordinates",
|
||||
input: "<point>1000 1000</point>",
|
||||
expected: "(1000,1000)",
|
||||
},
|
||||
{
|
||||
name: "complex_mixed_formats",
|
||||
input: "click <point>100 200</point> then drag [300, 400, 350, 450] to <bbox>500 600 550 650</bbox>",
|
||||
expected: "click (100,200) then drag (300,400,350,450) to (500,600,550,650)",
|
||||
},
|
||||
{
|
||||
name: "no_coordinates",
|
||||
input: "click on button",
|
||||
expected: "click on button",
|
||||
},
|
||||
{
|
||||
name: "empty_string",
|
||||
input: "",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "only_text_no_tags",
|
||||
input: "some random text without coordinates",
|
||||
expected: "some random text without coordinates",
|
||||
},
|
||||
// Note: Extra spaces in brackets with 4 coords are NOT handled properly by the regex
|
||||
{
|
||||
name: "bracket_format_with_extra_spaces_not_converted",
|
||||
input: "[ 100 , 200 , 150 , 250 ]",
|
||||
expected: "[ 100 , 200 , 150 , 250 ]", // Function regex doesn't handle extra spaces
|
||||
},
|
||||
{
|
||||
name: "large_coordinates",
|
||||
input: "<point>1920 1080</point>",
|
||||
expected: "(1920,1080)",
|
||||
},
|
||||
// Multiple formats in one string
|
||||
{
|
||||
name: "ultrawide_coordinates",
|
||||
input: "<bbox>0 0 3440 1440</bbox>",
|
||||
expected: "(0,0,3440,1440)",
|
||||
name: "mixed_formats",
|
||||
input: "<point>100 200</point> and [300, 400, 350, 450]",
|
||||
expected: "(100,200) and (300,400,350,450)",
|
||||
},
|
||||
// Unsupported formats (should remain unchanged)
|
||||
{
|
||||
name: "bracket_2_coords_not_converted",
|
||||
input: "[100, 200]",
|
||||
expected: "[100, 200]",
|
||||
},
|
||||
{
|
||||
name: "real_world_action_example",
|
||||
input: "Action: click(start_box='<point>235 512</point>')",
|
||||
expected: "Action: click(start_box='(235,512)')",
|
||||
name: "decimals_not_converted",
|
||||
input: "<point>100.5 200.7</point>",
|
||||
expected: "<point>100.5 200.7</point>",
|
||||
},
|
||||
{
|
||||
name: "real_world_drag_example",
|
||||
input: "Action: drag(start_box='[100, 200, 150, 250]', end_box='<bbox>300 400 350 450</bbox>')",
|
||||
expected: "Action: drag(start_box='(100,200,150,250)', end_box='(300,400,350,450)')",
|
||||
},
|
||||
{
|
||||
name: "real_world_example_1",
|
||||
input: "<point>235 512</point>",
|
||||
expected: "(235,512)", // Should be string format for normalizeCoordinatesFormat
|
||||
name: "no_coordinates",
|
||||
input: "click on button",
|
||||
expected: "click on button",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -253,141 +143,82 @@ func TestConvertRelativeToAbsolute(t *testing.T) {
|
||||
expectedResult float64
|
||||
description string
|
||||
}{
|
||||
// Basic conversion tests
|
||||
{
|
||||
name: "standard_1000x2000_x_coordinate",
|
||||
size: types.Size{Width: 1000, Height: 2000},
|
||||
relativeCoord: 500, // 500/1000 * 1000 = 500
|
||||
isXCoord: true,
|
||||
expectedResult: 500.0,
|
||||
description: "Standard case: X coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "standard_1000x2000_y_coordinate",
|
||||
size: types.Size{Width: 1000, Height: 2000},
|
||||
relativeCoord: 500, // 500/1000 * 2000 = 1000
|
||||
isXCoord: false,
|
||||
expectedResult: 1000.0,
|
||||
description: "Standard case: Y coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "example_from_documentation_x",
|
||||
name: "standard_x_coordinate",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 235, // round(1920*235/1000) = 451
|
||||
relativeCoord: 500, // 500/1000 * 1920 = 960
|
||||
isXCoord: true,
|
||||
expectedResult: 451.2, // 实际计算值为451.2,测试精确值
|
||||
description: "Documentation example: X coordinate (235, 512) on 1920x1080",
|
||||
expectedResult: 960.0,
|
||||
description: "Standard X coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "example_from_documentation_y",
|
||||
name: "standard_y_coordinate",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 512, // round(1080*512/1000) = 553
|
||||
relativeCoord: 500, // 500/1000 * 1080 = 540
|
||||
isXCoord: false,
|
||||
expectedResult: 553.0, // 实际计算值为553.0
|
||||
description: "Documentation example: Y coordinate (235, 512) on 1920x1080",
|
||||
expectedResult: 540.0,
|
||||
description: "Standard Y coordinate conversion",
|
||||
},
|
||||
// Mobile device tests
|
||||
{
|
||||
name: "mobile_device_x_coordinate",
|
||||
name: "mobile_x_coordinate",
|
||||
size: types.Size{Width: 375, Height: 812},
|
||||
relativeCoord: 200, // 200/1000 * 375 = 75
|
||||
isXCoord: true,
|
||||
expectedResult: 75.0,
|
||||
description: "Mobile device: iPhone X size X coordinate",
|
||||
description: "Mobile device X coordinate",
|
||||
},
|
||||
{
|
||||
name: "mobile_device_y_coordinate",
|
||||
name: "mobile_y_coordinate",
|
||||
size: types.Size{Width: 375, Height: 812},
|
||||
relativeCoord: 600, // 600/1000 * 812 = 487.2
|
||||
isXCoord: false,
|
||||
expectedResult: 487.2,
|
||||
description: "Mobile device: iPhone X size Y coordinate",
|
||||
description: "Mobile device Y coordinate",
|
||||
},
|
||||
// Edge cases
|
||||
{
|
||||
name: "tablet_device_x_coordinate",
|
||||
size: types.Size{Width: 1024, Height: 768},
|
||||
relativeCoord: 750, // 750/1000 * 1024 = 768
|
||||
isXCoord: true,
|
||||
expectedResult: 768.0,
|
||||
description: "Tablet device: iPad size X coordinate",
|
||||
},
|
||||
{
|
||||
name: "tablet_device_y_coordinate",
|
||||
size: types.Size{Width: 1024, Height: 768},
|
||||
relativeCoord: 400, // 400/1000 * 768 = 307.2
|
||||
isXCoord: false,
|
||||
expectedResult: 307.2,
|
||||
description: "Tablet device: iPad size Y coordinate",
|
||||
},
|
||||
{
|
||||
name: "edge_case_zero_coordinate",
|
||||
name: "zero_coordinate",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 0, // 0/1000 * width/height = 0
|
||||
relativeCoord: 0,
|
||||
isXCoord: true,
|
||||
expectedResult: 0.0,
|
||||
description: "Edge case: Zero coordinate",
|
||||
description: "Zero coordinate",
|
||||
},
|
||||
{
|
||||
name: "edge_case_maximum_coordinate_x",
|
||||
name: "maximum_coordinate",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 1000, // 1000/1000 * 1920 = 1920
|
||||
isXCoord: true,
|
||||
expectedResult: 1920.0,
|
||||
description: "Edge case: Maximum X coordinate (1000 -> full width)",
|
||||
description: "Maximum coordinate (1000 -> full width)",
|
||||
},
|
||||
// Coordinates > 1000 (normalization scenarios)
|
||||
{
|
||||
name: "edge_case_maximum_coordinate_y",
|
||||
name: "coordinate_greater_than_1000",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 1000, // 1000/1000 * 1080 = 1080
|
||||
isXCoord: false,
|
||||
expectedResult: 1080.0,
|
||||
description: "Edge case: Maximum Y coordinate (1000 -> full height)",
|
||||
},
|
||||
{
|
||||
name: "rounding_precision_test_x",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
relativeCoord: 333, // 333/1000 * 1000 = 333
|
||||
relativeCoord: 1500, // 1500/1000 * 1920 = 2880
|
||||
isXCoord: true,
|
||||
expectedResult: 333.0,
|
||||
description: "Precision test: X coordinate with rounding",
|
||||
expectedResult: 2880.0,
|
||||
description: "Coordinate > 1000: normalization test",
|
||||
},
|
||||
{
|
||||
name: "rounding_precision_test_y",
|
||||
size: types.Size{Width: 1000, Height: 2000},
|
||||
relativeCoord: 750, // 750/1000 * 2000 = 1500
|
||||
name: "very_large_coordinate",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
relativeCoord: 2000, // 2000/1000 * 1080 = 2160
|
||||
isXCoord: false,
|
||||
expectedResult: 1500.0,
|
||||
description: "Precision test: Y coordinate with rounding",
|
||||
expectedResult: 2160.0,
|
||||
description: "Very large coordinate test",
|
||||
},
|
||||
// High resolution test
|
||||
{
|
||||
name: "small_screen_x_coordinate",
|
||||
size: types.Size{Width: 480, Height: 800},
|
||||
relativeCoord: 125, // 125/1000 * 480 = 60
|
||||
name: "4k_resolution_large_coordinate",
|
||||
size: types.Size{Width: 3840, Height: 2160},
|
||||
relativeCoord: 1500, // 1500/1000 * 3840 = 5760
|
||||
isXCoord: true,
|
||||
expectedResult: 60.0,
|
||||
description: "Small screen: X coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "small_screen_y_coordinate",
|
||||
size: types.Size{Width: 480, Height: 800},
|
||||
relativeCoord: 875, // 875/1000 * 800 = 700
|
||||
isXCoord: false,
|
||||
expectedResult: 700.0,
|
||||
description: "Small screen: Y coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "ultrawide_monitor_x_coordinate",
|
||||
size: types.Size{Width: 3440, Height: 1440},
|
||||
relativeCoord: 450, // 450/1000 * 3440 = 1548
|
||||
isXCoord: true,
|
||||
expectedResult: 1548.0,
|
||||
description: "Ultrawide monitor: X coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "ultrawide_monitor_y_coordinate",
|
||||
size: types.Size{Width: 3440, Height: 1440},
|
||||
relativeCoord: 720, // 720/1000 * 1440 = 1036.8
|
||||
isXCoord: false,
|
||||
expectedResult: 1036.8,
|
||||
description: "Ultrawide monitor: Y coordinate conversion",
|
||||
expectedResult: 5760.0,
|
||||
description: "4K resolution with large coordinate",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -662,153 +493,101 @@ func TestNormalizeStringCoordinates(t *testing.T) {
|
||||
expectError bool
|
||||
description string
|
||||
}{
|
||||
// Basic coordinate formats
|
||||
{
|
||||
name: "simple_coordinate_string",
|
||||
coordStr: "100,200,150,250",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
description: "Simple comma-separated coordinate string",
|
||||
description: "Simple comma-separated coordinates",
|
||||
},
|
||||
{
|
||||
name: "coordinate_string_with_spaces",
|
||||
coordStr: " 100 , 200 , 150 , 250 ",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
description: "Coordinate string with spaces",
|
||||
},
|
||||
{
|
||||
name: "documentation_example_point_tag",
|
||||
name: "point_tag_format",
|
||||
coordStr: "<point>235 512</point>",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{451.2, 553.0}, // 235/1000*1920=451.2, 512/1000*1080=553.0
|
||||
description: "Documentation example: point tag on 1920x1080",
|
||||
description: "Point tag format with screen scaling",
|
||||
},
|
||||
{
|
||||
name: "documentation_example_bbox_tag",
|
||||
coordStr: "<bbox>235 512 451 553</bbox>",
|
||||
name: "bbox_tag_format",
|
||||
coordStr: "<bbox>100 200 150 250</bbox>",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{451.2, 553.0, 865.9, 597.2}, // All converted from relative to absolute
|
||||
description: "Documentation example: bbox tag on 1920x1080",
|
||||
expected: []float64{192.0, 216.0, 288.0, 270.0}, // All scaled to 1920x1080
|
||||
description: "Bbox tag format with screen scaling",
|
||||
},
|
||||
{
|
||||
name: "mobile_device_point",
|
||||
coordStr: "<point>200 600</point>",
|
||||
size: types.Size{Width: 375, Height: 812},
|
||||
expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2
|
||||
description: "Mobile device: iPhone X point coordinate",
|
||||
},
|
||||
{
|
||||
name: "mobile_device_bbox",
|
||||
coordStr: "<bbox>200 600 400 800</bbox>",
|
||||
size: types.Size{Width: 375, Height: 812},
|
||||
expected: []float64{75.0, 487.2, 150.0, 649.6}, // Mobile device bbox
|
||||
description: "Mobile device: iPhone X bbox coordinate",
|
||||
},
|
||||
{
|
||||
name: "tablet_device_coordinates",
|
||||
coordStr: "[750, 400, 800, 450]",
|
||||
size: types.Size{Width: 1024, Height: 768},
|
||||
expected: []float64{768.0, 307.2, 819.2, 345.6}, // Tablet coordinates
|
||||
description: "Tablet device: iPad coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "bracket_format_two_coords",
|
||||
coordStr: "[100, 200]",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expected: []float64{100.0, 200.0},
|
||||
description: "Bracket format with two coordinates",
|
||||
},
|
||||
{
|
||||
name: "bracket_format_four_coords",
|
||||
name: "bracket_format",
|
||||
coordStr: "[100, 200, 150, 250]",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
description: "Bracket format with four coordinates",
|
||||
description: "Bracket format coordinates",
|
||||
},
|
||||
// Mobile device test
|
||||
{
|
||||
name: "edge_case_zero_coordinates",
|
||||
name: "mobile_device_coordinates",
|
||||
coordStr: "<point>200 600</point>",
|
||||
size: types.Size{Width: 375, Height: 812},
|
||||
expected: []float64{75.0, 487.2}, // 200/1000*375=75, 600/1000*812=487.2
|
||||
description: "Mobile device coordinate conversion",
|
||||
},
|
||||
// Edge cases
|
||||
{
|
||||
name: "zero_coordinates",
|
||||
coordStr: "0,0,0,0",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{0.0, 0.0, 0.0, 0.0},
|
||||
description: "Edge case: all zero coordinates",
|
||||
description: "Zero coordinates",
|
||||
},
|
||||
{
|
||||
name: "edge_case_maximum_coordinates",
|
||||
name: "maximum_coordinates",
|
||||
coordStr: "1000,1000,1000,1000",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum relative coords -> screen edges
|
||||
description: "Edge case: maximum coordinates (1000 -> screen edges)",
|
||||
expected: []float64{1920.0, 1080.0, 1920.0, 1080.0}, // Maximum -> screen edges
|
||||
description: "Maximum coordinates (1000 -> screen edges)",
|
||||
},
|
||||
// Coordinates > 1000 (normalization scenarios)
|
||||
{
|
||||
name: "ultrawide_monitor_coords",
|
||||
coordStr: "<point>450 720</point>",
|
||||
size: types.Size{Width: 3440, Height: 1440},
|
||||
expected: []float64{1548.0, 1036.8}, // 450/1000*3440=1548, 720/1000*1440=1036.8
|
||||
description: "Ultrawide monitor: coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "small_screen_coordinates",
|
||||
coordStr: "<bbox>125 875 250 950</bbox>",
|
||||
size: types.Size{Width: 480, Height: 800},
|
||||
expected: []float64{60.0, 700.0, 120.0, 760.0}, // Small screen bbox
|
||||
description: "Small screen: coordinate conversion",
|
||||
},
|
||||
{
|
||||
name: "real_world_example_1",
|
||||
coordStr: "<point>235 512</point>",
|
||||
name: "coordinates_greater_than_1000",
|
||||
coordStr: "1200,1500,1400,1800",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{451.2, 553.0}, // Real documentation example
|
||||
description: "Real world: documentation example coordinates",
|
||||
expected: []float64{2304.0, 1620.0, 2688.0, 1944.0}, // Scaled up for larger screen
|
||||
description: "Coordinates > 1000: scaling to larger screen",
|
||||
},
|
||||
{
|
||||
name: "real_world_example_2",
|
||||
coordStr: "[375, 600, 425, 650]",
|
||||
size: types.Size{Width: 1080, Height: 1920},
|
||||
expected: []float64{405.0, 1152.0, 459.0, 1248.0}, // Portrait mobile bbox
|
||||
description: "Real world: portrait mobile bbox",
|
||||
name: "very_large_coordinates",
|
||||
coordStr: "[2000, 3000, 2500, 3500]",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{3840.0, 3240.0, 4800.0, 3780.0}, // Very large coordinates
|
||||
description: "Very large coordinates > 2000",
|
||||
},
|
||||
// Error cases - decimal coordinates are not supported by the regex (\d+ only matches integers)
|
||||
{
|
||||
name: "mixed_coordinates_boundary",
|
||||
coordStr: "800,1200,1000,1500",
|
||||
size: types.Size{Width: 1920, Height: 1080},
|
||||
expected: []float64{1536.0, 1296.0, 1920.0, 1620.0}, // Mixed coordinates
|
||||
description: "Mixed coordinates around 1000 boundary",
|
||||
},
|
||||
// Error cases
|
||||
{
|
||||
name: "empty_string",
|
||||
coordStr: "",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: empty string",
|
||||
description: "Empty string should cause error",
|
||||
},
|
||||
{
|
||||
name: "invalid_coordinate_string",
|
||||
coordStr: "abc,def",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: invalid coordinate string",
|
||||
description: "Invalid coordinate string should cause error",
|
||||
},
|
||||
{
|
||||
name: "insufficient_coordinates",
|
||||
coordStr: "100",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: insufficient coordinates",
|
||||
},
|
||||
{
|
||||
name: "invalid_bracket_format",
|
||||
coordStr: "[abc, def]",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: invalid bracket format",
|
||||
},
|
||||
{
|
||||
name: "invalid_point_tag",
|
||||
coordStr: "<point>abc def</point>",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: invalid point tag",
|
||||
},
|
||||
{
|
||||
name: "invalid_bbox_tag",
|
||||
coordStr: "<bbox>abc def ghi jkl</bbox>",
|
||||
size: types.Size{Width: 1000, Height: 1000},
|
||||
expectError: true,
|
||||
description: "Error case: invalid bbox tag",
|
||||
description: "Insufficient coordinates should cause error",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -832,7 +611,7 @@ func TestNormalizeStringCoordinates(t *testing.T) {
|
||||
|
||||
// Test normalizeActionCoordinates function
|
||||
func TestNormalizeActionCoordinates(t *testing.T) {
|
||||
size := types.Size{Width: 1000, Height: 1000}
|
||||
size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -843,27 +622,27 @@ func TestNormalizeActionCoordinates(t *testing.T) {
|
||||
{
|
||||
name: "JSON array format - []interface{}",
|
||||
coordData: []interface{}{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
|
||||
},
|
||||
{
|
||||
name: "JSON array format with int values",
|
||||
coordData: []interface{}{100, 200, 150, 250},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
|
||||
},
|
||||
{
|
||||
name: "float64 slice format",
|
||||
coordData: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
|
||||
},
|
||||
{
|
||||
name: "string format",
|
||||
coordData: "100,200,150,250",
|
||||
expected: []float64{100.0, 200.0, 150.0, 250.0},
|
||||
expected: []float64{192.0, 160.0, 288.0, 200.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160, etc.
|
||||
},
|
||||
{
|
||||
name: "two-element coordinate",
|
||||
coordData: []interface{}{100.0, 200.0},
|
||||
expected: []float64{100.0, 200.0},
|
||||
expected: []float64{192.0, 160.0}, // Scaled: 100/1000*1920=192, 200/1000*800=160
|
||||
},
|
||||
{
|
||||
name: "insufficient elements in array",
|
||||
@@ -902,7 +681,7 @@ func TestNormalizeActionCoordinates(t *testing.T) {
|
||||
|
||||
// Test processActionArguments function
|
||||
func TestProcessActionArguments(t *testing.T) {
|
||||
size := types.Size{Width: 1000, Height: 1000}
|
||||
size := types.Size{Width: 1920, Height: 800} // Width>1000, Height<1000 for testing coordinate normalization
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -911,29 +690,49 @@ func TestProcessActionArguments(t *testing.T) {
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "coordinate and non-coordinate parameters",
|
||||
name: "basic_coordinate_and_text_parameters",
|
||||
rawArgs: map[string]interface{}{
|
||||
"start_box": "100,200,150,250",
|
||||
"content": "Hello\\nWorld",
|
||||
},
|
||||
expected: map[string]interface{}{
|
||||
"start_box": []float64{100.0, 200.0, 150.0, 250.0},
|
||||
"start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180
|
||||
"content": "Hello\nWorld",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple coordinate parameters",
|
||||
name: "drag_operation_dual_coordinates",
|
||||
rawArgs: map[string]interface{}{
|
||||
"start_box": "100,200,150,250",
|
||||
"end_box": "300,400,350,450",
|
||||
},
|
||||
expected: map[string]interface{}{
|
||||
"start_box": []float64{100.0, 200.0, 150.0, 250.0},
|
||||
"end_box": []float64{300.0, 400.0, 350.0, 450.0},
|
||||
"start_box": []float64{240.0, 180.0}, // Center point: [100,200,150,250] -> scaled coords [192,160,288,200] -> center (192+288)/2=240, (160+200)/2=180
|
||||
"end_box": []float64{624.0, 340.0}, // Center point: [300,400,350,450] -> scaled coords [576,320,672,360] -> center (576+672)/2=624, (320+360)/2=340
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "only non-coordinate parameters",
|
||||
name: "coordinates_greater_than_1000",
|
||||
rawArgs: map[string]interface{}{
|
||||
"start_box": "1200,1500,1400,1800",
|
||||
},
|
||||
expected: map[string]interface{}{
|
||||
"start_box": []float64{2496.0, 1320.0}, // Center point: [1200,1500,1400,1800] -> scaled coords [2304,1200,2688,1440] -> center (2304+2688)/2=2496, (1200+1440)/2=1320
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "mixed_large_and_small_coordinates",
|
||||
rawArgs: map[string]interface{}{
|
||||
"start_box": "800,1200,1000,1500",
|
||||
"end_box": "1500,500,2000,800",
|
||||
},
|
||||
expected: map[string]interface{}{
|
||||
"start_box": []float64{1728.0, 1080.0}, // Center point: [800,1200,1000,1500] -> scaled coords [1536,960,1920,1200] -> center (1536+1920)/2=1728, (960+1200)/2=1080
|
||||
"end_box": []float64{3360.0, 520.0}, // Center point: [1500,500,2000,800] -> scaled coords [2880,400,3840,640] -> center (2880+3840)/2=3360, (400+640)/2=520
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "non_coordinate_parameters_only",
|
||||
rawArgs: map[string]interface{}{
|
||||
"content": "Hello World",
|
||||
"direction": "down",
|
||||
@@ -944,12 +743,12 @@ func TestProcessActionArguments(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "empty arguments",
|
||||
name: "empty_arguments",
|
||||
rawArgs: map[string]interface{}{},
|
||||
expected: map[string]interface{}{},
|
||||
},
|
||||
{
|
||||
name: "invalid coordinate parameter",
|
||||
name: "invalid_coordinate_parameter",
|
||||
rawArgs: map[string]interface{}{
|
||||
"start_box": "invalid",
|
||||
},
|
||||
@@ -988,3 +787,56 @@ func TestProcessActionArguments(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Test new coordinate conversion logic
|
||||
func TestNewCoordinateConversion(t *testing.T) {
|
||||
parser := &UITARSContentParser{}
|
||||
|
||||
// Test single start_box conversion to center point
|
||||
text := "Thought: 我需要点击这个按钮\nAction: click(start_box='100,200,150,250')"
|
||||
result, err := parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||
assert.Nil(t, err)
|
||||
function := result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "uixt__click")
|
||||
|
||||
// ActionInputs is now directly a coordinate array
|
||||
var coords []float64
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
|
||||
// Should convert bounding box [100,200,150,250] to center point [125.0, 225.0]
|
||||
assert.Equal(t, 2, len(coords))
|
||||
assert.Equal(t, 125.0, coords[0]) // (100 + 150) / 2 = 125
|
||||
assert.Equal(t, 225.0, coords[1]) // (200 + 250) / 2 = 225
|
||||
|
||||
// Test drag operation conversion to merged array
|
||||
text = "Thought: 我需要拖拽元素\nAction: drag(start_box='100,200,150,250', end_box='300,400,350,450')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||
assert.Nil(t, err)
|
||||
function = result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "uixt__drag")
|
||||
|
||||
// ActionInputs is now directly a coordinate array
|
||||
err = json.Unmarshal([]byte(function.Arguments), &coords)
|
||||
assert.Nil(t, err)
|
||||
|
||||
// Should merge start_box and end_box center points into single array [125.0, 225.0, 325.0, 425.0]
|
||||
assert.Equal(t, 4, len(coords))
|
||||
assert.Equal(t, 125.0, coords[0]) // start center x: (100 + 150) / 2 = 125
|
||||
assert.Equal(t, 225.0, coords[1]) // start center y: (200 + 250) / 2 = 225
|
||||
assert.Equal(t, 325.0, coords[2]) // end center x: (300 + 350) / 2 = 325
|
||||
assert.Equal(t, 425.0, coords[3]) // end center y: (400 + 450) / 2 = 425
|
||||
|
||||
// Test non-coordinate operation (type action)
|
||||
text = "Thought: 我需要输入文本\nAction: type(content='Hello World')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||
assert.Nil(t, err)
|
||||
function = result.ToolCalls[0].Function
|
||||
assert.Equal(t, function.Name, "uixt__type")
|
||||
|
||||
// ActionInputs should be a map for non-coordinate operations
|
||||
var arguments map[string]interface{}
|
||||
err = json.Unmarshal([]byte(function.Arguments), &arguments)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, "Hello World", arguments["content"])
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
@@ -20,7 +21,8 @@ const (
|
||||
|
||||
// UITARSContentParser parses the Thought/Action format response
|
||||
type UITARSContentParser struct {
|
||||
systemPrompt string
|
||||
systemPrompt string
|
||||
actionMapping map[string]option.ActionName
|
||||
}
|
||||
|
||||
func (p *UITARSContentParser) SystemPrompt() string {
|
||||
@@ -47,7 +49,7 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR
|
||||
}
|
||||
|
||||
// Convert actions to tool calls
|
||||
toolCalls := convertActionsToToolCalls(actions)
|
||||
toolCalls := convertActionsToToolCalls(actions, p.actionMapping)
|
||||
|
||||
return &PlanningResult{
|
||||
ToolCalls: toolCalls,
|
||||
@@ -92,10 +94,15 @@ func (p *UITARSContentParser) parseActionString(actionStr string, size types.Siz
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create final action
|
||||
// Convert processedArgs based on action type and coordinate parameters
|
||||
finalArgs, err := convertProcessedArgs(processedArgs, actionType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
action := Action{
|
||||
ActionType: actionType,
|
||||
ActionInputs: processedArgs,
|
||||
ActionInputs: finalArgs,
|
||||
}
|
||||
|
||||
return []Action{action}, nil
|
||||
@@ -147,6 +154,18 @@ func normalizeCoordinatesFormat(text string) string {
|
||||
}
|
||||
|
||||
// convertRelativeToAbsolute converts relative coordinates to absolute pixel coordinates
|
||||
// The coordinate system uses a 1000x1000 relative coordinate system as the base.
|
||||
// This function maps relative coordinates to actual screen resolution coordinates.
|
||||
//
|
||||
// Conversion formula:
|
||||
// - For X coordinates: absolute_x = (relative_x / 1000) * screen_width
|
||||
// - For Y coordinates: absolute_y = (relative_y / 1000) * screen_height
|
||||
//
|
||||
// Example:
|
||||
// - Screen size: 1920x1080
|
||||
// - Relative coordinate: 500 (in 1000x1000 system)
|
||||
// - X conversion: 500/1000 * 1920 = 960 pixels
|
||||
// - Y conversion: 500/1000 * 1080 = 540 pixels
|
||||
func convertRelativeToAbsolute(relativeCoord float64, isXCoord bool, size types.Size) float64 {
|
||||
if isXCoord {
|
||||
return math.Round((relativeCoord/DefaultFactor*float64(size.Width))*10) / 10
|
||||
@@ -204,7 +223,9 @@ func normalizeParameterName(paramName string) string {
|
||||
|
||||
// processActionArguments processes raw arguments based on action type and parameter types
|
||||
// Input: rawArgs={"start_box": "100,200,150,250"}
|
||||
// Output: processedArgs={"start_box": [120.5, 240.1, 180.7, 300.2]} (converted to pixels)
|
||||
// Output: processedArgs={"start_box": [125.0, 225.0]} (converted to center point coordinates)
|
||||
// For drag: rawArgs={"start_box": "100,200,150,250", "end_box": "300,400,350,450"}
|
||||
// Output: processedArgs={"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} (both converted to center points)
|
||||
func processActionArguments(rawArgs map[string]interface{}, size types.Size) (map[string]interface{}, error) {
|
||||
processedArgs := make(map[string]interface{})
|
||||
|
||||
@@ -222,9 +243,9 @@ func processActionArguments(rawArgs map[string]interface{}, size types.Size) (ma
|
||||
|
||||
// Process a single argument based on its name and value
|
||||
func processArgument(paramName string, paramValue interface{}, size types.Size) (interface{}, error) {
|
||||
// Handle coordinate parameters
|
||||
// Handle coordinate parameters - convert bounding box to center point
|
||||
if isCoordinateParameter(paramName) {
|
||||
return normalizeActionCoordinates(paramValue, size)
|
||||
return normalizeActionCoordinatesToCenterPoint(paramValue, size)
|
||||
}
|
||||
|
||||
// Handle other parameter types (content, key, direction, etc.)
|
||||
@@ -236,6 +257,59 @@ func isCoordinateParameter(paramName string) bool {
|
||||
return strings.Contains(paramName, "box") || strings.Contains(paramName, "point")
|
||||
}
|
||||
|
||||
// convertProcessedArgs converts processed arguments based on action type and coordinate parameters
|
||||
// For single start_box: {"start_box": [125.0, 225.0]} -> {"start_box": [125.0, 225.0]}
|
||||
// For drag with start_box and end_box: {"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} -> {"start_box": [125.0, 225.0, 325.0, 425.0]}
|
||||
func convertProcessedArgs(processedArgs map[string]interface{}, actionType string) (map[string]interface{}, error) {
|
||||
// Handle coordinate parameters based on action type
|
||||
startBox, hasStartBox := processedArgs["start_box"]
|
||||
endBox, hasEndBox := processedArgs["end_box"]
|
||||
|
||||
// Check if this is a drag operation that should merge coordinates
|
||||
if hasStartBox && hasEndBox {
|
||||
// Drag operation: merge start_box and end_box into a single coordinate array
|
||||
startCoords, ok1 := startBox.([]float64)
|
||||
endCoords, ok2 := endBox.([]float64)
|
||||
|
||||
if !ok1 || !ok2 {
|
||||
return nil, fmt.Errorf("invalid coordinate format for drag operation")
|
||||
}
|
||||
|
||||
if len(startCoords) != 2 || len(endCoords) != 2 {
|
||||
return nil, fmt.Errorf("drag operation requires 2-element coordinate arrays, got start: %d, end: %d", len(startCoords), len(endCoords))
|
||||
}
|
||||
|
||||
options := option.ActionOptions{
|
||||
FromX: startCoords[0],
|
||||
FromY: startCoords[1],
|
||||
ToX: endCoords[0],
|
||||
ToY: endCoords[1],
|
||||
}
|
||||
return options.ToMap(), nil
|
||||
}
|
||||
|
||||
// For single coordinate operations, return the coordinate array directly
|
||||
if hasStartBox {
|
||||
startCoords, ok := startBox.([]float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("invalid coordinate format for single operation")
|
||||
}
|
||||
options := option.ActionOptions{
|
||||
X: startCoords[0],
|
||||
Y: startCoords[1],
|
||||
}
|
||||
return options.ToMap(), nil
|
||||
}
|
||||
|
||||
// For non-coordinate operations, return the original arguments map
|
||||
// TODO
|
||||
finalArgs := make(map[string]interface{})
|
||||
for key, value := range processedArgs {
|
||||
finalArgs[key] = value
|
||||
}
|
||||
return finalArgs, nil
|
||||
}
|
||||
|
||||
// normalizeActionCoordinates normalizes coordinates from various formats to actual pixel coordinates
|
||||
func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
|
||||
switch v := coordData.(type) {
|
||||
@@ -350,15 +424,39 @@ func normalizeStringCoordinates(coordStr string, size types.Size) ([]float64, er
|
||||
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
||||
}
|
||||
|
||||
// normalizeActionCoordinatesToCenterPoint converts bounding box coordinates to center point coordinates
|
||||
// Input: "100,200,150,250" (x1,y1,x2,y2) -> Output: [125.0, 225.0] (center point in absolute pixels)
|
||||
// Input: "100,200" (x,y) -> Output: [100.0, 200.0] (point in absolute pixels)
|
||||
func normalizeActionCoordinatesToCenterPoint(coordData interface{}, size types.Size) ([]float64, error) {
|
||||
// First normalize coordinates to get absolute pixel coordinates
|
||||
coords, err := normalizeActionCoordinates(coordData, size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Convert bounding box to center point
|
||||
if len(coords) == 4 {
|
||||
// [x1, y1, x2, y2] -> [center_x, center_y]
|
||||
centerX := (coords[0] + coords[2]) / 2
|
||||
centerY := (coords[1] + coords[3]) / 2
|
||||
return []float64{centerX, centerY}, nil
|
||||
} else if len(coords) == 2 {
|
||||
// Already a point [x, y], return as-is
|
||||
return coords, nil
|
||||
} else {
|
||||
return nil, fmt.Errorf("invalid coordinate format: expected 2 or 4 coordinates, got %d", len(coords))
|
||||
}
|
||||
}
|
||||
|
||||
// Action represents a parsed action with its context.
|
||||
type Action struct {
|
||||
ActionType string `json:"action_type"`
|
||||
ActionType string `json:"action_type"` // map to option.ActionName
|
||||
ActionInputs map[string]any `json:"action_inputs"`
|
||||
}
|
||||
|
||||
// convertActionsToToolCalls converts actions to tool calls
|
||||
// This is a shared function used by both JSONContentParser and UITARSContentParser
|
||||
func convertActionsToToolCalls(actions []Action) []schema.ToolCall {
|
||||
func convertActionsToToolCalls(actions []Action, actionMapping map[string]option.ActionName) []schema.ToolCall {
|
||||
toolCalls := make([]schema.ToolCall, 0, len(actions))
|
||||
for _, action := range actions {
|
||||
jsonArgs, err := json.Marshal(action.ActionInputs)
|
||||
@@ -366,11 +464,15 @@ func convertActionsToToolCalls(actions []Action) []schema.ToolCall {
|
||||
log.Error().Interface("action", action).Msg("failed to marshal action inputs")
|
||||
continue
|
||||
}
|
||||
actionName := string(actionMapping[action.ActionType])
|
||||
if actionName == "" {
|
||||
actionName = action.ActionType
|
||||
}
|
||||
toolCalls = append(toolCalls, schema.ToolCall{
|
||||
ID: action.ActionType + "_" + strconv.FormatInt(time.Now().Unix(), 10),
|
||||
ID: actionName + "_" + strconv.FormatInt(time.Now().Unix(), 10),
|
||||
Type: "function",
|
||||
Function: schema.FunctionCall{
|
||||
Name: "uixt__" + action.ActionType,
|
||||
Name: "uixt__" + actionName,
|
||||
Arguments: string(jsonArgs),
|
||||
},
|
||||
})
|
||||
|
||||
@@ -2,7 +2,6 @@ package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/model/openai"
|
||||
@@ -140,20 +139,12 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
|
||||
Error: err.Error(),
|
||||
}
|
||||
log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed")
|
||||
// append assistant message
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Assistant,
|
||||
Content: message.Content,
|
||||
})
|
||||
} else {
|
||||
// append assistant message with tool calls
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Tool,
|
||||
Content: result.Content,
|
||||
ToolCalls: result.ToolCalls,
|
||||
ToolCallID: fmt.Sprintf("%d", time.Now().Unix()),
|
||||
})
|
||||
}
|
||||
// append assistant message (since we're parsing content, not using native function calling)
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Assistant,
|
||||
Content: message.Content,
|
||||
})
|
||||
|
||||
log.Info().
|
||||
Interface("summary", result.ActionSummary).
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
package ai
|
||||
|
||||
import "github.com/httprunner/httprunner/v5/uixt/option"
|
||||
|
||||
// system prompt for UITARSContentParser
|
||||
// doubao-1.5-ui-tars on volcengine.com
|
||||
// https://www.volcengine.com/docs/82379/1536429
|
||||
@@ -14,13 +16,12 @@ Action: ...
|
||||
|
||||
## Action Space
|
||||
click(start_box='[x1, y1, x2, y2]')
|
||||
long_press(start_box='[x1, y1, x2, y2]')
|
||||
left_double(start_box='[x1, y1, x2, y2]')
|
||||
right_single(start_box='[x1, y1, x2, y2]')
|
||||
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
|
||||
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
||||
open_app(app_name=\'\')
|
||||
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
||||
press_home()
|
||||
press_back()
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||
|
||||
@@ -31,6 +32,18 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
||||
## User Instruction
|
||||
`
|
||||
|
||||
var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{
|
||||
"click": option.ACTION_TapXY,
|
||||
"left_double": option.ACTION_DoubleTapXY,
|
||||
"right_single": option.ACTION_SecondaryClick,
|
||||
"drag": option.ACTION_Drag,
|
||||
"hotkey": option.ACTION_KeyCode,
|
||||
"type": option.ACTION_Input,
|
||||
"scroll": option.ACTION_Scroll,
|
||||
"wait": option.ACTION_Sleep,
|
||||
"finished": option.ACTION_Finished,
|
||||
}
|
||||
|
||||
// system prompt for UITARSContentParser
|
||||
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py
|
||||
const _ = `
|
||||
|
||||
@@ -29,7 +29,7 @@ func TestVLMPlanning(t *testing.T) {
|
||||
|
||||
userInstruction += "\n\n请基于以上游戏规则,给出下一步可点击的两个图标坐标"
|
||||
|
||||
modelConfig, err := GetModelConfig(option.LLMServiceTypeDoubaoVL)
|
||||
modelConfig, err := GetModelConfig(option.LLMServiceTypeUITARS)
|
||||
require.NoError(t, err)
|
||||
|
||||
planner, err := NewPlanner(context.Background(), modelConfig)
|
||||
@@ -157,9 +157,9 @@ func TestHandleSwitch(t *testing.T) {
|
||||
imageFile string
|
||||
actionType string
|
||||
}{
|
||||
{"testdata/deepseek_think_off.png", "click"}, // 关闭状态,需要点击开启
|
||||
{"testdata/deepseek_think_on.png", "click"}, // 关闭状态,需要点击开启
|
||||
{"testdata/deepseek_network_on.png", "finished"}, // 开启状态,无需操作
|
||||
{"testdata/deepseek_think_off.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启
|
||||
{"testdata/deepseek_think_on.png", "uixt__tap_xy"}, // 关闭状态,需要点击开启
|
||||
{"testdata/deepseek_network_on.png", "uixt__finished"}, // 开启状态,无需操作
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
|
||||
@@ -2,6 +2,7 @@ package option
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
"reflect"
|
||||
@@ -330,6 +331,18 @@ func (o *ActionOptions) Options() []ActionOption {
|
||||
return options
|
||||
}
|
||||
|
||||
func (o *ActionOptions) ToMap() map[string]interface{} {
|
||||
result := make(map[string]interface{})
|
||||
b, err := json.Marshal(o)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
if err := json.Unmarshal(b, &result); err != nil {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (o *ActionOptions) ApplyTapOffset(absX, absY float64) (float64, float64) {
|
||||
if len(o.TapOffset) == 2 {
|
||||
absX += float64(o.TapOffset[0])
|
||||
|
||||
@@ -64,9 +64,10 @@ func (c *MCPClient4XTDriver) ListTools(ctx context.Context, req mcp.ListToolsReq
|
||||
}
|
||||
|
||||
func (c *MCPClient4XTDriver) CallTool(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) {
|
||||
actionTool := c.Server.GetToolByAction(option.ActionName(req.Params.Name))
|
||||
actionName := strings.TrimPrefix(req.Params.Name, "uixt__")
|
||||
actionTool := c.Server.GetToolByAction(option.ActionName(actionName))
|
||||
if actionTool == nil {
|
||||
return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", req.Params.Name)), nil
|
||||
return mcp.NewToolResultError(fmt.Sprintf("action %s for tool not found", actionName)), nil
|
||||
}
|
||||
handler := actionTool.Implement()
|
||||
return handler(ctx, req)
|
||||
|
||||
Reference in New Issue
Block a user