fix: add direction parameter support for scroll operations in UI-TARS parser

- Handle direction parameter in convertProcessedArgs for scroll actions
- Ensure scroll operations map to swipe with both coordinates and direction
- Add comprehensive test coverage for scroll action parsing
- Fix issue where scroll direction was missing from tool call arguments
This commit is contained in:
lilong.129
2025-06-10 16:34:44 +08:00
parent c322d7c36c
commit 98bd41ff33
5 changed files with 105 additions and 3 deletions

View File

@@ -127,6 +127,7 @@ var (
LLMRequestServiceError = errors.New("request LLM service error") // 112
LLMParsePlanningResponseError = errors.New("parse LLM planning response error") // 113
LLMParseAssertionResponseError = errors.New("parse LLM assertion response error") // 114
LLMParseQueryResponseError = errors.New("parse LLM query response error") // 115
)
var errorsMap = map[error]int{
@@ -217,6 +218,7 @@ var errorsMap = map[error]int{
LLMRequestServiceError: 112,
LLMParsePlanningResponseError: 113,
LLMParseAssertionResponseError: 114,
LLMParseQueryResponseError: 115,
// trackings related
TrackingGetError: 90,

View File

@@ -1 +1 @@
v5.0.0-beta-2506101609
v5.0.0-beta-2506101640

View File

@@ -1276,3 +1276,83 @@ func TestNormalizeActionCoordinates_StringArray(t *testing.T) {
})
}
}
func TestUITARSContentParser_ParseScrollAction(t *testing.T) {
parser := &UITARSContentParser{
modelType: option.DOUBAO_1_5_UI_TARS_250328,
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
actionMapping: doubao_1_5_ui_tars_action_mapping,
}
size := types.Size{Width: 1080, Height: 1920}
tests := []struct {
name string
content string
expectedDirection string
}{
{
name: "scroll left with bbox format",
content: `Thought: 我需要向左滑动
Action: scroll(direction='left', start_box='<bbox>850 500 850 500</bbox>')`,
expectedDirection: "left",
},
{
name: "scroll up with array format",
content: `Thought: 我需要向上滑动
Action: scroll(direction='up', start_box='[400, 600]')`,
expectedDirection: "up",
},
{
name: "scroll down with array format",
content: `Thought: 我需要向下滑动
Action: scroll(direction='down', start_box='[500, 800]')`,
expectedDirection: "down",
},
{
name: "real log example - scroll left",
content: `Thought: 我仔细观察了当前的游戏局面发现两个2分别位于右下角和右中位置。之前尝试了几次滑动都没有成功现在我需要重新思考策略。既然向上滑动没有效果那我决定换个方向尝试向左滑动看看。这样应该能让这两个2相遇并合并为后续的游戏进展打下基础。
Action: scroll(direction='left', start_box='<bbox>850 500 850 500</bbox>')`,
expectedDirection: "left",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := parser.Parse(tt.content, size)
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Len(t, result.ToolCalls, 1)
toolCall := result.ToolCalls[0]
// Verify tool call structure
assert.Equal(t, "uixt__swipe", toolCall.Function.Name)
assert.Equal(t, "function", toolCall.Type)
assert.NotEmpty(t, toolCall.ID)
// Parse and verify arguments
var args map[string]interface{}
err = json.Unmarshal([]byte(toolCall.Function.Arguments), &args)
assert.NoError(t, err)
// Verify direction parameter is present and correct
assert.Contains(t, args, "direction")
assert.Equal(t, tt.expectedDirection, args["direction"])
// Verify coordinates are present and reasonable
assert.Contains(t, args, "x")
assert.Contains(t, args, "y")
assert.IsType(t, float64(0), args["x"])
assert.IsType(t, float64(0), args["y"])
// Verify coordinates are within screen bounds
x := args["x"].(float64)
y := args["y"].(float64)
assert.Greater(t, x, 0.0)
assert.Less(t, x, float64(size.Width))
assert.Greater(t, y, 0.0)
assert.Less(t, y, float64(size.Height))
})
}
}

View File

@@ -290,6 +290,26 @@ func convertProcessedArgs(processedArgs map[string]interface{}, actionType strin
return options.ToMap(), nil
}
// For scroll operations, handle both coordinates and direction
if actionType == "scroll" && hasStartBox {
startCoords, ok := startBox.([]float64)
if !ok {
return nil, fmt.Errorf("invalid coordinate format for scroll operation")
}
options := option.ActionOptions{
X: builtin.RoundToOneDecimal(startCoords[0]),
Y: builtin.RoundToOneDecimal(startCoords[1]),
}
// Add direction parameter if present
if direction, hasDirection := processedArgs["direction"]; hasDirection {
options.Direction = direction.(string)
}
return options.ToMap(), nil
}
// For single coordinate operations, return the coordinate array directly
if hasStartBox {
startCoords, ok := startBox.([]float64)

View File

@@ -39,7 +39,7 @@ var doubao_1_5_ui_tars_action_mapping = map[string]option.ActionName{
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
"scroll": option.ACTION_Scroll,
"scroll": option.ACTION_Swipe, // swipe up/down/left/right
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}
@@ -136,7 +136,7 @@ var doubao_1_5_thinking_vision_pro_action_mapping = map[string]option.ActionName
"drag": option.ACTION_Drag,
"hotkey": option.ACTION_KeyCode,
"type": option.ACTION_Input,
"scroll": option.ACTION_Scroll,
"scroll": option.ACTION_Swipe, // swipe up/down/left/right
"wait": option.ACTION_Sleep,
"finished": option.ACTION_Finished,
}