mirror of
https://github.com/httprunner/httprunner.git
synced 2026-05-11 10:00:23 +08:00
- Handle direction parameter in convertProcessedArgs for scroll actions - Ensure scroll operations map to swipe with both coordinates and direction - Add comprehensive test coverage for scroll action parsing - Fix issue where scroll direction was missing from tool call arguments
530 lines
18 KiB
Go
530 lines
18 KiB
Go
package ai
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/cloudwego/eino/schema"
|
|
"github.com/httprunner/httprunner/v5/internal/builtin"
|
|
"github.com/httprunner/httprunner/v5/uixt/option"
|
|
"github.com/httprunner/httprunner/v5/uixt/types"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
DefaultFactor = 1000
|
|
)
|
|
|
|
// UITARSContentParser parses the Thought/Action format response
|
|
type UITARSContentParser struct {
|
|
modelType option.LLMServiceType
|
|
systemPrompt string
|
|
actionMapping map[string]option.ActionName
|
|
}
|
|
|
|
func (p *UITARSContentParser) SystemPrompt() string {
|
|
return p.systemPrompt
|
|
}
|
|
|
|
// ParseActionToStructureOutput parses the model output text into structured actions.
|
|
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
|
content = strings.TrimSpace(content)
|
|
|
|
// Extract thought string
|
|
thought := p.extractThought(content)
|
|
|
|
// Extract action string
|
|
actionStr, err := p.extractActionString(content)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Parse and process actions
|
|
actions, err := p.parseActionString(actionStr, size)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Convert actions to tool calls
|
|
toolCalls := convertActionsToToolCalls(actions, p.actionMapping)
|
|
|
|
return &PlanningResult{
|
|
ToolCalls: toolCalls,
|
|
Thought: thought,
|
|
Content: content,
|
|
ModelName: string(p.modelType),
|
|
}, nil
|
|
}
|
|
|
|
// extractThought extracts thought from the text
|
|
func (p *UITARSContentParser) extractThought(text string) string {
|
|
re := regexp.MustCompile(`Thought:(.*?)\nAction:`)
|
|
matches := re.FindStringSubmatch(text)
|
|
if len(matches) > 1 {
|
|
return strings.TrimSpace(matches[1])
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// extractActionString extracts the action string from the text
|
|
func (p *UITARSContentParser) extractActionString(text string) (string, error) {
|
|
// Extract Action part using regex
|
|
re := regexp.MustCompile(`Action:(.*?)(?:\n|$)`)
|
|
matches := re.FindStringSubmatch(text)
|
|
if len(matches) > 1 {
|
|
return strings.TrimSpace(matches[1]), nil
|
|
}
|
|
return "", fmt.Errorf("no Action: found")
|
|
}
|
|
|
|
// parseActionString parse and process actions
|
|
func (p *UITARSContentParser) parseActionString(actionStr string, size types.Size) ([]Action, error) {
|
|
// Parse action type and raw arguments
|
|
actionType, rawArgs, err := parseActionTypeAndArguments(actionStr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Process and normalize arguments
|
|
processedArgs, err := processActionArguments(rawArgs, size)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Convert processedArgs based on action type and coordinate parameters
|
|
finalArgs, err := convertProcessedArgs(processedArgs, actionType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
action := Action{
|
|
ActionType: actionType,
|
|
ActionInputs: finalArgs,
|
|
}
|
|
|
|
return []Action{action}, nil
|
|
}
|
|
|
|
// normalizeCoordinatesFormat standardizes coordinate format in text (without pixel conversion)
|
|
func normalizeCoordinatesFormat(text string) string {
|
|
// Convert point tags to coordinate format
|
|
if strings.Contains(text, "<point>") {
|
|
// support <point>x1 y1 x2 y2</point> or <point>x y</point>
|
|
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
|
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
|
submatches := re.FindStringSubmatch(match)
|
|
if submatches[3] != "" && submatches[4] != "" {
|
|
// 4 numbers
|
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
|
}
|
|
// 2 numbers
|
|
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
|
|
})
|
|
}
|
|
|
|
// Convert bbox tags to coordinate format
|
|
if strings.Contains(text, "<bbox>") {
|
|
// support <bbox>x1 y1 x2 y2</bbox>
|
|
re := regexp.MustCompile(`<bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)</bbox>`)
|
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
|
submatches := re.FindStringSubmatch(match)
|
|
// 4 numbers for bbox
|
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
|
})
|
|
}
|
|
|
|
// Convert bracket format [x1, y1, x2, y2] to coordinate format
|
|
if strings.Contains(text, "[") && strings.Contains(text, "]") {
|
|
// support [x1, y1, x2, y2] format
|
|
re := regexp.MustCompile(`\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]`)
|
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
|
submatches := re.FindStringSubmatch(match)
|
|
// 4 numbers for bracket format
|
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
|
})
|
|
}
|
|
|
|
return text
|
|
}
|
|
|
|
// convertRelativeToAbsolute converts relative coordinates to absolute pixel coordinates
|
|
// The coordinate system uses a 1000x1000 relative coordinate system as the base.
|
|
// This function maps relative coordinates to actual screen resolution coordinates.
|
|
//
|
|
// Conversion formula:
|
|
// - For X coordinates: absolute_x = (relative_x / 1000) * screen_width
|
|
// - For Y coordinates: absolute_y = (relative_y / 1000) * screen_height
|
|
//
|
|
// Example:
|
|
// - Screen size: 1920x1080
|
|
// - Relative coordinate: 500 (in 1000x1000 system)
|
|
// - X conversion: 500/1000 * 1920 = 960 pixels
|
|
// - Y conversion: 500/1000 * 1080 = 540 pixels
|
|
func convertRelativeToAbsolute(relativeCoord float64, isXCoord bool, size types.Size) float64 {
|
|
if isXCoord {
|
|
return builtin.RoundToOneDecimal(relativeCoord / DefaultFactor * float64(size.Width))
|
|
}
|
|
return builtin.RoundToOneDecimal(relativeCoord / DefaultFactor * float64(size.Height))
|
|
}
|
|
|
|
// parseActionTypeAndArguments extracts function name and raw parameter map from action string
|
|
// Input: "click(start_box='100,200,150,250')" or "click(start_point='100,200,150,250')"
|
|
// Output: actionType="click", rawArgs={"start_box": "100,200,150,250"}
|
|
func parseActionTypeAndArguments(actionStr string) (actionType string, rawArgs map[string]interface{}, err error) {
|
|
// Parse action type and parameters
|
|
actionParts := strings.SplitN(actionStr, "(", 2)
|
|
if len(actionParts) < 2 {
|
|
return "", nil, fmt.Errorf("not a function call")
|
|
}
|
|
|
|
actionType = strings.TrimSpace(actionParts[0])
|
|
paramsText := strings.TrimSuffix(strings.TrimSpace(actionParts[1]), ")")
|
|
|
|
// Parse string parameters to map
|
|
rawArgs = make(map[string]interface{})
|
|
if paramsText != "" {
|
|
// Use regex to extract key=value pairs, handling quoted values properly
|
|
re := regexp.MustCompile(`(\w+)\s*=\s*['"]([^'"]*?)['"]`)
|
|
matches := re.FindAllStringSubmatch(paramsText, -1)
|
|
for _, match := range matches {
|
|
if len(match) >= 3 {
|
|
key := strings.TrimSpace(match[1])
|
|
value := strings.TrimSpace(match[2])
|
|
|
|
// Apply parameter name mapping (legacy compatibility)
|
|
key = normalizeParameterName(key)
|
|
rawArgs[key] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
return actionType, rawArgs, nil
|
|
}
|
|
|
|
// normalizeParameterName applies legacy parameter name mappings
|
|
func normalizeParameterName(paramName string) string {
|
|
switch paramName {
|
|
case "start_point":
|
|
return "start_box"
|
|
case "end_point":
|
|
return "end_box"
|
|
case "point":
|
|
return "start_box"
|
|
default:
|
|
return paramName
|
|
}
|
|
}
|
|
|
|
// processActionArguments processes raw arguments based on action type and parameter types
|
|
// Input: rawArgs={"start_box": "100,200,150,250"}
|
|
// Output: processedArgs={"start_box": [125.0, 225.0]} (converted to center point coordinates)
|
|
// For drag: rawArgs={"start_box": "100,200,150,250", "end_box": "300,400,350,450"}
|
|
// Output: processedArgs={"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} (both converted to center points)
|
|
func processActionArguments(rawArgs map[string]interface{}, size types.Size) (map[string]interface{}, error) {
|
|
processedArgs := make(map[string]interface{})
|
|
|
|
// Process each argument based on its type and context
|
|
for paramName, paramValue := range rawArgs {
|
|
processed, err := processArgument(paramName, paramValue, size)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to process argument %s: %w", paramName, err)
|
|
}
|
|
processedArgs[paramName] = processed
|
|
}
|
|
|
|
return processedArgs, nil
|
|
}
|
|
|
|
// Process a single argument based on its name and value
|
|
func processArgument(paramName string, paramValue interface{}, size types.Size) (interface{}, error) {
|
|
// Handle coordinate parameters - convert bounding box to center point
|
|
if isCoordinateParameter(paramName) {
|
|
return normalizeActionCoordinatesToCenterPoint(paramValue, size)
|
|
}
|
|
|
|
// Handle other parameter types (content, key, direction, etc.)
|
|
return normalizeStringParam(paramName, paramValue), nil
|
|
}
|
|
|
|
// Check if a parameter is a coordinate parameter
|
|
func isCoordinateParameter(paramName string) bool {
|
|
return strings.Contains(strings.ToLower(paramName), "box") ||
|
|
strings.Contains(strings.ToLower(paramName), "point")
|
|
}
|
|
|
|
// convertProcessedArgs converts processed arguments based on action type and coordinate parameters
|
|
// For single start_box: {"start_box": [125.0, 225.0]} -> {"start_box": [125.0, 225.0]}
|
|
// For drag with start_box and end_box: {"start_box": [125.0, 225.0], "end_box": [325.0, 425.0]} -> {"start_box": [125.0, 225.0, 325.0, 425.0]}
|
|
func convertProcessedArgs(processedArgs map[string]interface{}, actionType string) (map[string]interface{}, error) {
|
|
// Handle coordinate parameters based on action type
|
|
startBox, hasStartBox := processedArgs["start_box"]
|
|
endBox, hasEndBox := processedArgs["end_box"]
|
|
|
|
// Check if this is a drag operation that should merge coordinates
|
|
if hasStartBox && hasEndBox {
|
|
// Drag operation: merge start_box and end_box into a single coordinate array
|
|
startCoords, ok1 := startBox.([]float64)
|
|
endCoords, ok2 := endBox.([]float64)
|
|
|
|
if !ok1 || !ok2 {
|
|
return nil, fmt.Errorf("invalid coordinate format for drag operation")
|
|
}
|
|
|
|
if len(startCoords) != 2 || len(endCoords) != 2 {
|
|
return nil, fmt.Errorf("drag operation requires 2-element coordinate arrays, got start: %d, end: %d", len(startCoords), len(endCoords))
|
|
}
|
|
|
|
options := option.ActionOptions{
|
|
FromX: builtin.RoundToOneDecimal(startCoords[0]),
|
|
FromY: builtin.RoundToOneDecimal(startCoords[1]),
|
|
ToX: builtin.RoundToOneDecimal(endCoords[0]),
|
|
ToY: builtin.RoundToOneDecimal(endCoords[1]),
|
|
}
|
|
return options.ToMap(), nil
|
|
}
|
|
|
|
// For scroll operations, handle both coordinates and direction
|
|
if actionType == "scroll" && hasStartBox {
|
|
startCoords, ok := startBox.([]float64)
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid coordinate format for scroll operation")
|
|
}
|
|
|
|
options := option.ActionOptions{
|
|
X: builtin.RoundToOneDecimal(startCoords[0]),
|
|
Y: builtin.RoundToOneDecimal(startCoords[1]),
|
|
}
|
|
|
|
// Add direction parameter if present
|
|
if direction, hasDirection := processedArgs["direction"]; hasDirection {
|
|
options.Direction = direction.(string)
|
|
}
|
|
|
|
return options.ToMap(), nil
|
|
}
|
|
|
|
// For single coordinate operations, return the coordinate array directly
|
|
if hasStartBox {
|
|
startCoords, ok := startBox.([]float64)
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid coordinate format for single operation")
|
|
}
|
|
options := option.ActionOptions{
|
|
X: builtin.RoundToOneDecimal(startCoords[0]),
|
|
Y: builtin.RoundToOneDecimal(startCoords[1]),
|
|
}
|
|
return options.ToMap(), nil
|
|
}
|
|
|
|
// For non-coordinate operations, apply parameter name mapping and return the arguments map
|
|
finalArgs := make(map[string]interface{})
|
|
for key, value := range processedArgs {
|
|
// Map parameter names to match ActionOptions field names
|
|
mappedKey := mapParameterName(key)
|
|
finalArgs[mappedKey] = value
|
|
}
|
|
return finalArgs, nil
|
|
}
|
|
|
|
// mapParameterName maps UI-TARS parameter names to ActionOptions field names
|
|
func mapParameterName(paramName string) string {
|
|
switch paramName {
|
|
case "content":
|
|
return "text" // Map content to text for input operations
|
|
case "key":
|
|
return "keycode" // Map key to keycode for hotkey operations
|
|
default:
|
|
return paramName
|
|
}
|
|
}
|
|
|
|
// normalizeActionCoordinates normalizes coordinates from various formats to actual pixel coordinates
|
|
func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
|
|
switch v := coordData.(type) {
|
|
case []interface{}:
|
|
// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1] or ["229 389", "229 439"]
|
|
if len(v) == 0 {
|
|
return nil, fmt.Errorf("coordinate array cannot be empty")
|
|
}
|
|
|
|
coords := make([]float64, 0)
|
|
for i, val := range v {
|
|
switch num := val.(type) {
|
|
case float64:
|
|
// Convert relative coordinates to absolute coordinates using DefaultFactor
|
|
if i%2 == 0 { // x coordinates
|
|
coords = append(coords, convertRelativeToAbsolute(num, true, size))
|
|
} else { // y coordinates
|
|
coords = append(coords, convertRelativeToAbsolute(num, false, size))
|
|
}
|
|
case int:
|
|
numFloat := float64(num)
|
|
// Convert relative coordinates to absolute coordinates using DefaultFactor
|
|
if i%2 == 0 { // x coordinates
|
|
coords = append(coords, convertRelativeToAbsolute(numFloat, true, size))
|
|
} else { // y coordinates
|
|
coords = append(coords, convertRelativeToAbsolute(numFloat, false, size))
|
|
}
|
|
case string:
|
|
// Handle string coordinates like "229 389"
|
|
stringCoords, err := normalizeStringCoordinates(num, size)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse string coordinate '%s': %v", num, err)
|
|
}
|
|
coords = append(coords, stringCoords...)
|
|
default:
|
|
return nil, fmt.Errorf("coordinate value must be a number or string, got %T", val)
|
|
}
|
|
}
|
|
|
|
// Check if we have at least 2 coordinates after processing
|
|
if len(coords) < 2 {
|
|
return nil, fmt.Errorf("coordinate array must result in at least 2 coordinates, got %d", len(coords))
|
|
}
|
|
|
|
return coords, nil
|
|
|
|
case []float64:
|
|
// Handle already parsed float64 slice
|
|
coords := make([]float64, len(v))
|
|
for i, val := range v {
|
|
if i%2 == 0 { // x coordinates
|
|
coords[i] = convertRelativeToAbsolute(val, true, size)
|
|
} else { // y coordinates
|
|
coords[i] = convertRelativeToAbsolute(val, false, size)
|
|
}
|
|
}
|
|
return coords, nil
|
|
|
|
case string:
|
|
// Handle string format (from UI-TARS or string coordinates)
|
|
return normalizeStringCoordinates(v, size)
|
|
|
|
default:
|
|
return nil, fmt.Errorf("unsupported coordinate format: %T", coordData)
|
|
}
|
|
}
|
|
|
|
// normalizeStringParam normalizes string parameters, handling escape characters for content
|
|
func normalizeStringParam(paramName string, paramValue interface{}) interface{} {
|
|
if paramValue == nil {
|
|
return paramValue
|
|
}
|
|
|
|
// Convert to string if possible
|
|
param, ok := paramValue.(string)
|
|
if !ok {
|
|
return paramValue // Return as-is if not a string
|
|
}
|
|
|
|
param = strings.TrimSpace(param)
|
|
if param == "" {
|
|
return param
|
|
}
|
|
|
|
// Handle escape characters for content parameter
|
|
if paramName == "content" {
|
|
param = strings.ReplaceAll(param, "\\n", "\n")
|
|
param = strings.ReplaceAll(param, "\\\"", "\"")
|
|
param = strings.ReplaceAll(param, "\\'", "'")
|
|
}
|
|
|
|
return param
|
|
}
|
|
|
|
// normalizeStringCoordinates normalizes coordinates from string format
|
|
func normalizeStringCoordinates(coordStr string, size types.Size) ([]float64, error) {
|
|
// check empty string
|
|
if coordStr == "" {
|
|
return nil, fmt.Errorf("empty coordinate string")
|
|
}
|
|
|
|
// Apply coordinate format normalization using the shared function
|
|
normalizedStr := normalizeCoordinatesFormat(coordStr)
|
|
|
|
// Extract numbers from the normalized string using regex
|
|
re := regexp.MustCompile(`\d+`)
|
|
numbers := re.FindAllString(normalizedStr, -1)
|
|
if len(numbers) >= 2 {
|
|
coords := make([]float64, len(numbers))
|
|
for i, numStr := range numbers {
|
|
num, err := strconv.ParseFloat(numStr, 64)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid coordinate: %s", numStr)
|
|
}
|
|
// Convert relative coordinates to absolute coordinates
|
|
if i%2 == 0 { // x coordinates
|
|
coords[i] = convertRelativeToAbsolute(num, true, size)
|
|
} else { // y coordinates
|
|
coords[i] = convertRelativeToAbsolute(num, false, size)
|
|
}
|
|
}
|
|
return coords, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
|
}
|
|
|
|
// normalizeActionCoordinatesToCenterPoint converts bounding box coordinates to center point coordinates
|
|
// Input: "100,200,150,250" (x1,y1,x2,y2) -> Output: [125.0, 225.0] (center point in absolute pixels)
|
|
// Input: "100,200" (x,y) -> Output: [100.0, 200.0] (point in absolute pixels)
|
|
func normalizeActionCoordinatesToCenterPoint(coordData interface{}, size types.Size) ([]float64, error) {
|
|
// First normalize coordinates to get absolute pixel coordinates
|
|
coords, err := normalizeActionCoordinates(coordData, size)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Convert bounding box to center point
|
|
if len(coords) == 4 {
|
|
// [x1, y1, x2, y2] -> [center_x, center_y]
|
|
centerX := (coords[0] + coords[2]) / 2
|
|
centerY := (coords[1] + coords[3]) / 2
|
|
return []float64{centerX, centerY}, nil
|
|
} else if len(coords) == 2 {
|
|
// Already a point [x, y], return as-is
|
|
return coords, nil
|
|
} else {
|
|
return nil, fmt.Errorf("invalid coordinate format: expected 2 or 4 coordinates, got %d", len(coords))
|
|
}
|
|
}
|
|
|
|
// Action represents a parsed action with its context.
|
|
type Action struct {
|
|
ActionType string `json:"action_type"` // map to option.ActionName
|
|
ActionInputs map[string]any `json:"action_inputs"`
|
|
}
|
|
|
|
// convertActionsToToolCalls converts actions to tool calls
|
|
// This is a shared function used by both JSONContentParser and UITARSContentParser
|
|
func convertActionsToToolCalls(actions []Action, actionMapping map[string]option.ActionName) []schema.ToolCall {
|
|
toolCalls := make([]schema.ToolCall, 0, len(actions))
|
|
for _, action := range actions {
|
|
jsonArgs, err := json.Marshal(action.ActionInputs)
|
|
if err != nil {
|
|
log.Error().Interface("action", action).Msg("failed to marshal action inputs")
|
|
continue
|
|
}
|
|
actionName := string(actionMapping[action.ActionType])
|
|
if actionName == "" {
|
|
actionName = action.ActionType
|
|
}
|
|
toolCalls = append(toolCalls, schema.ToolCall{
|
|
ID: actionName + "_" + strconv.FormatInt(time.Now().Unix(), 10),
|
|
Type: "function",
|
|
Function: schema.FunctionCall{
|
|
Name: "uixt__" + actionName,
|
|
Arguments: string(jsonArgs),
|
|
},
|
|
})
|
|
}
|
|
return toolCalls
|
|
}
|