@@ -1,12 +1,15 @@
package ai
import (
"encoding/json"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"time"
"github.com/cloudwego/eino/schema"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/rs/zerolog/log"
)
@@ -30,178 +33,257 @@ func (p *UITARSContentParser) SystemPrompt() string {
// ParseActionToStructureOutput parses the model output text into structured actions.
func ( p * UITARSContentParser ) Parse ( content string , size types . Size ) ( * PlanningResult , error ) {
text := strings . TrimSpace ( content )
if strings . Contains ( text , "<point>" ) {
text = convertPointToCoordinates ( text )
// Extract thought/reflection
thought := p . extractThought ( text )
// Normalize text first
normalizedText := p . normalizeCoordinates ( text )
// Get action string from normalized text
actionStr , err := p . extractActionString ( normalizedText )
if err != nil {
return nil , err
}
// Parse actions directly
actions , err := p . parseActionString ( actionStr , size )
if err != nil {
return nil , err
}
// Convert actions to tool calls
toolCalls := p . convertActionsToToolCalls ( actions )
return & PlanningResult {
ToolCalls : toolCalls ,
Actions : actions ,
ActionSummary : thought ,
Thought : thought ,
Text : normalizedText ,
} , nil
}
// extractThought extracts thought from the text
func ( p * UITARSContentParser ) extractThought ( text string ) string {
re := regexp . MustCompile ( ` Thought:(.*?)\nAction: ` )
matches := re . FindStringSubmatch ( text )
if len ( matches ) > 1 {
return strings . TrimSpace ( matches [ 1 ] )
}
return ""
}
// extractActionString extracts the action string from the text
func ( p * UITARSContentParser ) extractActionString ( text string ) ( string , error ) {
// Extract Action part using regex
re := regexp . MustCompile ( ` Action:(.*?)(?:\n|$) ` )
matches := re . FindStringSubmatch ( text )
if len ( matches ) > 1 {
return strings . TrimSpace ( matches [ 1 ] ) , nil
}
return "" , fmt . Errorf ( "no Action: found" )
}
// normalizeCoordinates normalizes the text by converting points to coordinates and replacing keywords
func ( p * UITARSContentParser ) normalizeCoordinates ( text string ) string {
// Convert point tags to coordinate format
if strings . Contains ( text , "<point>" ) {
// support <point>x1 y1 x2 y2</point> or <point>x y</point>
re := regexp . MustCompile ( ` <point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point> ` )
text = re . ReplaceAllStringFunc ( text , func ( match string ) string {
submatches := re . FindStringSubmatch ( match )
if submatches [ 3 ] != "" && submatches [ 4 ] != "" {
// 4 numbers
return fmt . Sprintf ( "(%s,%s,%s,%s)" ,
submatches [ 1 ] , submatches [ 2 ] , submatches [ 3 ] , submatches [ 4 ] )
}
// 2 numbers
return fmt . Sprintf ( "(%s,%s)" , submatches [ 1 ] , submatches [ 2 ] )
} )
}
// Convert bbox tags to coordinate format
if strings . Contains ( text , "<bbox>" ) {
// support <bbox>x1 y1 x2 y2</bbox>
re := regexp . MustCompile ( ` <bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)</bbox> ` )
text = re . ReplaceAllStringFunc ( text , func ( match string ) string {
submatches := re . FindStringSubmatch ( match )
// 4 numbers for bbox
return fmt . Sprintf ( "(%s,%s,%s,%s)" ,
submatches [ 1 ] , submatches [ 2 ] , submatches [ 3 ] , submatches [ 4 ] )
} )
}
// Convert bracket format [x1, y1, x2, y2] to coordinate format
if strings . Contains ( text , "[" ) && strings . Contains ( text , "]" ) {
// support [x1, y1, x2, y2] format
re := regexp . MustCompile ( ` \[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\] ` )
text = re . ReplaceAllStringFunc ( text , func ( match string ) string {
submatches := re . FindStringSubmatch ( match )
// 4 numbers for bracket format
return fmt . Sprintf ( "(%s,%s,%s,%s)" ,
submatches [ 1 ] , submatches [ 2 ] , submatches [ 3 ] , submatches [ 4 ] )
} )
}
// Legacy parameter name replacements (keep for backward compatibility)
text = strings . ReplaceAll ( text , "start_point=" , "start_box=" )
text = strings . ReplaceAll ( text , "end_point=" , "end_box=" )
text = strings . ReplaceAll ( text , "point=" , "start_box=" )
// Extract context (thought/reflection)
var thought , reflection string
actionIdx := strings . Index ( text , "Action:" )
prefix := ""
if actionIdx != - 1 {
prefix = text [ : actionIdx ]
}
if strings . HasPrefix ( prefix , "Thought:" ) {
thought = strings . TrimSpace ( strings . TrimPrefix ( prefix , "Thought:" ) )
} else if strings . HasPrefix ( prefix , "Reflection:" ) {
refIdx := strings . Index ( prefix , "Action_Summary:" )
if refIdx != - 1 {
reflection = strings . TrimSpace ( strings . TrimPrefix ( prefix [ : refIdx ] , "Reflection:" ) )
thought = strings . TrimSpace ( strings . TrimPrefix ( prefix [ refIdx : ] , "Action_Summary:" ) )
}
} else if strings . HasPrefix ( prefix , "Action_Summary:" ) {
thought = strings . TrimSpace ( strings . TrimPrefix ( prefix , "Action_Summary:" ) )
}
if ! strings . Contains ( text , "Action:" ) {
return nil , fmt . Errorf ( "no Action: found" )
}
actionStr := strings . SplitN ( text , "Action: " , 2 ) [ 1 ]
return text
}
rawActions := strings . Split ( actionStr , ")\n\n" )
normalizedActions := make ( [ ] string , 0 , len ( rawActions ) )
for _ , act := range rawActions {
actionStr := act
if strings . Contains ( actionStr , "type(content" ) {
if ! strings . HasSuffix ( strings . TrimSpace ( actionStr ) , ") " ) {
actionStr = strings . TrimSpace ( actionStr ) + ")"
}
pattern := regexp . MustCompile ( ` type\(content='(.*?)'\) ` )
m := pattern . FindStringSubmatch ( actionStr )
if len ( m ) > 1 {
content := m [ 1 ]
actionStr = "type(content='" + escapeSingleQuotes ( content ) + "')"
} else {
return nil , fmt . Errorf ( "pattern not found in the input string" )
}
}
if ! strings . HasSuffix ( strings . TrimSpace ( actionStr ) , ")" ) {
actionStr = strings . TrimSpace ( actionStr ) + ")"
}
normalizedActions = append ( normalizedActions , actionStr )
// parseActionString parses the action string directly
func ( p * UITARSContentParser ) parseActionString ( actionStr string , size types . Size ) ( [ ] Action , error ) {
actions := make ( [ ] Action , 0 , 1 )
// Parse action type and parameters
actionParts := strings . SplitN ( actionStr , "( " , 2 )
if len ( actionParts ) < 2 {
return nil , fmt . Errorf ( "not a function call" )
}
actions := make ( [ ] Action , 0 , len ( normalizedActions ) )
for _ , action := range normalizedActions {
parsed , err := ParseAction ( strings . ReplaceAll ( action , "\n" , "\\n" ) )
if err ! = nil {
return nil , fmt . Errorf ( "Action can't parse: %s" , action )
}
actionTyp e := parsed . Function
param s := parsed . Args
actionInputs := make ( map [ string ] any )
imageWidth : = size . Width
imageHeight := size . Height
for paramName , param := range params {
if p aram == "" {
continue
funcName := strings . TrimSpace ( actionParts [ 0 ] )
paramsText := strings . TrimSuffix ( strings . TrimSpace ( actionParts [ 1 ] ) , ")" )
args : = make ( map [ string ] string )
if paramsText != "" {
// Use regex to extract key=value pairs, handling quoted values properly
r e := regexp . MustCompile ( ` (\w+)\s*=\s*['"]([^'"]*?)['"] ` )
matche s := re . FindAllStringSubmatch ( paramsText , - 1 )
for _ , match := range matches {
if len ( match ) > = 3 {
key := strings . TrimSpace ( match [ 1 ] )
value := strings . TrimSpace ( match [ 2 ] )
args [ key ] = value
}
param = strings . TrimLeft ( param , " " )
actionInputs [ paramName ] = param
if strings . Contains ( paramName , "start_box" ) || strings . Contains ( paramName , "end_box" ) {
oriBox := param
parameters := strings . Split ( strings . ReplaceAll ( strings . ReplaceAll ( oriBox , "(" , "" ) , ")" , "" ) , "," )
floatNumbers := make ( [ ] float64 , 0 , len ( parameters ) )
for _ , numStr := range parameters {
num , err := strconv . ParseFloat ( strings . TrimSpace ( numStr ) , 64 )
}
}
actionInputs , err := p . parseActionInputs ( args , size )
if err != nil {
return nil , err
}
actions = append ( actions , Action {
ActionType : funcName ,
ActionInputs : actionInputs ,
} )
return actions , nil
}
// parseActionInputs parses action parameters and converts coordinates
func ( p * UITARSContentParser ) parseActionInputs ( args map [ string ] string , size types . Size ) ( map [ string ] any , error ) {
actionInputs := make ( map [ string ] any )
imageWidth := size . Width
imageHeight := size . Height
for paramName , param := range args {
if param == "" {
continue
}
param = strings . TrimSpace ( param )
// Convert box coordinates
if strings . Contains ( paramName , "box" ) || strings . Contains ( paramName , "point" ) {
// Extract numbers from the parameter value using regex
re := regexp . MustCompile ( ` \d+ ` )
numbers := re . FindAllString ( param , - 1 )
if len ( numbers ) >= 2 {
coords := make ( [ ] float64 , len ( numbers ) )
for i , numStr := range numbers {
num , err := strconv . ParseFloat ( numStr , 64 )
if err != nil {
log . Error ( ) . Interface ( "parameters" , parameters ) . Msg ( "invalid float action parameters" )
return nil , fmt . Errorf ( "invalid action parameters" )
return nil , fmt . Errorf ( "invalid coordinate: %s" , numStr )
}
// Convert relative coordinates to absolute coordinates
if i % 2 == 0 { // x coordinates
coords [ i ] = math . Round ( ( num / DefaultFactor * float64 ( imageWidth ) ) * 10 ) / 10
} else { // y coordinates
coords [ i ] = math . Round ( ( num / DefaultFactor * float64 ( imageHeight ) ) * 10 ) / 10
}
floatNumbers = append ( floatNumbers , num )
}
// The model generates a 2D coordinate output that represents relative positions.
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
// The absolute coordinates required by the Action can be calculated by:
// - X absolute = X relative × image width / 1000
// - Y absolute = Y relative × image height / 1000
if len ( floatNumbers ) == 2 {
floatNumbers [ 0 ] = math . Round ( ( floatNumbers [ 0 ] / DefaultFactor * float64 ( imageWidth ) ) * 10 ) / 10
floatNumbers [ 1 ] = math . Round ( ( floatNumbers [ 1 ] / DefaultFactor * float64 ( imageHeight ) ) * 10 ) / 10
} else if len ( floatNumbers ) == 4 {
floatNumbers [ 0 ] = math . Round ( ( floatNumbers [ 0 ] / DefaultFactor * float64 ( imageWidth ) ) * 10 ) / 10
floatNumbers [ 1 ] = math . Round ( ( floatNumbers [ 1 ] / DefaultFactor * float64 ( imageHeight ) ) * 10 ) / 10
floatNumbers [ 2 ] = math . Round ( ( floatNumbers [ 2 ] / DefaultFactor * float64 ( imageWidth ) ) * 10 ) / 10
floatNumbers [ 3 ] = math . Round ( ( floatNumbers [ 3 ] / DefaultFactor * float64 ( imageHeight ) ) * 10 ) / 10
} else {
log . Error ( ) . Interface ( "parameters" , floatNumbers ) . Msg ( "invalid float action parameters" )
return nil , fmt . Errorf ( "invalid action parameters" )
}
actionInputs [ paramName ] = floatNumbers
actionInputs [ paramName ] = coords
} else {
actionInputs [ paramName ] = param
}
} else {
// Handle other parameter types (content, key, direction, etc.)
if paramName == "content" {
// Handle escape characters
param = strings . ReplaceAll ( param , "\\n" , "\n" )
param = strings . ReplaceAll ( param , "\\\"" , "\"" )
param = strings . ReplaceAll ( param , "\\'" , "'" )
}
actionInputs [ paramName ] = param
}
actions = append ( actions , Action {
Reflection : reflection ,
Thought : thought ,
ActionType : actionType ,
ActionInputs : actionInputs ,
Text : text ,
}
return actionInputs , nil
}
// convertActionsToToolCalls converts actions to tool calls
func ( p * UITARSContentParser ) convertActionsToToolCalls ( actions [ ] Action ) [ ] schema . ToolCall {
toolCalls := make ( [ ] schema . ToolCall , 0 , len ( actions ) )
for _ , action := range actions {
jsonArgs , err := json . Marshal ( action . ActionInputs )
if err != nil {
log . Error ( ) . Interface ( "action" , action ) . Msg ( "failed to marshal action inputs" )
continue
}
toolCalls = append ( toolCalls , schema . ToolCall {
ID : action . ActionType + "_" + strconv . FormatInt ( time . Now ( ) . Unix ( ) , 10 ) ,
Type : "function" ,
Function : schema . FunctionCall {
Name : action . ActionType ,
Arguments : string ( jsonArgs ) ,
} ,
} )
}
return & PlanningResult {
Actions : actions ,
} , nil
return toolCalls
}
// Action represents a parsed action with its context.
type Action struct {
Reflection string ` json:"reflection" `
Thought string ` json:"thought" `
ActionType string ` json:"action_type" `
ActionInputs map [ string ] any ` json:"action_inputs" `
Text string ` json:"text" `
}
// ParsedActionArgs represents the result of parsing an action string.
type ParsedActionArgs struct {
Function string
Args map [ string ] string
}
// convertPointToCoordinates replaces <point>x y</point> with (x,y)
func convertPointToCoordinates ( text string ) string {
// 支持 <point>x1 y1 x2 y2</point> 或 <point>x y</point>
re := regexp . MustCompile ( ` <point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point> ` )
return re . ReplaceAllStringFunc ( text , func ( match string ) string {
submatches := re . FindStringSubmatch ( match )
if submatches [ 3 ] != "" && submatches [ 4 ] != "" {
// 4 个数字
return fmt . Sprintf ( "(%s,%s,%s,%s)" , submatches [ 1 ] , submatches [ 2 ] , submatches [ 3 ] , submatches [ 4 ] )
}
// 2 个数字
return fmt . Sprintf ( "(%s,%s)" , submatches [ 1 ] , submatches [ 2 ] )
} )
}
// escapeSingleQuotes escapes unescaped single quotes in a string.
func escapeSingleQuotes ( text string ) string {
var b strings . Builder
n := len ( text )
for i := 0 ; i < n ; i ++ {
if text [ i ] == '\'' && ( i == 0 || text [ i - 1 ] != '\\' ) {
b . WriteString ( "\\'" )
} else {
b . WriteByte ( text [ i ] )
}
}
return b . String ( )
}
// ParseAction parses an action string into function name and arguments.
func ParseAction ( actionStr string ) ( * ParsedActionArgs , error ) {
re := regexp . MustCompile ( ` ^(\w+)\((.*)\)$ ` )
matche s := re . FindStringSubmatch ( actionStr )
if len ( matche s) < 3 {
func ParseAction ( actionStr string ) ( * ParsedAction , error ) {
// Parse action type and parameters
actionPart s := strings . SplitN ( actionStr , "(" , 2 )
if len ( actionPart s) < 2 {
return nil , fmt . Errorf ( "not a function call" )
}
funcName := matches [ 1 ]
argsStr := matche s[ 2 ]
funcName := strings . TrimSpace ( actionPart s[ 0 ] )
paramsText := strings . TrimSuffix ( strings . TrimSpace ( actionParts [ 1 ] ) , ")" )
args := make ( map [ string ] string )
argRe : = regexp . MustCompile ( ` (\w+)\s*=\s*'([^']*)' ` )
for _ , m := range argRe . FindAllStringSubmatch ( argsStr , - 1 ) {
args [ m [ 1 ] ] = m [ 2 ]
if paramsText ! = "" {
// Split parameters by comma and parse key=value pairs
for _ , param := range strings . Split ( paramsText , "," ) {
param = strings . TrimSpace ( param )
if strings . Contains ( param , "=" ) {
parts := strings . SplitN ( param , "=" , 2 )
key := strings . TrimSpace ( parts [ 0 ] )
value := strings . TrimSpace ( parts [ 1 ] )
// Remove surrounding quotes
value = strings . Trim ( value , "'\"" )
args [ key ] = value
}
}
}
return & ParsedActionArgs { Function : funcName , Args : args } , nil
return & ParsedAction { Function : funcName , Args : args } , nil
}
// ParsedAction represents the result of parsing an action string.
type ParsedAction struct {
Function string
Args map [ string ] string
}