This commit is contained in:
lilong.129
2025-05-22 22:52:47 +08:00
parent 3b77ade24f
commit 009bfa4ecb
9 changed files with 421 additions and 571 deletions

157
uixt/ai/parser_default.go Normal file
View File

@@ -0,0 +1,157 @@
package ai
import (
"fmt"
"regexp"
"strconv"
"strings"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/pkg/errors"
)
// LLMContentParser parses the content from the LLM response
// parser is corresponding to the model type and system prompt
type LLMContentParser interface {
SystemPrompt() string
Parse(content string, size types.Size) (*PlanningResult, error)
}
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
switch modelType {
case option.LLMServiceTypeUITARS:
return &UITARSContentParser{
systemPrompt: uiTarsPlanningPrompt,
}
default:
return &JSONContentParser{
systemPrompt: defaultPlanningResponseJsonFormat,
}
}
}
// JSONContentParser parses the response as JSON string format
type JSONContentParser struct {
systemPrompt string
}
func (p *JSONContentParser) SystemPrompt() string {
return p.systemPrompt
}
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
content = strings.TrimSpace(content)
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
content = strings.TrimPrefix(content, "```json")
content = strings.TrimSuffix(content, "```")
}
content = strings.TrimSpace(content)
var response PlanningResult
if err := json.Unmarshal([]byte(content), &response); err != nil {
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
}
if response.Error != "" {
return nil, errors.New(response.Error)
}
if len(response.Actions) == 0 {
return nil, errors.New("no actions returned from VLM")
}
// normalize actions
var normalizedActions []Action
for i := range response.Actions {
// create a new variable, avoid implicit memory aliasing in for loop.
action := response.Actions[i]
if err := normalizeAction(&action); err != nil {
return nil, errors.Wrap(err, "failed to normalize action")
}
normalizedActions = append(normalizedActions, action)
}
return &PlanningResult{
Actions: normalizedActions,
ActionSummary: response.ActionSummary,
}, nil
}
// normalizeAction normalizes the coordinates in the action
func normalizeAction(action *Action) error {
switch action.ActionType {
case "click", "drag":
// handle click and drag action coordinates
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
normalized, err := normalizeCoordinates(startBox)
if err != nil {
return fmt.Errorf("failed to normalize startBox: %w", err)
}
action.ActionInputs["startBox"] = normalized
}
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
normalized, err := normalizeCoordinates(endBox)
if err != nil {
return fmt.Errorf("failed to normalize endBox: %w", err)
}
action.ActionInputs["endBox"] = normalized
}
}
return nil
}
// normalizeCoordinates normalizes the coordinates based on the factor
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
// check empty string
if coordStr == "" {
return nil, fmt.Errorf("empty coordinate string")
}
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
if len(bboxMatches) > 1 {
// Extract space-separated values from inside the bbox tags
bboxContent := bboxMatches[1]
// Split by whitespace
parts := strings.Fields(bboxContent)
if len(parts) == 4 {
coords = make([]float64, 4)
for i, part := range parts {
val, e := strconv.ParseFloat(part, 64)
if e != nil {
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
}
coords[i] = val
}
// 将 val 转换为 [x,y] 坐标
x := (coords[0] + coords[2]) / 2
y := (coords[1] + coords[3]) / 2
return []float64{x, y}, nil
}
}
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
if strings.Contains(coordStr, ",") {
// remove possible brackets and split coordinates
coordStr = strings.Trim(coordStr, "[]() \t")
// try parsing JSON array
jsonStr := coordStr
if !strings.HasPrefix(jsonStr, "[") {
jsonStr = "[" + coordStr + "]"
}
err = json.Unmarshal([]byte(jsonStr), &coords)
if err != nil {
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
}
return coords, nil
}
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
}

33
uixt/ai/parser_test.go Normal file
View File

@@ -0,0 +1,33 @@
package ai
import (
"testing"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/stretchr/testify/assert"
)
func TestParseAction(t *testing.T) {
actionStr := "click(point='<point>200 300</point>')"
result, err := ParseAction(actionStr)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, result.Function, "click")
assert.Equal(t, result.Args["point"], "<point>200 300</point>")
}
func TestParseActionToStructureOutput(t *testing.T) {
text := "Thought: test\nAction: click(point='<point>200 300</point>')"
parser := &UITARSContentParser{}
result, err := parser.Parse(text, types.Size{Height: 224, Width: 224})
assert.Nil(t, err)
assert.Equal(t, result.Actions[0].ActionType, "click")
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='<point>550 450 550 450</point>')"
result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024})
assert.Nil(t, err)
assert.Equal(t, result.Actions[0].ActionType, "click")
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
}

207
uixt/ai/parser_ui_tars.go Normal file
View File

@@ -0,0 +1,207 @@
package ai
import (
"fmt"
"math"
"regexp"
"strconv"
"strings"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/rs/zerolog/log"
)
// reference:
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/action_parser.py
const (
DefaultFactor = 1000
)
// UITARSContentParser parses the Thought/Action format response
type UITARSContentParser struct {
systemPrompt string
}
func (p *UITARSContentParser) SystemPrompt() string {
return p.systemPrompt
}
// ParseActionToStructureOutput parses the model output text into structured actions.
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
text := strings.TrimSpace(content)
if strings.Contains(text, "<point>") {
text = convertPointToCoordinates(text)
}
text = strings.ReplaceAll(text, "start_point=", "start_box=")
text = strings.ReplaceAll(text, "end_point=", "end_box=")
text = strings.ReplaceAll(text, "point=", "start_box=")
// Extract context (thought/reflection)
var thought, reflection string
actionIdx := strings.Index(text, "Action:")
prefix := ""
if actionIdx != -1 {
prefix = text[:actionIdx]
}
if strings.HasPrefix(prefix, "Thought:") {
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Thought:"))
} else if strings.HasPrefix(prefix, "Reflection:") {
refIdx := strings.Index(prefix, "Action_Summary:")
if refIdx != -1 {
reflection = strings.TrimSpace(strings.TrimPrefix(prefix[:refIdx], "Reflection:"))
thought = strings.TrimSpace(strings.TrimPrefix(prefix[refIdx:], "Action_Summary:"))
}
} else if strings.HasPrefix(prefix, "Action_Summary:") {
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Action_Summary:"))
}
if !strings.Contains(text, "Action:") {
return nil, fmt.Errorf("no Action: found")
}
actionStr := strings.SplitN(text, "Action: ", 2)[1]
rawActions := strings.Split(actionStr, ")\n\n")
normalizedActions := make([]string, 0, len(rawActions))
for _, act := range rawActions {
actionStr := act
if strings.Contains(actionStr, "type(content") {
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
actionStr = strings.TrimSpace(actionStr) + ")"
}
pattern := regexp.MustCompile(`type\(content='(.*?)'\)`)
m := pattern.FindStringSubmatch(actionStr)
if len(m) > 1 {
content := m[1]
actionStr = "type(content='" + escapeSingleQuotes(content) + "')"
} else {
return nil, fmt.Errorf("pattern not found in the input string")
}
}
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
actionStr = strings.TrimSpace(actionStr) + ")"
}
normalizedActions = append(normalizedActions, actionStr)
}
actions := make([]Action, 0, len(normalizedActions))
for _, action := range normalizedActions {
parsed, err := ParseAction(strings.ReplaceAll(action, "\n", "\\n"))
if err != nil {
return nil, fmt.Errorf("Action can't parse: %s", action)
}
actionType := parsed.Function
params := parsed.Args
actionInputs := make(map[string]any)
imageWidth := size.Width
imageHeight := size.Height
for paramName, param := range params {
if param == "" {
continue
}
param = strings.TrimLeft(param, " ")
actionInputs[paramName] = param
if strings.Contains(paramName, "start_box") || strings.Contains(paramName, "end_box") {
oriBox := param
parameters := strings.Split(strings.ReplaceAll(strings.ReplaceAll(oriBox, "(", ""), ")", ""), ",")
floatNumbers := make([]float64, 0, len(parameters))
for _, numStr := range parameters {
num, err := strconv.ParseFloat(strings.TrimSpace(numStr), 64)
if err != nil {
log.Error().Interface("parameters", parameters).Msg("invalid float action parameters")
return nil, fmt.Errorf("invalid action parameters")
}
floatNumbers = append(floatNumbers, num)
}
// The model generates a 2D coordinate output that represents relative positions.
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
// The absolute coordinates required by the Action can be calculated by:
// - X absolute = X relative × image width / 1000
// - Y absolute = Y relative × image height / 1000
if len(floatNumbers) == 2 {
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
} else if len(floatNumbers) == 4 {
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
floatNumbers[2] = math.Round((floatNumbers[2]/DefaultFactor*float64(imageWidth))*10) / 10
floatNumbers[3] = math.Round((floatNumbers[3]/DefaultFactor*float64(imageHeight))*10) / 10
} else {
log.Error().Interface("parameters", floatNumbers).Msg("invalid float action parameters")
return nil, fmt.Errorf("invalid action parameters")
}
actionInputs[paramName] = floatNumbers
}
}
actions = append(actions, Action{
Reflection: reflection,
Thought: thought,
ActionType: actionType,
ActionInputs: actionInputs,
Text: text,
})
}
return &PlanningResult{
Actions: actions,
}, nil
}
// Action represents a parsed action with its context.
type Action struct {
Reflection string `json:"reflection"`
Thought string `json:"thought"`
ActionType string `json:"action_type"`
ActionInputs map[string]any `json:"action_inputs"`
Text string `json:"text"`
}
// ParsedActionArgs represents the result of parsing an action string.
type ParsedActionArgs struct {
Function string
Args map[string]string
}
// convertPointToCoordinates replaces <point>x y</point> with (x,y)
func convertPointToCoordinates(text string) string {
// 支持 <point>x1 y1 x2 y2</point> 或 <point>x y</point>
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
return re.ReplaceAllStringFunc(text, func(match string) string {
submatches := re.FindStringSubmatch(match)
if submatches[3] != "" && submatches[4] != "" {
// 4 个数字
return fmt.Sprintf("(%s,%s,%s,%s)", submatches[1], submatches[2], submatches[3], submatches[4])
}
// 2 个数字
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
})
}
// escapeSingleQuotes escapes unescaped single quotes in a string.
func escapeSingleQuotes(text string) string {
var b strings.Builder
n := len(text)
for i := 0; i < n; i++ {
if text[i] == '\'' && (i == 0 || text[i-1] != '\\') {
b.WriteString("\\'")
} else {
b.WriteByte(text[i])
}
}
return b.String()
}
// ParseAction parses an action string into function name and arguments.
func ParseAction(actionStr string) (*ParsedActionArgs, error) {
re := regexp.MustCompile(`^(\w+)\((.*)\)$`)
matches := re.FindStringSubmatch(actionStr)
if len(matches) < 3 {
return nil, fmt.Errorf("not a function call")
}
funcName := matches[1]
argsStr := matches[2]
args := make(map[string]string)
argRe := regexp.MustCompile(`(\w+)\s*=\s*'([^']*)'`)
for _, m := range argRe.FindAllStringSubmatch(argsStr, -1) {
args[m[1]] = m[2]
}
return &ParsedActionArgs{Function: funcName, Args: args}, nil
}

View File

@@ -28,7 +28,7 @@ type PlanningOptions struct {
// PlanningResult represents the result of planning
type PlanningResult struct {
ToolCalls []schema.ToolCall `json:"tool_calls"` // TODO: merge to NextActions
NextActions []ParsedAction `json:"actions"`
Actions []Action `json:"actions"`
ActionSummary string `json:"summary"`
Error string `json:"error,omitempty"`
}
@@ -138,7 +138,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
log.Info().
Interface("summary", result.ActionSummary).
Interface("actions", result.NextActions).
Interface("actions", result.Actions).
Msg("get VLM planning result")
return result, nil
}

View File

@@ -1,431 +0,0 @@
package ai
import (
"fmt"
"math"
"regexp"
"strconv"
"strings"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/pkg/errors"
"github.com/rs/zerolog/log"
)
// LLMContentParser parses the content from the LLM response
// parser is corresponding to the model type and system prompt
type LLMContentParser interface {
SystemPrompt() string
Parse(content string, size types.Size) (*PlanningResult, error)
}
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
switch modelType {
case option.LLMServiceTypeUITARS:
return &UITARSContentParser{
systemPrompt: uiTarsPlanningPrompt,
}
default:
return &JSONContentParser{
systemPrompt: defaultPlanningResponseJsonFormat,
}
}
}
// ParsedAction represents a parsed action from the VLM response
type ParsedAction struct {
ActionType ActionType `json:"actionType"`
ActionInputs map[string]interface{} `json:"actionInputs"`
Thought string `json:"thought"`
}
type ActionType string
const (
ActionTypeClick ActionType = "click"
ActionTypeTap ActionType = "tap"
ActionTypeDrag ActionType = "drag"
ActionTypeSwipe ActionType = "swipe"
ActionTypeWait ActionType = "wait"
ActionTypeFinished ActionType = "finished"
ActionTypeCallUser ActionType = "call_user"
ActionTypeType ActionType = "type"
ActionTypeScroll ActionType = "scroll"
)
// UITARSContentParser parses the Thought/Action format response
type UITARSContentParser struct {
systemPrompt string
}
func (p *UITARSContentParser) SystemPrompt() string {
return p.systemPrompt
}
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
thoughtRegex := regexp.MustCompile(`(?is)Thought:(.+?)Action:`)
actionRegex := regexp.MustCompile(`(?is)Action:(.+)`)
// extract Thought part
thoughtMatch := thoughtRegex.FindStringSubmatch(content)
var thought string
if len(thoughtMatch) > 1 {
thought = strings.TrimSpace(thoughtMatch[1])
}
// extract Action part, e.g. "click(start_box='(552,454)')"
actionMatch := actionRegex.FindStringSubmatch(content)
if len(actionMatch) < 2 {
return nil, errors.New("no action found in the response")
}
actionsText := strings.TrimSpace(actionMatch[1])
// parse action type and parameters
parseActions, err := parseActionText(actionsText, thought)
if err != nil {
return nil, err
}
// process response
result, err := processVLMResponse(parseActions, size)
if err != nil {
return nil, errors.Wrap(err, "process VLM response failed")
}
return result, nil
}
// parseActionText parses the action text to extract the action type and parameters
func parseActionText(actionsText, thought string) ([]ParsedAction, error) {
// remove trailing comments
if idx := strings.Index(actionsText, "#"); idx > 0 {
actionsText = strings.TrimSpace(actionsText[:idx])
}
// supported action types and regexes
actionRegexes := map[ActionType]*regexp.Regexp{
"click": regexp.MustCompile(`click\(start_box='([^']+)'\)`),
"left_double": regexp.MustCompile(`left_double\(start_box='([^']+)'\)`),
"right_single": regexp.MustCompile(`right_single\(start_box='([^']+)'\)`),
"drag": regexp.MustCompile(`drag\(start_box='([^']+)', end_box='([^']+)'\)`),
"type": regexp.MustCompile(`type\(content='([^']+)'\)`),
"scroll": regexp.MustCompile(`scroll\(start_box='([^']+)', direction='([^']+)'\)`),
"wait": regexp.MustCompile(`wait\(\)`),
"finished": regexp.MustCompile(`finished\(content='([^']+)'\)`),
"call_user": regexp.MustCompile(`call_user\(\)`),
}
// one or multiple actions, separated by newline
// "click(start_box='<bbox>229 379 229 379</bbox>')
// "click(start_box='<bbox>229 379 229 379</bbox>')\n\nclick(start_box='<bbox>769 519 769 519</bbox>')"
parsedActions := make([]ParsedAction, 0)
for _, actionText := range strings.Split(actionsText, "\n") {
actionText = strings.TrimSpace(actionText)
for actionType, regex := range actionRegexes {
matches := regex.FindStringSubmatch(actionText)
if len(matches) == 0 {
continue
}
var action ParsedAction
action.ActionType = actionType
action.ActionInputs = make(map[string]interface{})
action.Thought = thought
// parse parameters based on action type
switch actionType {
case ActionTypeClick:
if len(matches) > 1 {
coord, err := normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize point failed: %s", matches[1])
}
action.ActionInputs["startBox"] = coord
}
case ActionTypeDrag:
if len(matches) > 2 {
// handle start point
startBox, err := normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
}
action.ActionInputs["startBox"] = startBox
// handle end point
endBox, err := normalizeCoordinates(matches[2])
if err != nil {
return nil, errors.Wrapf(err, "normalize endBox failed: %s", matches[2])
}
action.ActionInputs["endBox"] = endBox
}
case ActionTypeType:
if len(matches) > 1 {
action.ActionInputs["content"] = matches[1]
}
case ActionTypeScroll:
if len(matches) > 2 {
startBox, err := normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
}
action.ActionInputs["startBox"] = startBox
action.ActionInputs["direction"] = matches[2]
}
case ActionTypeWait, ActionTypeFinished, ActionTypeCallUser:
// 这些动作没有额外参数
}
parsedActions = append(parsedActions, action)
}
}
if len(parsedActions) == 0 {
return nil, fmt.Errorf("no valid actions returned from VLM")
}
return parsedActions, nil
}
// normalizeCoordinates normalizes the coordinates based on the factor
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
// check empty string
if coordStr == "" {
return nil, fmt.Errorf("empty coordinate string")
}
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
if len(bboxMatches) > 1 {
// Extract space-separated values from inside the bbox tags
bboxContent := bboxMatches[1]
// Split by whitespace
parts := strings.Fields(bboxContent)
if len(parts) == 4 {
coords = make([]float64, 4)
for i, part := range parts {
val, e := strconv.ParseFloat(part, 64)
if e != nil {
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
}
coords[i] = val
}
// 将 val 转换为 [x,y] 坐标
x := (coords[0] + coords[2]) / 2
y := (coords[1] + coords[3]) / 2
return []float64{x, y}, nil
}
}
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
if strings.Contains(coordStr, ",") {
// remove possible brackets and split coordinates
coordStr = strings.Trim(coordStr, "[]() \t")
// try parsing JSON array
jsonStr := coordStr
if !strings.HasPrefix(jsonStr, "[") {
jsonStr = "[" + coordStr + "]"
}
err = json.Unmarshal([]byte(jsonStr), &coords)
if err != nil {
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
}
return coords, nil
}
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
}
// processVLMResponse processes the VLM response and converts it to PlanningResult
func processVLMResponse(actions []ParsedAction, size types.Size) (*PlanningResult, error) {
log.Info().Msg("processing VLM response...")
if len(actions) == 0 {
return nil, fmt.Errorf("no actions returned from VLM")
}
// validate and post-process each action
for i := range actions {
// validate action type
switch actions[i].ActionType {
case "click":
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
return nil, errors.Wrap(err, "convert coordinate action failed")
}
case "drag":
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
return nil, errors.Wrap(err, "convert coordinate action failed")
}
if err := convertCoordinateAction(&actions[i], "endBox", size); err != nil {
return nil, errors.Wrap(err, "convert coordinate action failed")
}
case "type":
validateTypeContent(&actions[i])
case "wait", "finished", "call_user":
// these actions do not need extra parameters
default:
log.Printf("warning: unknown action type: %s, will try to continue processing", actions[i].ActionType)
}
}
// extract action summary
actionSummary := extractActionSummary(actions)
return &PlanningResult{
NextActions: actions,
ActionSummary: actionSummary,
}, nil
}
// extractActionSummary extracts the summary from the actions
func extractActionSummary(actions []ParsedAction) string {
if len(actions) == 0 {
return ""
}
// use the Thought of the first action as summary
if actions[0].Thought != "" {
return actions[0].Thought
}
// if no Thought, generate summary from action type
action := actions[0]
switch action.ActionType {
case "click":
return "点击操作"
case "drag":
return "拖拽操作"
case "type":
content, _ := action.ActionInputs["content"].(string)
if len(content) > 20 {
content = content[:20] + "..."
}
return fmt.Sprintf("输入文本: %s", content)
case "wait":
return "等待操作"
case "finished":
return "完成操作"
case "call_user":
return "请求用户协助"
default:
return fmt.Sprintf("执行 %s 操作", action.ActionType)
}
}
func convertCoordinateAction(action *ParsedAction, boxField string, size types.Size) error {
// The model generates a 2D coordinate output that represents relative positions.
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
// The absolute coordinates required by the Action can be calculated by:
// - X absolute = X relative × image width / 1000
// - Y absolute = Y relative × image height / 1000
// get image width and height
imageWidth := size.Width
imageHeight := size.Height
box := action.ActionInputs[boxField]
coords, ok := box.([]float64)
if !ok {
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
return fmt.Errorf("invalid action inputs")
}
if len(coords) == 2 {
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
} else if len(coords) == 4 {
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
coords[2] = math.Round((coords[2]/1000*float64(imageWidth))*10) / 10
coords[3] = math.Round((coords[3]/1000*float64(imageHeight))*10) / 10
} else {
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
return fmt.Errorf("invalid action inputs")
}
return nil
}
// validateTypeContent 验证输入文本内容
func validateTypeContent(action *ParsedAction) {
if content, ok := action.ActionInputs["content"]; !ok || content == "" {
// default to empty string
action.ActionInputs["content"] = ""
log.Warn().Msg("type action missing content parameter, set to default")
}
}
// JSONContentParser parses the response as JSON string format
type JSONContentParser struct {
systemPrompt string
}
func (p *JSONContentParser) SystemPrompt() string {
return p.systemPrompt
}
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
content = strings.TrimSpace(content)
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
content = strings.TrimPrefix(content, "```json")
content = strings.TrimSuffix(content, "```")
}
content = strings.TrimSpace(content)
var response PlanningResult
if err := json.Unmarshal([]byte(content), &response); err != nil {
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
}
if response.Error != "" {
return nil, errors.New(response.Error)
}
if len(response.NextActions) == 0 {
return nil, errors.New("no actions returned from VLM")
}
// normalize actions
var normalizedActions []ParsedAction
for i := range response.NextActions {
// create a new variable, avoid implicit memory aliasing in for loop.
action := response.NextActions[i]
if err := normalizeAction(&action); err != nil {
return nil, errors.Wrap(err, "failed to normalize action")
}
normalizedActions = append(normalizedActions, action)
}
return &PlanningResult{
NextActions: normalizedActions,
ActionSummary: response.ActionSummary,
}, nil
}
// normalizeAction normalizes the coordinates in the action
func normalizeAction(action *ParsedAction) error {
switch action.ActionType {
case "click", "drag":
// handle click and drag action coordinates
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
normalized, err := normalizeCoordinates(startBox)
if err != nil {
return fmt.Errorf("failed to normalize startBox: %w", err)
}
action.ActionInputs["startBox"] = normalized
}
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
normalized, err := normalizeCoordinates(endBox)
if err != nil {
return fmt.Errorf("failed to normalize endBox: %w", err)
}
action.ActionInputs["endBox"] = normalized
}
}
return nil
}

View File

@@ -12,14 +12,14 @@ Action: ...
` + "```" + `
## Action Space
click(start_box='[x1, y1, x2, y2]')
left_double(start_box='[x1, y1, x2, y2]')
right_single(start_box='[x1, y1, x2, y2]')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
hotkey(key='')
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
click(point='<point>x1 y1</point>')
long_press(point='<point>x1 y1</point>')
type(content='') #If you want to submit your input, use "\\n" at the end of ` + "`content`" + `.
scroll(point='<point>x1 y1</point>', direction='down or up or right or left')
open_app(app_name=\'\')
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
press_home()
press_back()
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
## Note
@@ -30,11 +30,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
`
// system prompt for JSONContentParser
const defaultPlanningResponseJsonFormat = `You are a versatile professional in software UI automation.
## Output Format
` + "```" + `
Thought: ...
Action: ...
` + "```" + `
`
const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.`

View File

@@ -8,7 +8,6 @@ import (
"github.com/httprunner/httprunner/v5/code"
"github.com/httprunner/httprunner/v5/internal/builtin"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -58,43 +57,12 @@ func TestVLMPlanning(t *testing.T) {
// 验证结果
require.NoError(t, err)
require.NotNil(t, result)
require.NotEmpty(t, result.NextActions)
require.NotEmpty(t, result.Actions)
// 验证动作
action := result.NextActions[0]
action := result.Actions[0]
assert.NotEmpty(t, action.ActionType)
assert.NotEmpty(t, action.Thought)
// 根据动作类型验证参数
switch action.ActionType {
case "click", "drag", "left_double", "right_single", "scroll":
// 这些动作需要验证坐标
assert.NotEmpty(t, action.ActionInputs["startBox"])
// 验证坐标格式
coords, ok := action.ActionInputs["startBox"].([]float64)
require.True(t, ok)
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
// 验证坐标范围
for _, coord := range coords {
assert.GreaterOrEqual(t, coord, float64(0))
}
case "type":
// 验证文本内容
assert.NotEmpty(t, action.ActionInputs["content"])
case "hotkey":
// 验证按键
assert.NotEmpty(t, action.ActionInputs["key"])
case "wait", "finished", "call_user":
// 这些动作不需要额外参数
default:
t.Fatalf("未知的动作类型: %s", action.ActionType)
}
}
func TestXHSPlanning(t *testing.T) {
@@ -131,43 +99,12 @@ func TestXHSPlanning(t *testing.T) {
// 验证结果
require.NoError(t, err)
require.NotNil(t, result)
require.NotEmpty(t, result.NextActions)
require.NotEmpty(t, result.Actions)
// 验证动作
action := result.NextActions[0]
action := result.Actions[0]
assert.NotEmpty(t, action.ActionType)
assert.NotEmpty(t, action.Thought)
// 根据动作类型验证参数
switch action.ActionType {
case "click", "drag", "left_double", "right_single", "scroll":
// 这些动作需要验证坐标
assert.NotEmpty(t, action.ActionInputs["startBox"])
// 验证坐标格式
coords, ok := action.ActionInputs["startBox"].([]float64)
require.True(t, ok)
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
// 验证坐标范围
for _, coord := range coords {
assert.GreaterOrEqual(t, coord, float64(0))
}
case "type":
// 验证文本内容
assert.NotEmpty(t, action.ActionInputs["content"])
case "hotkey":
// 验证按键
assert.NotEmpty(t, action.ActionInputs["key"])
case "wait", "finished", "call_user":
// 这些动作不需要额外参数
default:
t.Fatalf("未知的动作类型: %s", action.ActionType)
}
}
func TestChatList(t *testing.T) {
@@ -218,11 +155,11 @@ func TestHandleSwitch(t *testing.T) {
testCases := []struct {
imageFile string
actionType ActionType
actionType string
}{
{"testdata/deepseek_think_off.png", ActionTypeClick},
{"testdata/deepseek_think_on.png", ActionTypeFinished},
{"testdata/deepseek_network_on.png", ActionTypeFinished},
{"testdata/deepseek_think_off.png", "finished"},
{"testdata/deepseek_think_on.png", "finished"},
{"testdata/deepseek_network_on.png", "finished"},
}
for _, tc := range testCases {
@@ -251,7 +188,7 @@ func TestHandleSwitch(t *testing.T) {
// Validate results
require.NoError(t, err)
require.NotNil(t, result)
require.Equal(t, result.NextActions[0].ActionType, tc.actionType,
require.Equal(t, result.Actions[0].ActionType, tc.actionType,
"Unexpected action type for image file: %s", tc.imageFile)
}
}
@@ -336,52 +273,6 @@ func TestValidateInput(t *testing.T) {
}
}
func TestProcessVLMResponse(t *testing.T) {
tests := []struct {
name string
actions []ParsedAction
wantErr bool
}{
{
name: "valid response",
actions: []ParsedAction{
{
ActionType: "click",
ActionInputs: map[string]interface{}{
"startBox": []float64{0.5, 0.5},
},
Thought: "点击中心位置",
},
},
wantErr: false,
},
{
name: "empty actions",
actions: []ParsedAction{},
wantErr: true,
},
}
size := types.Size{
Width: 1000,
Height: 1000,
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := processVLMResponse(tt.actions, size)
if tt.wantErr {
assert.Error(t, err)
assert.Nil(t, result)
return
}
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, tt.actions, result.NextActions)
})
}
}
func TestLoadImage(t *testing.T) {
// Test PNG image
pngBase64, pngSize, err := builtin.LoadImage("testdata/llk_1.png")

View File

@@ -40,14 +40,14 @@ func (dExt *XTDriver) AIAction(text string, opts ...option.ActionOption) error {
}
// do actions
for _, action := range result.NextActions {
for _, action := range result.Actions {
switch action.ActionType {
case ai.ActionTypeClick:
case "click":
point := action.ActionInputs["startBox"].([]float64)
if err := dExt.TapAbsXY(point[0], point[1], opts...); err != nil {
return err
}
case ai.ActionTypeFinished:
case "finished":
log.Info().Msg("ai action done")
return nil
}