mirror of
https://github.com/httprunner/httprunner.git
synced 2026-05-24 09:50:00 +08:00
refactor: replace ui-tars parser with https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/action_parser.py
This commit is contained in:
157
uixt/ai/parser_default.go
Normal file
157
uixt/ai/parser_default.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/internal/json"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// LLMContentParser parses the content from the LLM response
|
||||
// parser is corresponding to the model type and system prompt
|
||||
type LLMContentParser interface {
|
||||
SystemPrompt() string
|
||||
Parse(content string, size types.Size) (*PlanningResult, error)
|
||||
}
|
||||
|
||||
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
||||
switch modelType {
|
||||
case option.LLMServiceTypeUITARS:
|
||||
return &UITARSContentParser{
|
||||
systemPrompt: uiTarsPlanningPrompt,
|
||||
}
|
||||
default:
|
||||
return &JSONContentParser{
|
||||
systemPrompt: defaultPlanningResponseJsonFormat,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// JSONContentParser parses the response as JSON string format
|
||||
type JSONContentParser struct {
|
||||
systemPrompt string
|
||||
}
|
||||
|
||||
func (p *JSONContentParser) SystemPrompt() string {
|
||||
return p.systemPrompt
|
||||
}
|
||||
|
||||
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||
content = strings.TrimSpace(content)
|
||||
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
|
||||
content = strings.TrimPrefix(content, "```json")
|
||||
content = strings.TrimSuffix(content, "```")
|
||||
}
|
||||
content = strings.TrimSpace(content)
|
||||
|
||||
var response PlanningResult
|
||||
if err := json.Unmarshal([]byte(content), &response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
||||
}
|
||||
|
||||
if response.Error != "" {
|
||||
return nil, errors.New(response.Error)
|
||||
}
|
||||
|
||||
if len(response.Actions) == 0 {
|
||||
return nil, errors.New("no actions returned from VLM")
|
||||
}
|
||||
|
||||
// normalize actions
|
||||
var normalizedActions []Action
|
||||
for i := range response.Actions {
|
||||
// create a new variable, avoid implicit memory aliasing in for loop.
|
||||
action := response.Actions[i]
|
||||
if err := normalizeAction(&action); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to normalize action")
|
||||
}
|
||||
normalizedActions = append(normalizedActions, action)
|
||||
}
|
||||
|
||||
return &PlanningResult{
|
||||
Actions: normalizedActions,
|
||||
ActionSummary: response.ActionSummary,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// normalizeAction normalizes the coordinates in the action
|
||||
func normalizeAction(action *Action) error {
|
||||
switch action.ActionType {
|
||||
case "click", "drag":
|
||||
// handle click and drag action coordinates
|
||||
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(startBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize startBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["startBox"] = normalized
|
||||
}
|
||||
|
||||
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(endBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize endBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["endBox"] = normalized
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// normalizeCoordinates normalizes the coordinates based on the factor
|
||||
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
|
||||
// check empty string
|
||||
if coordStr == "" {
|
||||
return nil, fmt.Errorf("empty coordinate string")
|
||||
}
|
||||
|
||||
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
|
||||
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
|
||||
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
|
||||
if len(bboxMatches) > 1 {
|
||||
// Extract space-separated values from inside the bbox tags
|
||||
bboxContent := bboxMatches[1]
|
||||
// Split by whitespace
|
||||
parts := strings.Fields(bboxContent)
|
||||
if len(parts) == 4 {
|
||||
coords = make([]float64, 4)
|
||||
for i, part := range parts {
|
||||
val, e := strconv.ParseFloat(part, 64)
|
||||
if e != nil {
|
||||
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
|
||||
}
|
||||
coords[i] = val
|
||||
}
|
||||
// 将 val 转换为 [x,y] 坐标
|
||||
x := (coords[0] + coords[2]) / 2
|
||||
y := (coords[1] + coords[3]) / 2
|
||||
return []float64{x, y}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
|
||||
if strings.Contains(coordStr, ",") {
|
||||
// remove possible brackets and split coordinates
|
||||
coordStr = strings.Trim(coordStr, "[]() \t")
|
||||
|
||||
// try parsing JSON array
|
||||
jsonStr := coordStr
|
||||
if !strings.HasPrefix(jsonStr, "[") {
|
||||
jsonStr = "[" + coordStr + "]"
|
||||
}
|
||||
|
||||
err = json.Unmarshal([]byte(jsonStr), &coords)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
|
||||
}
|
||||
return coords, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
||||
}
|
||||
33
uixt/ai/parser_test.go
Normal file
33
uixt/ai/parser_test.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParseAction(t *testing.T) {
|
||||
actionStr := "click(point='<point>200 300</point>')"
|
||||
result, err := ParseAction(actionStr)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
assert.Equal(t, result.Function, "click")
|
||||
assert.Equal(t, result.Args["point"], "<point>200 300</point>")
|
||||
}
|
||||
|
||||
func TestParseActionToStructureOutput(t *testing.T) {
|
||||
text := "Thought: test\nAction: click(point='<point>200 300</point>')"
|
||||
parser := &UITARSContentParser{}
|
||||
result, err := parser.Parse(text, types.Size{Height: 224, Width: 224})
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||
|
||||
text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='<point>550 450 550 450</point>')"
|
||||
result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024})
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||
}
|
||||
207
uixt/ai/parser_ui_tars.go
Normal file
207
uixt/ai/parser_ui_tars.go
Normal file
@@ -0,0 +1,207 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// reference:
|
||||
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/action_parser.py
|
||||
|
||||
const (
|
||||
DefaultFactor = 1000
|
||||
)
|
||||
|
||||
// UITARSContentParser parses the Thought/Action format response
|
||||
type UITARSContentParser struct {
|
||||
systemPrompt string
|
||||
}
|
||||
|
||||
func (p *UITARSContentParser) SystemPrompt() string {
|
||||
return p.systemPrompt
|
||||
}
|
||||
|
||||
// ParseActionToStructureOutput parses the model output text into structured actions.
|
||||
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||
text := strings.TrimSpace(content)
|
||||
if strings.Contains(text, "<point>") {
|
||||
text = convertPointToCoordinates(text)
|
||||
}
|
||||
text = strings.ReplaceAll(text, "start_point=", "start_box=")
|
||||
text = strings.ReplaceAll(text, "end_point=", "end_box=")
|
||||
text = strings.ReplaceAll(text, "point=", "start_box=")
|
||||
|
||||
// Extract context (thought/reflection)
|
||||
var thought, reflection string
|
||||
actionIdx := strings.Index(text, "Action:")
|
||||
prefix := ""
|
||||
if actionIdx != -1 {
|
||||
prefix = text[:actionIdx]
|
||||
}
|
||||
if strings.HasPrefix(prefix, "Thought:") {
|
||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Thought:"))
|
||||
} else if strings.HasPrefix(prefix, "Reflection:") {
|
||||
refIdx := strings.Index(prefix, "Action_Summary:")
|
||||
if refIdx != -1 {
|
||||
reflection = strings.TrimSpace(strings.TrimPrefix(prefix[:refIdx], "Reflection:"))
|
||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix[refIdx:], "Action_Summary:"))
|
||||
}
|
||||
} else if strings.HasPrefix(prefix, "Action_Summary:") {
|
||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Action_Summary:"))
|
||||
}
|
||||
if !strings.Contains(text, "Action:") {
|
||||
return nil, fmt.Errorf("no Action: found")
|
||||
}
|
||||
actionStr := strings.SplitN(text, "Action: ", 2)[1]
|
||||
|
||||
rawActions := strings.Split(actionStr, ")\n\n")
|
||||
normalizedActions := make([]string, 0, len(rawActions))
|
||||
for _, act := range rawActions {
|
||||
actionStr := act
|
||||
if strings.Contains(actionStr, "type(content") {
|
||||
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
||||
actionStr = strings.TrimSpace(actionStr) + ")"
|
||||
}
|
||||
pattern := regexp.MustCompile(`type\(content='(.*?)'\)`)
|
||||
m := pattern.FindStringSubmatch(actionStr)
|
||||
if len(m) > 1 {
|
||||
content := m[1]
|
||||
actionStr = "type(content='" + escapeSingleQuotes(content) + "')"
|
||||
} else {
|
||||
return nil, fmt.Errorf("pattern not found in the input string")
|
||||
}
|
||||
}
|
||||
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
||||
actionStr = strings.TrimSpace(actionStr) + ")"
|
||||
}
|
||||
normalizedActions = append(normalizedActions, actionStr)
|
||||
}
|
||||
|
||||
actions := make([]Action, 0, len(normalizedActions))
|
||||
for _, action := range normalizedActions {
|
||||
parsed, err := ParseAction(strings.ReplaceAll(action, "\n", "\\n"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Action can't parse: %s", action)
|
||||
}
|
||||
actionType := parsed.Function
|
||||
params := parsed.Args
|
||||
actionInputs := make(map[string]any)
|
||||
imageWidth := size.Width
|
||||
imageHeight := size.Height
|
||||
for paramName, param := range params {
|
||||
if param == "" {
|
||||
continue
|
||||
}
|
||||
param = strings.TrimLeft(param, " ")
|
||||
actionInputs[paramName] = param
|
||||
if strings.Contains(paramName, "start_box") || strings.Contains(paramName, "end_box") {
|
||||
oriBox := param
|
||||
parameters := strings.Split(strings.ReplaceAll(strings.ReplaceAll(oriBox, "(", ""), ")", ""), ",")
|
||||
floatNumbers := make([]float64, 0, len(parameters))
|
||||
for _, numStr := range parameters {
|
||||
num, err := strconv.ParseFloat(strings.TrimSpace(numStr), 64)
|
||||
if err != nil {
|
||||
log.Error().Interface("parameters", parameters).Msg("invalid float action parameters")
|
||||
return nil, fmt.Errorf("invalid action parameters")
|
||||
}
|
||||
floatNumbers = append(floatNumbers, num)
|
||||
}
|
||||
// The model generates a 2D coordinate output that represents relative positions.
|
||||
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
|
||||
// The absolute coordinates required by the Action can be calculated by:
|
||||
// - X absolute = X relative × image width / 1000
|
||||
// - Y absolute = Y relative × image height / 1000
|
||||
if len(floatNumbers) == 2 {
|
||||
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||
} else if len(floatNumbers) == 4 {
|
||||
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||
floatNumbers[2] = math.Round((floatNumbers[2]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||
floatNumbers[3] = math.Round((floatNumbers[3]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||
} else {
|
||||
log.Error().Interface("parameters", floatNumbers).Msg("invalid float action parameters")
|
||||
return nil, fmt.Errorf("invalid action parameters")
|
||||
}
|
||||
actionInputs[paramName] = floatNumbers
|
||||
}
|
||||
}
|
||||
actions = append(actions, Action{
|
||||
Reflection: reflection,
|
||||
Thought: thought,
|
||||
ActionType: actionType,
|
||||
ActionInputs: actionInputs,
|
||||
Text: text,
|
||||
})
|
||||
}
|
||||
return &PlanningResult{
|
||||
Actions: actions,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Action represents a parsed action with its context.
|
||||
type Action struct {
|
||||
Reflection string `json:"reflection"`
|
||||
Thought string `json:"thought"`
|
||||
ActionType string `json:"action_type"`
|
||||
ActionInputs map[string]any `json:"action_inputs"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// ParsedActionArgs represents the result of parsing an action string.
|
||||
type ParsedActionArgs struct {
|
||||
Function string
|
||||
Args map[string]string
|
||||
}
|
||||
|
||||
// convertPointToCoordinates replaces <point>x y</point> with (x,y)
|
||||
func convertPointToCoordinates(text string) string {
|
||||
// 支持 <point>x1 y1 x2 y2</point> 或 <point>x y</point>
|
||||
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
|
||||
return re.ReplaceAllStringFunc(text, func(match string) string {
|
||||
submatches := re.FindStringSubmatch(match)
|
||||
if submatches[3] != "" && submatches[4] != "" {
|
||||
// 4 个数字
|
||||
return fmt.Sprintf("(%s,%s,%s,%s)", submatches[1], submatches[2], submatches[3], submatches[4])
|
||||
}
|
||||
// 2 个数字
|
||||
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
|
||||
})
|
||||
}
|
||||
|
||||
// escapeSingleQuotes escapes unescaped single quotes in a string.
|
||||
func escapeSingleQuotes(text string) string {
|
||||
var b strings.Builder
|
||||
n := len(text)
|
||||
for i := 0; i < n; i++ {
|
||||
if text[i] == '\'' && (i == 0 || text[i-1] != '\\') {
|
||||
b.WriteString("\\'")
|
||||
} else {
|
||||
b.WriteByte(text[i])
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ParseAction parses an action string into function name and arguments.
|
||||
func ParseAction(actionStr string) (*ParsedActionArgs, error) {
|
||||
re := regexp.MustCompile(`^(\w+)\((.*)\)$`)
|
||||
matches := re.FindStringSubmatch(actionStr)
|
||||
if len(matches) < 3 {
|
||||
return nil, fmt.Errorf("not a function call")
|
||||
}
|
||||
funcName := matches[1]
|
||||
argsStr := matches[2]
|
||||
args := make(map[string]string)
|
||||
argRe := regexp.MustCompile(`(\w+)\s*=\s*'([^']*)'`)
|
||||
for _, m := range argRe.FindAllStringSubmatch(argsStr, -1) {
|
||||
args[m[1]] = m[2]
|
||||
}
|
||||
return &ParsedActionArgs{Function: funcName, Args: args}, nil
|
||||
}
|
||||
@@ -28,7 +28,7 @@ type PlanningOptions struct {
|
||||
// PlanningResult represents the result of planning
|
||||
type PlanningResult struct {
|
||||
ToolCalls []schema.ToolCall `json:"tool_calls"` // TODO: merge to NextActions
|
||||
NextActions []ParsedAction `json:"actions"`
|
||||
Actions []Action `json:"actions"`
|
||||
ActionSummary string `json:"summary"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
@@ -138,7 +138,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
|
||||
|
||||
log.Info().
|
||||
Interface("summary", result.ActionSummary).
|
||||
Interface("actions", result.NextActions).
|
||||
Interface("actions", result.Actions).
|
||||
Msg("get VLM planning result")
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@@ -1,431 +0,0 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/internal/json"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// LLMContentParser parses the content from the LLM response
|
||||
// parser is corresponding to the model type and system prompt
|
||||
type LLMContentParser interface {
|
||||
SystemPrompt() string
|
||||
Parse(content string, size types.Size) (*PlanningResult, error)
|
||||
}
|
||||
|
||||
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
||||
switch modelType {
|
||||
case option.LLMServiceTypeUITARS:
|
||||
return &UITARSContentParser{
|
||||
systemPrompt: uiTarsPlanningPrompt,
|
||||
}
|
||||
default:
|
||||
return &JSONContentParser{
|
||||
systemPrompt: defaultPlanningResponseJsonFormat,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ParsedAction represents a parsed action from the VLM response
|
||||
type ParsedAction struct {
|
||||
ActionType ActionType `json:"actionType"`
|
||||
ActionInputs map[string]interface{} `json:"actionInputs"`
|
||||
Thought string `json:"thought"`
|
||||
}
|
||||
|
||||
type ActionType string
|
||||
|
||||
const (
|
||||
ActionTypeClick ActionType = "click"
|
||||
ActionTypeTap ActionType = "tap"
|
||||
ActionTypeDrag ActionType = "drag"
|
||||
ActionTypeSwipe ActionType = "swipe"
|
||||
ActionTypeWait ActionType = "wait"
|
||||
ActionTypeFinished ActionType = "finished"
|
||||
ActionTypeCallUser ActionType = "call_user"
|
||||
ActionTypeType ActionType = "type"
|
||||
ActionTypeScroll ActionType = "scroll"
|
||||
)
|
||||
|
||||
// UITARSContentParser parses the Thought/Action format response
|
||||
type UITARSContentParser struct {
|
||||
systemPrompt string
|
||||
}
|
||||
|
||||
func (p *UITARSContentParser) SystemPrompt() string {
|
||||
return p.systemPrompt
|
||||
}
|
||||
|
||||
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||
thoughtRegex := regexp.MustCompile(`(?is)Thought:(.+?)Action:`)
|
||||
actionRegex := regexp.MustCompile(`(?is)Action:(.+)`)
|
||||
|
||||
// extract Thought part
|
||||
thoughtMatch := thoughtRegex.FindStringSubmatch(content)
|
||||
var thought string
|
||||
if len(thoughtMatch) > 1 {
|
||||
thought = strings.TrimSpace(thoughtMatch[1])
|
||||
}
|
||||
|
||||
// extract Action part, e.g. "click(start_box='(552,454)')"
|
||||
actionMatch := actionRegex.FindStringSubmatch(content)
|
||||
if len(actionMatch) < 2 {
|
||||
return nil, errors.New("no action found in the response")
|
||||
}
|
||||
|
||||
actionsText := strings.TrimSpace(actionMatch[1])
|
||||
|
||||
// parse action type and parameters
|
||||
parseActions, err := parseActionText(actionsText, thought)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// process response
|
||||
result, err := processVLMResponse(parseActions, size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "process VLM response failed")
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// parseActionText parses the action text to extract the action type and parameters
|
||||
func parseActionText(actionsText, thought string) ([]ParsedAction, error) {
|
||||
// remove trailing comments
|
||||
if idx := strings.Index(actionsText, "#"); idx > 0 {
|
||||
actionsText = strings.TrimSpace(actionsText[:idx])
|
||||
}
|
||||
|
||||
// supported action types and regexes
|
||||
actionRegexes := map[ActionType]*regexp.Regexp{
|
||||
"click": regexp.MustCompile(`click\(start_box='([^']+)'\)`),
|
||||
"left_double": regexp.MustCompile(`left_double\(start_box='([^']+)'\)`),
|
||||
"right_single": regexp.MustCompile(`right_single\(start_box='([^']+)'\)`),
|
||||
"drag": regexp.MustCompile(`drag\(start_box='([^']+)', end_box='([^']+)'\)`),
|
||||
"type": regexp.MustCompile(`type\(content='([^']+)'\)`),
|
||||
"scroll": regexp.MustCompile(`scroll\(start_box='([^']+)', direction='([^']+)'\)`),
|
||||
"wait": regexp.MustCompile(`wait\(\)`),
|
||||
"finished": regexp.MustCompile(`finished\(content='([^']+)'\)`),
|
||||
"call_user": regexp.MustCompile(`call_user\(\)`),
|
||||
}
|
||||
|
||||
// one or multiple actions, separated by newline
|
||||
// "click(start_box='<bbox>229 379 229 379</bbox>')
|
||||
// "click(start_box='<bbox>229 379 229 379</bbox>')\n\nclick(start_box='<bbox>769 519 769 519</bbox>')"
|
||||
parsedActions := make([]ParsedAction, 0)
|
||||
for _, actionText := range strings.Split(actionsText, "\n") {
|
||||
actionText = strings.TrimSpace(actionText)
|
||||
for actionType, regex := range actionRegexes {
|
||||
matches := regex.FindStringSubmatch(actionText)
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var action ParsedAction
|
||||
action.ActionType = actionType
|
||||
action.ActionInputs = make(map[string]interface{})
|
||||
action.Thought = thought
|
||||
|
||||
// parse parameters based on action type
|
||||
switch actionType {
|
||||
case ActionTypeClick:
|
||||
if len(matches) > 1 {
|
||||
coord, err := normalizeCoordinates(matches[1])
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "normalize point failed: %s", matches[1])
|
||||
}
|
||||
action.ActionInputs["startBox"] = coord
|
||||
}
|
||||
case ActionTypeDrag:
|
||||
if len(matches) > 2 {
|
||||
// handle start point
|
||||
startBox, err := normalizeCoordinates(matches[1])
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
|
||||
}
|
||||
action.ActionInputs["startBox"] = startBox
|
||||
|
||||
// handle end point
|
||||
endBox, err := normalizeCoordinates(matches[2])
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "normalize endBox failed: %s", matches[2])
|
||||
}
|
||||
action.ActionInputs["endBox"] = endBox
|
||||
}
|
||||
case ActionTypeType:
|
||||
if len(matches) > 1 {
|
||||
action.ActionInputs["content"] = matches[1]
|
||||
}
|
||||
case ActionTypeScroll:
|
||||
if len(matches) > 2 {
|
||||
startBox, err := normalizeCoordinates(matches[1])
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
|
||||
}
|
||||
action.ActionInputs["startBox"] = startBox
|
||||
action.ActionInputs["direction"] = matches[2]
|
||||
}
|
||||
case ActionTypeWait, ActionTypeFinished, ActionTypeCallUser:
|
||||
// 这些动作没有额外参数
|
||||
}
|
||||
|
||||
parsedActions = append(parsedActions, action)
|
||||
}
|
||||
}
|
||||
|
||||
if len(parsedActions) == 0 {
|
||||
return nil, fmt.Errorf("no valid actions returned from VLM")
|
||||
}
|
||||
return parsedActions, nil
|
||||
}
|
||||
|
||||
// normalizeCoordinates normalizes the coordinates based on the factor
|
||||
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
|
||||
// check empty string
|
||||
if coordStr == "" {
|
||||
return nil, fmt.Errorf("empty coordinate string")
|
||||
}
|
||||
|
||||
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
|
||||
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
|
||||
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
|
||||
if len(bboxMatches) > 1 {
|
||||
// Extract space-separated values from inside the bbox tags
|
||||
bboxContent := bboxMatches[1]
|
||||
// Split by whitespace
|
||||
parts := strings.Fields(bboxContent)
|
||||
if len(parts) == 4 {
|
||||
coords = make([]float64, 4)
|
||||
for i, part := range parts {
|
||||
val, e := strconv.ParseFloat(part, 64)
|
||||
if e != nil {
|
||||
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
|
||||
}
|
||||
coords[i] = val
|
||||
}
|
||||
// 将 val 转换为 [x,y] 坐标
|
||||
x := (coords[0] + coords[2]) / 2
|
||||
y := (coords[1] + coords[3]) / 2
|
||||
return []float64{x, y}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
|
||||
if strings.Contains(coordStr, ",") {
|
||||
// remove possible brackets and split coordinates
|
||||
coordStr = strings.Trim(coordStr, "[]() \t")
|
||||
|
||||
// try parsing JSON array
|
||||
jsonStr := coordStr
|
||||
if !strings.HasPrefix(jsonStr, "[") {
|
||||
jsonStr = "[" + coordStr + "]"
|
||||
}
|
||||
|
||||
err = json.Unmarshal([]byte(jsonStr), &coords)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
|
||||
}
|
||||
return coords, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
||||
}
|
||||
|
||||
// processVLMResponse processes the VLM response and converts it to PlanningResult
|
||||
func processVLMResponse(actions []ParsedAction, size types.Size) (*PlanningResult, error) {
|
||||
log.Info().Msg("processing VLM response...")
|
||||
|
||||
if len(actions) == 0 {
|
||||
return nil, fmt.Errorf("no actions returned from VLM")
|
||||
}
|
||||
|
||||
// validate and post-process each action
|
||||
for i := range actions {
|
||||
// validate action type
|
||||
switch actions[i].ActionType {
|
||||
case "click":
|
||||
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
|
||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
||||
}
|
||||
case "drag":
|
||||
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
|
||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
||||
}
|
||||
if err := convertCoordinateAction(&actions[i], "endBox", size); err != nil {
|
||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
||||
}
|
||||
case "type":
|
||||
validateTypeContent(&actions[i])
|
||||
case "wait", "finished", "call_user":
|
||||
// these actions do not need extra parameters
|
||||
default:
|
||||
log.Printf("warning: unknown action type: %s, will try to continue processing", actions[i].ActionType)
|
||||
}
|
||||
}
|
||||
|
||||
// extract action summary
|
||||
actionSummary := extractActionSummary(actions)
|
||||
|
||||
return &PlanningResult{
|
||||
NextActions: actions,
|
||||
ActionSummary: actionSummary,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// extractActionSummary extracts the summary from the actions
|
||||
func extractActionSummary(actions []ParsedAction) string {
|
||||
if len(actions) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// use the Thought of the first action as summary
|
||||
if actions[0].Thought != "" {
|
||||
return actions[0].Thought
|
||||
}
|
||||
|
||||
// if no Thought, generate summary from action type
|
||||
action := actions[0]
|
||||
switch action.ActionType {
|
||||
case "click":
|
||||
return "点击操作"
|
||||
case "drag":
|
||||
return "拖拽操作"
|
||||
case "type":
|
||||
content, _ := action.ActionInputs["content"].(string)
|
||||
if len(content) > 20 {
|
||||
content = content[:20] + "..."
|
||||
}
|
||||
return fmt.Sprintf("输入文本: %s", content)
|
||||
case "wait":
|
||||
return "等待操作"
|
||||
case "finished":
|
||||
return "完成操作"
|
||||
case "call_user":
|
||||
return "请求用户协助"
|
||||
default:
|
||||
return fmt.Sprintf("执行 %s 操作", action.ActionType)
|
||||
}
|
||||
}
|
||||
|
||||
func convertCoordinateAction(action *ParsedAction, boxField string, size types.Size) error {
|
||||
// The model generates a 2D coordinate output that represents relative positions.
|
||||
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
|
||||
// The absolute coordinates required by the Action can be calculated by:
|
||||
// - X absolute = X relative × image width / 1000
|
||||
// - Y absolute = Y relative × image height / 1000
|
||||
|
||||
// get image width and height
|
||||
imageWidth := size.Width
|
||||
imageHeight := size.Height
|
||||
|
||||
box := action.ActionInputs[boxField]
|
||||
coords, ok := box.([]float64)
|
||||
if !ok {
|
||||
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
|
||||
return fmt.Errorf("invalid action inputs")
|
||||
}
|
||||
|
||||
if len(coords) == 2 {
|
||||
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
|
||||
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
|
||||
} else if len(coords) == 4 {
|
||||
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
|
||||
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
|
||||
coords[2] = math.Round((coords[2]/1000*float64(imageWidth))*10) / 10
|
||||
coords[3] = math.Round((coords[3]/1000*float64(imageHeight))*10) / 10
|
||||
} else {
|
||||
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
|
||||
return fmt.Errorf("invalid action inputs")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateTypeContent 验证输入文本内容
|
||||
func validateTypeContent(action *ParsedAction) {
|
||||
if content, ok := action.ActionInputs["content"]; !ok || content == "" {
|
||||
// default to empty string
|
||||
action.ActionInputs["content"] = ""
|
||||
log.Warn().Msg("type action missing content parameter, set to default")
|
||||
}
|
||||
}
|
||||
|
||||
// JSONContentParser parses the response as JSON string format
|
||||
type JSONContentParser struct {
|
||||
systemPrompt string
|
||||
}
|
||||
|
||||
func (p *JSONContentParser) SystemPrompt() string {
|
||||
return p.systemPrompt
|
||||
}
|
||||
|
||||
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||
content = strings.TrimSpace(content)
|
||||
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
|
||||
content = strings.TrimPrefix(content, "```json")
|
||||
content = strings.TrimSuffix(content, "```")
|
||||
}
|
||||
content = strings.TrimSpace(content)
|
||||
|
||||
var response PlanningResult
|
||||
if err := json.Unmarshal([]byte(content), &response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
||||
}
|
||||
|
||||
if response.Error != "" {
|
||||
return nil, errors.New(response.Error)
|
||||
}
|
||||
|
||||
if len(response.NextActions) == 0 {
|
||||
return nil, errors.New("no actions returned from VLM")
|
||||
}
|
||||
|
||||
// normalize actions
|
||||
var normalizedActions []ParsedAction
|
||||
for i := range response.NextActions {
|
||||
// create a new variable, avoid implicit memory aliasing in for loop.
|
||||
action := response.NextActions[i]
|
||||
if err := normalizeAction(&action); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to normalize action")
|
||||
}
|
||||
normalizedActions = append(normalizedActions, action)
|
||||
}
|
||||
|
||||
return &PlanningResult{
|
||||
NextActions: normalizedActions,
|
||||
ActionSummary: response.ActionSummary,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// normalizeAction normalizes the coordinates in the action
|
||||
func normalizeAction(action *ParsedAction) error {
|
||||
switch action.ActionType {
|
||||
case "click", "drag":
|
||||
// handle click and drag action coordinates
|
||||
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(startBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize startBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["startBox"] = normalized
|
||||
}
|
||||
|
||||
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(endBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize endBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["endBox"] = normalized
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -12,14 +12,14 @@ Action: ...
|
||||
` + "```" + `
|
||||
|
||||
## Action Space
|
||||
click(start_box='[x1, y1, x2, y2]')
|
||||
left_double(start_box='[x1, y1, x2, y2]')
|
||||
right_single(start_box='[x1, y1, x2, y2]')
|
||||
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
||||
hotkey(key='')
|
||||
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
|
||||
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||
click(point='<point>x1 y1</point>')
|
||||
long_press(point='<point>x1 y1</point>')
|
||||
type(content='') #If you want to submit your input, use "\\n" at the end of ` + "`content`" + `.
|
||||
scroll(point='<point>x1 y1</point>', direction='down or up or right or left')
|
||||
open_app(app_name=\'\')
|
||||
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
|
||||
press_home()
|
||||
press_back()
|
||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||
|
||||
## Note
|
||||
@@ -30,11 +30,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
||||
`
|
||||
|
||||
// system prompt for JSONContentParser
|
||||
const defaultPlanningResponseJsonFormat = `You are a versatile professional in software UI automation.
|
||||
|
||||
## Output Format
|
||||
` + "```" + `
|
||||
Thought: ...
|
||||
Action: ...
|
||||
` + "```" + `
|
||||
`
|
||||
const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.`
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/internal/builtin"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
@@ -58,43 +57,12 @@ func TestVLMPlanning(t *testing.T) {
|
||||
// 验证结果
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, result)
|
||||
require.NotEmpty(t, result.NextActions)
|
||||
require.NotEmpty(t, result.Actions)
|
||||
|
||||
// 验证动作
|
||||
action := result.NextActions[0]
|
||||
action := result.Actions[0]
|
||||
assert.NotEmpty(t, action.ActionType)
|
||||
assert.NotEmpty(t, action.Thought)
|
||||
|
||||
// 根据动作类型验证参数
|
||||
switch action.ActionType {
|
||||
case "click", "drag", "left_double", "right_single", "scroll":
|
||||
// 这些动作需要验证坐标
|
||||
assert.NotEmpty(t, action.ActionInputs["startBox"])
|
||||
|
||||
// 验证坐标格式
|
||||
coords, ok := action.ActionInputs["startBox"].([]float64)
|
||||
require.True(t, ok)
|
||||
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
|
||||
|
||||
// 验证坐标范围
|
||||
for _, coord := range coords {
|
||||
assert.GreaterOrEqual(t, coord, float64(0))
|
||||
}
|
||||
|
||||
case "type":
|
||||
// 验证文本内容
|
||||
assert.NotEmpty(t, action.ActionInputs["content"])
|
||||
|
||||
case "hotkey":
|
||||
// 验证按键
|
||||
assert.NotEmpty(t, action.ActionInputs["key"])
|
||||
|
||||
case "wait", "finished", "call_user":
|
||||
// 这些动作不需要额外参数
|
||||
|
||||
default:
|
||||
t.Fatalf("未知的动作类型: %s", action.ActionType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestXHSPlanning(t *testing.T) {
|
||||
@@ -131,43 +99,12 @@ func TestXHSPlanning(t *testing.T) {
|
||||
// 验证结果
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, result)
|
||||
require.NotEmpty(t, result.NextActions)
|
||||
require.NotEmpty(t, result.Actions)
|
||||
|
||||
// 验证动作
|
||||
action := result.NextActions[0]
|
||||
action := result.Actions[0]
|
||||
assert.NotEmpty(t, action.ActionType)
|
||||
assert.NotEmpty(t, action.Thought)
|
||||
|
||||
// 根据动作类型验证参数
|
||||
switch action.ActionType {
|
||||
case "click", "drag", "left_double", "right_single", "scroll":
|
||||
// 这些动作需要验证坐标
|
||||
assert.NotEmpty(t, action.ActionInputs["startBox"])
|
||||
|
||||
// 验证坐标格式
|
||||
coords, ok := action.ActionInputs["startBox"].([]float64)
|
||||
require.True(t, ok)
|
||||
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
|
||||
|
||||
// 验证坐标范围
|
||||
for _, coord := range coords {
|
||||
assert.GreaterOrEqual(t, coord, float64(0))
|
||||
}
|
||||
|
||||
case "type":
|
||||
// 验证文本内容
|
||||
assert.NotEmpty(t, action.ActionInputs["content"])
|
||||
|
||||
case "hotkey":
|
||||
// 验证按键
|
||||
assert.NotEmpty(t, action.ActionInputs["key"])
|
||||
|
||||
case "wait", "finished", "call_user":
|
||||
// 这些动作不需要额外参数
|
||||
|
||||
default:
|
||||
t.Fatalf("未知的动作类型: %s", action.ActionType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatList(t *testing.T) {
|
||||
@@ -218,11 +155,11 @@ func TestHandleSwitch(t *testing.T) {
|
||||
|
||||
testCases := []struct {
|
||||
imageFile string
|
||||
actionType ActionType
|
||||
actionType string
|
||||
}{
|
||||
{"testdata/deepseek_think_off.png", ActionTypeClick},
|
||||
{"testdata/deepseek_think_on.png", ActionTypeFinished},
|
||||
{"testdata/deepseek_network_on.png", ActionTypeFinished},
|
||||
{"testdata/deepseek_think_off.png", "finished"},
|
||||
{"testdata/deepseek_think_on.png", "finished"},
|
||||
{"testdata/deepseek_network_on.png", "finished"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
@@ -251,7 +188,7 @@ func TestHandleSwitch(t *testing.T) {
|
||||
// Validate results
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, result)
|
||||
require.Equal(t, result.NextActions[0].ActionType, tc.actionType,
|
||||
require.Equal(t, result.Actions[0].ActionType, tc.actionType,
|
||||
"Unexpected action type for image file: %s", tc.imageFile)
|
||||
}
|
||||
}
|
||||
@@ -336,52 +273,6 @@ func TestValidateInput(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessVLMResponse(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
actions []ParsedAction
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "valid response",
|
||||
actions: []ParsedAction{
|
||||
{
|
||||
ActionType: "click",
|
||||
ActionInputs: map[string]interface{}{
|
||||
"startBox": []float64{0.5, 0.5},
|
||||
},
|
||||
Thought: "点击中心位置",
|
||||
},
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty actions",
|
||||
actions: []ParsedAction{},
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
size := types.Size{
|
||||
Width: 1000,
|
||||
Height: 1000,
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := processVLMResponse(tt.actions, size)
|
||||
if tt.wantErr {
|
||||
assert.Error(t, err)
|
||||
assert.Nil(t, result)
|
||||
return
|
||||
}
|
||||
|
||||
assert.NoError(t, err)
|
||||
assert.NotNil(t, result)
|
||||
assert.Equal(t, tt.actions, result.NextActions)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage(t *testing.T) {
|
||||
// Test PNG image
|
||||
pngBase64, pngSize, err := builtin.LoadImage("testdata/llk_1.png")
|
||||
|
||||
@@ -40,14 +40,14 @@ func (dExt *XTDriver) AIAction(text string, opts ...option.ActionOption) error {
|
||||
}
|
||||
|
||||
// do actions
|
||||
for _, action := range result.NextActions {
|
||||
for _, action := range result.Actions {
|
||||
switch action.ActionType {
|
||||
case ai.ActionTypeClick:
|
||||
case "click":
|
||||
point := action.ActionInputs["startBox"].([]float64)
|
||||
if err := dExt.TapAbsXY(point[0], point[1], opts...); err != nil {
|
||||
return err
|
||||
}
|
||||
case ai.ActionTypeFinished:
|
||||
case "finished":
|
||||
log.Info().Msg("ai action done")
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user