mirror of
https://github.com/httprunner/httprunner.git
synced 2026-06-09 17:59:36 +08:00
refactor: replace ui-tars parser with https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/action_parser.py
This commit is contained in:
@@ -1 +1 @@
|
|||||||
v5.0.0-beta-2505221822
|
v5.0.0-beta-2505222252
|
||||||
|
|||||||
157
uixt/ai/parser_default.go
Normal file
157
uixt/ai/parser_default.go
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/httprunner/httprunner/v5/internal/json"
|
||||||
|
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||||
|
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||||
|
"github.com/pkg/errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LLMContentParser parses the content from the LLM response
|
||||||
|
// parser is corresponding to the model type and system prompt
|
||||||
|
type LLMContentParser interface {
|
||||||
|
SystemPrompt() string
|
||||||
|
Parse(content string, size types.Size) (*PlanningResult, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
||||||
|
switch modelType {
|
||||||
|
case option.LLMServiceTypeUITARS:
|
||||||
|
return &UITARSContentParser{
|
||||||
|
systemPrompt: uiTarsPlanningPrompt,
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return &JSONContentParser{
|
||||||
|
systemPrompt: defaultPlanningResponseJsonFormat,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// JSONContentParser parses the response as JSON string format
|
||||||
|
type JSONContentParser struct {
|
||||||
|
systemPrompt string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *JSONContentParser) SystemPrompt() string {
|
||||||
|
return p.systemPrompt
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||||
|
content = strings.TrimSpace(content)
|
||||||
|
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
|
||||||
|
content = strings.TrimPrefix(content, "```json")
|
||||||
|
content = strings.TrimSuffix(content, "```")
|
||||||
|
}
|
||||||
|
content = strings.TrimSpace(content)
|
||||||
|
|
||||||
|
var response PlanningResult
|
||||||
|
if err := json.Unmarshal([]byte(content), &response); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if response.Error != "" {
|
||||||
|
return nil, errors.New(response.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(response.Actions) == 0 {
|
||||||
|
return nil, errors.New("no actions returned from VLM")
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize actions
|
||||||
|
var normalizedActions []Action
|
||||||
|
for i := range response.Actions {
|
||||||
|
// create a new variable, avoid implicit memory aliasing in for loop.
|
||||||
|
action := response.Actions[i]
|
||||||
|
if err := normalizeAction(&action); err != nil {
|
||||||
|
return nil, errors.Wrap(err, "failed to normalize action")
|
||||||
|
}
|
||||||
|
normalizedActions = append(normalizedActions, action)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &PlanningResult{
|
||||||
|
Actions: normalizedActions,
|
||||||
|
ActionSummary: response.ActionSummary,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeAction normalizes the coordinates in the action
|
||||||
|
func normalizeAction(action *Action) error {
|
||||||
|
switch action.ActionType {
|
||||||
|
case "click", "drag":
|
||||||
|
// handle click and drag action coordinates
|
||||||
|
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
||||||
|
normalized, err := normalizeCoordinates(startBox)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to normalize startBox: %w", err)
|
||||||
|
}
|
||||||
|
action.ActionInputs["startBox"] = normalized
|
||||||
|
}
|
||||||
|
|
||||||
|
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
||||||
|
normalized, err := normalizeCoordinates(endBox)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to normalize endBox: %w", err)
|
||||||
|
}
|
||||||
|
action.ActionInputs["endBox"] = normalized
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeCoordinates normalizes the coordinates based on the factor
|
||||||
|
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
|
||||||
|
// check empty string
|
||||||
|
if coordStr == "" {
|
||||||
|
return nil, fmt.Errorf("empty coordinate string")
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
|
||||||
|
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
|
||||||
|
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
|
||||||
|
if len(bboxMatches) > 1 {
|
||||||
|
// Extract space-separated values from inside the bbox tags
|
||||||
|
bboxContent := bboxMatches[1]
|
||||||
|
// Split by whitespace
|
||||||
|
parts := strings.Fields(bboxContent)
|
||||||
|
if len(parts) == 4 {
|
||||||
|
coords = make([]float64, 4)
|
||||||
|
for i, part := range parts {
|
||||||
|
val, e := strconv.ParseFloat(part, 64)
|
||||||
|
if e != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
|
||||||
|
}
|
||||||
|
coords[i] = val
|
||||||
|
}
|
||||||
|
// 将 val 转换为 [x,y] 坐标
|
||||||
|
x := (coords[0] + coords[2]) / 2
|
||||||
|
y := (coords[1] + coords[3]) / 2
|
||||||
|
return []float64{x, y}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
|
||||||
|
if strings.Contains(coordStr, ",") {
|
||||||
|
// remove possible brackets and split coordinates
|
||||||
|
coordStr = strings.Trim(coordStr, "[]() \t")
|
||||||
|
|
||||||
|
// try parsing JSON array
|
||||||
|
jsonStr := coordStr
|
||||||
|
if !strings.HasPrefix(jsonStr, "[") {
|
||||||
|
jsonStr = "[" + coordStr + "]"
|
||||||
|
}
|
||||||
|
|
||||||
|
err = json.Unmarshal([]byte(jsonStr), &coords)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
|
||||||
|
}
|
||||||
|
return coords, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
||||||
|
}
|
||||||
33
uixt/ai/parser_test.go
Normal file
33
uixt/ai/parser_test.go
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseAction(t *testing.T) {
|
||||||
|
actionStr := "click(point='<point>200 300</point>')"
|
||||||
|
result, err := ParseAction(actionStr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
assert.Equal(t, result.Function, "click")
|
||||||
|
assert.Equal(t, result.Args["point"], "<point>200 300</point>")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseActionToStructureOutput(t *testing.T) {
|
||||||
|
text := "Thought: test\nAction: click(point='<point>200 300</point>')"
|
||||||
|
parser := &UITARSContentParser{}
|
||||||
|
result, err := parser.Parse(text, types.Size{Height: 224, Width: 224})
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||||
|
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||||
|
|
||||||
|
text = "Thought: 我看到页面上有几个帖子,第二个帖子的标题是\"字节四年,头发白了\"。要完成任务,我需要点击这个帖子下方的作者头像,这样就能进入作者的个人主页了。\nAction: click(start_point='<point>550 450 550 450</point>')"
|
||||||
|
result, err = parser.Parse(text, types.Size{Height: 2341, Width: 1024})
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||||
|
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||||
|
}
|
||||||
207
uixt/ai/parser_ui_tars.go
Normal file
207
uixt/ai/parser_ui_tars.go
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
// reference:
|
||||||
|
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/action_parser.py
|
||||||
|
|
||||||
|
const (
|
||||||
|
DefaultFactor = 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
// UITARSContentParser parses the Thought/Action format response
|
||||||
|
type UITARSContentParser struct {
|
||||||
|
systemPrompt string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *UITARSContentParser) SystemPrompt() string {
|
||||||
|
return p.systemPrompt
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseActionToStructureOutput parses the model output text into structured actions.
|
||||||
|
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||||
|
text := strings.TrimSpace(content)
|
||||||
|
if strings.Contains(text, "<point>") {
|
||||||
|
text = convertPointToCoordinates(text)
|
||||||
|
}
|
||||||
|
text = strings.ReplaceAll(text, "start_point=", "start_box=")
|
||||||
|
text = strings.ReplaceAll(text, "end_point=", "end_box=")
|
||||||
|
text = strings.ReplaceAll(text, "point=", "start_box=")
|
||||||
|
|
||||||
|
// Extract context (thought/reflection)
|
||||||
|
var thought, reflection string
|
||||||
|
actionIdx := strings.Index(text, "Action:")
|
||||||
|
prefix := ""
|
||||||
|
if actionIdx != -1 {
|
||||||
|
prefix = text[:actionIdx]
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(prefix, "Thought:") {
|
||||||
|
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Thought:"))
|
||||||
|
} else if strings.HasPrefix(prefix, "Reflection:") {
|
||||||
|
refIdx := strings.Index(prefix, "Action_Summary:")
|
||||||
|
if refIdx != -1 {
|
||||||
|
reflection = strings.TrimSpace(strings.TrimPrefix(prefix[:refIdx], "Reflection:"))
|
||||||
|
thought = strings.TrimSpace(strings.TrimPrefix(prefix[refIdx:], "Action_Summary:"))
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(prefix, "Action_Summary:") {
|
||||||
|
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Action_Summary:"))
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "Action:") {
|
||||||
|
return nil, fmt.Errorf("no Action: found")
|
||||||
|
}
|
||||||
|
actionStr := strings.SplitN(text, "Action: ", 2)[1]
|
||||||
|
|
||||||
|
rawActions := strings.Split(actionStr, ")\n\n")
|
||||||
|
normalizedActions := make([]string, 0, len(rawActions))
|
||||||
|
for _, act := range rawActions {
|
||||||
|
actionStr := act
|
||||||
|
if strings.Contains(actionStr, "type(content") {
|
||||||
|
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
||||||
|
actionStr = strings.TrimSpace(actionStr) + ")"
|
||||||
|
}
|
||||||
|
pattern := regexp.MustCompile(`type\(content='(.*?)'\)`)
|
||||||
|
m := pattern.FindStringSubmatch(actionStr)
|
||||||
|
if len(m) > 1 {
|
||||||
|
content := m[1]
|
||||||
|
actionStr = "type(content='" + escapeSingleQuotes(content) + "')"
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("pattern not found in the input string")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
||||||
|
actionStr = strings.TrimSpace(actionStr) + ")"
|
||||||
|
}
|
||||||
|
normalizedActions = append(normalizedActions, actionStr)
|
||||||
|
}
|
||||||
|
|
||||||
|
actions := make([]Action, 0, len(normalizedActions))
|
||||||
|
for _, action := range normalizedActions {
|
||||||
|
parsed, err := ParseAction(strings.ReplaceAll(action, "\n", "\\n"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Action can't parse: %s", action)
|
||||||
|
}
|
||||||
|
actionType := parsed.Function
|
||||||
|
params := parsed.Args
|
||||||
|
actionInputs := make(map[string]any)
|
||||||
|
imageWidth := size.Width
|
||||||
|
imageHeight := size.Height
|
||||||
|
for paramName, param := range params {
|
||||||
|
if param == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
param = strings.TrimLeft(param, " ")
|
||||||
|
actionInputs[paramName] = param
|
||||||
|
if strings.Contains(paramName, "start_box") || strings.Contains(paramName, "end_box") {
|
||||||
|
oriBox := param
|
||||||
|
parameters := strings.Split(strings.ReplaceAll(strings.ReplaceAll(oriBox, "(", ""), ")", ""), ",")
|
||||||
|
floatNumbers := make([]float64, 0, len(parameters))
|
||||||
|
for _, numStr := range parameters {
|
||||||
|
num, err := strconv.ParseFloat(strings.TrimSpace(numStr), 64)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Interface("parameters", parameters).Msg("invalid float action parameters")
|
||||||
|
return nil, fmt.Errorf("invalid action parameters")
|
||||||
|
}
|
||||||
|
floatNumbers = append(floatNumbers, num)
|
||||||
|
}
|
||||||
|
// The model generates a 2D coordinate output that represents relative positions.
|
||||||
|
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
|
||||||
|
// The absolute coordinates required by the Action can be calculated by:
|
||||||
|
// - X absolute = X relative × image width / 1000
|
||||||
|
// - Y absolute = Y relative × image height / 1000
|
||||||
|
if len(floatNumbers) == 2 {
|
||||||
|
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||||
|
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||||
|
} else if len(floatNumbers) == 4 {
|
||||||
|
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||||
|
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||||
|
floatNumbers[2] = math.Round((floatNumbers[2]/DefaultFactor*float64(imageWidth))*10) / 10
|
||||||
|
floatNumbers[3] = math.Round((floatNumbers[3]/DefaultFactor*float64(imageHeight))*10) / 10
|
||||||
|
} else {
|
||||||
|
log.Error().Interface("parameters", floatNumbers).Msg("invalid float action parameters")
|
||||||
|
return nil, fmt.Errorf("invalid action parameters")
|
||||||
|
}
|
||||||
|
actionInputs[paramName] = floatNumbers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
actions = append(actions, Action{
|
||||||
|
Reflection: reflection,
|
||||||
|
Thought: thought,
|
||||||
|
ActionType: actionType,
|
||||||
|
ActionInputs: actionInputs,
|
||||||
|
Text: text,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return &PlanningResult{
|
||||||
|
Actions: actions,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Action represents a parsed action with its context.
|
||||||
|
type Action struct {
|
||||||
|
Reflection string `json:"reflection"`
|
||||||
|
Thought string `json:"thought"`
|
||||||
|
ActionType string `json:"action_type"`
|
||||||
|
ActionInputs map[string]any `json:"action_inputs"`
|
||||||
|
Text string `json:"text"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParsedActionArgs represents the result of parsing an action string.
|
||||||
|
type ParsedActionArgs struct {
|
||||||
|
Function string
|
||||||
|
Args map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
// convertPointToCoordinates replaces <point>x y</point> with (x,y)
|
||||||
|
func convertPointToCoordinates(text string) string {
|
||||||
|
// 支持 <point>x1 y1 x2 y2</point> 或 <point>x y</point>
|
||||||
|
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
|
||||||
|
return re.ReplaceAllStringFunc(text, func(match string) string {
|
||||||
|
submatches := re.FindStringSubmatch(match)
|
||||||
|
if submatches[3] != "" && submatches[4] != "" {
|
||||||
|
// 4 个数字
|
||||||
|
return fmt.Sprintf("(%s,%s,%s,%s)", submatches[1], submatches[2], submatches[3], submatches[4])
|
||||||
|
}
|
||||||
|
// 2 个数字
|
||||||
|
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// escapeSingleQuotes escapes unescaped single quotes in a string.
|
||||||
|
func escapeSingleQuotes(text string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
n := len(text)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
if text[i] == '\'' && (i == 0 || text[i-1] != '\\') {
|
||||||
|
b.WriteString("\\'")
|
||||||
|
} else {
|
||||||
|
b.WriteByte(text[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseAction parses an action string into function name and arguments.
|
||||||
|
func ParseAction(actionStr string) (*ParsedActionArgs, error) {
|
||||||
|
re := regexp.MustCompile(`^(\w+)\((.*)\)$`)
|
||||||
|
matches := re.FindStringSubmatch(actionStr)
|
||||||
|
if len(matches) < 3 {
|
||||||
|
return nil, fmt.Errorf("not a function call")
|
||||||
|
}
|
||||||
|
funcName := matches[1]
|
||||||
|
argsStr := matches[2]
|
||||||
|
args := make(map[string]string)
|
||||||
|
argRe := regexp.MustCompile(`(\w+)\s*=\s*'([^']*)'`)
|
||||||
|
for _, m := range argRe.FindAllStringSubmatch(argsStr, -1) {
|
||||||
|
args[m[1]] = m[2]
|
||||||
|
}
|
||||||
|
return &ParsedActionArgs{Function: funcName, Args: args}, nil
|
||||||
|
}
|
||||||
@@ -28,7 +28,7 @@ type PlanningOptions struct {
|
|||||||
// PlanningResult represents the result of planning
|
// PlanningResult represents the result of planning
|
||||||
type PlanningResult struct {
|
type PlanningResult struct {
|
||||||
ToolCalls []schema.ToolCall `json:"tool_calls"` // TODO: merge to NextActions
|
ToolCalls []schema.ToolCall `json:"tool_calls"` // TODO: merge to NextActions
|
||||||
NextActions []ParsedAction `json:"actions"`
|
Actions []Action `json:"actions"`
|
||||||
ActionSummary string `json:"summary"`
|
ActionSummary string `json:"summary"`
|
||||||
Error string `json:"error,omitempty"`
|
Error string `json:"error,omitempty"`
|
||||||
}
|
}
|
||||||
@@ -138,7 +138,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
|
|||||||
|
|
||||||
log.Info().
|
log.Info().
|
||||||
Interface("summary", result.ActionSummary).
|
Interface("summary", result.ActionSummary).
|
||||||
Interface("actions", result.NextActions).
|
Interface("actions", result.Actions).
|
||||||
Msg("get VLM planning result")
|
Msg("get VLM planning result")
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,431 +0,0 @@
|
|||||||
package ai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"math"
|
|
||||||
"regexp"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/httprunner/httprunner/v5/internal/json"
|
|
||||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
|
||||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
|
||||||
"github.com/pkg/errors"
|
|
||||||
"github.com/rs/zerolog/log"
|
|
||||||
)
|
|
||||||
|
|
||||||
// LLMContentParser parses the content from the LLM response
|
|
||||||
// parser is corresponding to the model type and system prompt
|
|
||||||
type LLMContentParser interface {
|
|
||||||
SystemPrompt() string
|
|
||||||
Parse(content string, size types.Size) (*PlanningResult, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
|
||||||
switch modelType {
|
|
||||||
case option.LLMServiceTypeUITARS:
|
|
||||||
return &UITARSContentParser{
|
|
||||||
systemPrompt: uiTarsPlanningPrompt,
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
return &JSONContentParser{
|
|
||||||
systemPrompt: defaultPlanningResponseJsonFormat,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ParsedAction represents a parsed action from the VLM response
|
|
||||||
type ParsedAction struct {
|
|
||||||
ActionType ActionType `json:"actionType"`
|
|
||||||
ActionInputs map[string]interface{} `json:"actionInputs"`
|
|
||||||
Thought string `json:"thought"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type ActionType string
|
|
||||||
|
|
||||||
const (
|
|
||||||
ActionTypeClick ActionType = "click"
|
|
||||||
ActionTypeTap ActionType = "tap"
|
|
||||||
ActionTypeDrag ActionType = "drag"
|
|
||||||
ActionTypeSwipe ActionType = "swipe"
|
|
||||||
ActionTypeWait ActionType = "wait"
|
|
||||||
ActionTypeFinished ActionType = "finished"
|
|
||||||
ActionTypeCallUser ActionType = "call_user"
|
|
||||||
ActionTypeType ActionType = "type"
|
|
||||||
ActionTypeScroll ActionType = "scroll"
|
|
||||||
)
|
|
||||||
|
|
||||||
// UITARSContentParser parses the Thought/Action format response
|
|
||||||
type UITARSContentParser struct {
|
|
||||||
systemPrompt string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *UITARSContentParser) SystemPrompt() string {
|
|
||||||
return p.systemPrompt
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
|
||||||
thoughtRegex := regexp.MustCompile(`(?is)Thought:(.+?)Action:`)
|
|
||||||
actionRegex := regexp.MustCompile(`(?is)Action:(.+)`)
|
|
||||||
|
|
||||||
// extract Thought part
|
|
||||||
thoughtMatch := thoughtRegex.FindStringSubmatch(content)
|
|
||||||
var thought string
|
|
||||||
if len(thoughtMatch) > 1 {
|
|
||||||
thought = strings.TrimSpace(thoughtMatch[1])
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract Action part, e.g. "click(start_box='(552,454)')"
|
|
||||||
actionMatch := actionRegex.FindStringSubmatch(content)
|
|
||||||
if len(actionMatch) < 2 {
|
|
||||||
return nil, errors.New("no action found in the response")
|
|
||||||
}
|
|
||||||
|
|
||||||
actionsText := strings.TrimSpace(actionMatch[1])
|
|
||||||
|
|
||||||
// parse action type and parameters
|
|
||||||
parseActions, err := parseActionText(actionsText, thought)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// process response
|
|
||||||
result, err := processVLMResponse(parseActions, size)
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrap(err, "process VLM response failed")
|
|
||||||
}
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseActionText parses the action text to extract the action type and parameters
|
|
||||||
func parseActionText(actionsText, thought string) ([]ParsedAction, error) {
|
|
||||||
// remove trailing comments
|
|
||||||
if idx := strings.Index(actionsText, "#"); idx > 0 {
|
|
||||||
actionsText = strings.TrimSpace(actionsText[:idx])
|
|
||||||
}
|
|
||||||
|
|
||||||
// supported action types and regexes
|
|
||||||
actionRegexes := map[ActionType]*regexp.Regexp{
|
|
||||||
"click": regexp.MustCompile(`click\(start_box='([^']+)'\)`),
|
|
||||||
"left_double": regexp.MustCompile(`left_double\(start_box='([^']+)'\)`),
|
|
||||||
"right_single": regexp.MustCompile(`right_single\(start_box='([^']+)'\)`),
|
|
||||||
"drag": regexp.MustCompile(`drag\(start_box='([^']+)', end_box='([^']+)'\)`),
|
|
||||||
"type": regexp.MustCompile(`type\(content='([^']+)'\)`),
|
|
||||||
"scroll": regexp.MustCompile(`scroll\(start_box='([^']+)', direction='([^']+)'\)`),
|
|
||||||
"wait": regexp.MustCompile(`wait\(\)`),
|
|
||||||
"finished": regexp.MustCompile(`finished\(content='([^']+)'\)`),
|
|
||||||
"call_user": regexp.MustCompile(`call_user\(\)`),
|
|
||||||
}
|
|
||||||
|
|
||||||
// one or multiple actions, separated by newline
|
|
||||||
// "click(start_box='<bbox>229 379 229 379</bbox>')
|
|
||||||
// "click(start_box='<bbox>229 379 229 379</bbox>')\n\nclick(start_box='<bbox>769 519 769 519</bbox>')"
|
|
||||||
parsedActions := make([]ParsedAction, 0)
|
|
||||||
for _, actionText := range strings.Split(actionsText, "\n") {
|
|
||||||
actionText = strings.TrimSpace(actionText)
|
|
||||||
for actionType, regex := range actionRegexes {
|
|
||||||
matches := regex.FindStringSubmatch(actionText)
|
|
||||||
if len(matches) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
var action ParsedAction
|
|
||||||
action.ActionType = actionType
|
|
||||||
action.ActionInputs = make(map[string]interface{})
|
|
||||||
action.Thought = thought
|
|
||||||
|
|
||||||
// parse parameters based on action type
|
|
||||||
switch actionType {
|
|
||||||
case ActionTypeClick:
|
|
||||||
if len(matches) > 1 {
|
|
||||||
coord, err := normalizeCoordinates(matches[1])
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrapf(err, "normalize point failed: %s", matches[1])
|
|
||||||
}
|
|
||||||
action.ActionInputs["startBox"] = coord
|
|
||||||
}
|
|
||||||
case ActionTypeDrag:
|
|
||||||
if len(matches) > 2 {
|
|
||||||
// handle start point
|
|
||||||
startBox, err := normalizeCoordinates(matches[1])
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
|
|
||||||
}
|
|
||||||
action.ActionInputs["startBox"] = startBox
|
|
||||||
|
|
||||||
// handle end point
|
|
||||||
endBox, err := normalizeCoordinates(matches[2])
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrapf(err, "normalize endBox failed: %s", matches[2])
|
|
||||||
}
|
|
||||||
action.ActionInputs["endBox"] = endBox
|
|
||||||
}
|
|
||||||
case ActionTypeType:
|
|
||||||
if len(matches) > 1 {
|
|
||||||
action.ActionInputs["content"] = matches[1]
|
|
||||||
}
|
|
||||||
case ActionTypeScroll:
|
|
||||||
if len(matches) > 2 {
|
|
||||||
startBox, err := normalizeCoordinates(matches[1])
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
|
|
||||||
}
|
|
||||||
action.ActionInputs["startBox"] = startBox
|
|
||||||
action.ActionInputs["direction"] = matches[2]
|
|
||||||
}
|
|
||||||
case ActionTypeWait, ActionTypeFinished, ActionTypeCallUser:
|
|
||||||
// 这些动作没有额外参数
|
|
||||||
}
|
|
||||||
|
|
||||||
parsedActions = append(parsedActions, action)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(parsedActions) == 0 {
|
|
||||||
return nil, fmt.Errorf("no valid actions returned from VLM")
|
|
||||||
}
|
|
||||||
return parsedActions, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalizeCoordinates normalizes the coordinates based on the factor
|
|
||||||
func normalizeCoordinates(coordStr string) (coords []float64, err error) {
|
|
||||||
// check empty string
|
|
||||||
if coordStr == "" {
|
|
||||||
return nil, fmt.Errorf("empty coordinate string")
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle BBox format: <bbox>x1 y1 x2 y2</bbox>
|
|
||||||
bboxRegex := regexp.MustCompile(`<bbox>(\d+\s+\d+\s+\d+\s+\d+)</bbox>`)
|
|
||||||
bboxMatches := bboxRegex.FindStringSubmatch(coordStr)
|
|
||||||
if len(bboxMatches) > 1 {
|
|
||||||
// Extract space-separated values from inside the bbox tags
|
|
||||||
bboxContent := bboxMatches[1]
|
|
||||||
// Split by whitespace
|
|
||||||
parts := strings.Fields(bboxContent)
|
|
||||||
if len(parts) == 4 {
|
|
||||||
coords = make([]float64, 4)
|
|
||||||
for i, part := range parts {
|
|
||||||
val, e := strconv.ParseFloat(part, 64)
|
|
||||||
if e != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse coordinate value '%s': %w", part, e)
|
|
||||||
}
|
|
||||||
coords[i] = val
|
|
||||||
}
|
|
||||||
// 将 val 转换为 [x,y] 坐标
|
|
||||||
x := (coords[0] + coords[2]) / 2
|
|
||||||
y := (coords[1] + coords[3]) / 2
|
|
||||||
return []float64{x, y}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle coordinate string, e.g. "[100, 200]", "(100, 200)"
|
|
||||||
if strings.Contains(coordStr, ",") {
|
|
||||||
// remove possible brackets and split coordinates
|
|
||||||
coordStr = strings.Trim(coordStr, "[]() \t")
|
|
||||||
|
|
||||||
// try parsing JSON array
|
|
||||||
jsonStr := coordStr
|
|
||||||
if !strings.HasPrefix(jsonStr, "[") {
|
|
||||||
jsonStr = "[" + coordStr + "]"
|
|
||||||
}
|
|
||||||
|
|
||||||
err = json.Unmarshal([]byte(jsonStr), &coords)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse coordinate string: %w", err)
|
|
||||||
}
|
|
||||||
return coords, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, fmt.Errorf("invalid coordinate string format: %s", coordStr)
|
|
||||||
}
|
|
||||||
|
|
||||||
// processVLMResponse processes the VLM response and converts it to PlanningResult
|
|
||||||
func processVLMResponse(actions []ParsedAction, size types.Size) (*PlanningResult, error) {
|
|
||||||
log.Info().Msg("processing VLM response...")
|
|
||||||
|
|
||||||
if len(actions) == 0 {
|
|
||||||
return nil, fmt.Errorf("no actions returned from VLM")
|
|
||||||
}
|
|
||||||
|
|
||||||
// validate and post-process each action
|
|
||||||
for i := range actions {
|
|
||||||
// validate action type
|
|
||||||
switch actions[i].ActionType {
|
|
||||||
case "click":
|
|
||||||
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
|
|
||||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
|
||||||
}
|
|
||||||
case "drag":
|
|
||||||
if err := convertCoordinateAction(&actions[i], "startBox", size); err != nil {
|
|
||||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
|
||||||
}
|
|
||||||
if err := convertCoordinateAction(&actions[i], "endBox", size); err != nil {
|
|
||||||
return nil, errors.Wrap(err, "convert coordinate action failed")
|
|
||||||
}
|
|
||||||
case "type":
|
|
||||||
validateTypeContent(&actions[i])
|
|
||||||
case "wait", "finished", "call_user":
|
|
||||||
// these actions do not need extra parameters
|
|
||||||
default:
|
|
||||||
log.Printf("warning: unknown action type: %s, will try to continue processing", actions[i].ActionType)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract action summary
|
|
||||||
actionSummary := extractActionSummary(actions)
|
|
||||||
|
|
||||||
return &PlanningResult{
|
|
||||||
NextActions: actions,
|
|
||||||
ActionSummary: actionSummary,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractActionSummary extracts the summary from the actions
|
|
||||||
func extractActionSummary(actions []ParsedAction) string {
|
|
||||||
if len(actions) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// use the Thought of the first action as summary
|
|
||||||
if actions[0].Thought != "" {
|
|
||||||
return actions[0].Thought
|
|
||||||
}
|
|
||||||
|
|
||||||
// if no Thought, generate summary from action type
|
|
||||||
action := actions[0]
|
|
||||||
switch action.ActionType {
|
|
||||||
case "click":
|
|
||||||
return "点击操作"
|
|
||||||
case "drag":
|
|
||||||
return "拖拽操作"
|
|
||||||
case "type":
|
|
||||||
content, _ := action.ActionInputs["content"].(string)
|
|
||||||
if len(content) > 20 {
|
|
||||||
content = content[:20] + "..."
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("输入文本: %s", content)
|
|
||||||
case "wait":
|
|
||||||
return "等待操作"
|
|
||||||
case "finished":
|
|
||||||
return "完成操作"
|
|
||||||
case "call_user":
|
|
||||||
return "请求用户协助"
|
|
||||||
default:
|
|
||||||
return fmt.Sprintf("执行 %s 操作", action.ActionType)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func convertCoordinateAction(action *ParsedAction, boxField string, size types.Size) error {
|
|
||||||
// The model generates a 2D coordinate output that represents relative positions.
|
|
||||||
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
|
|
||||||
// The absolute coordinates required by the Action can be calculated by:
|
|
||||||
// - X absolute = X relative × image width / 1000
|
|
||||||
// - Y absolute = Y relative × image height / 1000
|
|
||||||
|
|
||||||
// get image width and height
|
|
||||||
imageWidth := size.Width
|
|
||||||
imageHeight := size.Height
|
|
||||||
|
|
||||||
box := action.ActionInputs[boxField]
|
|
||||||
coords, ok := box.([]float64)
|
|
||||||
if !ok {
|
|
||||||
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
|
|
||||||
return fmt.Errorf("invalid action inputs")
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(coords) == 2 {
|
|
||||||
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
|
|
||||||
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
|
|
||||||
} else if len(coords) == 4 {
|
|
||||||
coords[0] = math.Round((coords[0]/1000*float64(imageWidth))*10) / 10
|
|
||||||
coords[1] = math.Round((coords[1]/1000*float64(imageHeight))*10) / 10
|
|
||||||
coords[2] = math.Round((coords[2]/1000*float64(imageWidth))*10) / 10
|
|
||||||
coords[3] = math.Round((coords[3]/1000*float64(imageHeight))*10) / 10
|
|
||||||
} else {
|
|
||||||
log.Error().Interface("inputs", action.ActionInputs).Msg("invalid action inputs")
|
|
||||||
return fmt.Errorf("invalid action inputs")
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// validateTypeContent 验证输入文本内容
|
|
||||||
func validateTypeContent(action *ParsedAction) {
|
|
||||||
if content, ok := action.ActionInputs["content"]; !ok || content == "" {
|
|
||||||
// default to empty string
|
|
||||||
action.ActionInputs["content"] = ""
|
|
||||||
log.Warn().Msg("type action missing content parameter, set to default")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// JSONContentParser parses the response as JSON string format
|
|
||||||
type JSONContentParser struct {
|
|
||||||
systemPrompt string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *JSONContentParser) SystemPrompt() string {
|
|
||||||
return p.systemPrompt
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
|
||||||
content = strings.TrimSpace(content)
|
|
||||||
if strings.HasPrefix(content, "```json") && strings.HasSuffix(content, "```") {
|
|
||||||
content = strings.TrimPrefix(content, "```json")
|
|
||||||
content = strings.TrimSuffix(content, "```")
|
|
||||||
}
|
|
||||||
content = strings.TrimSpace(content)
|
|
||||||
|
|
||||||
var response PlanningResult
|
|
||||||
if err := json.Unmarshal([]byte(content), &response); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if response.Error != "" {
|
|
||||||
return nil, errors.New(response.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(response.NextActions) == 0 {
|
|
||||||
return nil, errors.New("no actions returned from VLM")
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalize actions
|
|
||||||
var normalizedActions []ParsedAction
|
|
||||||
for i := range response.NextActions {
|
|
||||||
// create a new variable, avoid implicit memory aliasing in for loop.
|
|
||||||
action := response.NextActions[i]
|
|
||||||
if err := normalizeAction(&action); err != nil {
|
|
||||||
return nil, errors.Wrap(err, "failed to normalize action")
|
|
||||||
}
|
|
||||||
normalizedActions = append(normalizedActions, action)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &PlanningResult{
|
|
||||||
NextActions: normalizedActions,
|
|
||||||
ActionSummary: response.ActionSummary,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalizeAction normalizes the coordinates in the action
|
|
||||||
func normalizeAction(action *ParsedAction) error {
|
|
||||||
switch action.ActionType {
|
|
||||||
case "click", "drag":
|
|
||||||
// handle click and drag action coordinates
|
|
||||||
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
|
||||||
normalized, err := normalizeCoordinates(startBox)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to normalize startBox: %w", err)
|
|
||||||
}
|
|
||||||
action.ActionInputs["startBox"] = normalized
|
|
||||||
}
|
|
||||||
|
|
||||||
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
|
||||||
normalized, err := normalizeCoordinates(endBox)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to normalize endBox: %w", err)
|
|
||||||
}
|
|
||||||
action.ActionInputs["endBox"] = normalized
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
@@ -12,14 +12,14 @@ Action: ...
|
|||||||
` + "```" + `
|
` + "```" + `
|
||||||
|
|
||||||
## Action Space
|
## Action Space
|
||||||
click(start_box='[x1, y1, x2, y2]')
|
click(point='<point>x1 y1</point>')
|
||||||
left_double(start_box='[x1, y1, x2, y2]')
|
long_press(point='<point>x1 y1</point>')
|
||||||
right_single(start_box='[x1, y1, x2, y2]')
|
type(content='') #If you want to submit your input, use "\\n" at the end of ` + "`content`" + `.
|
||||||
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
scroll(point='<point>x1 y1</point>', direction='down or up or right or left')
|
||||||
hotkey(key='')
|
open_app(app_name=\'\')
|
||||||
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
|
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
|
||||||
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
press_home()
|
||||||
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
press_back()
|
||||||
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||||
|
|
||||||
## Note
|
## Note
|
||||||
@@ -30,11 +30,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
|||||||
`
|
`
|
||||||
|
|
||||||
// system prompt for JSONContentParser
|
// system prompt for JSONContentParser
|
||||||
const defaultPlanningResponseJsonFormat = `You are a versatile professional in software UI automation.
|
const defaultPlanningResponseJsonFormat = `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.`
|
||||||
|
|
||||||
## Output Format
|
|
||||||
` + "```" + `
|
|
||||||
Thought: ...
|
|
||||||
Action: ...
|
|
||||||
` + "```" + `
|
|
||||||
`
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ import (
|
|||||||
"github.com/httprunner/httprunner/v5/code"
|
"github.com/httprunner/httprunner/v5/code"
|
||||||
"github.com/httprunner/httprunner/v5/internal/builtin"
|
"github.com/httprunner/httprunner/v5/internal/builtin"
|
||||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
@@ -58,43 +57,12 @@ func TestVLMPlanning(t *testing.T) {
|
|||||||
// 验证结果
|
// 验证结果
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, result)
|
require.NotNil(t, result)
|
||||||
require.NotEmpty(t, result.NextActions)
|
require.NotEmpty(t, result.Actions)
|
||||||
|
|
||||||
// 验证动作
|
// 验证动作
|
||||||
action := result.NextActions[0]
|
action := result.Actions[0]
|
||||||
assert.NotEmpty(t, action.ActionType)
|
assert.NotEmpty(t, action.ActionType)
|
||||||
assert.NotEmpty(t, action.Thought)
|
assert.NotEmpty(t, action.Thought)
|
||||||
|
|
||||||
// 根据动作类型验证参数
|
|
||||||
switch action.ActionType {
|
|
||||||
case "click", "drag", "left_double", "right_single", "scroll":
|
|
||||||
// 这些动作需要验证坐标
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["startBox"])
|
|
||||||
|
|
||||||
// 验证坐标格式
|
|
||||||
coords, ok := action.ActionInputs["startBox"].([]float64)
|
|
||||||
require.True(t, ok)
|
|
||||||
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
|
|
||||||
|
|
||||||
// 验证坐标范围
|
|
||||||
for _, coord := range coords {
|
|
||||||
assert.GreaterOrEqual(t, coord, float64(0))
|
|
||||||
}
|
|
||||||
|
|
||||||
case "type":
|
|
||||||
// 验证文本内容
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["content"])
|
|
||||||
|
|
||||||
case "hotkey":
|
|
||||||
// 验证按键
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["key"])
|
|
||||||
|
|
||||||
case "wait", "finished", "call_user":
|
|
||||||
// 这些动作不需要额外参数
|
|
||||||
|
|
||||||
default:
|
|
||||||
t.Fatalf("未知的动作类型: %s", action.ActionType)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestXHSPlanning(t *testing.T) {
|
func TestXHSPlanning(t *testing.T) {
|
||||||
@@ -131,43 +99,12 @@ func TestXHSPlanning(t *testing.T) {
|
|||||||
// 验证结果
|
// 验证结果
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, result)
|
require.NotNil(t, result)
|
||||||
require.NotEmpty(t, result.NextActions)
|
require.NotEmpty(t, result.Actions)
|
||||||
|
|
||||||
// 验证动作
|
// 验证动作
|
||||||
action := result.NextActions[0]
|
action := result.Actions[0]
|
||||||
assert.NotEmpty(t, action.ActionType)
|
assert.NotEmpty(t, action.ActionType)
|
||||||
assert.NotEmpty(t, action.Thought)
|
assert.NotEmpty(t, action.Thought)
|
||||||
|
|
||||||
// 根据动作类型验证参数
|
|
||||||
switch action.ActionType {
|
|
||||||
case "click", "drag", "left_double", "right_single", "scroll":
|
|
||||||
// 这些动作需要验证坐标
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["startBox"])
|
|
||||||
|
|
||||||
// 验证坐标格式
|
|
||||||
coords, ok := action.ActionInputs["startBox"].([]float64)
|
|
||||||
require.True(t, ok)
|
|
||||||
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
|
|
||||||
|
|
||||||
// 验证坐标范围
|
|
||||||
for _, coord := range coords {
|
|
||||||
assert.GreaterOrEqual(t, coord, float64(0))
|
|
||||||
}
|
|
||||||
|
|
||||||
case "type":
|
|
||||||
// 验证文本内容
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["content"])
|
|
||||||
|
|
||||||
case "hotkey":
|
|
||||||
// 验证按键
|
|
||||||
assert.NotEmpty(t, action.ActionInputs["key"])
|
|
||||||
|
|
||||||
case "wait", "finished", "call_user":
|
|
||||||
// 这些动作不需要额外参数
|
|
||||||
|
|
||||||
default:
|
|
||||||
t.Fatalf("未知的动作类型: %s", action.ActionType)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChatList(t *testing.T) {
|
func TestChatList(t *testing.T) {
|
||||||
@@ -218,11 +155,11 @@ func TestHandleSwitch(t *testing.T) {
|
|||||||
|
|
||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
imageFile string
|
imageFile string
|
||||||
actionType ActionType
|
actionType string
|
||||||
}{
|
}{
|
||||||
{"testdata/deepseek_think_off.png", ActionTypeClick},
|
{"testdata/deepseek_think_off.png", "finished"},
|
||||||
{"testdata/deepseek_think_on.png", ActionTypeFinished},
|
{"testdata/deepseek_think_on.png", "finished"},
|
||||||
{"testdata/deepseek_network_on.png", ActionTypeFinished},
|
{"testdata/deepseek_network_on.png", "finished"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
@@ -251,7 +188,7 @@ func TestHandleSwitch(t *testing.T) {
|
|||||||
// Validate results
|
// Validate results
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, result)
|
require.NotNil(t, result)
|
||||||
require.Equal(t, result.NextActions[0].ActionType, tc.actionType,
|
require.Equal(t, result.Actions[0].ActionType, tc.actionType,
|
||||||
"Unexpected action type for image file: %s", tc.imageFile)
|
"Unexpected action type for image file: %s", tc.imageFile)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -336,52 +273,6 @@ func TestValidateInput(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcessVLMResponse(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
actions []ParsedAction
|
|
||||||
wantErr bool
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "valid response",
|
|
||||||
actions: []ParsedAction{
|
|
||||||
{
|
|
||||||
ActionType: "click",
|
|
||||||
ActionInputs: map[string]interface{}{
|
|
||||||
"startBox": []float64{0.5, 0.5},
|
|
||||||
},
|
|
||||||
Thought: "点击中心位置",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
wantErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "empty actions",
|
|
||||||
actions: []ParsedAction{},
|
|
||||||
wantErr: true,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
size := types.Size{
|
|
||||||
Width: 1000,
|
|
||||||
Height: 1000,
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result, err := processVLMResponse(tt.actions, size)
|
|
||||||
if tt.wantErr {
|
|
||||||
assert.Error(t, err)
|
|
||||||
assert.Nil(t, result)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
assert.NoError(t, err)
|
|
||||||
assert.NotNil(t, result)
|
|
||||||
assert.Equal(t, tt.actions, result.NextActions)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestLoadImage(t *testing.T) {
|
func TestLoadImage(t *testing.T) {
|
||||||
// Test PNG image
|
// Test PNG image
|
||||||
pngBase64, pngSize, err := builtin.LoadImage("testdata/llk_1.png")
|
pngBase64, pngSize, err := builtin.LoadImage("testdata/llk_1.png")
|
||||||
|
|||||||
@@ -40,14 +40,14 @@ func (dExt *XTDriver) AIAction(text string, opts ...option.ActionOption) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// do actions
|
// do actions
|
||||||
for _, action := range result.NextActions {
|
for _, action := range result.Actions {
|
||||||
switch action.ActionType {
|
switch action.ActionType {
|
||||||
case ai.ActionTypeClick:
|
case "click":
|
||||||
point := action.ActionInputs["startBox"].([]float64)
|
point := action.ActionInputs["startBox"].([]float64)
|
||||||
if err := dExt.TapAbsXY(point[0], point[1], opts...); err != nil {
|
if err := dExt.TapAbsXY(point[0], point[1], opts...); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
case ai.ActionTypeFinished:
|
case "finished":
|
||||||
log.Info().Msg("ai action done")
|
log.Info().Msg("ai action done")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user