feat: add ui-tars planner

This commit is contained in:
lilong.129
2025-03-18 23:55:10 +08:00
parent d63e5e0c1a
commit 6c74727c44
12 changed files with 1323 additions and 1 deletions

1
.gitignore vendored
View File

@@ -7,6 +7,7 @@
# Test binary, built with `go test -c`
*.test
.env
# Output of the go coverage tool, specifically when used with LiteIDE
*.out

View File

@@ -1 +1 @@
v5.0.0-beta-2503172040
v5.0.0-beta-2503190009

149
planner/env.go Normal file
View File

@@ -0,0 +1,149 @@
package planner
import (
"encoding/json"
"fmt"
"os"
"strconv"
"time"
"github.com/cloudwego/eino-ext/components/model/openai"
"github.com/joho/godotenv"
"github.com/rs/zerolog/log"
)
const (
defaultTimeout = 60 * time.Second
)
type OpenAIInitConfig struct {
ReportURL string `json:"REPORT_SERVER_URL"`
Headers map[string]string `json:"defaultHeaders"`
}
const (
EnvOpenAIBaseURL = "OPENAI_BASE_URL"
EnvOpenAIAPIKey = "OPENAI_API_KEY"
EnvModelName = "MIDSCENE_MODEL_NAME"
EnvOpenAIInitConfigJSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON"
EnvUseVLMUITars = "MIDSCENE_USE_VLM_UI_TARS"
)
// loadEnv loads environment variables from a file
func loadEnv(envPath string) error {
err := godotenv.Load(envPath)
if err != nil {
return err
}
log.Info().Str("path", envPath).Msg("load env success")
return nil
}
func GetEnvConfig(key string) string {
return os.Getenv(key)
}
func GetEnvConfigInJSON(key string) (map[string]interface{}, error) {
value := GetEnvConfig(key)
if value == "" {
return nil, nil
}
var result map[string]interface{}
if err := json.Unmarshal([]byte(value), &result); err != nil {
return nil, err
}
return result, nil
}
func GetEnvConfigInBool(key string) bool {
value := GetEnvConfig(key)
if value == "" {
return false
}
boolValue, _ := strconv.ParseBool(value)
return boolValue
}
// GetEnvConfigOrDefault get env config or default value
func GetEnvConfigOrDefault(key, defaultValue string) string {
value := GetEnvConfig(key)
if value == "" {
return defaultValue
}
return value
}
func GetEnvConfigInInt(key string, defaultValue int) int {
value := GetEnvConfig(key)
if value == "" {
return defaultValue
}
intValue, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return intValue
}
// GetModelConfig get OpenAI config
func GetModelConfig() (*openai.ChatModelConfig, error) {
envConfig := &OpenAIInitConfig{
Headers: make(map[string]string),
}
// read from JSON config first
jsonStr := GetEnvConfig(EnvOpenAIInitConfigJSON)
if jsonStr != "" {
if err := json.Unmarshal([]byte(jsonStr), envConfig); err != nil {
return nil, err
}
}
config := &openai.ChatModelConfig{
Timeout: defaultTimeout,
}
if baseURL := GetEnvConfig(EnvOpenAIBaseURL); baseURL != "" {
config.BaseURL = baseURL
} else {
return nil, fmt.Errorf("miss env %s", EnvOpenAIBaseURL)
}
if apiKey := GetEnvConfig(EnvOpenAIAPIKey); apiKey != "" {
config.APIKey = apiKey
} else {
return nil, fmt.Errorf("miss env %s", EnvOpenAIAPIKey)
}
if modelName := GetEnvConfig(EnvModelName); modelName != "" {
config.Model = modelName
} else {
return nil, fmt.Errorf("miss env %s", EnvModelName)
}
// log config info
log.Info().Str("model", config.Model).
Str("baseURL", config.BaseURL).
Str("apiKey", maskAPIKey(config.APIKey)).
Str("timeout", defaultTimeout.String()).
Msg("get model config")
return config, nil
}
// maskAPIKey masks the API key
func maskAPIKey(key string) string {
if len(key) <= 8 {
return "******"
}
return key[:4] + "******" + key[len(key)-4:]
}
func IsUseVLMUITars() bool {
return GetEnvConfigInBool(EnvUseVLMUITars)
}

50
planner/go.mod Normal file
View File

@@ -0,0 +1,50 @@
module github.com/httprunner/httprunner/v5/planner
go 1.24.1
require (
github.com/cloudwego/eino v0.3.16
github.com/cloudwego/eino-ext/components/model/openai v0.0.0-20250314110024-9e89ba18146c
github.com/joho/godotenv v1.5.1
github.com/pkg/errors v0.9.1
github.com/rs/zerolog v1.33.0
github.com/stretchr/testify v1.10.0
)
require (
github.com/bytedance/sonic v1.12.2 // indirect
github.com/bytedance/sonic/loader v0.2.0 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/eino-ext/libs/acl/openai v0.0.0-20250305023926-469de0301955 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/getkin/kin-openapi v0.118.0 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/swag v0.19.5 // indirect
github.com/goph/emperror v0.17.2 // indirect
github.com/invopop/yaml v0.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/nikolalohinski/gonja v1.5.3 // indirect
github.com/pelletier/go-toml/v2 v2.0.9 // indirect
github.com/perimeterx/marshmallow v1.1.4 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/sashabaranov/go-openai v1.32.5 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/slongfield/pyfmt v0.0.0-20220222012616-ea85ff4c361f // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/yargevad/filepathx v1.0.0 // indirect
golang.org/x/arch v0.11.0 // indirect
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect
golang.org/x/sys v0.28.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

175
planner/go.sum Normal file
View File

@@ -0,0 +1,175 @@
github.com/airbrake/gobrake v3.6.1+incompatible/go.mod h1:wM4gu3Cn0W0K7GUuVWnlXZU11AGBXMILnrdOU8Kn00o=
github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/bugsnag/bugsnag-go v1.4.0/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8=
github.com/bugsnag/panicwrap v1.2.0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/bytedance/mockey v1.2.13 h1:jokWZAm/pUEbD939Rhznz615MKUCZNuvCFQlJ2+ntoo=
github.com/bytedance/mockey v1.2.13/go.mod h1:1BPHF9sol5R1ud/+0VEHGQq/+i2lN+GTsr3O2Q9IENY=
github.com/bytedance/sonic v1.12.2 h1:oaMFuRTpMHYLpCntGca65YWt5ny+wAceDERTkT2L9lg=
github.com/bytedance/sonic v1.12.2/go.mod h1:B8Gt/XvtZ3Fqj+iSKMypzymZxw/FVwgIGKzMzT9r/rk=
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
github.com/bytedance/sonic/loader v0.2.0 h1:zNprn+lsIP06C/IqCHs3gPQIvnvpKbbxyXQP1iU4kWM=
github.com/bytedance/sonic/loader v0.2.0/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
github.com/certifi/gocertifi v0.0.0-20190105021004-abcd57078448/go.mod h1:GJKEexRPVJrBSOjoqN5VNOIKJ5Q3RViH6eu3puDRwx4=
github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
github.com/cloudwego/eino v0.3.16 h1:ASN8zISyoEdjEsPnIw5GazSHtbNY97NDthQ2B69yiZw=
github.com/cloudwego/eino v0.3.16/go.mod h1:+kmJimGEcKuSI6OKhet7kBedkm1WUZS3H1QRazxgWUo=
github.com/cloudwego/eino-ext/components/model/openai v0.0.0-20250314110024-9e89ba18146c h1:04WQpGikdQv6fh5wzMYSQhO0SJraV8+xcb9VQ00+HX4=
github.com/cloudwego/eino-ext/components/model/openai v0.0.0-20250314110024-9e89ba18146c/go.mod h1:YGP4q3uspj5qhkv3CnvlEPSo0YGeWpvkkTUHHpLExas=
github.com/cloudwego/eino-ext/libs/acl/openai v0.0.0-20250305023926-469de0301955 h1:fgvkmTqAalDfjdy3b6Ur2mh/KEwB9L2uvqS4MFgTOqc=
github.com/cloudwego/eino-ext/libs/acl/openai v0.0.0-20250305023926-469de0301955/go.mod h1:6CThw1XQx/ASXNt31yuvp0X4Yp4GprknQuIvP9VKDpw=
github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/getkin/kin-openapi v0.118.0 h1:z43njxPmJ7TaPpMSCQb7PN0dEYno4tyBPQcrFdHoLuM=
github.com/getkin/kin-openapi v0.118.0/go.mod h1:l5e9PaFUo9fyLJCPGQeXI2ML8c3P8BHOEV2VaAVf/pc=
github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=
github.com/go-check/check v0.0.0-20180628173108-788fd7840127 h1:0gkP6mzaMqkmpcJYCFOLkIBwI7xFExG03bbkOkCvUPI=
github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY=
github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg=
github.com/go-openapi/swag v0.19.5 h1:lTz6Ys4CmqqCQmZPBlbQENR1/GucA2bzYTE12Pw4tFY=
github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk=
github.com/go-test/deep v1.0.8 h1:TDsG77qcSprGbC6vTN8OuXp5g+J+b5Pcguhf7Zt61VM=
github.com/go-test/deep v1.0.8/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gofrs/uuid v3.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/goph/emperror v0.17.2 h1:yLapQcmEsO0ipe9p5TaN22djm3OFV/TfM/fcYP0/J18=
github.com/goph/emperror v0.17.2/go.mod h1:+ZbQ+fUNO/6FNiUo0ujtMjhgad9Xa6fQL9KhH4LNHic=
github.com/gopherjs/gopherjs v1.17.2 h1:fQnZVsXk8uxXIStYb0N4bGk7jeyTalG/wsZjQ25dO0g=
github.com/gopherjs/gopherjs v1.17.2/go.mod h1:pRRIvn/QzFLrKfvEz3qUuEhtE/zLCWfreZ6J5gM2i+k=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/invopop/yaml v0.1.0 h1:YW3WGUoJEXYfzWBjn00zIlrw7brGVD0fUKRYDPAPhrc=
github.com/invopop/yaml v0.1.0/go.mod h1:2XuRLgs/ouIrW3XNzuNj7J3Nvu/Dig5MXvbCEdiBN3Q=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0/go.mod h1:1NbS8ALrpOvjt0rHPNLyCIeMtbizbir8U//inJ+zuB8=
github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b h1:j7+1HpAFS1zy5+Q4qx1fWh90gTKwiN4QCGoY9TWyyO4=
github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw=
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
github.com/nikolalohinski/gonja v1.5.3 h1:GsA+EEaZDZPGJ8JtpeGN78jidhOlxeJROpqMT9fTj9c=
github.com/nikolalohinski/gonja v1.5.3/go.mod h1:RmjwxNiXAEqcq1HeK5SSMmqFJvKOfTfXhkJv6YBtPa4=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.8.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/pelletier/go-toml/v2 v2.0.9 h1:uH2qQXheeefCCkuBBSLi7jCiSmj3VRh2+Goq2N7Xxu0=
github.com/pelletier/go-toml/v2 v2.0.9/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc=
github.com/perimeterx/marshmallow v1.1.4 h1:pZLDH9RjlLGGorbXhcaQLhfuV0pFMNfPO55FuFkxqLw=
github.com/perimeterx/marshmallow v1.1.4/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rollbar/rollbar-go v1.0.2/go.mod h1:AcFs5f0I+c71bpHlXNNDbOWJiKwjFDtISeXco0L5PKQ=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
github.com/sashabaranov/go-openai v1.32.5 h1:/eNVa8KzlE7mJdKPZDj6886MUzZQjoVHyn0sLvIt5qA=
github.com/sashabaranov/go-openai v1.32.5/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/slongfield/pyfmt v0.0.0-20220222012616-ea85ff4c361f h1:Z2cODYsUxQPofhpYRMQVwWz4yUVpHF+vPi+eUdruUYI=
github.com/slongfield/pyfmt v0.0.0-20220222012616-ea85ff4c361f/go.mod h1:JqzWyvTuI2X4+9wOHmKSQCYxybB/8j6Ko43qVmXDuZg=
github.com/smarty/assertions v1.15.0 h1:cR//PqUBUiQRakZWqBiFFQ9wb8emQGDb0HeGdqGByCY=
github.com/smarty/assertions v1.15.0/go.mod h1:yABtdzeQs6l1brC900WlRNwj6ZR55d7B+E8C6HtKdec=
github.com/smartystreets/goconvey v1.8.1 h1:qGjIddxOk4grTu9JPOU31tVfq3cNdBlNa5sSznIX1xY=
github.com/smartystreets/goconvey v1.8.1/go.mod h1:+/u4qLyY6x1jReYOp7GOM2FSt8aP9CzCZL03bI28W60=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go v1.2.7 h1:qYhyWUUd6WbiM+C6JZAUkIJt/1WrjzNHY9+KCIjVqTo=
github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
github.com/ugorji/go/codec v1.2.7 h1:YPXUKf7fYbp/y8xloBqZOw2qaVggbfwMlI8WM3wZUJ0=
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/x-cray/logrus-prefixed-formatter v0.5.2 h1:00txxvfBM9muc0jiLIEAkAcIMJzfthRT6usrui8uGmg=
github.com/x-cray/logrus-prefixed-formatter v0.5.2/go.mod h1:2duySbKsL6M18s5GU7VPsoEPHyzalCE06qoARUCeBBE=
github.com/yargevad/filepathx v1.0.0 h1:SYcT+N3tYGi+NvazubCNlvgIPbzAk7i7y2dwg3I5FYc=
github.com/yargevad/filepathx v1.0.0/go.mod h1:BprfX/gpYNJHJfc35GjRRpVcwWXS89gGulUIU5tK3tA=
golang.org/x/arch v0.11.0 h1:KXV8WWKCXm6tRpLirl2szsO5j/oOODwZf4hATmGVNs4=
golang.org/x/arch v0.11.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=

250
planner/parser.go Normal file
View File

@@ -0,0 +1,250 @@
package planner
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"github.com/pkg/errors"
)
// NewActionParser creates a new ActionParser instance
func NewActionParser(prediction string, factor float64) *ActionParser {
return &ActionParser{
Prediction: prediction,
Factor: factor,
}
}
// ActionParser parses VLM responses and converts them to structured actions
type ActionParser struct {
Prediction string
Factor float64
}
// Parse parses the prediction text and extracts actions
func (p *ActionParser) Parse(predictionText string) ([]ParsedAction, error) {
// try parsing JSON format
var jsonActions []ParsedAction
jsonActions, jsonErr := p.parseJSON(predictionText)
if jsonErr == nil && len(jsonActions) > 0 {
return jsonActions, nil
}
// if JSON parsing fails, try parsing Thought/Action format
thoughtActions, thoughtErr := p.parseThoughtAction(predictionText)
if thoughtErr == nil && len(thoughtActions) > 0 {
return thoughtActions, nil
}
// both parsing methods failed
if jsonErr != nil && thoughtErr != nil {
return nil, fmt.Errorf("failed to parse VLM response: %v; %v", jsonErr, thoughtErr)
}
return nil, fmt.Errorf("no actions returned from VLM")
}
// parseJSON tries to parse the response as JSON format
func (p *ActionParser) parseJSON(predictionText string) ([]ParsedAction, error) {
predictionText = strings.TrimSpace(predictionText)
if strings.HasPrefix(predictionText, "```json") && strings.HasSuffix(predictionText, "```") {
predictionText = strings.TrimPrefix(predictionText, "```json")
predictionText = strings.TrimSuffix(predictionText, "```")
}
predictionText = strings.TrimSpace(predictionText)
var response VLMResponse
if err := json.Unmarshal([]byte(predictionText), &response); err != nil {
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
}
if response.Error != "" {
return nil, errors.New(response.Error)
}
if len(response.Actions) == 0 {
return nil, errors.New("no actions returned from VLM")
}
// normalize actions
var normalizedActions []ParsedAction
for _, action := range response.Actions {
if err := p.normalizeAction(&action); err != nil {
return nil, errors.Wrap(err, "failed to normalize action")
}
normalizedActions = append(normalizedActions, action)
}
return normalizedActions, nil
}
// parseThoughtAction parses the Thought/Action format response
func (p *ActionParser) parseThoughtAction(predictionText string) ([]ParsedAction, error) {
thoughtRegex := regexp.MustCompile(`(?is)Thought:(.+?)Action:`)
actionRegex := regexp.MustCompile(`(?is)Action:(.+)`)
// extract Thought part
thoughtMatch := thoughtRegex.FindStringSubmatch(predictionText)
var thought string
if len(thoughtMatch) > 1 {
thought = strings.TrimSpace(thoughtMatch[1])
}
// extract Action part
actionMatch := actionRegex.FindStringSubmatch(predictionText)
if len(actionMatch) < 2 {
return nil, fmt.Errorf("no action found in the response")
}
actionText := strings.TrimSpace(actionMatch[1])
// parse action type and parameters
return p.parseActionText(actionText, thought)
}
// parseActionText parses the action text to extract the action type and parameters
func (p *ActionParser) parseActionText(actionText, thought string) ([]ParsedAction, error) {
// remove trailing comments
if idx := strings.Index(actionText, "#"); idx > 0 {
actionText = strings.TrimSpace(actionText[:idx])
}
// supported action types and regexes
actionRegexes := map[string]*regexp.Regexp{
"click": regexp.MustCompile(`click\(start_box='([^']+)'\)`),
"left_double": regexp.MustCompile(`left_double\(start_box='([^']+)'\)`),
"right_single": regexp.MustCompile(`right_single\(start_box='([^']+)'\)`),
"drag": regexp.MustCompile(`drag\(start_box='([^']+)', end_box='([^']+)'\)`),
"hotkey": regexp.MustCompile(`hotkey\(key='([^']+)'\)`),
"type": regexp.MustCompile(`type\(content='([^']+)'\)`),
"scroll": regexp.MustCompile(`scroll\(start_box='([^']+)', direction='([^']+)'\)`),
"wait": regexp.MustCompile(`wait\(\)`),
"finished": regexp.MustCompile(`finished\(\)`),
"call_user": regexp.MustCompile(`call_user\(\)`),
}
for actionType, regex := range actionRegexes {
matches := regex.FindStringSubmatch(actionText)
if len(matches) == 0 {
continue
}
var action ParsedAction
action.ActionType = actionType
action.ActionInputs = make(map[string]interface{})
action.Thought = thought
// parse parameters based on action type
switch actionType {
case "click", "left_double", "right_single":
if len(matches) > 1 {
coord, err := p.normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize point failed: %s", matches[1])
}
action.ActionInputs["startBox"] = coord
}
case "drag":
if len(matches) > 2 {
// handle start point
startBox, err := p.normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
}
action.ActionInputs["startBox"] = startBox
// handle end point
endBox, err := p.normalizeCoordinates(matches[2])
if err != nil {
return nil, errors.Wrapf(err, "normalize endBox failed: %s", matches[2])
}
action.ActionInputs["endBox"] = endBox
}
case "hotkey":
if len(matches) > 1 {
action.ActionInputs["key"] = matches[1]
}
case "type":
if len(matches) > 1 {
action.ActionInputs["content"] = matches[1]
}
case "scroll":
if len(matches) > 2 {
startBox, err := p.normalizeCoordinates(matches[1])
if err != nil {
return nil, errors.Wrapf(err, "normalize startBox failed: %s", matches[1])
}
action.ActionInputs["startBox"] = startBox
action.ActionInputs["direction"] = matches[2]
}
case "wait", "finished", "call_user":
// 这些动作没有额外参数
}
return []ParsedAction{action}, nil
}
return nil, fmt.Errorf("unknown action format: %s", actionText)
}
// normalizeAction normalizes the coordinates in the action
func (p *ActionParser) normalizeAction(action *ParsedAction) error {
switch action.ActionType {
case "click", "drag":
// handle click and drag action coordinates
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
normalized, err := p.normalizeCoordinates(startBox)
if err != nil {
return fmt.Errorf("failed to normalize startBox: %w", err)
}
action.ActionInputs["startBox"] = normalized
}
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
normalized, err := p.normalizeCoordinates(endBox)
if err != nil {
return fmt.Errorf("failed to normalize endBox: %w", err)
}
action.ActionInputs["endBox"] = normalized
}
}
return nil
}
// normalizeCoordinates normalizes the coordinates based on the factor
func (p *ActionParser) normalizeCoordinates(coordStr string) (string, error) {
var coords []float64
// check empty string
if coordStr == "" {
return "", fmt.Errorf("empty coordinate string")
}
if !strings.Contains(coordStr, ",") {
return "", fmt.Errorf("invalid coordinate string: %s", coordStr)
}
// remove possible brackets and split coordinates
coordStr = strings.Trim(coordStr, "[]() \t")
// try parsing JSON array
jsonStr := coordStr
if !strings.HasPrefix(jsonStr, "[") {
jsonStr = "[" + coordStr + "]"
}
err := json.Unmarshal([]byte(jsonStr), &coords)
if err != nil {
return "", fmt.Errorf("failed to parse coordinate string: %w", err)
}
normalized, err := json.Marshal(coords)
if err != nil {
return "", fmt.Errorf("failed to marshal normalized coordinates: %w", err)
}
return string(normalized), nil
}

396
planner/planner.go Normal file
View File

@@ -0,0 +1,396 @@
package planner
import (
"bytes"
"context"
"encoding/base64"
"fmt"
"image"
"image/color"
"image/draw"
"image/png"
"os"
"strings"
"github.com/cloudwego/eino-ext/components/model/openai"
"github.com/cloudwego/eino/schema"
"github.com/pkg/errors"
"github.com/rs/zerolog/log"
)
// Error types
var (
ErrInvalidInput = fmt.Errorf("invalid input parameters")
ErrEmptyInstruction = fmt.Errorf("user instruction is empty")
ErrNoConversationHistory = fmt.Errorf("conversation history is empty")
ErrInvalidImageData = fmt.Errorf("invalid image data")
)
const uiTarsPlanningPrompt = `
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
Thought: ...
Action: ...
## Action Space
click(start_box='[x1, y1, x2, y2]')
left_double(start_box='[x1, y1, x2, y2]')
right_single(start_box='[x1, y1, x2, y2]')
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
hotkey(key='')
type(content='') #If you want to submit your input, use "\n" at the end of content.
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
## Note
- Use Chinese in Thought part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in Thought part.
## User Instruction
`
func NewPlanner(ctx context.Context) (*Planner, error) {
config, err := GetModelConfig()
if err != nil {
return nil, fmt.Errorf("failed to create OpenAI config: %w", err)
}
model, err := openai.NewChatModel(ctx, config)
if err != nil {
return nil, fmt.Errorf("failed to initialize OpenAI model: %w", err)
}
return &Planner{
ctx: ctx,
model: model,
}, nil
}
type Planner struct {
ctx context.Context
model *openai.ChatModel
}
// Start performs UI planning using Vision Language Model
func (p *Planner) Start(opts PlanningOptions) (*PlanningResult, error) {
log.Info().Str("user_instruction", opts.UserInstruction).Msg("start VLM planning")
// 1. validate input parameters
if err := validateInput(opts); err != nil {
return nil, errors.Wrap(err, "validate input parameters failed")
}
// 2. call VLM service
resp, err := p.callVLMService(opts)
if err != nil {
return nil, errors.Wrap(err, "call VLM service failed")
}
// 3. process response
result, err := processVLMResponse(resp)
if err != nil {
return nil, errors.Wrap(err, "process VLM response failed")
}
log.Info().
Interface("summary", result.ActionSummary).
Interface("actions", result.Actions).
Msg("VLM planning completed")
return result, nil
}
func validateInput(opts PlanningOptions) error {
if opts.UserInstruction == "" {
return ErrEmptyInstruction
}
if len(opts.ConversationHistory) == 0 {
return ErrNoConversationHistory
}
if opts.Size.Width <= 0 || opts.Size.Height <= 0 {
return ErrInvalidInput
}
// ensure at least one image URL
hasImageURL := false
for _, msg := range opts.ConversationHistory {
if msg.Role == "user" {
// check MultiContent
if len(msg.MultiContent) > 0 {
for _, content := range msg.MultiContent {
if content.Type == "image_url" && content.ImageURL != nil {
hasImageURL = true
break
}
}
}
}
if hasImageURL {
break
}
}
if !hasImageURL {
return ErrInvalidInput
}
return nil
}
// callVLMService makes the actual call to the VLM service
func (p *Planner) callVLMService(opts PlanningOptions) (*VLMResponse, error) {
log.Info().Msg("calling VLM service...")
// prepare prompt
systemPrompt := uiTarsPlanningPrompt + opts.UserInstruction
messages := []*schema.Message{
{
Role: schema.System,
Content: systemPrompt,
},
}
messages = append(messages, opts.ConversationHistory...)
// generate response
resp, err := p.model.Generate(p.ctx, messages)
if err != nil {
return nil, fmt.Errorf("OpenAI API request failed: %w", err)
}
// parse response
content := resp.Content
parser := NewActionParser(content, 1000) // 使用与 TypeScript 版本相同的 factor
actions, err := parser.Parse(content)
if err != nil {
return nil, fmt.Errorf("failed to parse actions: %w", err)
}
return &VLMResponse{
Actions: actions,
}, nil
}
// processVLMResponse processes the VLM response and converts it to PlanningResult
func processVLMResponse(resp *VLMResponse) (*PlanningResult, error) {
log.Info().Msg("processing VLM response...")
if resp.Error != "" {
return nil, fmt.Errorf("VLM error: %s", resp.Error)
}
if len(resp.Actions) == 0 {
return nil, fmt.Errorf("no actions returned from VLM")
}
// 验证和后处理每个动作
for i := range resp.Actions {
// 验证动作类型
switch resp.Actions[i].ActionType {
case "click", "left_double", "right_single":
validateCoordinateAction(&resp.Actions[i], "startBox")
case "drag":
validateCoordinateAction(&resp.Actions[i], "startBox")
validateCoordinateAction(&resp.Actions[i], "endBox")
case "scroll":
validateCoordinateAction(&resp.Actions[i], "startBox")
validateScrollDirection(&resp.Actions[i])
case "type":
validateTypeContent(&resp.Actions[i])
case "hotkey":
validateHotkeyAction(&resp.Actions[i])
case "wait", "finished", "call_user":
// 这些动作不需要额外参数
default:
log.Printf("警告: 未知的动作类型: %s, 将尝试继续处理", resp.Actions[i].ActionType)
}
}
// 提取动作摘要
actionSummary := extractActionSummary(resp.Actions)
// 将ParsedAction转换为接口类型
var actions []interface{}
for _, action := range resp.Actions {
actionMap := map[string]interface{}{
"actionType": action.ActionType,
"actionInputs": action.ActionInputs,
"thought": action.Thought,
}
actions = append(actions, actionMap)
}
return &PlanningResult{
Actions: actions,
RealActions: resp.Actions,
ActionSummary: actionSummary,
}, nil
}
// extractActionSummary 从动作中提取摘要
func extractActionSummary(actions []ParsedAction) string {
if len(actions) == 0 {
return ""
}
// 优先使用第一个动作的Thought作为摘要
if actions[0].Thought != "" {
return actions[0].Thought
}
// 如果没有Thought则根据动作类型生成摘要
action := actions[0]
switch action.ActionType {
case "click":
return "点击操作"
case "drag":
return "拖拽操作"
case "left_double":
return "双击操作"
case "right_single":
return "右键点击操作"
case "scroll":
direction, _ := action.ActionInputs["direction"].(string)
return fmt.Sprintf("滚动操作 (%s)", direction)
case "type":
content, _ := action.ActionInputs["content"].(string)
if len(content) > 20 {
content = content[:20] + "..."
}
return fmt.Sprintf("输入文本: %s", content)
case "hotkey":
key, _ := action.ActionInputs["key"].(string)
return fmt.Sprintf("快捷键: %s", key)
case "wait":
return "等待操作"
case "finished":
return "完成操作"
case "call_user":
return "请求用户协助"
default:
return fmt.Sprintf("执行 %s 操作", action.ActionType)
}
}
// validateCoordinateAction 验证坐标类动作
func validateCoordinateAction(action *ParsedAction, boxField string) {
if box, ok := action.ActionInputs[boxField]; !ok || box == "" {
// 为空或缺失的坐标设置默认值
action.ActionInputs[boxField] = "[0.5, 0.5]"
log.Printf("警告: %s动作缺少%s参数, 已设置默认值", action.ActionType, boxField)
}
}
// validateScrollDirection 验证滚动方向
func validateScrollDirection(action *ParsedAction) {
if direction, ok := action.ActionInputs["direction"].(string); !ok || direction == "" {
// 为空或缺失的方向设置默认值
action.ActionInputs["direction"] = "down"
log.Printf("警告: scroll动作缺少direction参数, 已设置默认值")
} else {
// 标准化方向
switch strings.ToLower(direction) {
case "up", "down", "left", "right":
// 保持原样
default:
// 非标准方向设为默认值
action.ActionInputs["direction"] = "down"
log.Printf("警告: 非标准滚动方向: %s, 已设置为down", direction)
}
}
}
// validateTypeContent 验证输入文本内容
func validateTypeContent(action *ParsedAction) {
if content, ok := action.ActionInputs["content"]; !ok || content == "" {
// 为空或缺失的内容设置默认值
action.ActionInputs["content"] = ""
log.Printf("警告: type动作缺少content参数, 已设置为空字符串")
}
}
// validateHotkeyAction 验证快捷键动作
func validateHotkeyAction(action *ParsedAction) {
if key, ok := action.ActionInputs["key"]; !ok || key == "" {
// 为空或缺失的键设置默认值
action.ActionInputs["key"] = "Enter"
log.Printf("警告: hotkey动作缺少key参数, 已设置默认值")
}
}
// SavePositionImg saves an image with position markers
func SavePositionImg(params struct {
InputImgBase64 string
Rect struct {
X float64
Y float64
}
OutputPath string
}) error {
// 解码Base64图像
imgData := params.InputImgBase64
// 如果包含了数据URL前缀去掉它
if strings.HasPrefix(imgData, "data:image/") {
parts := strings.Split(imgData, ",")
if len(parts) > 1 {
imgData = parts[1]
}
}
// 解码Base64
unbased, err := base64.StdEncoding.DecodeString(imgData)
if err != nil {
return fmt.Errorf("无法解码Base64图像: %w", err)
}
// 解码图像
reader := bytes.NewReader(unbased)
img, _, err := image.Decode(reader)
if err != nil {
return fmt.Errorf("无法解码图像数据: %w", err)
}
// 创建一个可以在其上绘制的图像
bounds := img.Bounds()
rgba := image.NewRGBA(bounds)
draw.Draw(rgba, bounds, img, bounds.Min, draw.Src)
// 在点击/拖动位置绘制标记
markRadius := 10
x, y := int(params.Rect.X), int(params.Rect.Y)
// 绘制红色圆圈
for i := -markRadius; i <= markRadius; i++ {
for j := -markRadius; j <= markRadius; j++ {
if i*i+j*j <= markRadius*markRadius {
if x+i >= 0 && x+i < bounds.Max.X && y+j >= 0 && y+j < bounds.Max.Y {
rgba.Set(x+i, y+j, color.RGBA{255, 0, 0, 255})
}
}
}
}
// 保存图像
outFile, err := os.Create(params.OutputPath)
if err != nil {
return fmt.Errorf("无法创建输出文件: %w", err)
}
defer outFile.Close()
// 编码为PNG并保存
if err := png.Encode(outFile, rgba); err != nil {
return fmt.Errorf("无法编码和保存图像: %w", err)
}
return nil
}
// loadImage loads image and returns base64 encoded string
func loadImage(imagePath string) (base64Str string, err error) {
imageData, err := os.ReadFile(imagePath)
if err != nil {
return "", err
}
base64Str = "data:image/png;base64," + base64.StdEncoding.EncodeToString(imageData)
return
}

263
planner/planner_test.go Normal file
View File

@@ -0,0 +1,263 @@
package planner
import (
"context"
"encoding/json"
"testing"
"github.com/cloudwego/eino/schema"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestVLMPlanning(t *testing.T) {
err := loadEnv("testdata/.env")
require.NoError(t, err)
// imageBase64, err := loadImage("testdata/popup_risk_warning.png")
imageBase64, err := loadImage("testdata/llk_1.png")
require.NoError(t, err)
userInstruction := `连连看是一款经典的益智消除类小游戏,通常以图案或图标为主要元素。以下是连连看的基本规则说明:
1. 游戏目标: 玩家需要在规定时间内,通过连接相同的图案或图标,将它们从游戏界面中消除。
2. 连接规则:
- 两个相同的图案可以通过不超过三条直线连接。
- 连接线可以水平或垂直,但不能穿过其他图案。
- 连接线的转折次数不能超过两次。
3. 游戏界面: 游戏界面通常是一个矩形区域,内含多个图案或图标,排列成行和列。
4. 时间限制: 游戏通常设有时间限制,玩家需要在时间耗尽前完成所有图案的消除。
5. 得分机制: 每成功连接并消除一对图案,玩家会获得相应的分数。完成游戏后,根据剩余时间和消除效率计算总分。
6. 关卡设计: 游戏可能包含多个关卡,随着关卡的推进,图案的复杂度和数量会增加。`
userInstruction += "\n\n请基于以上游戏规则给出下一步可点击的两个图标坐标"
planner, err := NewPlanner(context.Background())
require.NoError(t, err)
opts := PlanningOptions{
UserInstruction: userInstruction,
ConversationHistory: []*schema.Message{
{
Role: schema.User,
MultiContent: []schema.ChatMessagePart{
{
Type: "image_url",
ImageURL: &schema.ChatMessageImageURL{
URL: imageBase64,
},
},
},
},
},
Size: Size{
Width: 1920,
Height: 1080,
},
}
// 执行规划
result, err := planner.Start(opts)
// 验证结果
require.NoError(t, err)
require.NotNil(t, result)
require.NotEmpty(t, result.RealActions)
// 验证动作
action := result.RealActions[0]
assert.NotEmpty(t, action.ActionType)
assert.NotEmpty(t, action.Thought)
// 根据动作类型验证参数
switch action.ActionType {
case "click", "drag", "left_double", "right_single", "scroll":
// 这些动作需要验证坐标
assert.NotEmpty(t, action.ActionInputs["startBox"])
// 验证坐标格式
var coords []float64
err = json.Unmarshal([]byte(action.ActionInputs["startBox"].(string)), &coords)
require.NoError(t, err)
require.True(t, len(coords) >= 2) // 至少有 x, y 坐标
// 验证坐标范围
for _, coord := range coords {
assert.GreaterOrEqual(t, coord, float64(0))
assert.LessOrEqual(t, coord, float64(1920)) // 最大屏幕宽度
}
case "type":
// 验证文本内容
assert.NotEmpty(t, action.ActionInputs["content"])
case "hotkey":
// 验证按键
assert.NotEmpty(t, action.ActionInputs["key"])
case "wait", "finished", "call_user":
// 这些动作不需要额外参数
default:
t.Fatalf("未知的动作类型: %s", action.ActionType)
}
}
func TestValidateInput(t *testing.T) {
imageBase64, err := loadImage("testdata/popup_risk_warning.png")
require.NoError(t, err)
tests := []struct {
name string
opts PlanningOptions
wantErr error
}{
{
name: "valid input",
opts: PlanningOptions{
UserInstruction: "点击继续使用按钮",
ConversationHistory: []*schema.Message{
{
Role: schema.User,
MultiContent: []schema.ChatMessagePart{
{
Type: "image_url",
ImageURL: &schema.ChatMessageImageURL{
URL: imageBase64,
},
},
},
},
},
Size: Size{Width: 100, Height: 100},
},
wantErr: nil,
},
{
name: "empty instruction",
opts: PlanningOptions{
UserInstruction: "",
ConversationHistory: []*schema.Message{
{
Role: schema.User,
Content: "",
},
},
Size: Size{Width: 100, Height: 100},
},
wantErr: ErrEmptyInstruction,
},
{
name: "empty conversation history",
opts: PlanningOptions{
UserInstruction: "点击立即卸载按钮",
ConversationHistory: []*schema.Message{},
Size: Size{Width: 100, Height: 100},
},
wantErr: ErrNoConversationHistory,
},
{
name: "invalid size",
opts: PlanningOptions{
UserInstruction: "勾选不再提示选项",
ConversationHistory: []*schema.Message{
{
Role: schema.User,
Content: "",
},
},
Size: Size{Width: 0, Height: 0},
},
wantErr: ErrInvalidInput,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validateInput(tt.opts)
if tt.wantErr != nil {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}
func TestProcessVLMResponse(t *testing.T) {
tests := []struct {
name string
resp *VLMResponse
wantErr bool
}{
{
name: "valid response",
resp: &VLMResponse{
Actions: []ParsedAction{
{
ActionType: "click",
ActionInputs: map[string]interface{}{
"startBox": "[0.5, 0.5]",
},
},
},
},
wantErr: false,
},
{
name: "error response",
resp: &VLMResponse{
Error: "test error",
},
wantErr: true,
},
{
name: "empty actions",
resp: &VLMResponse{},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := processVLMResponse(tt.resp)
if tt.wantErr {
assert.Error(t, err)
assert.Nil(t, result)
return
}
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, tt.resp.Actions, result.RealActions)
})
}
}
func TestSavePositionImg(t *testing.T) {
imageBase64, err := loadImage("testdata/popup_risk_warning.png")
require.NoError(t, err)
tempFile := t.TempDir() + "/test.png"
params := struct {
InputImgBase64 string
Rect struct {
X float64
Y float64
}
OutputPath string
}{
InputImgBase64: imageBase64,
Rect: struct {
X float64
Y float64
}{
X: 100,
Y: 100,
},
OutputPath: tempFile,
}
err = SavePositionImg(params)
assert.NoError(t, err)
// TODO: Add more assertions when SavePositionImg is implemented
}

BIN
planner/testdata/llk_1.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 437 KiB

BIN
planner/testdata/llk_2.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 407 KiB

BIN
planner/testdata/popup_risk_warning.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

38
planner/types.go Normal file
View File

@@ -0,0 +1,38 @@
package planner
import (
"github.com/cloudwego/eino/schema"
)
// PlanningOptions represents the input options for planning
type PlanningOptions struct {
UserInstruction string `json:"user_instruction"`
ConversationHistory []*schema.Message `json:"conversation_history"`
Size Size `json:"size"`
}
// Size represents the dimensions of a screen
type Size struct {
Width int `json:"width"`
Height int `json:"height"`
}
// PlanningResult represents the result of planning
type PlanningResult struct {
Actions []interface{} `json:"actions"`
RealActions []ParsedAction `json:"real_actions"`
ActionSummary string `json:"action_summary"`
}
// VLMResponse represents the response from the Vision Language Model
type VLMResponse struct {
Actions []ParsedAction `json:"actions"`
Error string `json:"error,omitempty"`
}
// ParsedAction represents a parsed action from the VLM response
type ParsedAction struct {
ActionType string `json:"actionType"`
ActionInputs map[string]interface{} `json:"actionInputs"`
Thought string `json:"thought"`
}