mirror of
https://github.com/httprunner/httprunner.git
synced 2026-05-07 08:02:42 +08:00
refactor: merge planner
This commit is contained in:
@@ -1 +1 @@
|
||||
v5.0.0-beta-2504292314
|
||||
v5.0.0-beta-2504301407
|
||||
|
||||
@@ -3,92 +3,36 @@ package ai
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/model/openai"
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/internal/config"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func NewAIService(opts ...AIServiceOption) *AIServices {
|
||||
services := &AIServices{}
|
||||
for _, option := range opts {
|
||||
option(services)
|
||||
}
|
||||
return services
|
||||
}
|
||||
|
||||
type AIServices struct {
|
||||
ICVService
|
||||
ILLMService
|
||||
}
|
||||
|
||||
type AIServiceOption func(*AIServices)
|
||||
|
||||
type CVServiceType string
|
||||
|
||||
const (
|
||||
CVServiceTypeVEDEM CVServiceType = "vedem"
|
||||
CVServiceTypeOpenCV CVServiceType = "opencv"
|
||||
)
|
||||
|
||||
func WithCVService(service CVServiceType) AIServiceOption {
|
||||
return func(opts *AIServices) {
|
||||
if service == CVServiceTypeVEDEM {
|
||||
var err error
|
||||
opts.ICVService, err = NewVEDEMImageService()
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("init vedem image service failed")
|
||||
os.Exit(code.GetErrorCode(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type LLMServiceType string
|
||||
|
||||
const (
|
||||
LLMServiceTypeUITARS LLMServiceType = "ui-tars"
|
||||
LLMServiceTypeGPT LLMServiceType = "gpt"
|
||||
LLMServiceTypeQwenVL LLMServiceType = "qwen-vl"
|
||||
)
|
||||
|
||||
// ILLMService 定义了 LLM 服务接口,包括规划和断言功能
|
||||
type ILLMService interface {
|
||||
Call(opts *PlanningOptions) (*PlanningResult, error)
|
||||
Assert(opts *AssertOptions) (*AssertionResponse, error)
|
||||
}
|
||||
|
||||
func WithLLMService(modelType LLMServiceType) AIServiceOption {
|
||||
return func(opts *AIServices) {
|
||||
// init planner
|
||||
var planner IPlanner
|
||||
var err error
|
||||
switch modelType {
|
||||
case LLMServiceTypeGPT:
|
||||
// TODO: implement gpt-4o planner and asserter
|
||||
planner, err = NewPlanner(context.Background())
|
||||
case LLMServiceTypeUITARS:
|
||||
planner, err = NewUITarsPlanner(context.Background())
|
||||
}
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("init %s planner failed", modelType)
|
||||
os.Exit(code.GetErrorCode(err))
|
||||
}
|
||||
|
||||
// init asserter
|
||||
asserter, err := NewAsserter(context.Background())
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("init asserter failed")
|
||||
os.Exit(code.GetErrorCode(err))
|
||||
}
|
||||
|
||||
opts.ILLMService = &combinedLLMService{
|
||||
planner: planner,
|
||||
asserter: asserter,
|
||||
}
|
||||
func NewLLMService(modelType option.LLMServiceType) (ILLMService, error) {
|
||||
planner, err := NewPlanner(context.Background(), modelType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
asserter, err := NewAsserter(context.Background())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &combinedLLMService{
|
||||
planner: planner,
|
||||
asserter: asserter,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// combinedLLMService 实现了 ILLMService 接口,组合了规划和断言功能
|
||||
@@ -116,6 +60,10 @@ const (
|
||||
|
||||
var EnvModelUse string
|
||||
|
||||
const (
|
||||
defaultTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// GetOpenAIModelConfig get OpenAI config
|
||||
func GetOpenAIModelConfig() (*openai.ChatModelConfig, error) {
|
||||
if err := config.LoadEnv(); err != nil {
|
||||
@@ -157,3 +105,12 @@ func GetOpenAIModelConfig() (*openai.ChatModelConfig, error) {
|
||||
|
||||
return modelConfig, nil
|
||||
}
|
||||
|
||||
// maskAPIKey masks the API key
|
||||
func maskAPIKey(key string) string {
|
||||
if len(key) <= 8 {
|
||||
return "******"
|
||||
}
|
||||
|
||||
return key[:4] + "******" + key[len(key)-4:]
|
||||
}
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"context"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/color"
|
||||
"image/draw"
|
||||
"image/png"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/model/openai"
|
||||
"github.com/cloudwego/eino/components/model"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
type IPlanner interface {
|
||||
@@ -36,30 +33,115 @@ type PlanningResult struct {
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ParsedAction represents a parsed action from the VLM response
|
||||
type ParsedAction struct {
|
||||
ActionType ActionType `json:"actionType"`
|
||||
ActionInputs map[string]interface{} `json:"actionInputs"`
|
||||
Thought string `json:"thought"`
|
||||
func NewPlanner(ctx context.Context, modelType option.LLMServiceType) (*Planner, error) {
|
||||
planner := &Planner{
|
||||
ctx: ctx,
|
||||
modelType: modelType,
|
||||
}
|
||||
|
||||
config, err := GetOpenAIModelConfig()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create OpenAI config: %w", err)
|
||||
}
|
||||
|
||||
if modelType == option.LLMServiceTypeUITARS {
|
||||
planner.systemPrompt = uiTarsPlanningPrompt
|
||||
} else {
|
||||
planner.systemPrompt = defaultPlanningResponseJsonFormat
|
||||
}
|
||||
|
||||
planner.model, err = openai.NewChatModel(ctx, config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to initialize OpenAI model: %w", err)
|
||||
}
|
||||
|
||||
return planner, nil
|
||||
}
|
||||
|
||||
type ActionType string
|
||||
type Planner struct {
|
||||
ctx context.Context
|
||||
model model.ToolCallingChatModel
|
||||
systemPrompt string
|
||||
modelType option.LLMServiceType
|
||||
history ConversationHistory
|
||||
}
|
||||
|
||||
const (
|
||||
ActionTypeClick ActionType = "click"
|
||||
ActionTypeTap ActionType = "tap"
|
||||
ActionTypeDrag ActionType = "drag"
|
||||
ActionTypeSwipe ActionType = "swipe"
|
||||
ActionTypeWait ActionType = "wait"
|
||||
ActionTypeFinished ActionType = "finished"
|
||||
ActionTypeCallUser ActionType = "call_user"
|
||||
ActionTypeType ActionType = "type"
|
||||
ActionTypeScroll ActionType = "scroll"
|
||||
)
|
||||
// Call performs UI planning using Vision Language Model
|
||||
func (p *Planner) Call(opts *PlanningOptions) (*PlanningResult, error) {
|
||||
// validate input parameters
|
||||
if err := validatePlanningInput(opts); err != nil {
|
||||
return nil, errors.Wrap(err, "validate planning parameters failed")
|
||||
}
|
||||
|
||||
const (
|
||||
defaultTimeout = 30 * time.Second
|
||||
)
|
||||
// prepare prompt
|
||||
if len(p.history) == 0 {
|
||||
// add system message
|
||||
systemPrompt := uiTarsPlanningPrompt + opts.UserInstruction
|
||||
p.history = ConversationHistory{
|
||||
{
|
||||
Role: schema.System,
|
||||
Content: systemPrompt,
|
||||
},
|
||||
}
|
||||
}
|
||||
// append user image message
|
||||
p.history.Append(opts.Message)
|
||||
|
||||
// call model service, generate response
|
||||
logRequest(p.history)
|
||||
startTime := time.Now()
|
||||
resp, err := p.model.Generate(p.ctx, p.history)
|
||||
log.Info().Float64("elapsed(s)", time.Since(startTime).Seconds()).
|
||||
Str("model", string(p.modelType)).Msg("call model service")
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
|
||||
}
|
||||
logResponse(resp)
|
||||
|
||||
// parse result
|
||||
result, err := p.parseResult(resp, opts.Size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMParsePlanningResponseError, err.Error())
|
||||
}
|
||||
|
||||
// append assistant message
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Assistant,
|
||||
Content: result.ActionSummary,
|
||||
})
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *Planner) parseResult(msg *schema.Message, size types.Size) (*PlanningResult, error) {
|
||||
var parseActions []ParsedAction
|
||||
var err error
|
||||
if p.modelType == option.LLMServiceTypeUITARS {
|
||||
// parse Thought/Action format from UI-TARS
|
||||
parseActions, err = parseThoughtAction(msg.Content)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
// parse JSON format, from VLM like openai/gpt-4o
|
||||
parseActions, err = parseJSON(msg.Content)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// process response
|
||||
result, err := processVLMResponse(parseActions, size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "process VLM response failed")
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Interface("summary", result.ActionSummary).
|
||||
Interface("actions", result.NextActions).
|
||||
Msg("get VLM planning result")
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func validatePlanningInput(opts *PlanningOptions) error {
|
||||
if opts.UserInstruction == "" {
|
||||
@@ -83,79 +165,3 @@ func validatePlanningInput(opts *PlanningOptions) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SavePositionImg saves an image with position markers
|
||||
func SavePositionImg(params struct {
|
||||
InputImgBase64 string
|
||||
Rect struct {
|
||||
X float64
|
||||
Y float64
|
||||
}
|
||||
OutputPath string
|
||||
}) error {
|
||||
// 解码Base64图像
|
||||
imgData := params.InputImgBase64
|
||||
// 如果包含了数据URL前缀,去掉它
|
||||
if strings.HasPrefix(imgData, "data:image/") {
|
||||
parts := strings.Split(imgData, ",")
|
||||
if len(parts) > 1 {
|
||||
imgData = parts[1]
|
||||
}
|
||||
}
|
||||
|
||||
// 解码Base64
|
||||
unbased, err := base64.StdEncoding.DecodeString(imgData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法解码Base64图像: %w", err)
|
||||
}
|
||||
|
||||
// 解码图像
|
||||
reader := bytes.NewReader(unbased)
|
||||
img, _, err := image.Decode(reader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法解码图像数据: %w", err)
|
||||
}
|
||||
|
||||
// 创建一个可以在其上绘制的图像
|
||||
bounds := img.Bounds()
|
||||
rgba := image.NewRGBA(bounds)
|
||||
draw.Draw(rgba, bounds, img, bounds.Min, draw.Src)
|
||||
|
||||
// 在点击/拖动位置绘制标记
|
||||
markRadius := 10
|
||||
x, y := int(params.Rect.X), int(params.Rect.Y)
|
||||
|
||||
// 绘制红色圆圈
|
||||
for i := -markRadius; i <= markRadius; i++ {
|
||||
for j := -markRadius; j <= markRadius; j++ {
|
||||
if i*i+j*j <= markRadius*markRadius {
|
||||
if x+i >= 0 && x+i < bounds.Max.X && y+j >= 0 && y+j < bounds.Max.Y {
|
||||
rgba.Set(x+i, y+j, color.RGBA{255, 0, 0, 255})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 保存图像
|
||||
outFile, err := os.Create(params.OutputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法创建输出文件: %w", err)
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// 编码为PNG并保存
|
||||
if err := png.Encode(outFile, rgba); err != nil {
|
||||
return fmt.Errorf("无法编码和保存图像: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// maskAPIKey masks the API key
|
||||
func maskAPIKey(key string) string {
|
||||
if len(key) <= 8 {
|
||||
return "******"
|
||||
}
|
||||
|
||||
return key[:4] + "******" + key[len(key)-4:]
|
||||
}
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
_ "image/jpeg"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/model/openai"
|
||||
"github.com/cloudwego/eino/components/model"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/internal/json"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
)
|
||||
|
||||
func NewPlanner(ctx context.Context) (*Planner, error) {
|
||||
config, err := GetOpenAIModelConfig()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create OpenAI config: %w", err)
|
||||
}
|
||||
model, err := openai.NewChatModel(ctx, config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to initialize OpenAI model: %w", err)
|
||||
}
|
||||
return &Planner{
|
||||
ctx: ctx,
|
||||
model: model,
|
||||
modelType: LLMServiceTypeGPT,
|
||||
systemPrompt: uiTarsPlanningPrompt, // TODO: change prompt with function calling
|
||||
}, nil
|
||||
}
|
||||
|
||||
type Planner struct {
|
||||
ctx context.Context
|
||||
model model.ToolCallingChatModel
|
||||
systemPrompt string
|
||||
modelType LLMServiceType
|
||||
history ConversationHistory
|
||||
}
|
||||
|
||||
// Call performs UI planning using Vision Language Model
|
||||
func (p *Planner) Call(opts *PlanningOptions) (*PlanningResult, error) {
|
||||
// validate input parameters
|
||||
if err := validatePlanningInput(opts); err != nil {
|
||||
return nil, errors.Wrap(err, "validate planning parameters failed")
|
||||
}
|
||||
|
||||
// prepare prompt
|
||||
if len(p.history) == 0 {
|
||||
// add system message
|
||||
systemPrompt := uiTarsPlanningPrompt + opts.UserInstruction
|
||||
p.history = ConversationHistory{
|
||||
{
|
||||
Role: schema.System,
|
||||
Content: systemPrompt,
|
||||
},
|
||||
}
|
||||
}
|
||||
// append user image message
|
||||
p.history.Append(opts.Message)
|
||||
|
||||
// call model service, generate response
|
||||
logRequest(p.history)
|
||||
startTime := time.Now()
|
||||
resp, err := p.model.Generate(p.ctx, p.history)
|
||||
log.Info().Float64("elapsed(s)", time.Since(startTime).Seconds()).
|
||||
Str("model", string(p.modelType)).Msg("call model service")
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
|
||||
}
|
||||
logResponse(resp)
|
||||
|
||||
// parse result
|
||||
result, err := p.parseResult(resp, opts.Size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMParsePlanningResponseError, err.Error())
|
||||
}
|
||||
|
||||
// append assistant message
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Assistant,
|
||||
Content: result.ActionSummary,
|
||||
})
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *Planner) parseResult(msg *schema.Message, size types.Size) (*PlanningResult, error) {
|
||||
// parse JSON format, from VLM like openai/gpt-4o
|
||||
parseActions, jsonErr := parseJSON(msg.Content)
|
||||
if jsonErr != nil {
|
||||
return nil, jsonErr
|
||||
}
|
||||
|
||||
// process response
|
||||
result, err := processVLMResponse(parseActions, size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "process VLM response failed")
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Interface("summary", result.ActionSummary).
|
||||
Interface("actions", result.NextActions).
|
||||
Msg("get VLM planning result")
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// parseJSON tries to parse the response as JSON format
|
||||
func parseJSON(predictionText string) ([]ParsedAction, error) {
|
||||
predictionText = strings.TrimSpace(predictionText)
|
||||
if strings.HasPrefix(predictionText, "```json") && strings.HasSuffix(predictionText, "```") {
|
||||
predictionText = strings.TrimPrefix(predictionText, "```json")
|
||||
predictionText = strings.TrimSuffix(predictionText, "```")
|
||||
}
|
||||
predictionText = strings.TrimSpace(predictionText)
|
||||
|
||||
var response PlanningResult
|
||||
if err := json.Unmarshal([]byte(predictionText), &response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
||||
}
|
||||
|
||||
if response.Error != "" {
|
||||
return nil, errors.New(response.Error)
|
||||
}
|
||||
|
||||
if len(response.NextActions) == 0 {
|
||||
return nil, errors.New("no actions returned from VLM")
|
||||
}
|
||||
|
||||
// normalize actions
|
||||
var normalizedActions []ParsedAction
|
||||
for i := range response.NextActions {
|
||||
// create a new variable, avoid implicit memory aliasing in for loop.
|
||||
action := response.NextActions[i]
|
||||
if err := normalizeAction(&action); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to normalize action")
|
||||
}
|
||||
normalizedActions = append(normalizedActions, action)
|
||||
}
|
||||
|
||||
return normalizedActions, nil
|
||||
}
|
||||
|
||||
// normalizeAction normalizes the coordinates in the action
|
||||
func normalizeAction(action *ParsedAction) error {
|
||||
switch action.ActionType {
|
||||
case "click", "drag":
|
||||
// handle click and drag action coordinates
|
||||
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(startBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize startBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["startBox"] = normalized
|
||||
}
|
||||
|
||||
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(endBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize endBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["endBox"] = normalized
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,116 +1,38 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/model/openai"
|
||||
"github.com/cloudwego/eino/components/model"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/internal/json"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func NewUITarsPlanner(ctx context.Context) (*UITarsPlanner, error) {
|
||||
config, err := GetOpenAIModelConfig()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chatModel, err := openai.NewChatModel(ctx, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &UITarsPlanner{
|
||||
ctx: ctx,
|
||||
model: chatModel,
|
||||
modelType: LLMServiceTypeUITARS,
|
||||
systemPrompt: uiTarsPlanningPrompt,
|
||||
}, nil
|
||||
// ParsedAction represents a parsed action from the VLM response
|
||||
type ParsedAction struct {
|
||||
ActionType ActionType `json:"actionType"`
|
||||
ActionInputs map[string]interface{} `json:"actionInputs"`
|
||||
Thought string `json:"thought"`
|
||||
}
|
||||
|
||||
type UITarsPlanner struct {
|
||||
ctx context.Context
|
||||
model model.ToolCallingChatModel
|
||||
systemPrompt string
|
||||
modelType LLMServiceType
|
||||
history ConversationHistory
|
||||
}
|
||||
type ActionType string
|
||||
|
||||
// Call performs UI planning using Vision Language Model
|
||||
func (p *UITarsPlanner) Call(opts *PlanningOptions) (*PlanningResult, error) {
|
||||
// validate input parameters
|
||||
if err := validatePlanningInput(opts); err != nil {
|
||||
return nil, errors.Wrap(err, "validate planning parameters failed")
|
||||
}
|
||||
|
||||
// prepare prompt
|
||||
if len(p.history) == 0 {
|
||||
// add system message
|
||||
systemPrompt := uiTarsPlanningPrompt + opts.UserInstruction
|
||||
p.history = ConversationHistory{
|
||||
{
|
||||
Role: schema.System,
|
||||
Content: systemPrompt,
|
||||
},
|
||||
}
|
||||
}
|
||||
// append user image message
|
||||
p.history.Append(opts.Message)
|
||||
|
||||
// call model service, generate response
|
||||
logRequest(p.history)
|
||||
startTime := time.Now()
|
||||
resp, err := p.model.Generate(p.ctx, p.history)
|
||||
log.Info().Float64("elapsed(s)", time.Since(startTime).Seconds()).
|
||||
Str("model", string(p.modelType)).Msg("call model service")
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
|
||||
}
|
||||
logResponse(resp)
|
||||
|
||||
// parse result
|
||||
result, err := p.parseResult(resp, opts.Size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(code.LLMParsePlanningResponseError, err.Error())
|
||||
}
|
||||
|
||||
// append assistant message
|
||||
p.history.Append(&schema.Message{
|
||||
Role: schema.Assistant,
|
||||
Content: result.ActionSummary,
|
||||
})
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *UITarsPlanner) parseResult(msg *schema.Message, size types.Size) (*PlanningResult, error) {
|
||||
// parse Thought/Action format from UI-TARS
|
||||
parseActions, thoughtErr := parseThoughtAction(msg.Content)
|
||||
if thoughtErr != nil {
|
||||
return nil, thoughtErr
|
||||
}
|
||||
|
||||
// process response
|
||||
result, err := processVLMResponse(parseActions, size)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "process VLM response failed")
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Interface("summary", result.ActionSummary).
|
||||
Interface("actions", result.NextActions).
|
||||
Msg("get VLM planning result")
|
||||
return result, nil
|
||||
}
|
||||
const (
|
||||
ActionTypeClick ActionType = "click"
|
||||
ActionTypeTap ActionType = "tap"
|
||||
ActionTypeDrag ActionType = "drag"
|
||||
ActionTypeSwipe ActionType = "swipe"
|
||||
ActionTypeWait ActionType = "wait"
|
||||
ActionTypeFinished ActionType = "finished"
|
||||
ActionTypeCallUser ActionType = "call_user"
|
||||
ActionTypeType ActionType = "type"
|
||||
ActionTypeScroll ActionType = "scroll"
|
||||
)
|
||||
|
||||
// parseThoughtAction parses the Thought/Action format response
|
||||
func parseThoughtAction(predictionText string) ([]ParsedAction, error) {
|
||||
@@ -396,3 +318,64 @@ func validateTypeContent(action *ParsedAction) {
|
||||
log.Warn().Msg("type action missing content parameter, set to default")
|
||||
}
|
||||
}
|
||||
|
||||
// parseJSON tries to parse the response as JSON format
|
||||
func parseJSON(predictionText string) ([]ParsedAction, error) {
|
||||
predictionText = strings.TrimSpace(predictionText)
|
||||
if strings.HasPrefix(predictionText, "```json") && strings.HasSuffix(predictionText, "```") {
|
||||
predictionText = strings.TrimPrefix(predictionText, "```json")
|
||||
predictionText = strings.TrimSuffix(predictionText, "```")
|
||||
}
|
||||
predictionText = strings.TrimSpace(predictionText)
|
||||
|
||||
var response PlanningResult
|
||||
if err := json.Unmarshal([]byte(predictionText), &response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse VLM response: %v", err)
|
||||
}
|
||||
|
||||
if response.Error != "" {
|
||||
return nil, errors.New(response.Error)
|
||||
}
|
||||
|
||||
if len(response.NextActions) == 0 {
|
||||
return nil, errors.New("no actions returned from VLM")
|
||||
}
|
||||
|
||||
// normalize actions
|
||||
var normalizedActions []ParsedAction
|
||||
for i := range response.NextActions {
|
||||
// create a new variable, avoid implicit memory aliasing in for loop.
|
||||
action := response.NextActions[i]
|
||||
if err := normalizeAction(&action); err != nil {
|
||||
return nil, errors.Wrap(err, "failed to normalize action")
|
||||
}
|
||||
normalizedActions = append(normalizedActions, action)
|
||||
}
|
||||
|
||||
return normalizedActions, nil
|
||||
}
|
||||
|
||||
// normalizeAction normalizes the coordinates in the action
|
||||
func normalizeAction(action *ParsedAction) error {
|
||||
switch action.ActionType {
|
||||
case "click", "drag":
|
||||
// handle click and drag action coordinates
|
||||
if startBox, ok := action.ActionInputs["startBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(startBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize startBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["startBox"] = normalized
|
||||
}
|
||||
|
||||
if endBox, ok := action.ActionInputs["endBox"].(string); ok {
|
||||
normalized, err := normalizeCoordinates(endBox)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to normalize endBox: %w", err)
|
||||
}
|
||||
action.ActionInputs["endBox"] = normalized
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -27,3 +27,5 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
||||
|
||||
## User Instruction
|
||||
`
|
||||
|
||||
const defaultPlanningResponseJsonFormat = ``
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/httprunner/httprunner/v5/code"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
@@ -35,7 +36,7 @@ func TestVLMPlanning(t *testing.T) {
|
||||
|
||||
userInstruction += "\n\n请基于以上游戏规则,给出下一步可点击的两个图标坐标"
|
||||
|
||||
planner, err := NewUITarsPlanner(context.Background())
|
||||
planner, err := NewPlanner(context.Background(), option.LLMServiceTypeUITARS)
|
||||
require.NoError(t, err)
|
||||
|
||||
opts := &PlanningOptions{
|
||||
@@ -105,7 +106,7 @@ func TestXHSPlanning(t *testing.T) {
|
||||
|
||||
userInstruction := "点击第二个帖子的作者头像"
|
||||
|
||||
planner, err := NewUITarsPlanner(context.Background())
|
||||
planner, err := NewPlanner(context.Background(), option.LLMServiceTypeUITARS)
|
||||
require.NoError(t, err)
|
||||
|
||||
opts := &PlanningOptions{
|
||||
@@ -175,7 +176,7 @@ func TestChatList(t *testing.T) {
|
||||
|
||||
userInstruction := "请结合图片的文字信息,请告诉我一共有多少个群聊,哪些群聊右下角有绿点"
|
||||
|
||||
planner, err := NewUITarsPlanner(context.Background())
|
||||
planner, err := NewPlanner(context.Background(), option.LLMServiceTypeUITARS)
|
||||
require.NoError(t, err)
|
||||
|
||||
opts := &PlanningOptions{
|
||||
@@ -206,7 +207,7 @@ func TestHandleSwitch(t *testing.T) {
|
||||
userInstruction := "发送框下方的联网搜索开关是开启状态" // 点击开启联网搜索开关
|
||||
// 检查发送框下方的联网搜索开关,蓝色为开启状态,灰色为关闭状态;若开关处于关闭状态,则点击进行开启
|
||||
|
||||
planner, err := NewUITarsPlanner(context.Background())
|
||||
planner, err := NewPlanner(context.Background(), option.LLMServiceTypeUITARS)
|
||||
require.NoError(t, err)
|
||||
|
||||
testCases := []struct {
|
||||
|
||||
@@ -1,6 +1,14 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/color"
|
||||
"image/draw"
|
||||
"image/png"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/cloudwego/eino/schema"
|
||||
@@ -101,3 +109,70 @@ func logResponse(resp *schema.Message) {
|
||||
}
|
||||
logger.Msg("log response message")
|
||||
}
|
||||
|
||||
// SavePositionImg saves an image with position markers
|
||||
func SavePositionImg(params struct {
|
||||
InputImgBase64 string
|
||||
Rect struct {
|
||||
X float64
|
||||
Y float64
|
||||
}
|
||||
OutputPath string
|
||||
}) error {
|
||||
// 解码Base64图像
|
||||
imgData := params.InputImgBase64
|
||||
// 如果包含了数据URL前缀,去掉它
|
||||
if strings.HasPrefix(imgData, "data:image/") {
|
||||
parts := strings.Split(imgData, ",")
|
||||
if len(parts) > 1 {
|
||||
imgData = parts[1]
|
||||
}
|
||||
}
|
||||
|
||||
// 解码Base64
|
||||
unbased, err := base64.StdEncoding.DecodeString(imgData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法解码Base64图像: %w", err)
|
||||
}
|
||||
|
||||
// 解码图像
|
||||
reader := bytes.NewReader(unbased)
|
||||
img, _, err := image.Decode(reader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法解码图像数据: %w", err)
|
||||
}
|
||||
|
||||
// 创建一个可以在其上绘制的图像
|
||||
bounds := img.Bounds()
|
||||
rgba := image.NewRGBA(bounds)
|
||||
draw.Draw(rgba, bounds, img, bounds.Min, draw.Src)
|
||||
|
||||
// 在点击/拖动位置绘制标记
|
||||
markRadius := 10
|
||||
x, y := int(params.Rect.X), int(params.Rect.Y)
|
||||
|
||||
// 绘制红色圆圈
|
||||
for i := -markRadius; i <= markRadius; i++ {
|
||||
for j := -markRadius; j <= markRadius; j++ {
|
||||
if i*i+j*j <= markRadius*markRadius {
|
||||
if x+i >= 0 && x+i < bounds.Max.X && y+j >= 0 && y+j < bounds.Max.Y {
|
||||
rgba.Set(x+i, y+j, color.RGBA{255, 0, 0, 255})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 保存图像
|
||||
outFile, err := os.Create(params.OutputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("无法创建输出文件: %w", err)
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// 编码为PNG并保存
|
||||
if err := png.Encode(outFile, rgba); err != nil {
|
||||
return fmt.Errorf("无法编码和保存图像: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user