mirror of
https://github.com/httprunner/httprunner.git
synced 2026-06-06 16:29:37 +08:00
change: update ui-tars prompt
This commit is contained in:
@@ -1 +1 @@
|
|||||||
v5.0.0-beta-2505222252
|
v5.0.0-beta-2505232205
|
||||||
|
|||||||
@@ -156,7 +156,7 @@ func (c *Chat) handleToolCalls(ctx context.Context, toolCalls []schema.ToolCall)
|
|||||||
serverName, toolName := parts[0], parts[1]
|
serverName, toolName := parts[0], parts[1]
|
||||||
|
|
||||||
// Unmarshal tool arguments from JSON string
|
// Unmarshal tool arguments from JSON string
|
||||||
var argsMap map[string]interface{}
|
var argsMap map[string]any
|
||||||
if err := sonic.UnmarshalString(toolArgs, &argsMap); err != nil {
|
if err := sonic.UnmarshalString(toolArgs, &argsMap); err != nil {
|
||||||
log.Error().Err(err).Str("args", toolArgs).Msg("failed to unmarshal tool arguments")
|
log.Error().Err(err).Str("args", toolArgs).Msg("failed to unmarshal tool arguments")
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
|||||||
switch modelType {
|
switch modelType {
|
||||||
case option.LLMServiceTypeUITARS:
|
case option.LLMServiceTypeUITARS:
|
||||||
return &UITARSContentParser{
|
return &UITARSContentParser{
|
||||||
systemPrompt: uiTarsPlanningPrompt,
|
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
return &JSONContentParser{
|
return &JSONContentParser{
|
||||||
|
|||||||
@@ -30,4 +30,31 @@ func TestParseActionToStructureOutput(t *testing.T) {
|
|||||||
assert.Nil(t, err)
|
assert.Nil(t, err)
|
||||||
assert.Equal(t, result.Actions[0].ActionType, "click")
|
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||||
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||||
|
|
||||||
|
// Test new bracket format
|
||||||
|
text = "Thought: 我需要点击这个按钮\nAction: click(start_box='[100, 200, 150, 250]')"
|
||||||
|
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, result.Actions[0].ActionType, "click")
|
||||||
|
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||||
|
coords := result.Actions[0].ActionInputs["start_box"].([]float64)
|
||||||
|
assert.Equal(t, 4, len(coords))
|
||||||
|
assert.Equal(t, 100.0, coords[0])
|
||||||
|
assert.Equal(t, 200.0, coords[1])
|
||||||
|
assert.Equal(t, 150.0, coords[2])
|
||||||
|
assert.Equal(t, 250.0, coords[3])
|
||||||
|
|
||||||
|
// Test drag operation with both start_box and end_box
|
||||||
|
text = "Thought: 我需要拖拽元素\nAction: drag(start_box='[100, 200, 150, 250]', end_box='[300, 400, 350, 450]')"
|
||||||
|
result, err = parser.Parse(text, types.Size{Height: 1000, Width: 1000})
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, result.Actions[0].ActionType, "drag")
|
||||||
|
assert.Contains(t, result.Actions[0].ActionInputs, "start_box")
|
||||||
|
assert.Contains(t, result.Actions[0].ActionInputs, "end_box")
|
||||||
|
startCoords := result.Actions[0].ActionInputs["start_box"].([]float64)
|
||||||
|
endCoords := result.Actions[0].ActionInputs["end_box"].([]float64)
|
||||||
|
assert.Equal(t, 4, len(startCoords))
|
||||||
|
assert.Equal(t, 4, len(endCoords))
|
||||||
|
assert.Equal(t, 100.0, startCoords[0])
|
||||||
|
assert.Equal(t, 300.0, endCoords[0])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,15 @@
|
|||||||
package ai
|
package ai
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/cloudwego/eino/schema"
|
||||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
)
|
)
|
||||||
@@ -30,178 +33,257 @@ func (p *UITARSContentParser) SystemPrompt() string {
|
|||||||
// ParseActionToStructureOutput parses the model output text into structured actions.
|
// ParseActionToStructureOutput parses the model output text into structured actions.
|
||||||
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningResult, error) {
|
||||||
text := strings.TrimSpace(content)
|
text := strings.TrimSpace(content)
|
||||||
if strings.Contains(text, "<point>") {
|
|
||||||
text = convertPointToCoordinates(text)
|
// Extract thought/reflection
|
||||||
|
thought := p.extractThought(text)
|
||||||
|
|
||||||
|
// Normalize text first
|
||||||
|
normalizedText := p.normalizeCoordinates(text)
|
||||||
|
|
||||||
|
// Get action string from normalized text
|
||||||
|
actionStr, err := p.extractActionString(normalizedText)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse actions directly
|
||||||
|
actions, err := p.parseActionString(actionStr, size)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert actions to tool calls
|
||||||
|
toolCalls := p.convertActionsToToolCalls(actions)
|
||||||
|
|
||||||
|
return &PlanningResult{
|
||||||
|
ToolCalls: toolCalls,
|
||||||
|
Actions: actions,
|
||||||
|
ActionSummary: thought,
|
||||||
|
Thought: thought,
|
||||||
|
Text: normalizedText,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractThought extracts thought from the text
|
||||||
|
func (p *UITARSContentParser) extractThought(text string) string {
|
||||||
|
re := regexp.MustCompile(`Thought:(.*?)\nAction:`)
|
||||||
|
matches := re.FindStringSubmatch(text)
|
||||||
|
if len(matches) > 1 {
|
||||||
|
return strings.TrimSpace(matches[1])
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractActionString extracts the action string from the text
|
||||||
|
func (p *UITARSContentParser) extractActionString(text string) (string, error) {
|
||||||
|
// Extract Action part using regex
|
||||||
|
re := regexp.MustCompile(`Action:(.*?)(?:\n|$)`)
|
||||||
|
matches := re.FindStringSubmatch(text)
|
||||||
|
if len(matches) > 1 {
|
||||||
|
return strings.TrimSpace(matches[1]), nil
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("no Action: found")
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeCoordinates normalizes the text by converting points to coordinates and replacing keywords
|
||||||
|
func (p *UITARSContentParser) normalizeCoordinates(text string) string {
|
||||||
|
// Convert point tags to coordinate format
|
||||||
|
if strings.Contains(text, "<point>") {
|
||||||
|
// support <point>x1 y1 x2 y2</point> or <point>x y</point>
|
||||||
|
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
|
||||||
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
||||||
|
submatches := re.FindStringSubmatch(match)
|
||||||
|
if submatches[3] != "" && submatches[4] != "" {
|
||||||
|
// 4 numbers
|
||||||
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
||||||
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
||||||
|
}
|
||||||
|
// 2 numbers
|
||||||
|
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert bbox tags to coordinate format
|
||||||
|
if strings.Contains(text, "<bbox>") {
|
||||||
|
// support <bbox>x1 y1 x2 y2</bbox>
|
||||||
|
re := regexp.MustCompile(`<bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)</bbox>`)
|
||||||
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
||||||
|
submatches := re.FindStringSubmatch(match)
|
||||||
|
// 4 numbers for bbox
|
||||||
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
||||||
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert bracket format [x1, y1, x2, y2] to coordinate format
|
||||||
|
if strings.Contains(text, "[") && strings.Contains(text, "]") {
|
||||||
|
// support [x1, y1, x2, y2] format
|
||||||
|
re := regexp.MustCompile(`\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]`)
|
||||||
|
text = re.ReplaceAllStringFunc(text, func(match string) string {
|
||||||
|
submatches := re.FindStringSubmatch(match)
|
||||||
|
// 4 numbers for bracket format
|
||||||
|
return fmt.Sprintf("(%s,%s,%s,%s)",
|
||||||
|
submatches[1], submatches[2], submatches[3], submatches[4])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Legacy parameter name replacements (keep for backward compatibility)
|
||||||
text = strings.ReplaceAll(text, "start_point=", "start_box=")
|
text = strings.ReplaceAll(text, "start_point=", "start_box=")
|
||||||
text = strings.ReplaceAll(text, "end_point=", "end_box=")
|
text = strings.ReplaceAll(text, "end_point=", "end_box=")
|
||||||
text = strings.ReplaceAll(text, "point=", "start_box=")
|
text = strings.ReplaceAll(text, "point=", "start_box=")
|
||||||
|
|
||||||
// Extract context (thought/reflection)
|
return text
|
||||||
var thought, reflection string
|
}
|
||||||
actionIdx := strings.Index(text, "Action:")
|
|
||||||
prefix := ""
|
|
||||||
if actionIdx != -1 {
|
|
||||||
prefix = text[:actionIdx]
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(prefix, "Thought:") {
|
|
||||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Thought:"))
|
|
||||||
} else if strings.HasPrefix(prefix, "Reflection:") {
|
|
||||||
refIdx := strings.Index(prefix, "Action_Summary:")
|
|
||||||
if refIdx != -1 {
|
|
||||||
reflection = strings.TrimSpace(strings.TrimPrefix(prefix[:refIdx], "Reflection:"))
|
|
||||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix[refIdx:], "Action_Summary:"))
|
|
||||||
}
|
|
||||||
} else if strings.HasPrefix(prefix, "Action_Summary:") {
|
|
||||||
thought = strings.TrimSpace(strings.TrimPrefix(prefix, "Action_Summary:"))
|
|
||||||
}
|
|
||||||
if !strings.Contains(text, "Action:") {
|
|
||||||
return nil, fmt.Errorf("no Action: found")
|
|
||||||
}
|
|
||||||
actionStr := strings.SplitN(text, "Action: ", 2)[1]
|
|
||||||
|
|
||||||
rawActions := strings.Split(actionStr, ")\n\n")
|
// parseActionString parses the action string directly
|
||||||
normalizedActions := make([]string, 0, len(rawActions))
|
func (p *UITARSContentParser) parseActionString(actionStr string, size types.Size) ([]Action, error) {
|
||||||
for _, act := range rawActions {
|
actions := make([]Action, 0, 1)
|
||||||
actionStr := act
|
|
||||||
if strings.Contains(actionStr, "type(content") {
|
// Parse action type and parameters
|
||||||
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
actionParts := strings.SplitN(actionStr, "(", 2)
|
||||||
actionStr = strings.TrimSpace(actionStr) + ")"
|
if len(actionParts) < 2 {
|
||||||
}
|
return nil, fmt.Errorf("not a function call")
|
||||||
pattern := regexp.MustCompile(`type\(content='(.*?)'\)`)
|
|
||||||
m := pattern.FindStringSubmatch(actionStr)
|
|
||||||
if len(m) > 1 {
|
|
||||||
content := m[1]
|
|
||||||
actionStr = "type(content='" + escapeSingleQuotes(content) + "')"
|
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("pattern not found in the input string")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !strings.HasSuffix(strings.TrimSpace(actionStr), ")") {
|
|
||||||
actionStr = strings.TrimSpace(actionStr) + ")"
|
|
||||||
}
|
|
||||||
normalizedActions = append(normalizedActions, actionStr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
actions := make([]Action, 0, len(normalizedActions))
|
funcName := strings.TrimSpace(actionParts[0])
|
||||||
for _, action := range normalizedActions {
|
paramsText := strings.TrimSuffix(strings.TrimSpace(actionParts[1]), ")")
|
||||||
parsed, err := ParseAction(strings.ReplaceAll(action, "\n", "\\n"))
|
|
||||||
if err != nil {
|
args := make(map[string]string)
|
||||||
return nil, fmt.Errorf("Action can't parse: %s", action)
|
if paramsText != "" {
|
||||||
}
|
// Use regex to extract key=value pairs, handling quoted values properly
|
||||||
actionType := parsed.Function
|
re := regexp.MustCompile(`(\w+)\s*=\s*['"]([^'"]*?)['"]`)
|
||||||
params := parsed.Args
|
matches := re.FindAllStringSubmatch(paramsText, -1)
|
||||||
actionInputs := make(map[string]any)
|
for _, match := range matches {
|
||||||
imageWidth := size.Width
|
if len(match) >= 3 {
|
||||||
imageHeight := size.Height
|
key := strings.TrimSpace(match[1])
|
||||||
for paramName, param := range params {
|
value := strings.TrimSpace(match[2])
|
||||||
if param == "" {
|
args[key] = value
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
param = strings.TrimLeft(param, " ")
|
}
|
||||||
actionInputs[paramName] = param
|
}
|
||||||
if strings.Contains(paramName, "start_box") || strings.Contains(paramName, "end_box") {
|
|
||||||
oriBox := param
|
actionInputs, err := p.parseActionInputs(args, size)
|
||||||
parameters := strings.Split(strings.ReplaceAll(strings.ReplaceAll(oriBox, "(", ""), ")", ""), ",")
|
if err != nil {
|
||||||
floatNumbers := make([]float64, 0, len(parameters))
|
return nil, err
|
||||||
for _, numStr := range parameters {
|
}
|
||||||
num, err := strconv.ParseFloat(strings.TrimSpace(numStr), 64)
|
|
||||||
|
actions = append(actions, Action{
|
||||||
|
ActionType: funcName,
|
||||||
|
ActionInputs: actionInputs,
|
||||||
|
})
|
||||||
|
|
||||||
|
return actions, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseActionInputs parses action parameters and converts coordinates
|
||||||
|
func (p *UITARSContentParser) parseActionInputs(args map[string]string, size types.Size) (map[string]any, error) {
|
||||||
|
actionInputs := make(map[string]any)
|
||||||
|
imageWidth := size.Width
|
||||||
|
imageHeight := size.Height
|
||||||
|
|
||||||
|
for paramName, param := range args {
|
||||||
|
if param == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
param = strings.TrimSpace(param)
|
||||||
|
|
||||||
|
// Convert box coordinates
|
||||||
|
if strings.Contains(paramName, "box") || strings.Contains(paramName, "point") {
|
||||||
|
// Extract numbers from the parameter value using regex
|
||||||
|
re := regexp.MustCompile(`\d+`)
|
||||||
|
numbers := re.FindAllString(param, -1)
|
||||||
|
if len(numbers) >= 2 {
|
||||||
|
coords := make([]float64, len(numbers))
|
||||||
|
for i, numStr := range numbers {
|
||||||
|
num, err := strconv.ParseFloat(numStr, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error().Interface("parameters", parameters).Msg("invalid float action parameters")
|
return nil, fmt.Errorf("invalid coordinate: %s", numStr)
|
||||||
return nil, fmt.Errorf("invalid action parameters")
|
}
|
||||||
|
// Convert relative coordinates to absolute coordinates
|
||||||
|
if i%2 == 0 { // x coordinates
|
||||||
|
coords[i] = math.Round((num/DefaultFactor*float64(imageWidth))*10) / 10
|
||||||
|
} else { // y coordinates
|
||||||
|
coords[i] = math.Round((num/DefaultFactor*float64(imageHeight))*10) / 10
|
||||||
}
|
}
|
||||||
floatNumbers = append(floatNumbers, num)
|
|
||||||
}
|
}
|
||||||
// The model generates a 2D coordinate output that represents relative positions.
|
actionInputs[paramName] = coords
|
||||||
// To convert these values to image-relative coordinates, divide each component by 1000 to obtain values in the range [0,1].
|
} else {
|
||||||
// The absolute coordinates required by the Action can be calculated by:
|
actionInputs[paramName] = param
|
||||||
// - X absolute = X relative × image width / 1000
|
|
||||||
// - Y absolute = Y relative × image height / 1000
|
|
||||||
if len(floatNumbers) == 2 {
|
|
||||||
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
|
||||||
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
|
||||||
} else if len(floatNumbers) == 4 {
|
|
||||||
floatNumbers[0] = math.Round((floatNumbers[0]/DefaultFactor*float64(imageWidth))*10) / 10
|
|
||||||
floatNumbers[1] = math.Round((floatNumbers[1]/DefaultFactor*float64(imageHeight))*10) / 10
|
|
||||||
floatNumbers[2] = math.Round((floatNumbers[2]/DefaultFactor*float64(imageWidth))*10) / 10
|
|
||||||
floatNumbers[3] = math.Round((floatNumbers[3]/DefaultFactor*float64(imageHeight))*10) / 10
|
|
||||||
} else {
|
|
||||||
log.Error().Interface("parameters", floatNumbers).Msg("invalid float action parameters")
|
|
||||||
return nil, fmt.Errorf("invalid action parameters")
|
|
||||||
}
|
|
||||||
actionInputs[paramName] = floatNumbers
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Handle other parameter types (content, key, direction, etc.)
|
||||||
|
if paramName == "content" {
|
||||||
|
// Handle escape characters
|
||||||
|
param = strings.ReplaceAll(param, "\\n", "\n")
|
||||||
|
param = strings.ReplaceAll(param, "\\\"", "\"")
|
||||||
|
param = strings.ReplaceAll(param, "\\'", "'")
|
||||||
|
}
|
||||||
|
actionInputs[paramName] = param
|
||||||
}
|
}
|
||||||
actions = append(actions, Action{
|
}
|
||||||
Reflection: reflection,
|
|
||||||
Thought: thought,
|
return actionInputs, nil
|
||||||
ActionType: actionType,
|
}
|
||||||
ActionInputs: actionInputs,
|
|
||||||
Text: text,
|
// convertActionsToToolCalls converts actions to tool calls
|
||||||
|
func (p *UITARSContentParser) convertActionsToToolCalls(actions []Action) []schema.ToolCall {
|
||||||
|
toolCalls := make([]schema.ToolCall, 0, len(actions))
|
||||||
|
for _, action := range actions {
|
||||||
|
jsonArgs, err := json.Marshal(action.ActionInputs)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Interface("action", action).Msg("failed to marshal action inputs")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
toolCalls = append(toolCalls, schema.ToolCall{
|
||||||
|
ID: action.ActionType + "_" + strconv.FormatInt(time.Now().Unix(), 10),
|
||||||
|
Type: "function",
|
||||||
|
Function: schema.FunctionCall{
|
||||||
|
Name: action.ActionType,
|
||||||
|
Arguments: string(jsonArgs),
|
||||||
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return &PlanningResult{
|
return toolCalls
|
||||||
Actions: actions,
|
|
||||||
}, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Action represents a parsed action with its context.
|
// Action represents a parsed action with its context.
|
||||||
type Action struct {
|
type Action struct {
|
||||||
Reflection string `json:"reflection"`
|
|
||||||
Thought string `json:"thought"`
|
|
||||||
ActionType string `json:"action_type"`
|
ActionType string `json:"action_type"`
|
||||||
ActionInputs map[string]any `json:"action_inputs"`
|
ActionInputs map[string]any `json:"action_inputs"`
|
||||||
Text string `json:"text"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// ParsedActionArgs represents the result of parsing an action string.
|
|
||||||
type ParsedActionArgs struct {
|
|
||||||
Function string
|
|
||||||
Args map[string]string
|
|
||||||
}
|
|
||||||
|
|
||||||
// convertPointToCoordinates replaces <point>x y</point> with (x,y)
|
|
||||||
func convertPointToCoordinates(text string) string {
|
|
||||||
// 支持 <point>x1 y1 x2 y2</point> 或 <point>x y</point>
|
|
||||||
re := regexp.MustCompile(`<point>(\d+)\s+(\d+)(?:\s+(\d+)\s+(\d+))?</point>`)
|
|
||||||
return re.ReplaceAllStringFunc(text, func(match string) string {
|
|
||||||
submatches := re.FindStringSubmatch(match)
|
|
||||||
if submatches[3] != "" && submatches[4] != "" {
|
|
||||||
// 4 个数字
|
|
||||||
return fmt.Sprintf("(%s,%s,%s,%s)", submatches[1], submatches[2], submatches[3], submatches[4])
|
|
||||||
}
|
|
||||||
// 2 个数字
|
|
||||||
return fmt.Sprintf("(%s,%s)", submatches[1], submatches[2])
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// escapeSingleQuotes escapes unescaped single quotes in a string.
|
|
||||||
func escapeSingleQuotes(text string) string {
|
|
||||||
var b strings.Builder
|
|
||||||
n := len(text)
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
if text[i] == '\'' && (i == 0 || text[i-1] != '\\') {
|
|
||||||
b.WriteString("\\'")
|
|
||||||
} else {
|
|
||||||
b.WriteByte(text[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return b.String()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ParseAction parses an action string into function name and arguments.
|
// ParseAction parses an action string into function name and arguments.
|
||||||
func ParseAction(actionStr string) (*ParsedActionArgs, error) {
|
func ParseAction(actionStr string) (*ParsedAction, error) {
|
||||||
re := regexp.MustCompile(`^(\w+)\((.*)\)$`)
|
// Parse action type and parameters
|
||||||
matches := re.FindStringSubmatch(actionStr)
|
actionParts := strings.SplitN(actionStr, "(", 2)
|
||||||
if len(matches) < 3 {
|
if len(actionParts) < 2 {
|
||||||
return nil, fmt.Errorf("not a function call")
|
return nil, fmt.Errorf("not a function call")
|
||||||
}
|
}
|
||||||
funcName := matches[1]
|
|
||||||
argsStr := matches[2]
|
funcName := strings.TrimSpace(actionParts[0])
|
||||||
|
paramsText := strings.TrimSuffix(strings.TrimSpace(actionParts[1]), ")")
|
||||||
|
|
||||||
args := make(map[string]string)
|
args := make(map[string]string)
|
||||||
argRe := regexp.MustCompile(`(\w+)\s*=\s*'([^']*)'`)
|
if paramsText != "" {
|
||||||
for _, m := range argRe.FindAllStringSubmatch(argsStr, -1) {
|
// Split parameters by comma and parse key=value pairs
|
||||||
args[m[1]] = m[2]
|
for _, param := range strings.Split(paramsText, ",") {
|
||||||
|
param = strings.TrimSpace(param)
|
||||||
|
if strings.Contains(param, "=") {
|
||||||
|
parts := strings.SplitN(param, "=", 2)
|
||||||
|
key := strings.TrimSpace(parts[0])
|
||||||
|
value := strings.TrimSpace(parts[1])
|
||||||
|
// Remove surrounding quotes
|
||||||
|
value = strings.Trim(value, "'\"")
|
||||||
|
args[key] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return &ParsedActionArgs{Function: funcName, Args: args}, nil
|
|
||||||
|
return &ParsedAction{Function: funcName, Args: args}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParsedAction represents the result of parsing an action string.
|
||||||
|
type ParsedAction struct {
|
||||||
|
Function string
|
||||||
|
Args map[string]string
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,9 +27,11 @@ type PlanningOptions struct {
|
|||||||
|
|
||||||
// PlanningResult represents the result of planning
|
// PlanningResult represents the result of planning
|
||||||
type PlanningResult struct {
|
type PlanningResult struct {
|
||||||
ToolCalls []schema.ToolCall `json:"tool_calls"` // TODO: merge to NextActions
|
ToolCalls []schema.ToolCall `json:"tool_calls"`
|
||||||
Actions []Action `json:"actions"`
|
Actions []Action `json:"actions"` // TODO: merge to ToolCalls
|
||||||
ActionSummary string `json:"summary"`
|
ActionSummary string `json:"summary"`
|
||||||
|
Thought string `json:"thought"`
|
||||||
|
Text string `json:"text"`
|
||||||
Error string `json:"error,omitempty"`
|
Error string `json:"error,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,7 +55,6 @@ type Planner struct {
|
|||||||
model model.ToolCallingChatModel
|
model model.ToolCallingChatModel
|
||||||
parser LLMContentParser
|
parser LLMContentParser
|
||||||
history ConversationHistory
|
history ConversationHistory
|
||||||
tools []*schema.ToolInfo
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Planner) SystemPrompt() string {
|
func (p *Planner) SystemPrompt() string {
|
||||||
@@ -75,7 +76,6 @@ func (p *Planner) RegisterTools(tools []*schema.ToolInfo) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.Wrap(err, "failed to register tools")
|
return errors.Wrap(err, "failed to register tools")
|
||||||
}
|
}
|
||||||
p.tools = tools
|
|
||||||
p.model = toolCallingModel
|
p.model = toolCallingModel
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -138,7 +138,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
|
|||||||
|
|
||||||
log.Info().
|
log.Info().
|
||||||
Interface("summary", result.ActionSummary).
|
Interface("summary", result.ActionSummary).
|
||||||
Interface("actions", result.Actions).
|
Interface("tool_calls", result.ToolCalls).
|
||||||
Msg("get VLM planning result")
|
Msg("get VLM planning result")
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,37 @@
|
|||||||
package ai
|
package ai
|
||||||
|
|
||||||
|
// system prompt for doubao-1.5-ui-tars on volcengine.com
|
||||||
// https://www.volcengine.com/docs/82379/1536429
|
// https://www.volcengine.com/docs/82379/1536429
|
||||||
|
const doubao_1_5_ui_tars_planning_prompt = `
|
||||||
|
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
` + "```" + `
|
||||||
|
Thought: ...
|
||||||
|
Action: ...
|
||||||
|
` + "```" + `
|
||||||
|
|
||||||
|
## Action Space
|
||||||
|
click(start_box='[x1, y1, x2, y2]')
|
||||||
|
long_press(start_box='[x1, y1, x2, y2]')
|
||||||
|
type(content='') #If you want to submit your input, use "\n" at the end of ` + "`content`" + `.
|
||||||
|
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
||||||
|
open_app(app_name=\'\')
|
||||||
|
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
||||||
|
press_home()
|
||||||
|
press_back()
|
||||||
|
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
||||||
|
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
||||||
|
|
||||||
|
## Note
|
||||||
|
- Use Chinese in ` + "`Thought`" + ` part.
|
||||||
|
- Write a small plan and finally summarize your next action (with its target element) in one sentence in ` + "`Thought`" + ` part.
|
||||||
|
|
||||||
|
## User Instruction
|
||||||
|
`
|
||||||
|
|
||||||
// system prompt for UITARSContentParser
|
// system prompt for UITARSContentParser
|
||||||
|
// https://github.com/bytedance/UI-TARS/blob/main/codes/ui_tars/prompt.py
|
||||||
const uiTarsPlanningPrompt = `
|
const uiTarsPlanningPrompt = `
|
||||||
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
||||||
|
|
||||||
|
|||||||
@@ -57,12 +57,13 @@ func TestVLMPlanning(t *testing.T) {
|
|||||||
// 验证结果
|
// 验证结果
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NotNil(t, result)
|
require.NotNil(t, result)
|
||||||
require.NotEmpty(t, result.Actions)
|
require.NotEmpty(t, result.ToolCalls)
|
||||||
|
|
||||||
// 验证动作
|
// 验证动作
|
||||||
action := result.Actions[0]
|
toolCall := result.ToolCalls[0]
|
||||||
assert.NotEmpty(t, action.ActionType)
|
assert.NotEmpty(t, toolCall.Function.Name)
|
||||||
assert.NotEmpty(t, action.Thought)
|
assert.NotEmpty(t, result.Thought)
|
||||||
|
assert.NotEmpty(t, result.Text)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestXHSPlanning(t *testing.T) {
|
func TestXHSPlanning(t *testing.T) {
|
||||||
@@ -104,7 +105,8 @@ func TestXHSPlanning(t *testing.T) {
|
|||||||
// 验证动作
|
// 验证动作
|
||||||
action := result.Actions[0]
|
action := result.Actions[0]
|
||||||
assert.NotEmpty(t, action.ActionType)
|
assert.NotEmpty(t, action.ActionType)
|
||||||
assert.NotEmpty(t, action.Thought)
|
assert.NotEmpty(t, result.Thought)
|
||||||
|
assert.NotEmpty(t, result.Text)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChatList(t *testing.T) {
|
func TestChatList(t *testing.T) {
|
||||||
|
|||||||
Reference in New Issue
Block a user