feat: enhance VLM response parsing and DOUBAO model support

- Fix JSON extraction logic by prioritizing brace counting method
- Add support for DOUBAO string array coordinate format
- Introduce IS_UI_TARS helper function for model type checking
- Add comprehensive tests for JSON parsing and coordinate handling
- Improve error handling with retry delays for LLM service failures
This commit is contained in:
lilong.129
2025-06-10 15:55:57 +08:00
parent 4959c2e47e
commit 88ae8faee1
9 changed files with 307 additions and 20 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506101408
v5.0.0-beta-2506101556

View File

@@ -51,7 +51,7 @@ func NewAsserter(ctx context.Context, modelConfig *ModelConfig) (*Asserter, erro
systemPrompt: defaultAssertionPrompt,
}
if modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
if option.IS_UI_TARS(modelConfig.ModelType) {
asserter.systemPrompt += "\n" + uiTarsAssertionResponseFormat
} else {
// define output format

View File

@@ -4,8 +4,10 @@ import (
"testing"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParseActionToStructureOutput(t *testing.T) {
@@ -1090,3 +1092,187 @@ func TestMapParameterName(t *testing.T) {
})
}
}
func TestJSONContentParser_Parse(t *testing.T) {
parser := &JSONContentParser{
modelType: option.OPENAI_GPT_4O,
systemPrompt: "test prompt",
actionMapping: map[string]option.ActionName{"click": "tap_xy"},
}
size := types.Size{Width: 1200, Height: 2640}
tests := []struct {
name string
content string
expectError bool
expectTools int
}{
{
name: "valid click action",
content: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [371, 235, 425, 270]
}
}
],
"thought": "点击桌面上的抖音应用图标以启动抖音",
"error": null
}`,
expectError: false,
expectTools: 1,
},
{
name: "multiple actions",
content: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 100, 200, 200]
}
},
{
"action_type": "type",
"action_inputs": {
"content": "hello world"
}
}
],
"thought": "执行多个操作",
"error": null
}`,
expectError: false,
expectTools: 2,
},
{
name: "no actions but valid thought",
content: `{
"actions": [],
"thought": "这是一个分析任务,不需要执行操作",
"error": null
}`,
expectError: false,
expectTools: 0,
},
{
name: "error response",
content: `{
"actions": [],
"thought": "发生了错误",
"error": "无法找到目标元素"
}`,
expectError: true,
expectTools: 0,
},
{
name: "invalid JSON",
content: `{"actions": [{"action_type": "click"`,
expectError: true,
expectTools: 0,
},
{
name: "string array coordinates (DOUBAO format)",
content: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [
"229 389",
"229 439"
]
}
}
],
"thought": "点击苹果图案",
"error": null
}`,
expectError: false,
expectTools: 1,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := parser.Parse(tt.content, size)
if tt.expectError {
assert.Error(t, err)
return
}
require.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, tt.expectTools, len(result.ToolCalls))
assert.NotEmpty(t, result.Thought)
assert.Equal(t, string(parser.modelType), result.ModelName)
// Verify tool calls structure if any
for _, toolCall := range result.ToolCalls {
assert.NotEmpty(t, toolCall.ID)
assert.Equal(t, "function", toolCall.Type)
assert.NotEmpty(t, toolCall.Function.Name)
assert.NotEmpty(t, toolCall.Function.Arguments)
}
})
}
}
func TestNormalizeActionCoordinates_StringArray(t *testing.T) {
size := types.Size{Width: 1200, Height: 2640}
tests := []struct {
name string
coordData interface{}
expectError bool
expectLen int
}{
{
name: "string array coordinates",
coordData: []interface{}{"229 389", "229 439"},
expectError: false,
expectLen: 4, // Each string contains 2 coordinates, so total 4
},
{
name: "mixed number and string coordinates",
coordData: []interface{}{100, 200, "300 400"},
expectError: false,
expectLen: 4, // 2 numbers + 2 from string = 4
},
{
name: "single string coordinate",
coordData: []interface{}{"100 200"},
expectError: false,
expectLen: 2,
},
{
name: "invalid string format",
coordData: []interface{}{"invalid"},
expectError: true,
expectLen: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := normalizeActionCoordinates(tt.coordData, size)
if tt.expectError {
assert.Error(t, err)
return
}
require.NoError(t, err)
assert.Equal(t, tt.expectLen, len(result))
// All coordinates should be positive numbers
for _, coord := range result {
assert.True(t, coord >= 0, "coordinate should be non-negative: %f", coord)
}
})
}
}

View File

@@ -329,33 +329,46 @@ func mapParameterName(paramName string) string {
func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
switch v := coordData.(type) {
case []interface{}:
// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1]
if len(v) < 2 {
return nil, fmt.Errorf("coordinate array must have at least 2 elements, got %d", len(v))
// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1] or ["229 389", "229 439"]
if len(v) == 0 {
return nil, fmt.Errorf("coordinate array cannot be empty")
}
coords := make([]float64, len(v))
coords := make([]float64, 0)
for i, val := range v {
switch num := val.(type) {
case float64:
// Convert relative coordinates to absolute coordinates using DefaultFactor
if i%2 == 0 { // x coordinates
coords[i] = convertRelativeToAbsolute(num, true, size)
coords = append(coords, convertRelativeToAbsolute(num, true, size))
} else { // y coordinates
coords[i] = convertRelativeToAbsolute(num, false, size)
coords = append(coords, convertRelativeToAbsolute(num, false, size))
}
case int:
numFloat := float64(num)
// Convert relative coordinates to absolute coordinates using DefaultFactor
if i%2 == 0 { // x coordinates
coords[i] = convertRelativeToAbsolute(numFloat, true, size)
coords = append(coords, convertRelativeToAbsolute(numFloat, true, size))
} else { // y coordinates
coords[i] = convertRelativeToAbsolute(numFloat, false, size)
coords = append(coords, convertRelativeToAbsolute(numFloat, false, size))
}
case string:
// Handle string coordinates like "229 389"
stringCoords, err := normalizeStringCoordinates(num, size)
if err != nil {
return nil, fmt.Errorf("failed to parse string coordinate '%s': %v", num, err)
}
coords = append(coords, stringCoords...)
default:
return nil, fmt.Errorf("coordinate value must be a number, got %T", val)
return nil, fmt.Errorf("coordinate value must be a number or string, got %T", val)
}
}
// Check if we have at least 2 coordinates after processing
if len(coords) < 2 {
return nil, fmt.Errorf("coordinate array must result in at least 2 coordinates, got %d", len(coords))
}
return coords, nil
case []float64:

View File

@@ -67,7 +67,7 @@ func (p *Planner) History() *ConversationHistory {
}
func (p *Planner) RegisterTools(tools []*schema.ToolInfo) error {
if p.modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
if option.IS_UI_TARS(p.modelConfig.ModelType) {
// tools have been registered in ui-tars system prompt
return nil
}

View File

@@ -42,14 +42,7 @@ func extractJSONFromContent(content string) string {
}
}
// Case 3: Try regex approach for markdown-like formats
jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
matches := jsonRegex.FindStringSubmatch(content)
if len(matches) > 1 {
return strings.TrimSpace(matches[1])
}
// Case 4: Look for JSON object in the content using brace counting
// Case 3: Look for JSON object in the content using brace counting (most reliable method)
start := strings.Index(content, "{")
if start != -1 {
// Find the matching closing brace
@@ -67,6 +60,13 @@ func extractJSONFromContent(content string) string {
}
}
// Case 4: Try regex approach for markdown-like formats (fallback)
jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
matches := jsonRegex.FindStringSubmatch(content)
if len(matches) > 1 {
return strings.TrimSpace(matches[1])
}
// Case 5: If content itself looks like JSON
if strings.HasPrefix(content, "{") && strings.HasSuffix(content, "}") {
return content

79
uixt/ai/utils_test.go Normal file
View File

@@ -0,0 +1,79 @@
package ai
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestExtractJSONFromContent(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "valid JSON",
input: `{"content": "test", "thought": "test"}`,
expected: `{"content": "test", "thought": "test"}`,
},
{
name: "JSON in markdown",
input: "```json\n{\n \"content\": \"test\"\n}\n```",
expected: `{
"content": "test"
}`,
},
{
name: "incomplete JSON without closing brace",
input: `{"content": "incomplete json"`,
expected: "",
},
{
name: "incomplete JSON with missing closing brace",
input: `{"content": "incomplete json", "missing_closing_brace": true`,
expected: "",
},
{
name: "plain text",
input: "This is just plain text",
expected: "",
},
{
name: "complex nested JSON with arrays",
input: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [371, 235, 425, 270]
}
}
],
"thought": "点击桌面上的抖音应用图标以启动抖音",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [371, 235, 425, 270]
}
}
],
"thought": "点击桌面上的抖音应用图标以启动抖音",
"error": null
}`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractJSONFromContent(tt.input)
t.Logf("Input: %s", tt.input)
t.Logf("Output: %s", result)
assert.Equal(t, tt.expected, result)
})
}
}

View File

@@ -50,6 +50,7 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
if errors.Is(err, code.LLMRequestServiceError) {
log.Warn().Err(err).Int("attempt", attempt).
Msg("LLM service request failed, retrying...")
time.Sleep(5 * time.Second)
continue
}
// Create planning result with error

View File

@@ -30,9 +30,17 @@ func WithCVService(service CVServiceType) AIServiceOption {
type LLMServiceType string
func IS_UI_TARS(modelType LLMServiceType) bool {
return modelType == DOUBAO_1_5_UI_TARS_250328 ||
modelType == DOUBAO_1_5_UI_TARS_250428
}
const (
DOUBAO_1_5_UI_TARS_250328 LLMServiceType = "doubao-1.5-ui-tars-250328"
DOUBAO_1_5_UI_TARS_250428 LLMServiceType = "doubao-1.5-ui-tars-250428" // not support function calling and json response
DOUBAO_1_5_THINKING_VISION_PRO_250428 LLMServiceType = "doubao-1.5-thinking-vision-pro-250428"
OPENAI_GPT_4O LLMServiceType = "openai/gpt-4o"
DEEPSEEK_R1_250528 LLMServiceType = "deepseek-r1-250528"
)
func WithLLMService(modelType LLMServiceType) AIServiceOption {