feat: enhance VLM response parsing and DOUBAO model support

- Fix JSON extraction logic by prioritizing brace counting method - Add support for DOUBAO string array coordinate format - Introduce IS_UI_TARS helper function for model type checking - Add comprehensive tests for JSON parsing and coordinate handling - Improve error handling with retry delays for LLM service failures
2026-05-07 07:22:39 +08:00 · 2025-06-10 15:55:57 +08:00
parent 4959c2e47e
commit 88ae8faee1
9 changed files with 307 additions and 20 deletions
--- a/internal/version/VERSION
+++ b/internal/version/VERSION
@@ -1 +1 @@
-v5.0.0-beta-2506101408
+v5.0.0-beta-2506101556
--- a/uixt/ai/asserter.go
+++ b/uixt/ai/asserter.go
@@ -51,7 +51,7 @@ func NewAsserter(ctx context.Context, modelConfig *ModelConfig) (*Asserter, erro
 		systemPrompt: defaultAssertionPrompt,
 	}

-	if modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
+	if option.IS_UI_TARS(modelConfig.ModelType) {
 		asserter.systemPrompt += "\n" + uiTarsAssertionResponseFormat
 	} else {
 		// define output format
--- a/uixt/ai/parser_test.go
+++ b/uixt/ai/parser_test.go
@@ -4,8 +4,10 @@ import (
 	"testing"

 	"github.com/httprunner/httprunner/v5/internal/json"
+	"github.com/httprunner/httprunner/v5/uixt/option"
 	"github.com/httprunner/httprunner/v5/uixt/types"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestParseActionToStructureOutput(t *testing.T) {
@@ -1090,3 +1092,187 @@ func TestMapParameterName(t *testing.T) {
 		})
 	}
 }
+
+func TestJSONContentParser_Parse(t *testing.T) {
+	parser := &JSONContentParser{
+		modelType:     option.OPENAI_GPT_4O,
+		systemPrompt:  "test prompt",
+		actionMapping: map[string]option.ActionName{"click": "tap_xy"},
+	}
+
+	size := types.Size{Width: 1200, Height: 2640}
+
+	tests := []struct {
+		name        string
+		content     string
+		expectError bool
+		expectTools int
+	}{
+		{
+			name: "valid click action",
+			content: `{
+  "actions": [
+    {
+      "action_type": "click",
+      "action_inputs": {
+        "start_box": [371, 235, 425, 270]
+      }
+    }
+  ],
+  "thought": "点击桌面上的抖音应用图标以启动抖音",
+  "error": null
+}`,
+			expectError: false,
+			expectTools: 1,
+		},
+		{
+			name: "multiple actions",
+			content: `{
+  "actions": [
+    {
+      "action_type": "click",
+      "action_inputs": {
+        "start_box": [100, 100, 200, 200]
+      }
+    },
+    {
+      "action_type": "type",
+      "action_inputs": {
+        "content": "hello world"
+      }
+    }
+  ],
+  "thought": "执行多个操作",
+  "error": null
+}`,
+			expectError: false,
+			expectTools: 2,
+		},
+		{
+			name: "no actions but valid thought",
+			content: `{
+  "actions": [],
+  "thought": "这是一个分析任务，不需要执行操作",
+  "error": null
+}`,
+			expectError: false,
+			expectTools: 0,
+		},
+		{
+			name: "error response",
+			content: `{
+  "actions": [],
+  "thought": "发生了错误",
+  "error": "无法找到目标元素"
+}`,
+			expectError: true,
+			expectTools: 0,
+		},
+		{
+			name:        "invalid JSON",
+			content:     `{"actions": [{"action_type": "click"`,
+			expectError: true,
+			expectTools: 0,
+		},
+		{
+			name: "string array coordinates (DOUBAO format)",
+			content: `{
+  "actions": [
+    {
+      "action_type": "click",
+      "action_inputs": {
+        "start_box": [
+          "229 389",
+          "229 439"
+        ]
+      }
+    }
+  ],
+  "thought": "点击苹果图案",
+  "error": null
+}`,
+			expectError: false,
+			expectTools: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := parser.Parse(tt.content, size)
+
+			if tt.expectError {
+				assert.Error(t, err)
+				return
+			}
+
+			require.NoError(t, err)
+			assert.NotNil(t, result)
+			assert.Equal(t, tt.expectTools, len(result.ToolCalls))
+			assert.NotEmpty(t, result.Thought)
+			assert.Equal(t, string(parser.modelType), result.ModelName)
+
+			// Verify tool calls structure if any
+			for _, toolCall := range result.ToolCalls {
+				assert.NotEmpty(t, toolCall.ID)
+				assert.Equal(t, "function", toolCall.Type)
+				assert.NotEmpty(t, toolCall.Function.Name)
+				assert.NotEmpty(t, toolCall.Function.Arguments)
+			}
+		})
+	}
+}
+
+func TestNormalizeActionCoordinates_StringArray(t *testing.T) {
+	size := types.Size{Width: 1200, Height: 2640}
+
+	tests := []struct {
+		name        string
+		coordData   interface{}
+		expectError bool
+		expectLen   int
+	}{
+		{
+			name:        "string array coordinates",
+			coordData:   []interface{}{"229 389", "229 439"},
+			expectError: false,
+			expectLen:   4, // Each string contains 2 coordinates, so total 4
+		},
+		{
+			name:        "mixed number and string coordinates",
+			coordData:   []interface{}{100, 200, "300 400"},
+			expectError: false,
+			expectLen:   4, // 2 numbers + 2 from string = 4
+		},
+		{
+			name:        "single string coordinate",
+			coordData:   []interface{}{"100 200"},
+			expectError: false,
+			expectLen:   2,
+		},
+		{
+			name:        "invalid string format",
+			coordData:   []interface{}{"invalid"},
+			expectError: true,
+			expectLen:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := normalizeActionCoordinates(tt.coordData, size)
+
+			if tt.expectError {
+				assert.Error(t, err)
+				return
+			}
+
+			require.NoError(t, err)
+			assert.Equal(t, tt.expectLen, len(result))
+
+			// All coordinates should be positive numbers
+			for _, coord := range result {
+				assert.True(t, coord >= 0, "coordinate should be non-negative: %f", coord)
+			}
+		})
+	}
+}
--- a/uixt/ai/parser_ui_tars.go
+++ b/uixt/ai/parser_ui_tars.go
@@ -329,33 +329,46 @@ func mapParameterName(paramName string) string {
 func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
 	switch v := coordData.(type) {
 	case []interface{}:
-		// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1]
-		if len(v) < 2 {
-			return nil, fmt.Errorf("coordinate array must have at least 2 elements, got %d", len(v))
+		// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1] or ["229 389", "229 439"]
+		if len(v) == 0 {
+			return nil, fmt.Errorf("coordinate array cannot be empty")
 		}

-		coords := make([]float64, len(v))
+		coords := make([]float64, 0)
 		for i, val := range v {
 			switch num := val.(type) {
 			case float64:
 				// Convert relative coordinates to absolute coordinates using DefaultFactor
 				if i%2 == 0 { // x coordinates
-					coords[i] = convertRelativeToAbsolute(num, true, size)
+					coords = append(coords, convertRelativeToAbsolute(num, true, size))
 				} else { // y coordinates
-					coords[i] = convertRelativeToAbsolute(num, false, size)
+					coords = append(coords, convertRelativeToAbsolute(num, false, size))
 				}
 			case int:
 				numFloat := float64(num)
 				// Convert relative coordinates to absolute coordinates using DefaultFactor
 				if i%2 == 0 { // x coordinates
-					coords[i] = convertRelativeToAbsolute(numFloat, true, size)
+					coords = append(coords, convertRelativeToAbsolute(numFloat, true, size))
 				} else { // y coordinates
-					coords[i] = convertRelativeToAbsolute(numFloat, false, size)
+					coords = append(coords, convertRelativeToAbsolute(numFloat, false, size))
 				}
+			case string:
+				// Handle string coordinates like "229 389"
+				stringCoords, err := normalizeStringCoordinates(num, size)
+				if err != nil {
+					return nil, fmt.Errorf("failed to parse string coordinate '%s': %v", num, err)
+				}
+				coords = append(coords, stringCoords...)
 			default:
-				return nil, fmt.Errorf("coordinate value must be a number, got %T", val)
+				return nil, fmt.Errorf("coordinate value must be a number or string, got %T", val)
 			}
 		}
+
+		// Check if we have at least 2 coordinates after processing
+		if len(coords) < 2 {
+			return nil, fmt.Errorf("coordinate array must result in at least 2 coordinates, got %d", len(coords))
+		}
+
 		return coords, nil

 	case []float64:
--- a/uixt/ai/planner.go
+++ b/uixt/ai/planner.go
@@ -67,7 +67,7 @@ func (p *Planner) History() *ConversationHistory {
 }

 func (p *Planner) RegisterTools(tools []*schema.ToolInfo) error {
-	if p.modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
+	if option.IS_UI_TARS(p.modelConfig.ModelType) {
 		// tools have been registered in ui-tars system prompt
 		return nil
 	}
--- a/uixt/ai/utils.go
+++ b/uixt/ai/utils.go
@@ -42,14 +42,7 @@ func extractJSONFromContent(content string) string {
 		}
 	}

-	// Case 3: Try regex approach for markdown-like formats
-	jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
-	matches := jsonRegex.FindStringSubmatch(content)
-	if len(matches) > 1 {
-		return strings.TrimSpace(matches[1])
-	}
-
-	// Case 4: Look for JSON object in the content using brace counting
+	// Case 3: Look for JSON object in the content using brace counting (most reliable method)
 	start := strings.Index(content, "{")
 	if start != -1 {
 		// Find the matching closing brace
@@ -67,6 +60,13 @@ func extractJSONFromContent(content string) string {
 		}
 	}

+	// Case 4: Try regex approach for markdown-like formats (fallback)
+	jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
+	matches := jsonRegex.FindStringSubmatch(content)
+	if len(matches) > 1 {
+		return strings.TrimSpace(matches[1])
+	}
+
 	// Case 5: If content itself looks like JSON
 	if strings.HasPrefix(content, "{") && strings.HasSuffix(content, "}") {
 		return content
--- a/uixt/ai/utils_test.go
+++ b/uixt/ai/utils_test.go
@@ -0,0 +1,79 @@
+package ai
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestExtractJSONFromContent(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "valid JSON",
+			input:    `{"content": "test", "thought": "test"}`,
+			expected: `{"content": "test", "thought": "test"}`,
+		},
+		{
+			name:  "JSON in markdown",
+			input: "```json\n{\n  \"content\": \"test\"\n}\n```",
+			expected: `{
+  "content": "test"
+}`,
+		},
+		{
+			name:     "incomplete JSON without closing brace",
+			input:    `{"content": "incomplete json"`,
+			expected: "",
+		},
+		{
+			name:     "incomplete JSON with missing closing brace",
+			input:    `{"content": "incomplete json", "missing_closing_brace": true`,
+			expected: "",
+		},
+		{
+			name:     "plain text",
+			input:    "This is just plain text",
+			expected: "",
+		},
+		{
+			name: "complex nested JSON with arrays",
+			input: `{
+  "actions": [
+    {
+      "action_type": "click",
+      "action_inputs": {
+        "start_box": [371, 235, 425, 270]
+      }
+    }
+  ],
+  "thought": "点击桌面上的抖音应用图标以启动抖音",
+  "error": null
+}`,
+			expected: `{
+  "actions": [
+    {
+      "action_type": "click",
+      "action_inputs": {
+        "start_box": [371, 235, 425, 270]
+      }
+    }
+  ],
+  "thought": "点击桌面上的抖音应用图标以启动抖音",
+  "error": null
+}`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := extractJSONFromContent(tt.input)
+			t.Logf("Input: %s", tt.input)
+			t.Logf("Output: %s", result)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
--- a/uixt/driver_ext_ai.go
+++ b/uixt/driver_ext_ai.go
@@ -50,6 +50,7 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
 			if errors.Is(err, code.LLMRequestServiceError) {
 				log.Warn().Err(err).Int("attempt", attempt).
 					Msg("LLM service request failed, retrying...")
+				time.Sleep(5 * time.Second)
 				continue
 			}
 			// Create planning result with error
--- a/uixt/option/ai.go
+++ b/uixt/option/ai.go
@@ -30,9 +30,17 @@ func WithCVService(service CVServiceType) AIServiceOption {

 type LLMServiceType string

+func IS_UI_TARS(modelType LLMServiceType) bool {
+	return modelType == DOUBAO_1_5_UI_TARS_250328 ||
+		modelType == DOUBAO_1_5_UI_TARS_250428
+}
+
 const (
+	DOUBAO_1_5_UI_TARS_250328             LLMServiceType = "doubao-1.5-ui-tars-250328"
 	DOUBAO_1_5_UI_TARS_250428             LLMServiceType = "doubao-1.5-ui-tars-250428" // not support function calling and json response
 	DOUBAO_1_5_THINKING_VISION_PRO_250428 LLMServiceType = "doubao-1.5-thinking-vision-pro-250428"
+	OPENAI_GPT_4O                         LLMServiceType = "openai/gpt-4o"
+	DEEPSEEK_R1_250528                    LLMServiceType = "deepseek-r1-250528"
 )

 func WithLLMService(modelType LLMServiceType) AIServiceOption {