mirror of
https://github.com/httprunner/httprunner.git
synced 2026-05-07 07:22:39 +08:00
feat: enhance VLM response parsing and DOUBAO model support
- Fix JSON extraction logic by prioritizing brace counting method - Add support for DOUBAO string array coordinate format - Introduce IS_UI_TARS helper function for model type checking - Add comprehensive tests for JSON parsing and coordinate handling - Improve error handling with retry delays for LLM service failures
This commit is contained in:
@@ -1 +1 @@
|
||||
v5.0.0-beta-2506101408
|
||||
v5.0.0-beta-2506101556
|
||||
|
||||
@@ -51,7 +51,7 @@ func NewAsserter(ctx context.Context, modelConfig *ModelConfig) (*Asserter, erro
|
||||
systemPrompt: defaultAssertionPrompt,
|
||||
}
|
||||
|
||||
if modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
|
||||
if option.IS_UI_TARS(modelConfig.ModelType) {
|
||||
asserter.systemPrompt += "\n" + uiTarsAssertionResponseFormat
|
||||
} else {
|
||||
// define output format
|
||||
|
||||
@@ -4,8 +4,10 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/httprunner/httprunner/v5/internal/json"
|
||||
"github.com/httprunner/httprunner/v5/uixt/option"
|
||||
"github.com/httprunner/httprunner/v5/uixt/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParseActionToStructureOutput(t *testing.T) {
|
||||
@@ -1090,3 +1092,187 @@ func TestMapParameterName(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJSONContentParser_Parse(t *testing.T) {
|
||||
parser := &JSONContentParser{
|
||||
modelType: option.OPENAI_GPT_4O,
|
||||
systemPrompt: "test prompt",
|
||||
actionMapping: map[string]option.ActionName{"click": "tap_xy"},
|
||||
}
|
||||
|
||||
size := types.Size{Width: 1200, Height: 2640}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
expectError bool
|
||||
expectTools int
|
||||
}{
|
||||
{
|
||||
name: "valid click action",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [371, 235, 425, 270]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击桌面上的抖音应用图标以启动抖音",
|
||||
"error": null
|
||||
}`,
|
||||
expectError: false,
|
||||
expectTools: 1,
|
||||
},
|
||||
{
|
||||
name: "multiple actions",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [100, 100, 200, 200]
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "hello world"
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "执行多个操作",
|
||||
"error": null
|
||||
}`,
|
||||
expectError: false,
|
||||
expectTools: 2,
|
||||
},
|
||||
{
|
||||
name: "no actions but valid thought",
|
||||
content: `{
|
||||
"actions": [],
|
||||
"thought": "这是一个分析任务,不需要执行操作",
|
||||
"error": null
|
||||
}`,
|
||||
expectError: false,
|
||||
expectTools: 0,
|
||||
},
|
||||
{
|
||||
name: "error response",
|
||||
content: `{
|
||||
"actions": [],
|
||||
"thought": "发生了错误",
|
||||
"error": "无法找到目标元素"
|
||||
}`,
|
||||
expectError: true,
|
||||
expectTools: 0,
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
content: `{"actions": [{"action_type": "click"`,
|
||||
expectError: true,
|
||||
expectTools: 0,
|
||||
},
|
||||
{
|
||||
name: "string array coordinates (DOUBAO format)",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [
|
||||
"229 389",
|
||||
"229 439"
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击苹果图案",
|
||||
"error": null
|
||||
}`,
|
||||
expectError: false,
|
||||
expectTools: 1,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := parser.Parse(tt.content, size)
|
||||
|
||||
if tt.expectError {
|
||||
assert.Error(t, err)
|
||||
return
|
||||
}
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, result)
|
||||
assert.Equal(t, tt.expectTools, len(result.ToolCalls))
|
||||
assert.NotEmpty(t, result.Thought)
|
||||
assert.Equal(t, string(parser.modelType), result.ModelName)
|
||||
|
||||
// Verify tool calls structure if any
|
||||
for _, toolCall := range result.ToolCalls {
|
||||
assert.NotEmpty(t, toolCall.ID)
|
||||
assert.Equal(t, "function", toolCall.Type)
|
||||
assert.NotEmpty(t, toolCall.Function.Name)
|
||||
assert.NotEmpty(t, toolCall.Function.Arguments)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeActionCoordinates_StringArray(t *testing.T) {
|
||||
size := types.Size{Width: 1200, Height: 2640}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
coordData interface{}
|
||||
expectError bool
|
||||
expectLen int
|
||||
}{
|
||||
{
|
||||
name: "string array coordinates",
|
||||
coordData: []interface{}{"229 389", "229 439"},
|
||||
expectError: false,
|
||||
expectLen: 4, // Each string contains 2 coordinates, so total 4
|
||||
},
|
||||
{
|
||||
name: "mixed number and string coordinates",
|
||||
coordData: []interface{}{100, 200, "300 400"},
|
||||
expectError: false,
|
||||
expectLen: 4, // 2 numbers + 2 from string = 4
|
||||
},
|
||||
{
|
||||
name: "single string coordinate",
|
||||
coordData: []interface{}{"100 200"},
|
||||
expectError: false,
|
||||
expectLen: 2,
|
||||
},
|
||||
{
|
||||
name: "invalid string format",
|
||||
coordData: []interface{}{"invalid"},
|
||||
expectError: true,
|
||||
expectLen: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := normalizeActionCoordinates(tt.coordData, size)
|
||||
|
||||
if tt.expectError {
|
||||
assert.Error(t, err)
|
||||
return
|
||||
}
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, tt.expectLen, len(result))
|
||||
|
||||
// All coordinates should be positive numbers
|
||||
for _, coord := range result {
|
||||
assert.True(t, coord >= 0, "coordinate should be non-negative: %f", coord)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -329,33 +329,46 @@ func mapParameterName(paramName string) string {
|
||||
func normalizeActionCoordinates(coordData interface{}, size types.Size) ([]float64, error) {
|
||||
switch v := coordData.(type) {
|
||||
case []interface{}:
|
||||
// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1]
|
||||
if len(v) < 2 {
|
||||
return nil, fmt.Errorf("coordinate array must have at least 2 elements, got %d", len(v))
|
||||
// Handle JSON array format: [x1, y1, x2, y2] or [x1, y1] or ["229 389", "229 439"]
|
||||
if len(v) == 0 {
|
||||
return nil, fmt.Errorf("coordinate array cannot be empty")
|
||||
}
|
||||
|
||||
coords := make([]float64, len(v))
|
||||
coords := make([]float64, 0)
|
||||
for i, val := range v {
|
||||
switch num := val.(type) {
|
||||
case float64:
|
||||
// Convert relative coordinates to absolute coordinates using DefaultFactor
|
||||
if i%2 == 0 { // x coordinates
|
||||
coords[i] = convertRelativeToAbsolute(num, true, size)
|
||||
coords = append(coords, convertRelativeToAbsolute(num, true, size))
|
||||
} else { // y coordinates
|
||||
coords[i] = convertRelativeToAbsolute(num, false, size)
|
||||
coords = append(coords, convertRelativeToAbsolute(num, false, size))
|
||||
}
|
||||
case int:
|
||||
numFloat := float64(num)
|
||||
// Convert relative coordinates to absolute coordinates using DefaultFactor
|
||||
if i%2 == 0 { // x coordinates
|
||||
coords[i] = convertRelativeToAbsolute(numFloat, true, size)
|
||||
coords = append(coords, convertRelativeToAbsolute(numFloat, true, size))
|
||||
} else { // y coordinates
|
||||
coords[i] = convertRelativeToAbsolute(numFloat, false, size)
|
||||
coords = append(coords, convertRelativeToAbsolute(numFloat, false, size))
|
||||
}
|
||||
case string:
|
||||
// Handle string coordinates like "229 389"
|
||||
stringCoords, err := normalizeStringCoordinates(num, size)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse string coordinate '%s': %v", num, err)
|
||||
}
|
||||
coords = append(coords, stringCoords...)
|
||||
default:
|
||||
return nil, fmt.Errorf("coordinate value must be a number, got %T", val)
|
||||
return nil, fmt.Errorf("coordinate value must be a number or string, got %T", val)
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we have at least 2 coordinates after processing
|
||||
if len(coords) < 2 {
|
||||
return nil, fmt.Errorf("coordinate array must result in at least 2 coordinates, got %d", len(coords))
|
||||
}
|
||||
|
||||
return coords, nil
|
||||
|
||||
case []float64:
|
||||
|
||||
@@ -67,7 +67,7 @@ func (p *Planner) History() *ConversationHistory {
|
||||
}
|
||||
|
||||
func (p *Planner) RegisterTools(tools []*schema.ToolInfo) error {
|
||||
if p.modelConfig.ModelType == option.DOUBAO_1_5_UI_TARS_250428 {
|
||||
if option.IS_UI_TARS(p.modelConfig.ModelType) {
|
||||
// tools have been registered in ui-tars system prompt
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -42,14 +42,7 @@ func extractJSONFromContent(content string) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Case 3: Try regex approach for markdown-like formats
|
||||
jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
|
||||
matches := jsonRegex.FindStringSubmatch(content)
|
||||
if len(matches) > 1 {
|
||||
return strings.TrimSpace(matches[1])
|
||||
}
|
||||
|
||||
// Case 4: Look for JSON object in the content using brace counting
|
||||
// Case 3: Look for JSON object in the content using brace counting (most reliable method)
|
||||
start := strings.Index(content, "{")
|
||||
if start != -1 {
|
||||
// Find the matching closing brace
|
||||
@@ -67,6 +60,13 @@ func extractJSONFromContent(content string) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Case 4: Try regex approach for markdown-like formats (fallback)
|
||||
jsonRegex := regexp.MustCompile(`(?:json)?\s*({[\s\S]*?})\s*`)
|
||||
matches := jsonRegex.FindStringSubmatch(content)
|
||||
if len(matches) > 1 {
|
||||
return strings.TrimSpace(matches[1])
|
||||
}
|
||||
|
||||
// Case 5: If content itself looks like JSON
|
||||
if strings.HasPrefix(content, "{") && strings.HasSuffix(content, "}") {
|
||||
return content
|
||||
|
||||
79
uixt/ai/utils_test.go
Normal file
79
uixt/ai/utils_test.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestExtractJSONFromContent(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "valid JSON",
|
||||
input: `{"content": "test", "thought": "test"}`,
|
||||
expected: `{"content": "test", "thought": "test"}`,
|
||||
},
|
||||
{
|
||||
name: "JSON in markdown",
|
||||
input: "```json\n{\n \"content\": \"test\"\n}\n```",
|
||||
expected: `{
|
||||
"content": "test"
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "incomplete JSON without closing brace",
|
||||
input: `{"content": "incomplete json"`,
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "incomplete JSON with missing closing brace",
|
||||
input: `{"content": "incomplete json", "missing_closing_brace": true`,
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "plain text",
|
||||
input: "This is just plain text",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "complex nested JSON with arrays",
|
||||
input: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [371, 235, 425, 270]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击桌面上的抖音应用图标以启动抖音",
|
||||
"error": null
|
||||
}`,
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [371, 235, 425, 270]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击桌面上的抖音应用图标以启动抖音",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractJSONFromContent(tt.input)
|
||||
t.Logf("Input: %s", tt.input)
|
||||
t.Logf("Output: %s", result)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -50,6 +50,7 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
|
||||
if errors.Is(err, code.LLMRequestServiceError) {
|
||||
log.Warn().Err(err).Int("attempt", attempt).
|
||||
Msg("LLM service request failed, retrying...")
|
||||
time.Sleep(5 * time.Second)
|
||||
continue
|
||||
}
|
||||
// Create planning result with error
|
||||
|
||||
@@ -30,9 +30,17 @@ func WithCVService(service CVServiceType) AIServiceOption {
|
||||
|
||||
type LLMServiceType string
|
||||
|
||||
func IS_UI_TARS(modelType LLMServiceType) bool {
|
||||
return modelType == DOUBAO_1_5_UI_TARS_250328 ||
|
||||
modelType == DOUBAO_1_5_UI_TARS_250428
|
||||
}
|
||||
|
||||
const (
|
||||
DOUBAO_1_5_UI_TARS_250328 LLMServiceType = "doubao-1.5-ui-tars-250328"
|
||||
DOUBAO_1_5_UI_TARS_250428 LLMServiceType = "doubao-1.5-ui-tars-250428" // not support function calling and json response
|
||||
DOUBAO_1_5_THINKING_VISION_PRO_250428 LLMServiceType = "doubao-1.5-thinking-vision-pro-250428"
|
||||
OPENAI_GPT_4O LLMServiceType = "openai/gpt-4o"
|
||||
DEEPSEEK_R1_250528 LLMServiceType = "deepseek-r1-250528"
|
||||
)
|
||||
|
||||
func WithLLMService(modelType LLMServiceType) AIServiceOption {
|
||||
|
||||
Reference in New Issue
Block a user