mirror of
https://github.com/httprunner/httprunner.git
synced 2026-05-10 17:43:00 +08:00
fix: improve JSON extraction to handle UTF-8 Chinese characters properly
- Replace byte-based brace counting with UTF-8 aware rune iteration - Add proper string state tracking to handle escaped quotes - Add comprehensive test cases for Chinese character handling - Fix parsing errors when JSON contains Chinese text like 2048经典
This commit is contained in:
@@ -1 +1 @@
|
||||
v5.0.0-beta-2506101556
|
||||
v5.0.0-beta-2506101609
|
||||
|
||||
@@ -19,14 +19,13 @@ type LLMContentParser interface {
|
||||
}
|
||||
|
||||
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
|
||||
switch modelType {
|
||||
case option.DOUBAO_1_5_UI_TARS_250428:
|
||||
if option.IS_UI_TARS(modelType) {
|
||||
return &UITARSContentParser{
|
||||
modelType: modelType,
|
||||
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
|
||||
actionMapping: doubao_1_5_ui_tars_action_mapping,
|
||||
}
|
||||
default:
|
||||
} else {
|
||||
return &JSONContentParser{
|
||||
modelType: modelType,
|
||||
systemPrompt: doubao_1_5_thinking_vision_pro_planning_prompt,
|
||||
|
||||
@@ -3,6 +3,7 @@ package ai
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// extractJSONFromContent extracts JSON content from various formats in the response
|
||||
@@ -42,21 +43,48 @@ func extractJSONFromContent(content string) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Case 3: Look for JSON object in the content using brace counting (most reliable method)
|
||||
// Case 3: Look for JSON object in the content using rune-based brace counting (most reliable method)
|
||||
start := strings.Index(content, "{")
|
||||
if start != -1 {
|
||||
// Find the matching closing brace
|
||||
// Find the matching closing brace using rune-based iteration to handle UTF-8 properly
|
||||
braceCount := 0
|
||||
for i := start; i < len(content); i++ {
|
||||
if content[i] == '{' {
|
||||
braceCount++
|
||||
} else if content[i] == '}' {
|
||||
braceCount--
|
||||
if braceCount == 0 {
|
||||
jsonContent := strings.TrimSpace(content[start : i+1])
|
||||
return jsonContent
|
||||
inString := false
|
||||
escaped := false
|
||||
|
||||
// Use byte-based iteration but track string state properly
|
||||
for i := start; i < len(content); {
|
||||
r, size := utf8.DecodeRuneInString(content[i:])
|
||||
|
||||
if escaped {
|
||||
escaped = false
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
|
||||
if r == '\\' && inString {
|
||||
escaped = true
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
|
||||
if r == '"' {
|
||||
inString = !inString
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
|
||||
if !inString {
|
||||
if r == '{' {
|
||||
braceCount++
|
||||
} else if r == '}' {
|
||||
braceCount--
|
||||
if braceCount == 0 {
|
||||
jsonContent := strings.TrimSpace(content[start : i+size])
|
||||
return jsonContent
|
||||
}
|
||||
}
|
||||
}
|
||||
i += size
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,39 +9,12 @@ import (
|
||||
func TestExtractJSONFromContent(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "valid JSON",
|
||||
input: `{"content": "test", "thought": "test"}`,
|
||||
expected: `{"content": "test", "thought": "test"}`,
|
||||
},
|
||||
{
|
||||
name: "JSON in markdown",
|
||||
input: "```json\n{\n \"content\": \"test\"\n}\n```",
|
||||
expected: `{
|
||||
"content": "test"
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "incomplete JSON without closing brace",
|
||||
input: `{"content": "incomplete json"`,
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "incomplete JSON with missing closing brace",
|
||||
input: `{"content": "incomplete json", "missing_closing_brace": true`,
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "plain text",
|
||||
input: "This is just plain text",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "complex nested JSON with arrays",
|
||||
input: `{
|
||||
name: "simple JSON",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
@@ -64,15 +37,161 @@ func TestExtractJSONFromContent(t *testing.T) {
|
||||
],
|
||||
"thought": "点击桌面上的抖音应用图标以启动抖音",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "JSON with Chinese characters in strings",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "2048经典"
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
|
||||
"error": null
|
||||
}`,
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "2048经典"
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "JSON with markdown wrapper",
|
||||
content: "```json\n" + `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [100, 200, 150, 250]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击按钮",
|
||||
"error": null
|
||||
}` + "\n```",
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [100, 200, 150, 250]
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "点击按钮",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "JSON embedded in text with Chinese",
|
||||
content: `这是一个包含中文的响应:{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "测试内容"
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "这是一个测试思路",
|
||||
"error": null
|
||||
} 后面还有一些文本`,
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "测试内容"
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "这是一个测试思路",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "JSON with escaped quotes and Chinese",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "他说:\"你好,世界!\""
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "输入包含引号的中文文本",
|
||||
"error": null
|
||||
}`,
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "type",
|
||||
"action_inputs": {
|
||||
"content": "他说:\"你好,世界!\""
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "输入包含引号的中文文本",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
{
|
||||
name: "no JSON content",
|
||||
content: "这只是一些普通的文本,没有JSON内容",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "nested JSON objects with Chinese",
|
||||
content: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [100, 200, 150, 250],
|
||||
"metadata": {
|
||||
"description": "点击操作",
|
||||
"target": "按钮"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "执行嵌套对象的点击操作",
|
||||
"error": null
|
||||
}`,
|
||||
expected: `{
|
||||
"actions": [
|
||||
{
|
||||
"action_type": "click",
|
||||
"action_inputs": {
|
||||
"start_box": [100, 200, 150, 250],
|
||||
"metadata": {
|
||||
"description": "点击操作",
|
||||
"target": "按钮"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thought": "执行嵌套对象的点击操作",
|
||||
"error": null
|
||||
}`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractJSONFromContent(tt.input)
|
||||
t.Logf("Input: %s", tt.input)
|
||||
t.Logf("Output: %s", result)
|
||||
result := extractJSONFromContent(tt.content)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user