fix: improve JSON extraction to handle UTF-8 Chinese characters properly

- Replace byte-based brace counting with UTF-8 aware rune iteration
- Add proper string state tracking to handle escaped quotes
- Add comprehensive test cases for Chinese character handling
- Fix parsing errors when JSON contains Chinese text like 2048经典
This commit is contained in:
lilong.129
2025-06-10 16:05:43 +08:00
parent 88ae8faee1
commit c322d7c36c
4 changed files with 193 additions and 47 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506101556
v5.0.0-beta-2506101609

View File

@@ -19,14 +19,13 @@ type LLMContentParser interface {
}
func NewLLMContentParser(modelType option.LLMServiceType) LLMContentParser {
switch modelType {
case option.DOUBAO_1_5_UI_TARS_250428:
if option.IS_UI_TARS(modelType) {
return &UITARSContentParser{
modelType: modelType,
systemPrompt: doubao_1_5_ui_tars_planning_prompt,
actionMapping: doubao_1_5_ui_tars_action_mapping,
}
default:
} else {
return &JSONContentParser{
modelType: modelType,
systemPrompt: doubao_1_5_thinking_vision_pro_planning_prompt,

View File

@@ -3,6 +3,7 @@ package ai
import (
"regexp"
"strings"
"unicode/utf8"
)
// extractJSONFromContent extracts JSON content from various formats in the response
@@ -42,21 +43,48 @@ func extractJSONFromContent(content string) string {
}
}
// Case 3: Look for JSON object in the content using brace counting (most reliable method)
// Case 3: Look for JSON object in the content using rune-based brace counting (most reliable method)
start := strings.Index(content, "{")
if start != -1 {
// Find the matching closing brace
// Find the matching closing brace using rune-based iteration to handle UTF-8 properly
braceCount := 0
for i := start; i < len(content); i++ {
if content[i] == '{' {
braceCount++
} else if content[i] == '}' {
braceCount--
if braceCount == 0 {
jsonContent := strings.TrimSpace(content[start : i+1])
return jsonContent
inString := false
escaped := false
// Use byte-based iteration but track string state properly
for i := start; i < len(content); {
r, size := utf8.DecodeRuneInString(content[i:])
if escaped {
escaped = false
i += size
continue
}
if r == '\\' && inString {
escaped = true
i += size
continue
}
if r == '"' {
inString = !inString
i += size
continue
}
if !inString {
if r == '{' {
braceCount++
} else if r == '}' {
braceCount--
if braceCount == 0 {
jsonContent := strings.TrimSpace(content[start : i+size])
return jsonContent
}
}
}
i += size
}
}

View File

@@ -9,39 +9,12 @@ import (
func TestExtractJSONFromContent(t *testing.T) {
tests := []struct {
name string
input string
content string
expected string
}{
{
name: "valid JSON",
input: `{"content": "test", "thought": "test"}`,
expected: `{"content": "test", "thought": "test"}`,
},
{
name: "JSON in markdown",
input: "```json\n{\n \"content\": \"test\"\n}\n```",
expected: `{
"content": "test"
}`,
},
{
name: "incomplete JSON without closing brace",
input: `{"content": "incomplete json"`,
expected: "",
},
{
name: "incomplete JSON with missing closing brace",
input: `{"content": "incomplete json", "missing_closing_brace": true`,
expected: "",
},
{
name: "plain text",
input: "This is just plain text",
expected: "",
},
{
name: "complex nested JSON with arrays",
input: `{
name: "simple JSON",
content: `{
"actions": [
{
"action_type": "click",
@@ -64,15 +37,161 @@ func TestExtractJSONFromContent(t *testing.T) {
],
"thought": "点击桌面上的抖音应用图标以启动抖音",
"error": null
}`,
},
{
name: "JSON with Chinese characters in strings",
content: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "2048经典"
}
}
],
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "2048经典"
}
}
],
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
"error": null
}`,
},
{
name: "JSON with markdown wrapper",
content: "```json\n" + `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250]
}
}
],
"thought": "点击按钮",
"error": null
}` + "\n```",
expected: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250]
}
}
],
"thought": "点击按钮",
"error": null
}`,
},
{
name: "JSON embedded in text with Chinese",
content: `这是一个包含中文的响应:{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "测试内容"
}
}
],
"thought": "这是一个测试思路",
"error": null
} 后面还有一些文本`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "测试内容"
}
}
],
"thought": "这是一个测试思路",
"error": null
}`,
},
{
name: "JSON with escaped quotes and Chinese",
content: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "他说:\"你好,世界!\""
}
}
],
"thought": "输入包含引号的中文文本",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "他说:\"你好,世界!\""
}
}
],
"thought": "输入包含引号的中文文本",
"error": null
}`,
},
{
name: "no JSON content",
content: "这只是一些普通的文本没有JSON内容",
expected: "",
},
{
name: "nested JSON objects with Chinese",
content: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250],
"metadata": {
"description": "点击操作",
"target": "按钮"
}
}
}
],
"thought": "执行嵌套对象的点击操作",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250],
"metadata": {
"description": "点击操作",
"target": "按钮"
}
}
}
],
"thought": "执行嵌套对象的点击操作",
"error": null
}`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractJSONFromContent(tt.input)
t.Logf("Input: %s", tt.input)
t.Logf("Output: %s", result)
result := extractJSONFromContent(tt.content)
assert.Equal(t, tt.expected, result)
})
}