fix: improve JSON extraction to handle UTF-8 Chinese characters properly

- Replace byte-based brace counting with UTF-8 aware rune iteration
- Add proper string state tracking to handle escaped quotes
- Add comprehensive test cases for Chinese character handling
- Fix parsing errors when JSON contains Chinese text like 2048经典
This commit is contained in:
lilong.129
2025-06-10 16:05:43 +08:00
parent 88ae8faee1
commit c322d7c36c
4 changed files with 193 additions and 47 deletions

View File

@@ -9,39 +9,12 @@ import (
func TestExtractJSONFromContent(t *testing.T) {
tests := []struct {
name string
input string
content string
expected string
}{
{
name: "valid JSON",
input: `{"content": "test", "thought": "test"}`,
expected: `{"content": "test", "thought": "test"}`,
},
{
name: "JSON in markdown",
input: "```json\n{\n \"content\": \"test\"\n}\n```",
expected: `{
"content": "test"
}`,
},
{
name: "incomplete JSON without closing brace",
input: `{"content": "incomplete json"`,
expected: "",
},
{
name: "incomplete JSON with missing closing brace",
input: `{"content": "incomplete json", "missing_closing_brace": true`,
expected: "",
},
{
name: "plain text",
input: "This is just plain text",
expected: "",
},
{
name: "complex nested JSON with arrays",
input: `{
name: "simple JSON",
content: `{
"actions": [
{
"action_type": "click",
@@ -64,15 +37,161 @@ func TestExtractJSONFromContent(t *testing.T) {
],
"thought": "点击桌面上的抖音应用图标以启动抖音",
"error": null
}`,
},
{
name: "JSON with Chinese characters in strings",
content: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "2048经典"
}
}
],
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "2048经典"
}
}
],
"thought": "搜索框已经清空了,现在我要输入\"2048经典\"这个关键词。看到键盘已经弹出来了,正好可以直接开始输入。这样一来,就能找到我们想要玩的那个小游戏了。",
"error": null
}`,
},
{
name: "JSON with markdown wrapper",
content: "```json\n" + `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250]
}
}
],
"thought": "点击按钮",
"error": null
}` + "\n```",
expected: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250]
}
}
],
"thought": "点击按钮",
"error": null
}`,
},
{
name: "JSON embedded in text with Chinese",
content: `这是一个包含中文的响应:{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "测试内容"
}
}
],
"thought": "这是一个测试思路",
"error": null
} 后面还有一些文本`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "测试内容"
}
}
],
"thought": "这是一个测试思路",
"error": null
}`,
},
{
name: "JSON with escaped quotes and Chinese",
content: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "他说:\"你好,世界!\""
}
}
],
"thought": "输入包含引号的中文文本",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "type",
"action_inputs": {
"content": "他说:\"你好,世界!\""
}
}
],
"thought": "输入包含引号的中文文本",
"error": null
}`,
},
{
name: "no JSON content",
content: "这只是一些普通的文本没有JSON内容",
expected: "",
},
{
name: "nested JSON objects with Chinese",
content: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250],
"metadata": {
"description": "点击操作",
"target": "按钮"
}
}
}
],
"thought": "执行嵌套对象的点击操作",
"error": null
}`,
expected: `{
"actions": [
{
"action_type": "click",
"action_inputs": {
"start_box": [100, 200, 150, 250],
"metadata": {
"description": "点击操作",
"target": "按钮"
}
}
}
],
"thought": "执行嵌套对象的点击操作",
"error": null
}`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractJSONFromContent(tt.input)
t.Logf("Input: %s", tt.input)
t.Logf("Output: %s", result)
result := extractJSONFromContent(tt.content)
assert.Equal(t, tt.expected, result)
})
}