refactor: merge ActionSummary and Thought fields to eliminate duplication

- Remove redundant ActionSummary field from PlanningResult struct - Update parsers to use unified Thought field instead of duplicate fields - Modify chat interface to display Thought instead of ActionSummary - Update planner logging to use thought instead of summary - Adjust prompt templates to use thought field consistently - Switch test LLM service from UI-TARS to DoubaoVL - Add default parameter handling for sleep tool
2026-07-21 04:22:30 +08:00 · 2025-06-05 14:19:09 +08:00
parent 0864f74021
commit 0add3231ff
8 changed files with 24 additions and 26 deletions
--- a/uixt/ai/parser_default.go
+++ b/uixt/ai/parser_default.go
@@ -53,7 +53,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
 	// Define a temporary struct to parse the expected JSON format
 	var jsonResponse struct {
 		Actions []Action `json:"actions"`
-		Summary string   `json:"summary"`
+		Thought string   `json:"thought"`
 		Error   string   `json:"error"`
 	}

@@ -95,9 +95,8 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
 	toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping)

 	return &PlanningResult{
-		ToolCalls:     toolCalls,
-		ActionSummary: jsonResponse.Summary,
-		Thought:       jsonResponse.Summary,
-		Content:       content,
+		ToolCalls: toolCalls,
+		Thought:   jsonResponse.Thought,
+		Content:   content,
 	}, nil
 }
--- a/uixt/ai/parser_ui_tars.go
+++ b/uixt/ai/parser_ui_tars.go
@@ -52,10 +52,9 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR
 	toolCalls := convertActionsToToolCalls(actions, p.actionMapping)

 	return &PlanningResult{
-		ToolCalls:     toolCalls,
-		ActionSummary: thought,
-		Thought:       thought,
-		Content:       content,
+		ToolCalls: toolCalls,
+		Thought:   thought,
+		Content:   content,
 	}, nil
 }

--- a/uixt/ai/planner.go
+++ b/uixt/ai/planner.go
@@ -27,11 +27,10 @@ type PlanningOptions struct {

 // PlanningResult represents the result of planning
 type PlanningResult struct {
-	ToolCalls     []schema.ToolCall `json:"tool_calls"`
-	ActionSummary string            `json:"summary"`
-	Thought       string            `json:"thought"`
-	Content       string            `json:"content"` // original content from model
-	Error         string            `json:"error,omitempty"`
+	ToolCalls []schema.ToolCall `json:"tool_calls"`
+	Thought   string            `json:"thought"`
+	Content   string            `json:"content"` // original content from model
+	Error     string            `json:"error,omitempty"`
 }

 func NewPlanner(ctx context.Context, modelConfig *ModelConfig) (*Planner, error) {
@@ -125,8 +124,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
 		})
 		// history will be appended with tool calls execution result
 		result := &PlanningResult{
-			ToolCalls:     message.ToolCalls,
-			ActionSummary: message.Content,
+			ToolCalls: message.ToolCalls,
+			Thought:   message.Content,
 		}
 		return result, nil
 	}
@@ -135,8 +134,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
 	result, err := p.parser.Parse(message.Content, opts.Size)
 	if err != nil {
 		result = &PlanningResult{
-			ActionSummary: message.Content,
-			Error:         err.Error(),
+			Thought: message.Content,
+			Error:   err.Error(),
 		}
 		log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed")
 	}
@@ -147,7 +146,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
 	})

 	log.Info().
-		Interface("summary", result.ActionSummary).
+		Interface("thought", result.Thought).
 		Interface("tool_calls", result.ToolCalls).
 		Msg("get VLM planning result")
 	return result, nil
--- a/uixt/ai/planner_prompts.go
+++ b/uixt/ai/planner_prompts.go
@@ -98,7 +98,7 @@ Supporting actions:

 Field description:
 * The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
-* Use Chinese in log and summary fields.
+* Use Chinese in log and thought fields.

 Return in JSON format:
 {
@@ -108,7 +108,7 @@ Return in JSON format:
      "action_inputs": { ... }
    }
  ],
-  "summary": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese.
+  "thought": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese.
  "error": "string" | null, // Error messages about unexpected situations, if any. Use Chinese.
 }

@@ -123,7 +123,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by
      }
    }
  ],
-  "summary": "点击第二个帖子的作者头像",
+  "thought": "点击第二个帖子的作者头像",
  "error": null
 }

--- a/uixt/android_test.go
+++ b/uixt/android_test.go
@@ -25,7 +25,7 @@ func setupADBDriverExt(t *testing.T) *XTDriver {
 	require.Nil(t, err)
 	driverExt, err := NewXTDriver(driver,
 		option.WithCVService(option.CVServiceTypeVEDEM),
-		option.WithLLMService(option.LLMServiceTypeUITARS),
+		option.WithLLMService(option.LLMServiceTypeDoubaoVL),
 	)
 	require.Nil(t, err)
 	return driverExt
--- a/uixt/mcp_tools_utility.go
+++ b/uixt/mcp_tools_utility.go
@@ -34,7 +34,8 @@ func (t *ToolSleep) Implement() server.ToolHandlerFunc {
 	return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
 		seconds, ok := request.Params.Arguments["seconds"]
 		if !ok {
-			return nil, fmt.Errorf("seconds parameter is required")
+			log.Warn().Msg("seconds parameter is required, using default value 5.0 seconds")
+			seconds = 5.0
 		}

 		// Sleep action logic