refactor: merge ActionSummary and Thought fields to eliminate duplication

- Remove redundant ActionSummary field from PlanningResult struct
- Update parsers to use unified Thought field instead of duplicate fields
- Modify chat interface to display Thought instead of ActionSummary
- Update planner logging to use thought instead of summary
- Adjust prompt templates to use thought field consistently
- Switch test LLM service from UI-TARS to DoubaoVL
- Add default parameter handling for sleep tool
This commit is contained in:
lilong.129
2025-06-05 14:19:09 +08:00
parent 0864f74021
commit 0add3231ff
8 changed files with 24 additions and 26 deletions

View File

@@ -53,7 +53,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
// Define a temporary struct to parse the expected JSON format
var jsonResponse struct {
Actions []Action `json:"actions"`
Summary string `json:"summary"`
Thought string `json:"thought"`
Error string `json:"error"`
}
@@ -95,9 +95,8 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes
toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping)
return &PlanningResult{
ToolCalls: toolCalls,
ActionSummary: jsonResponse.Summary,
Thought: jsonResponse.Summary,
Content: content,
ToolCalls: toolCalls,
Thought: jsonResponse.Thought,
Content: content,
}, nil
}

View File

@@ -52,10 +52,9 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR
toolCalls := convertActionsToToolCalls(actions, p.actionMapping)
return &PlanningResult{
ToolCalls: toolCalls,
ActionSummary: thought,
Thought: thought,
Content: content,
ToolCalls: toolCalls,
Thought: thought,
Content: content,
}, nil
}

View File

@@ -27,11 +27,10 @@ type PlanningOptions struct {
// PlanningResult represents the result of planning
type PlanningResult struct {
ToolCalls []schema.ToolCall `json:"tool_calls"`
ActionSummary string `json:"summary"`
Thought string `json:"thought"`
Content string `json:"content"` // original content from model
Error string `json:"error,omitempty"`
ToolCalls []schema.ToolCall `json:"tool_calls"`
Thought string `json:"thought"`
Content string `json:"content"` // original content from model
Error string `json:"error,omitempty"`
}
func NewPlanner(ctx context.Context, modelConfig *ModelConfig) (*Planner, error) {
@@ -125,8 +124,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
})
// history will be appended with tool calls execution result
result := &PlanningResult{
ToolCalls: message.ToolCalls,
ActionSummary: message.Content,
ToolCalls: message.ToolCalls,
Thought: message.Content,
}
return result, nil
}
@@ -135,8 +134,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
result, err := p.parser.Parse(message.Content, opts.Size)
if err != nil {
result = &PlanningResult{
ActionSummary: message.Content,
Error: err.Error(),
Thought: message.Content,
Error: err.Error(),
}
log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed")
}
@@ -147,7 +146,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes
})
log.Info().
Interface("summary", result.ActionSummary).
Interface("thought", result.Thought).
Interface("tool_calls", result.ToolCalls).
Msg("get VLM planning result")
return result, nil

View File

@@ -98,7 +98,7 @@ Supporting actions:
Field description:
* The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system.
* Use Chinese in log and summary fields.
* Use Chinese in log and thought fields.
Return in JSON format:
{
@@ -108,7 +108,7 @@ Return in JSON format:
"action_inputs": { ... }
}
],
"summary": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese.
"thought": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese.
"error": "string" | null, // Error messages about unexpected situations, if any. Use Chinese.
}
@@ -123,7 +123,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by
}
}
],
"summary": "点击第二个帖子的作者头像",
"thought": "点击第二个帖子的作者头像",
"error": null
}

View File

@@ -25,7 +25,7 @@ func setupADBDriverExt(t *testing.T) *XTDriver {
require.Nil(t, err)
driverExt, err := NewXTDriver(driver,
option.WithCVService(option.CVServiceTypeVEDEM),
option.WithLLMService(option.LLMServiceTypeUITARS),
option.WithLLMService(option.LLMServiceTypeDoubaoVL),
)
require.Nil(t, err)
return driverExt

View File

@@ -34,7 +34,8 @@ func (t *ToolSleep) Implement() server.ToolHandlerFunc {
return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
seconds, ok := request.Params.Arguments["seconds"]
if !ok {
return nil, fmt.Errorf("seconds parameter is required")
log.Warn().Msg("seconds parameter is required, using default value 5.0 seconds")
seconds = 5.0
}
// Sleep action logic