diff --git a/internal/version/VERSION b/internal/version/VERSION index 3b935b64..04063e33 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506051328 +v5.0.0-beta-2506051419 diff --git a/mcphost/chat.go b/mcphost/chat.go index 60e45c59..633b9d28 100644 --- a/mcphost/chat.go +++ b/mcphost/chat.go @@ -136,7 +136,7 @@ func (c *Chat) runPrompt(ctx context.Context, prompt string) error { return c.handleToolCalls(ctx, toolCalls) } - c.renderContent("Assistant", result.ActionSummary) + c.renderContent("Assistant", result.Thought) return nil } diff --git a/uixt/ai/parser_default.go b/uixt/ai/parser_default.go index b950b4ce..38b4c3e5 100644 --- a/uixt/ai/parser_default.go +++ b/uixt/ai/parser_default.go @@ -53,7 +53,7 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes // Define a temporary struct to parse the expected JSON format var jsonResponse struct { Actions []Action `json:"actions"` - Summary string `json:"summary"` + Thought string `json:"thought"` Error string `json:"error"` } @@ -95,9 +95,8 @@ func (p *JSONContentParser) Parse(content string, size types.Size) (*PlanningRes toolCalls := convertActionsToToolCalls(normalizedActions, p.actionMapping) return &PlanningResult{ - ToolCalls: toolCalls, - ActionSummary: jsonResponse.Summary, - Thought: jsonResponse.Summary, - Content: content, + ToolCalls: toolCalls, + Thought: jsonResponse.Thought, + Content: content, }, nil } diff --git a/uixt/ai/parser_ui_tars.go b/uixt/ai/parser_ui_tars.go index 5d72040b..29781e94 100644 --- a/uixt/ai/parser_ui_tars.go +++ b/uixt/ai/parser_ui_tars.go @@ -52,10 +52,9 @@ func (p *UITARSContentParser) Parse(content string, size types.Size) (*PlanningR toolCalls := convertActionsToToolCalls(actions, p.actionMapping) return &PlanningResult{ - ToolCalls: toolCalls, - ActionSummary: thought, - Thought: thought, - Content: content, + ToolCalls: toolCalls, + Thought: thought, + Content: content, }, nil } diff --git a/uixt/ai/planner.go b/uixt/ai/planner.go index ead8fbed..bd71aec9 100644 --- a/uixt/ai/planner.go +++ b/uixt/ai/planner.go @@ -27,11 +27,10 @@ type PlanningOptions struct { // PlanningResult represents the result of planning type PlanningResult struct { - ToolCalls []schema.ToolCall `json:"tool_calls"` - ActionSummary string `json:"summary"` - Thought string `json:"thought"` - Content string `json:"content"` // original content from model - Error string `json:"error,omitempty"` + ToolCalls []schema.ToolCall `json:"tool_calls"` + Thought string `json:"thought"` + Content string `json:"content"` // original content from model + Error string `json:"error,omitempty"` } func NewPlanner(ctx context.Context, modelConfig *ModelConfig) (*Planner, error) { @@ -125,8 +124,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes }) // history will be appended with tool calls execution result result := &PlanningResult{ - ToolCalls: message.ToolCalls, - ActionSummary: message.Content, + ToolCalls: message.ToolCalls, + Thought: message.Content, } return result, nil } @@ -135,8 +134,8 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes result, err := p.parser.Parse(message.Content, opts.Size) if err != nil { result = &PlanningResult{ - ActionSummary: message.Content, - Error: err.Error(), + Thought: message.Content, + Error: err.Error(), } log.Debug().Str("reason", err.Error()).Msg("parse content to actions failed") } @@ -147,7 +146,7 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (*PlanningRes }) log.Info(). - Interface("summary", result.ActionSummary). + Interface("thought", result.Thought). Interface("tool_calls", result.ToolCalls). Msg("get VLM planning result") return result, nil diff --git a/uixt/ai/planner_prompts.go b/uixt/ai/planner_prompts.go index d939522d..dfe879b2 100644 --- a/uixt/ai/planner_prompts.go +++ b/uixt/ai/planner_prompts.go @@ -98,7 +98,7 @@ Supporting actions: Field description: * The ` + "`start_box`" + ` and ` + "`end_box`" + ` fields represent the bounding box coordinates of the target element in 1000x1000 relative coordinate system. -* Use Chinese in log and summary fields. +* Use Chinese in log and thought fields. Return in JSON format: { @@ -108,7 +108,7 @@ Return in JSON format: "action_inputs": { ... } } ], - "summary": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese. + "thought": "string", // Log what the next action you can do according to the screenshot and the instruction. Use Chinese. "error": "string" | null, // Error messages about unexpected situations, if any. Use Chinese. } @@ -123,7 +123,7 @@ For example, when the instruction is "点击第二个帖子的作者头像", by } } ], - "summary": "点击第二个帖子的作者头像", + "thought": "点击第二个帖子的作者头像", "error": null } diff --git a/uixt/android_test.go b/uixt/android_test.go index 24eef80b..b1c7b30d 100644 --- a/uixt/android_test.go +++ b/uixt/android_test.go @@ -25,7 +25,7 @@ func setupADBDriverExt(t *testing.T) *XTDriver { require.Nil(t, err) driverExt, err := NewXTDriver(driver, option.WithCVService(option.CVServiceTypeVEDEM), - option.WithLLMService(option.LLMServiceTypeUITARS), + option.WithLLMService(option.LLMServiceTypeDoubaoVL), ) require.Nil(t, err) return driverExt diff --git a/uixt/mcp_tools_utility.go b/uixt/mcp_tools_utility.go index 69c26030..b8940e4b 100644 --- a/uixt/mcp_tools_utility.go +++ b/uixt/mcp_tools_utility.go @@ -34,7 +34,8 @@ func (t *ToolSleep) Implement() server.ToolHandlerFunc { return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { seconds, ok := request.Params.Arguments["seconds"] if !ok { - return nil, fmt.Errorf("seconds parameter is required") + log.Warn().Msg("seconds parameter is required, using default value 5.0 seconds") + seconds = 5.0 } // Sleep action logic