feat: implement ToolStartToGoal and fix LLM service initialization

- Add ToolStartToGoal implementation with AI-driven goal automation
- Fix LLM service not initialized issue by applying global AI config to XTDriver creation
- Ensure XTDriver is created with proper AI services from the first initialization
- Add StartToGoal method to StepMobile for goal-oriented automation
- Register ToolStartToGoal in MCP server and add corresponding action type
- Add comprehensive test case for StartToGoal functionality
- Fix ReturnSchema consistency across AI tools (StartToGoal, AIAction, Finished)
- Extract AI service options in MCP argument processing

This resolves the root cause where XTDriver was created without AI services
in runStepMobileUI, ensuring only one XTDriver initialization with complete
AI service configuration.
This commit is contained in:
lilong.129
2025-06-05 16:52:11 +08:00
parent 0add3231ff
commit c4e7ab00a7
7 changed files with 199 additions and 19 deletions

View File

@@ -279,9 +279,23 @@ func setupXTDriver(_ context.Context, args map[string]any) (*XTDriver, error) {
platform, _ := args["platform"].(string)
serial, _ := args["serial"].(string)
// Extract AI service options from arguments if provided
var aiOpts []option.AIServiceOption
// Check for LLM service type
if llmService, ok := args["llm_service"].(string); ok && llmService != "" {
aiOpts = append(aiOpts, option.WithLLMService(option.LLMServiceType(llmService)))
}
// Check for CV service type
if cvService, ok := args["cv_service"].(string); ok && cvService != "" {
aiOpts = append(aiOpts, option.WithCVService(option.CVServiceType(cvService)))
}
config := DriverCacheConfig{
Platform: platform,
Serial: serial,
Platform: platform,
Serial: serial,
AIOptions: aiOpts,
}
return GetOrCreateXTDriver(config)
}

View File

@@ -121,6 +121,7 @@ func (s *MCPServer4XTDriver) registerTools() {
s.registerTool(&ToolWebCloseTab{})
// AI Tools
s.registerTool(&ToolStartToGoal{})
s.registerTool(&ToolAIAction{})
s.registerTool(&ToolFinished{})
}
@@ -214,6 +215,14 @@ func extractActionOptionsToArguments(actionOptions []option.ActionOption, argume
if tempOptions.PressDuration > 0 {
arguments["press_duration"] = tempOptions.PressDuration
}
// Add AI service options
if tempOptions.LLMService != "" {
arguments["llm_service"] = tempOptions.LLMService
}
if tempOptions.CVService != "" {
arguments["cv_service"] = tempOptions.CVService
}
}
func getFloat64ValueOrDefault(value float64, defaultValue float64) float64 {

View File

@@ -10,6 +10,65 @@ import (
"github.com/rs/zerolog/log"
)
// ToolStartToGoal implements the start_to_goal tool call.
type ToolStartToGoal struct{}
func (t *ToolStartToGoal) Name() option.ActionName {
return option.ACTION_StartToGoal
}
func (t *ToolStartToGoal) Description() string {
return "Start AI-driven automation to achieve a specific goal using natural language description"
}
func (t *ToolStartToGoal) Options() []mcp.ToolOption {
unifiedReq := &option.ActionOptions{}
return unifiedReq.GetMCPOptions(option.ACTION_StartToGoal)
}
func (t *ToolStartToGoal) Implement() server.ToolHandlerFunc {
return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
driverExt, err := setupXTDriver(ctx, request.Params.Arguments)
if err != nil {
return nil, fmt.Errorf("setup driver failed: %w", err)
}
unifiedReq, err := parseActionOptions(request.Params.Arguments)
if err != nil {
return nil, err
}
// Start to goal logic
log.Info().Str("prompt", unifiedReq.Prompt).Msg("starting to goal")
err = driverExt.StartToGoal(unifiedReq.Prompt)
if err != nil {
return mcp.NewToolResultError(fmt.Sprintf("Failed to achieve goal: %s", err.Error())), nil
}
return mcp.NewToolResultText(fmt.Sprintf("Successfully achieved goal: %s", unifiedReq.Prompt)), nil
}
}
func (t *ToolStartToGoal) ConvertActionToCallToolRequest(action option.MobileAction) (mcp.CallToolRequest, error) {
if prompt, ok := action.Params.(string); ok {
arguments := map[string]any{
"prompt": prompt,
}
// Extract options to arguments
extractActionOptionsToArguments(action.GetOptions(), arguments)
return buildMCPCallToolRequest(t.Name(), arguments), nil
}
return mcp.CallToolRequest{}, fmt.Errorf("invalid start to goal params: %v", action.Params)
}
func (t *ToolStartToGoal) ReturnSchema() map[string]string {
return map[string]string{
"message": "string: Success message confirming goal was achieved, or error message if failed",
}
}
// ToolAIAction implements the ai_action tool call.
type ToolAIAction struct{}
@@ -54,6 +113,10 @@ func (t *ToolAIAction) ConvertActionToCallToolRequest(action option.MobileAction
arguments := map[string]any{
"prompt": prompt,
}
// Extract options to arguments
extractActionOptionsToArguments(action.GetOptions(), arguments)
return buildMCPCallToolRequest(t.Name(), arguments), nil
}
return mcp.CallToolRequest{}, fmt.Errorf("invalid AI action params: %v", action.Params)
@@ -61,9 +124,7 @@ func (t *ToolAIAction) ConvertActionToCallToolRequest(action option.MobileAction
func (t *ToolAIAction) ReturnSchema() map[string]string {
return map[string]string{
"message": "string: Success message confirming AI action was performed",
"prompt": "string: Natural language prompt that was processed",
"actionTaken": "string: Description of the specific action that was taken by AI",
"message": "string: Success message confirming AI action was performed, or error message if failed",
}
}
@@ -107,8 +168,6 @@ func (t *ToolFinished) ConvertActionToCallToolRequest(action option.MobileAction
func (t *ToolFinished) ReturnSchema() map[string]string {
return map[string]string{
"message": "string: Success message confirming task completion",
"content": "string: Completion reason or result description",
"taskCompleted": "bool: Boolean indicating task was successfully finished",
"message": "string: Success message confirming task completion, or error message if failed",
}
}

View File

@@ -73,7 +73,6 @@ const (
ACTION_KeyCode ActionName = "keycode"
ACTION_Delete ActionName = "delete" // delete action
ACTION_Backspace ActionName = "backspace" // backspace action
ACTION_AIAction ActionName = "ai_action" // action with ai
ACTION_TapBySelector ActionName = "tap_by_selector"
ACTION_HoverBySelector ActionName = "hover_by_selector"
ACTION_Hover ActionName = "hover" // generic hover action
@@ -101,9 +100,13 @@ const (
ACTION_InstallApp ActionName = "install_app"
ACTION_UninstallApp ActionName = "uninstall_app"
ACTION_DownloadApp ActionName = "download_app"
ACTION_Finished ActionName = "finished"
ACTION_CallFunction ActionName = "call_function"
// AI actions
ACTION_StartToGoal ActionName = "start_to_goal" // start to goal action
ACTION_AIAction ActionName = "ai_action" // action with ai
ACTION_Finished ActionName = "finished" // finished action
// anti-risk actions
ACTION_SetTouchInfo ActionName = "set_touch_info"
ACTION_SetTouchInfoList ActionName = "set_touch_info_list"
@@ -178,8 +181,10 @@ type ActionOptions struct {
Params []float64 `json:"params,omitempty" yaml:"params,omitempty" desc:"Generic parameter array"`
// AI related
Prompt string `json:"prompt,omitempty" yaml:"prompt,omitempty" desc:"AI action prompt"`
Content string `json:"content,omitempty" yaml:"content,omitempty" desc:"Content for finished action"`
Prompt string `json:"prompt,omitempty" yaml:"prompt,omitempty" desc:"AI action prompt"`
Content string `json:"content,omitempty" yaml:"content,omitempty" desc:"Content for finished action"`
LLMService string `json:"llm_service,omitempty" yaml:"llm_service,omitempty" desc:"LLM service type for AI actions"`
CVService string `json:"cv_service,omitempty" yaml:"cv_service,omitempty" desc:"Computer vision service type for AI actions"`
// Time related
Seconds float64 `json:"seconds,omitempty" yaml:"seconds,omitempty" desc:"Sleep duration in seconds"`
@@ -679,6 +684,9 @@ func (o *ActionOptions) validateActionSpecificFields(actionType ActionName) erro
ACTION_AIAction: func() error {
return o.requireFields("prompt", o.Prompt != "")
},
ACTION_StartToGoal: func() error {
return o.requireFields("prompt", o.Prompt != "")
},
ACTION_Finished: func() error {
return o.requireFields("content", o.Content != "")
},
@@ -750,7 +758,8 @@ func (o *ActionOptions) GetMCPOptions(actionType ActionName) []mcp.ToolOption {
ACTION_Sleep: {"seconds"},
ACTION_SleepMS: {"platform", "serial", "milliseconds"},
ACTION_SleepRandom: {"platform", "serial", "params"},
ACTION_AIAction: {"platform", "serial", "prompt"},
ACTION_AIAction: {"platform", "serial", "prompt", "llm_service", "cv_service"},
ACTION_StartToGoal: {"platform", "serial", "prompt", "llm_service", "cv_service"},
ACTION_Finished: {"content"},
ACTION_ListAvailableDevices: {},
ACTION_SelectDevice: {"platform", "serial"},