feat: implement ToolStartToGoal and fix LLM service initialization

- Add ToolStartToGoal implementation with AI-driven goal automation - Fix LLM service not initialized issue by applying global AI config to XTDriver creation - Ensure XTDriver is created with proper AI services from the first initialization - Add StartToGoal method to StepMobile for goal-oriented automation - Register ToolStartToGoal in MCP server and add corresponding action type - Add comprehensive test case for StartToGoal functionality - Fix ReturnSchema consistency across AI tools (StartToGoal, AIAction, Finished) - Extract AI service options in MCP argument processing This resolves the root cause where XTDriver was created without AI services in runStepMobileUI, ensuring only one XTDriver initialization with complete AI service configuration.
2026-06-27 18:41:33 +08:00 · 2025-06-05 16:52:11 +08:00
parent 0add3231ff
commit c4e7ab00a7
7 changed files with 199 additions and 19 deletions
--- a/uixt/cache.go
+++ b/uixt/cache.go
@@ -279,9 +279,23 @@ func setupXTDriver(_ context.Context, args map[string]any) (*XTDriver, error) {
 	platform, _ := args["platform"].(string)
 	serial, _ := args["serial"].(string)

+	// Extract AI service options from arguments if provided
+	var aiOpts []option.AIServiceOption
+
+	// Check for LLM service type
+	if llmService, ok := args["llm_service"].(string); ok && llmService != "" {
+		aiOpts = append(aiOpts, option.WithLLMService(option.LLMServiceType(llmService)))
+	}
+
+	// Check for CV service type
+	if cvService, ok := args["cv_service"].(string); ok && cvService != "" {
+		aiOpts = append(aiOpts, option.WithCVService(option.CVServiceType(cvService)))
+	}
+
 	config := DriverCacheConfig{
-		Platform: platform,
-		Serial:   serial,
+		Platform:  platform,
+		Serial:    serial,
+		AIOptions: aiOpts,
 	}
 	return GetOrCreateXTDriver(config)
 }
--- a/uixt/mcp_server.go
+++ b/uixt/mcp_server.go
@@ -121,6 +121,7 @@ func (s *MCPServer4XTDriver) registerTools() {
 	s.registerTool(&ToolWebCloseTab{})

 	// AI Tools
+	s.registerTool(&ToolStartToGoal{})
 	s.registerTool(&ToolAIAction{})
 	s.registerTool(&ToolFinished{})
 }
@@ -214,6 +215,14 @@ func extractActionOptionsToArguments(actionOptions []option.ActionOption, argume
 	if tempOptions.PressDuration > 0 {
 		arguments["press_duration"] = tempOptions.PressDuration
 	}
+
+	// Add AI service options
+	if tempOptions.LLMService != "" {
+		arguments["llm_service"] = tempOptions.LLMService
+	}
+	if tempOptions.CVService != "" {
+		arguments["cv_service"] = tempOptions.CVService
+	}
 }

 func getFloat64ValueOrDefault(value float64, defaultValue float64) float64 {
--- a/uixt/mcp_tools_ai.go
+++ b/uixt/mcp_tools_ai.go
@@ -10,6 +10,65 @@ import (
 	"github.com/rs/zerolog/log"
 )

+// ToolStartToGoal implements the start_to_goal tool call.
+type ToolStartToGoal struct{}
+
+func (t *ToolStartToGoal) Name() option.ActionName {
+	return option.ACTION_StartToGoal
+}
+
+func (t *ToolStartToGoal) Description() string {
+	return "Start AI-driven automation to achieve a specific goal using natural language description"
+}
+
+func (t *ToolStartToGoal) Options() []mcp.ToolOption {
+	unifiedReq := &option.ActionOptions{}
+	return unifiedReq.GetMCPOptions(option.ACTION_StartToGoal)
+}
+
+func (t *ToolStartToGoal) Implement() server.ToolHandlerFunc {
+	return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
+		driverExt, err := setupXTDriver(ctx, request.Params.Arguments)
+		if err != nil {
+			return nil, fmt.Errorf("setup driver failed: %w", err)
+		}
+
+		unifiedReq, err := parseActionOptions(request.Params.Arguments)
+		if err != nil {
+			return nil, err
+		}
+
+		// Start to goal logic
+		log.Info().Str("prompt", unifiedReq.Prompt).Msg("starting to goal")
+		err = driverExt.StartToGoal(unifiedReq.Prompt)
+		if err != nil {
+			return mcp.NewToolResultError(fmt.Sprintf("Failed to achieve goal: %s", err.Error())), nil
+		}
+
+		return mcp.NewToolResultText(fmt.Sprintf("Successfully achieved goal: %s", unifiedReq.Prompt)), nil
+	}
+}
+
+func (t *ToolStartToGoal) ConvertActionToCallToolRequest(action option.MobileAction) (mcp.CallToolRequest, error) {
+	if prompt, ok := action.Params.(string); ok {
+		arguments := map[string]any{
+			"prompt": prompt,
+		}
+
+		// Extract options to arguments
+		extractActionOptionsToArguments(action.GetOptions(), arguments)
+
+		return buildMCPCallToolRequest(t.Name(), arguments), nil
+	}
+	return mcp.CallToolRequest{}, fmt.Errorf("invalid start to goal params: %v", action.Params)
+}
+
+func (t *ToolStartToGoal) ReturnSchema() map[string]string {
+	return map[string]string{
+		"message": "string: Success message confirming goal was achieved, or error message if failed",
+	}
+}
+
 // ToolAIAction implements the ai_action tool call.
 type ToolAIAction struct{}

@@ -54,6 +113,10 @@ func (t *ToolAIAction) ConvertActionToCallToolRequest(action option.MobileAction
 		arguments := map[string]any{
 			"prompt": prompt,
 		}
+
+		// Extract options to arguments
+		extractActionOptionsToArguments(action.GetOptions(), arguments)
+
 		return buildMCPCallToolRequest(t.Name(), arguments), nil
 	}
 	return mcp.CallToolRequest{}, fmt.Errorf("invalid AI action params: %v", action.Params)
@@ -61,9 +124,7 @@ func (t *ToolAIAction) ConvertActionToCallToolRequest(action option.MobileAction

 func (t *ToolAIAction) ReturnSchema() map[string]string {
 	return map[string]string{
-		"message":     "string: Success message confirming AI action was performed",
-		"prompt":      "string: Natural language prompt that was processed",
-		"actionTaken": "string: Description of the specific action that was taken by AI",
+		"message": "string: Success message confirming AI action was performed, or error message if failed",
 	}
 }

@@ -107,8 +168,6 @@ func (t *ToolFinished) ConvertActionToCallToolRequest(action option.MobileAction

 func (t *ToolFinished) ReturnSchema() map[string]string {
 	return map[string]string{
-		"message":       "string: Success message confirming task completion",
-		"content":       "string: Completion reason or result description",
-		"taskCompleted": "bool: Boolean indicating task was successfully finished",
+		"message": "string: Success message confirming task completion, or error message if failed",
 	}
 }
--- a/uixt/option/action.go
+++ b/uixt/option/action.go
@@ -73,7 +73,6 @@ const (
 	ACTION_KeyCode                  ActionName = "keycode"
 	ACTION_Delete                   ActionName = "delete"    // delete action
 	ACTION_Backspace                ActionName = "backspace" // backspace action
-	ACTION_AIAction                 ActionName = "ai_action" // action with ai
 	ACTION_TapBySelector            ActionName = "tap_by_selector"
 	ACTION_HoverBySelector          ActionName = "hover_by_selector"
 	ACTION_Hover                    ActionName = "hover"       // generic hover action
@@ -101,9 +100,13 @@ const (
 	ACTION_InstallApp      ActionName = "install_app"
 	ACTION_UninstallApp    ActionName = "uninstall_app"
 	ACTION_DownloadApp     ActionName = "download_app"
-	ACTION_Finished        ActionName = "finished"
 	ACTION_CallFunction    ActionName = "call_function"

+	// AI actions
+	ACTION_StartToGoal ActionName = "start_to_goal" // start to goal action
+	ACTION_AIAction    ActionName = "ai_action"     // action with ai
+	ACTION_Finished    ActionName = "finished"      // finished action
+
 	// anti-risk actions
 	ACTION_SetTouchInfo     ActionName = "set_touch_info"
 	ACTION_SetTouchInfoList ActionName = "set_touch_info_list"
@@ -178,8 +181,10 @@ type ActionOptions struct {
 	Params []float64 `json:"params,omitempty" yaml:"params,omitempty" desc:"Generic parameter array"`

 	// AI related
-	Prompt  string `json:"prompt,omitempty" yaml:"prompt,omitempty" desc:"AI action prompt"`
-	Content string `json:"content,omitempty" yaml:"content,omitempty" desc:"Content for finished action"`
+	Prompt     string `json:"prompt,omitempty" yaml:"prompt,omitempty" desc:"AI action prompt"`
+	Content    string `json:"content,omitempty" yaml:"content,omitempty" desc:"Content for finished action"`
+	LLMService string `json:"llm_service,omitempty" yaml:"llm_service,omitempty" desc:"LLM service type for AI actions"`
+	CVService  string `json:"cv_service,omitempty" yaml:"cv_service,omitempty" desc:"Computer vision service type for AI actions"`

 	// Time related
 	Seconds      float64 `json:"seconds,omitempty" yaml:"seconds,omitempty" desc:"Sleep duration in seconds"`
@@ -679,6 +684,9 @@ func (o *ActionOptions) validateActionSpecificFields(actionType ActionName) erro
 		ACTION_AIAction: func() error {
 			return o.requireFields("prompt", o.Prompt != "")
 		},
+		ACTION_StartToGoal: func() error {
+			return o.requireFields("prompt", o.Prompt != "")
+		},
 		ACTION_Finished: func() error {
 			return o.requireFields("content", o.Content != "")
 		},
@@ -750,7 +758,8 @@ func (o *ActionOptions) GetMCPOptions(actionType ActionName) []mcp.ToolOption {
 		ACTION_Sleep:                    {"seconds"},
 		ACTION_SleepMS:                  {"platform", "serial", "milliseconds"},
 		ACTION_SleepRandom:              {"platform", "serial", "params"},
-		ACTION_AIAction:                 {"platform", "serial", "prompt"},
+		ACTION_AIAction:                 {"platform", "serial", "prompt", "llm_service", "cv_service"},
+		ACTION_StartToGoal:              {"platform", "serial", "prompt", "llm_service", "cv_service"},
 		ACTION_Finished:                 {"content"},
 		ACTION_ListAvailableDevices:     {},
 		ACTION_SelectDevice:             {"platform", "serial"},