From ec4f1eb68a9a3ed124819390b6380d5e04202ce2 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sat, 7 Jun 2025 23:59:07 +0800 Subject: [PATCH] refactor: unify action execution interface and merge AI action handling --- internal/version/VERSION | 2 +- server/uixt.go | 4 +- step.go | 8 ++- step_ui.go | 4 +- uixt/driver_ext_ai.go | 130 +++++++++++++++++++++++++------------ uixt/driver_ext_ai_test.go | 5 +- uixt/mcp_tools_ai.go | 16 +++-- uixt/sdk.go | 95 ++++++++++++++++++++++++--- 8 files changed, 199 insertions(+), 65 deletions(-) diff --git a/internal/version/VERSION b/internal/version/VERSION index 23bba711..f78c8361 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506072045 +v5.0.0-beta-2506072359 diff --git a/server/uixt.go b/server/uixt.go index 6a81b229..9cdb0e16 100644 --- a/server/uixt.go +++ b/server/uixt.go @@ -19,7 +19,7 @@ func (r *Router) uixtActionHandler(c *gin.Context) { return } - if err = dExt.ExecuteAction(c.Request.Context(), req); err != nil { + if _, err = dExt.ExecuteAction(c.Request.Context(), req); err != nil { log.Err(err).Interface("action", req). Msg("exec uixt action failed") RenderError(c, err) @@ -42,7 +42,7 @@ func (r *Router) uixtActionsHandler(c *gin.Context) { } for _, action := range actions { - if err = dExt.ExecuteAction(c.Request.Context(), action); err != nil { + if _, err = dExt.ExecuteAction(c.Request.Context(), action); err != nil { log.Err(err).Interface("action", action). Msg("exec uixt action failed") RenderError(c, err) diff --git a/step.go b/step.go index a23daf9c..609b351d 100644 --- a/step.go +++ b/step.go @@ -1,6 +1,7 @@ package hrp import ( + "github.com/httprunner/httprunner/v5/uixt" "github.com/httprunner/httprunner/v5/uixt/option" "github.com/httprunner/httprunner/v5/uixt/types" ) @@ -58,9 +59,10 @@ type TStep struct { // one step contains one or multiple actions type ActionResult struct { option.MobileAction `json:",inline"` - StartTime int64 `json:"start_time"` // action start time - Elapsed int64 `json:"elapsed_ms"` // action elapsed time(ms) - Error error `json:"error"` // action execution result + StartTime int64 `json:"start_time"` // action start time + Elapsed int64 `json:"elapsed_ms"` // action elapsed time(ms) + Error error `json:"error"` // action execution result + SubActions []*uixt.SubActionResult `json:"sub_actions,omitempty"` // store sub-actions } // one testcase contains one or multiple steps diff --git a/step_ui.go b/step_ui.go index aedcd798..f6ce4914 100644 --- a/step_ui.go +++ b/step_ui.go @@ -907,8 +907,10 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err } }() - err = uiDriver.ExecuteAction(ctx, action) + // action execution + subActionResults, err := uiDriver.ExecuteAction(ctx, action) actionResult.Elapsed = time.Since(actionStartTime).Milliseconds() + actionResult.SubActions = subActionResults stepResult.Actions = append(stepResult.Actions, actionResult) if err != nil { if !code.IsErrorPredefined(err) { diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index eabc5bdc..239c604a 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -3,13 +3,12 @@ package uixt import ( "context" "encoding/base64" - "fmt" - "path/filepath" + "strings" + "time" "github.com/cloudwego/eino/schema" "github.com/httprunner/httprunner/v5/code" "github.com/httprunner/httprunner/v5/internal/builtin" - "github.com/httprunner/httprunner/v5/internal/config" "github.com/httprunner/httprunner/v5/internal/json" "github.com/httprunner/httprunner/v5/uixt/ai" "github.com/httprunner/httprunner/v5/uixt/option" @@ -18,10 +17,11 @@ import ( "github.com/rs/zerolog/log" ) -func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...option.ActionOption) error { +func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...option.ActionOption) ([]*SubActionResult, error) { options := option.NewActionOptions(opts...) log.Info().Int("max_retry_times", options.MaxRetryTimes).Msg("StartToGoal") + var allSubActions []*SubActionResult var attempt int for { attempt++ @@ -31,7 +31,7 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op select { case <-ctx.Done(): log.Warn().Msg("interrupted in StartToGoal") - return errors.Wrap(code.InterruptError, "StartToGoal interrupted") + return allSubActions, errors.Wrap(code.InterruptError, "StartToGoal interrupted") default: } @@ -49,37 +49,44 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op Msg("LLM service request failed, retrying...") continue } - return err + return allSubActions, err } // Check if task is finished BEFORE executing actions if dExt.isTaskFinished(result) { log.Info().Msg("task finished, stopping StartToGoal") - return nil + return allSubActions, nil } - // Execute actions only if task is not finished - if err := dExt.executeActions(ctx, result.ToolCalls); err != nil { - return err + // Invoke tool calls + subActions, err := dExt.invokeToolCalls(ctx, result.Thought, result.ToolCalls) + allSubActions = append(allSubActions, subActions...) + if err != nil { + return allSubActions, err } if options.MaxRetryTimes > 1 && attempt >= options.MaxRetryTimes { - return errors.New("reached max retry times") + return allSubActions, errors.New("reached max retry times") } } } -func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) error { +func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) ([]*SubActionResult, error) { log.Info().Str("prompt", prompt).Msg("performing AI action") // plan next action result, err := dExt.PlanNextAction(ctx, prompt, opts...) if err != nil { - return err + return nil, err } - // execute actions - return dExt.executeActions(ctx, result.ToolCalls) + // Invoke tool calls + subActionResults, err := dExt.invokeToolCalls(ctx, result.Thought, result.ToolCalls) + if err != nil { + return subActionResults, err + } + + return subActionResults, nil } func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*ai.PlanningResult, error) { @@ -87,36 +94,28 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. return nil, errors.New("LLM service is not initialized") } - compressedBufSource, err := getScreenShotBuffer(dExt.IDriver) + // Parse action options to get ResetHistory setting + options := option.NewActionOptions(opts...) + resetHistory := options.ResetHistory + + // Use GetScreenResult to handle screenshot capture, save, and session tracking + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + ) if err != nil { return nil, err } - // convert buffer to base64 string + // convert buffer to base64 string for LLM screenShotBase64 := "data:image/jpeg;base64," + - base64.StdEncoding.EncodeToString(compressedBufSource.Bytes()) - - // save screenshot to file - imagePath := filepath.Join( - config.GetConfig().ScreenShotsPath, - fmt.Sprintf("%s.jpeg", builtin.GenNameWithTimestamp("%d_screenshot")), - ) - go func() { - err := saveScreenShot(compressedBufSource, imagePath) - if err != nil { - log.Error().Err(err).Msg("save screenshot file failed") - } - }() + base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) + // get window size size, err := dExt.IDriver.WindowSize() if err != nil { return nil, errors.Wrap(code.DeviceGetInfoError, err.Error()) } - // Parse action options to get ResetHistory setting - options := option.NewActionOptions(opts...) - resetHistory := options.ResetHistory - planningOpts := &ai.PlanningOptions{ UserInstruction: prompt, Message: &schema.Message{ @@ -160,23 +159,40 @@ func (dExt *XTDriver) isTaskFinished(result *ai.PlanningResult) bool { return false } -// executeActions executes the planned actions -func (dExt *XTDriver) executeActions(ctx context.Context, toolCalls []schema.ToolCall) error { +// invokeToolCalls invokes the tool calls and returns sub-action results +func (dExt *XTDriver) invokeToolCalls(ctx context.Context, thought string, toolCalls []schema.ToolCall) ([]*SubActionResult, error) { + var subActionResults []*SubActionResult + for _, action := range toolCalls { // Check for context cancellation before each action select { case <-ctx.Done(): - log.Warn().Msg("interrupted in executeActions") - return errors.Wrap(code.InterruptError, "executeActions interrupted") + log.Warn().Msg("interrupted in invokeToolCalls") + return subActionResults, errors.Wrap(code.InterruptError, "invokeToolCalls interrupted") default: } - // call eino tool + subActionStartTime := time.Now() + + // Extract action name (remove "uixt__" prefix) + actionName := strings.TrimPrefix(action.Function.Name, "uixt__") + + // Parse arguments arguments := make(map[string]interface{}) err := json.Unmarshal([]byte(action.Function.Arguments), &arguments) if err != nil { - return err + return subActionResults, err } + + // Create sub-action result + subActionResult := &SubActionResult{ + ActionName: actionName, + Arguments: arguments, + StartTime: subActionStartTime.Unix(), + Thought: thought, + } + + // Execute the action req := mcp.CallToolRequest{ Params: struct { Name string `json:"name"` @@ -191,12 +207,42 @@ func (dExt *XTDriver) executeActions(ctx context.Context, toolCalls []schema.Too } _, err = dExt.client.CallTool(ctx, req) + subActionResult.Elapsed = time.Since(subActionStartTime).Milliseconds() if err != nil { - return err + subActionResult.Error = err + subActionResults = append(subActionResults, subActionResult) + return subActionResults, err } + + // Collect sub-action specific attachments and reset session data + subActionData := dExt.GetData(true) // reset after getting data + + // Add requests if any + if requests, ok := subActionData["requests"].([]*DriverRequests); ok && len(requests) > 0 { + subActionResult.Requests = requests + } + + // Add screen_results if any + if screenResults, ok := subActionData["screen_results"].([]*ScreenResult); ok && len(screenResults) > 0 { + subActionResult.ScreenResults = screenResults + } + + subActionResults = append(subActionResults, subActionResult) } - return nil + return subActionResults, nil +} + +// SubActionResult represents a sub-action within a start_to_goal action +type SubActionResult struct { + ActionName string `json:"action_name"` // name of the sub-action (e.g., "tap", "input") + Arguments interface{} `json:"arguments,omitempty"` // arguments passed to the sub-action + StartTime int64 `json:"start_time"` // sub-action start time + Elapsed int64 `json:"elapsed_ms"` // sub-action elapsed time(ms) + Error error `json:"error,omitempty"` // sub-action execution result + Thought string `json:"thought,omitempty"` // sub-action thought + Requests []*DriverRequests `json:"requests,omitempty"` // store sub-action specific requests + ScreenResults []*ScreenResult `json:"screen_results,omitempty"` // store sub-action specific screen_results } func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (string, error) { diff --git a/uixt/driver_ext_ai_test.go b/uixt/driver_ext_ai_test.go index 0f374a6a..e05844ad 100644 --- a/uixt/driver_ext_ai_test.go +++ b/uixt/driver_ext_ai_test.go @@ -15,8 +15,9 @@ import ( func TestDriverExt_TapByLLM(t *testing.T) { driver := setupDriverExt(t) - err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像") + subActionResults, err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像") assert.Nil(t, err) + t.Log(subActionResults) err = driver.AIAssert("当前在个人介绍页") assert.Nil(t, err) @@ -46,7 +47,7 @@ func TestDriverExt_StartToGoal(t *testing.T) { userInstruction += "\n\n请严格按照以上游戏规则,开始游戏;注意,请只做点击操作" - err := driver.StartToGoal(context.Background(), userInstruction) + _, err := driver.StartToGoal(context.Background(), userInstruction) assert.Nil(t, err) } diff --git a/uixt/mcp_tools_ai.go b/uixt/mcp_tools_ai.go index 3c9c3d90..f5bd7027 100644 --- a/uixt/mcp_tools_ai.go +++ b/uixt/mcp_tools_ai.go @@ -13,7 +13,8 @@ import ( // ToolStartToGoal implements the start_to_goal tool call. type ToolStartToGoal struct { // Return data fields - these define the structure of data returned by this tool - Prompt string `json:"prompt" desc:"Goal prompt that was executed"` + Prompt string `json:"prompt" desc:"Goal prompt that was executed"` + SubActions []*SubActionResult `json:"sub_actions" desc:"Sub-actions that were executed"` } func (t *ToolStartToGoal) Name() option.ActionName { @@ -42,14 +43,15 @@ func (t *ToolStartToGoal) Implement() server.ToolHandlerFunc { } // Start to goal logic - err = driverExt.StartToGoal(ctx, unifiedReq.Prompt) + subActionResults, err := driverExt.StartToGoal(ctx, unifiedReq.Prompt) if err != nil { return NewMCPErrorResponse(fmt.Sprintf("Failed to achieve goal: %s", err.Error())), nil } message := fmt.Sprintf("Successfully achieved goal: %s", unifiedReq.Prompt) returnData := ToolStartToGoal{ - Prompt: unifiedReq.Prompt, + Prompt: unifiedReq.Prompt, + SubActions: subActionResults, } return NewMCPSuccessResponse(message, &returnData), nil @@ -73,7 +75,8 @@ func (t *ToolStartToGoal) ConvertActionToCallToolRequest(action option.MobileAct // ToolAIAction implements the ai_action tool call. type ToolAIAction struct { // Return data fields - these define the structure of data returned by this tool - Prompt string `json:"prompt" desc:"AI action prompt that was executed"` + Prompt string `json:"prompt" desc:"AI action prompt that was executed"` + SubActions []*SubActionResult `json:"sub_actions" desc:"Sub-actions that were executed"` } func (t *ToolAIAction) Name() option.ActionName { @@ -102,14 +105,15 @@ func (t *ToolAIAction) Implement() server.ToolHandlerFunc { } // AI action logic - err = driverExt.AIAction(ctx, unifiedReq.Prompt) + subActionResults, err := driverExt.AIAction(ctx, unifiedReq.Prompt) if err != nil { return NewMCPErrorResponse(fmt.Sprintf("AI action failed: %s", err.Error())), nil } message := fmt.Sprintf("Successfully performed AI action with prompt: %s", unifiedReq.Prompt) returnData := ToolAIAction{ - Prompt: unifiedReq.Prompt, + Prompt: unifiedReq.Prompt, + SubActions: subActionResults, } return NewMCPSuccessResponse(message, &returnData), nil diff --git a/uixt/sdk.go b/uixt/sdk.go index 1d404d09..6d1548fc 100644 --- a/uixt/sdk.go +++ b/uixt/sdk.go @@ -4,7 +4,9 @@ import ( "context" "fmt" "strings" + "time" + "github.com/httprunner/httprunner/v5/internal/json" "github.com/httprunner/httprunner/v5/uixt/ai" "github.com/httprunner/httprunner/v5/uixt/option" "github.com/mark3labs/mcp-go/client" @@ -88,37 +90,114 @@ func (c *MCPClient4XTDriver) GetToolByAction(actionName option.ActionName) Actio return c.Server.GetToolByAction(actionName) } -func (dExt *XTDriver) ExecuteAction(ctx context.Context, action option.MobileAction) (err error) { +func (dExt *XTDriver) ExecuteAction(ctx context.Context, action option.MobileAction) ([]*SubActionResult, error) { + subActionStartTime := time.Now() + // Find the corresponding tool for this action method tool := dExt.client.Server.GetToolByAction(action.Method) if tool == nil { - return fmt.Errorf("no tool found for action method: %s", action.Method) + return nil, fmt.Errorf("no tool found for action method: %s", action.Method) } // Use the tool's own conversion method req, err := tool.ConvertActionToCallToolRequest(action) if err != nil { - return fmt.Errorf("failed to convert action to MCP tool call: %w", err) + return nil, fmt.Errorf("failed to convert action to MCP tool call: %w", err) + } + + // Create sub-action result + subActionResult := &SubActionResult{ + ActionName: string(action.Method), + Arguments: action.Params, + StartTime: subActionStartTime.Unix(), } // Execute via MCP tool result, err := dExt.client.CallTool(ctx, req) + subActionResult.Elapsed = time.Since(subActionStartTime).Milliseconds() if err != nil { - return fmt.Errorf("MCP tool call failed: %w", err) + subActionResult.Error = err + return []*SubActionResult{subActionResult}, fmt.Errorf("MCP tool call failed: %w", err) } // Check if the tool execution had business logic errors if result.IsError { + var errMsg string if len(result.Content) > 0 { - return fmt.Errorf("invoke tool %s failed: %v", - tool.Name(), result.Content) + errMsg = fmt.Sprintf("invoke tool %s failed: %v", tool.Name(), result.Content) + } else { + errMsg = fmt.Sprintf("invoke tool %s failed", tool.Name()) } - return fmt.Errorf("invoke tool %s failed", tool.Name()) + err := errors.New(errMsg) + subActionResult.Error = err + return []*SubActionResult{subActionResult}, err + } + + // Handle special AI actions (start_to_goal, ai_action) that return sub-actions + if action.Method == option.ACTION_StartToGoal || action.Method == option.ACTION_AIAction { + return dExt.parseAIActionResult(result, subActionResult) + } + + // For regular actions, collect session data and return single sub-action result + subActionData := dExt.GetData(true) // reset after getting data + + // Add requests if any + if requests, ok := subActionData["requests"].([]*DriverRequests); ok && len(requests) > 0 { + subActionResult.Requests = requests + } + + // Add screen_results if any + if screenResults, ok := subActionData["screen_results"].([]*ScreenResult); ok && len(screenResults) > 0 { + subActionResult.ScreenResults = screenResults } log.Debug().Str("tool", string(tool.Name())). Msg("execute action via MCP tool") - return nil + return []*SubActionResult{subActionResult}, nil +} + +// parseAIActionResult parses the result from AI actions (start_to_goal, ai_action) and extracts sub-actions +func (dExt *XTDriver) parseAIActionResult(result *mcp.CallToolResult, originalSubAction *SubActionResult) ([]*SubActionResult, error) { + // Parse the JSON response to extract sub_actions + var responseData map[string]interface{} + if len(result.Content) > 0 { + // Get the first text content + if textContent, ok := result.Content[0].(mcp.TextContent); ok { + if err := json.Unmarshal([]byte(textContent.Text), &responseData); err != nil { + log.Warn().Err(err).Msg("failed to parse AI action result, falling back to single action") + return []*SubActionResult{originalSubAction}, nil + } + } else { + log.Warn().Msg("AI action result is not text content, falling back to single action") + return []*SubActionResult{originalSubAction}, nil + } + } + + // Extract sub_actions from the response + if subActionsData, ok := responseData["sub_actions"]; ok { + // Convert to JSON and back to properly deserialize SubActionResult structs + subActionsJSON, err := json.Marshal(subActionsData) + if err != nil { + log.Warn().Err(err).Msg("failed to marshal sub_actions, falling back to single action") + return []*SubActionResult{originalSubAction}, nil + } + + var subActionResults []*SubActionResult + if err := json.Unmarshal(subActionsJSON, &subActionResults); err != nil { + log.Warn().Err(err).Msg("failed to unmarshal sub_actions, falling back to single action") + return []*SubActionResult{originalSubAction}, nil + } + + log.Debug().Int("sub_actions_count", len(subActionResults)). + Str("action", string(originalSubAction.ActionName)). + Msg("parsed AI action sub-actions") + return subActionResults, nil + } + + // If no sub_actions found, return the original action as a single result + log.Debug().Str("action", string(originalSubAction.ActionName)). + Msg("no sub_actions found in AI action result, using single action") + return []*SubActionResult{originalSubAction}, nil } // NewDeviceWithDefault is a helper function to create a device with default options