Files
httprunner/uixt/driver_ext_ai.go

471 lines
17 KiB
Go

package uixt
import (
"context"
"time"
"github.com/cloudwego/eino/schema"
"github.com/mark3labs/mcp-go/mcp"
"github.com/pkg/errors"
"github.com/rs/zerolog/log"
"github.com/httprunner/httprunner/v5/code"
"github.com/httprunner/httprunner/v5/internal/builtin"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/ai"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
)
func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...option.ActionOption) ([]*PlanningExecutionResult, error) {
options := option.NewActionOptions(opts...)
log.Info().Int("max_retry_times", options.MaxRetryTimes).Msg("StartToGoal")
var allPlannings []*PlanningExecutionResult
var attempt int
for {
attempt++
log.Info().Int("attempt", attempt).Msg("planning attempt")
// Check for context cancellation (interrupt signal)
select {
case <-ctx.Done():
log.Warn().
Int("attempt", attempt).
Int("completed_plannings", len(allPlannings)).
Msg("interrupted in StartToGoal")
return allPlannings, errors.Wrap(code.InterruptError, "StartToGoal interrupted")
default:
}
// Plan next action with history reset on first attempt
planningStartTime := time.Now()
planningOpts := opts
if attempt == 1 {
// Add ResetHistory option for the first attempt
planningOpts = append(planningOpts, option.WithResetHistory(true))
}
planningResult, err := dExt.PlanNextAction(ctx, prompt, planningOpts...)
if err != nil {
// Check if this is a LLM service request error that should be retried
if errors.Is(err, code.LLMRequestServiceError) {
log.Warn().Err(err).Int("attempt", attempt).
Msg("LLM service request failed, retrying...")
time.Sleep(5 * time.Second)
continue
}
// Create planning result with error
errorResult := &PlanningExecutionResult{
PlanningResult: ai.PlanningResult{
Thought: "Planning failed",
ModelName: "",
Error: err.Error(),
},
StartTime: planningStartTime.UnixMilli(),
Elapsed: time.Since(planningStartTime).Milliseconds(),
}
allPlannings = append(allPlannings, errorResult)
return allPlannings, err
}
// Set planning execution timing
planningResult.StartTime = planningStartTime.UnixMilli()
planningResult.SubActions = []*SubActionResult{}
// Check if task is finished BEFORE executing actions
if dExt.isTaskFinished(planningResult) {
log.Info().Msg("task finished, stopping StartToGoal")
planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
allPlannings = append(allPlannings, planningResult)
return allPlannings, nil
}
// Invoke tool calls
for _, toolCall := range planningResult.ToolCalls {
// Check for context cancellation before each action
select {
case <-ctx.Done():
log.Warn().
Int("attempt", attempt).
Int("completed_plannings", len(allPlannings)).
Int("completed_tool_calls", len(planningResult.SubActions)).
Int("total_tool_calls", len(planningResult.ToolCalls)).
Msg("interrupted in invokeToolCalls")
planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
allPlannings = append(allPlannings, planningResult)
return allPlannings, errors.Wrap(code.InterruptError, "invokeToolCalls interrupted")
default:
}
// Execute each tool call in a separate function to ensure proper defer execution
err := func() error {
subActionStartTime := time.Now()
subActionResult := &SubActionResult{
ActionName: toolCall.Function.Name,
Arguments: toolCall.Function.Arguments,
StartTime: subActionStartTime.UnixMilli(),
}
// Use defer to ensure sub-action is always processed and added to results
defer func() {
subActionResult.Elapsed = time.Since(subActionStartTime).Milliseconds()
subActionResult.SessionData = dExt.GetSession().GetData(true) // reset after getting data
planningResult.SubActions = append(planningResult.SubActions, subActionResult)
}()
// Execute the tool call
if err := dExt.invokeToolCall(ctx, toolCall); err != nil {
subActionResult.Error = err
return err
}
return nil
}()
if err != nil {
planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
planningResult.Error = err.Error()
allPlannings = append(allPlannings, planningResult)
return allPlannings, err
}
}
// Complete this planning cycle
planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
allPlannings = append(allPlannings, planningResult)
if options.MaxRetryTimes > 0 && attempt > options.MaxRetryTimes {
return allPlannings, errors.New("reached max retry times")
}
}
}
// AIAction performs AI-driven action and returns detailed execution result
func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) {
log.Info().Str("prompt", prompt).Msg("performing AI action")
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
// Step 2: Plan next action and measure time
modelCallStartTime := time.Now()
planningResult, err := dExt.PlanNextAction(ctx, prompt, opts...)
modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
aiExecutionResult := &AIExecutionResult{
Type: "action",
ModelCallElapsed: modelCallElapsed,
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
PlanningResult: &planningResult.PlanningResult,
}
if err != nil {
aiExecutionResult.Error = err.Error()
return aiExecutionResult, errors.Wrap(err, "get next action failed")
}
// Step 3: Execute tool calls
for _, toolCall := range planningResult.ToolCalls {
err = dExt.invokeToolCall(ctx, toolCall)
if err != nil {
aiExecutionResult.Error = err.Error()
return aiExecutionResult, errors.Wrap(err, "invoke tool call failed")
}
}
return aiExecutionResult, nil
}
// PlanNextAction performs planning and returns unified planning information
func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*PlanningExecutionResult, error) {
if dExt.LLMService == nil {
return nil, errors.New("LLM service is not initialized")
}
// Parse action options to get ResetHistory setting
options := option.NewActionOptions(opts...)
resetHistory := options.ResetHistory
// Step 1: Take screenshot
screenshotStartTime := time.Now()
// Use GetScreenResult to handle screenshot capture, save, and session tracking
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
// Clear session data after planning screenshot to avoid including it in sub-actions
// The planning screenshot is already stored in planningResult.ScreenResult
dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions
// get screen shot buffer base64 and size
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return nil, errors.Wrap(code.DeviceGetInfoError, err.Error())
}
// Step 2: Call model
modelCallStartTime := time.Now()
planningOpts := &ai.PlanningOptions{
UserInstruction: prompt,
Message: &schema.Message{
Role: schema.User,
MultiContent: []schema.ChatMessagePart{
{
Type: schema.ChatMessagePartTypeImageURL,
ImageURL: &schema.ChatMessageImageURL{
URL: screenShotBase64,
},
},
},
},
Size: size,
ResetHistory: resetHistory,
}
result, err := dExt.LLMService.Plan(ctx, planningOpts)
modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
if err != nil {
return nil, errors.Wrap(err, "failed to get next action from planner")
}
// Step 3: Parse result (this is already done in LLMService.Call, but we record it separately)
actionNames := make([]string, len(result.ToolCalls))
for i, toolCall := range result.ToolCalls {
actionNames[i] = toolCall.Function.Name
}
// Create unified planning result that inherits from ai.PlanningResult
planningResult := &PlanningExecutionResult{
PlanningResult: *result, // Inherit all fields from ai.PlanningResult
// Planning process timing and metadata
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
ScreenResult: screenResult,
ModelCallElapsed: modelCallElapsed,
ToolCallsCount: len(result.ToolCalls),
ActionNames: actionNames,
// Execution timing (will be set by StartToGoal)
StartTime: 0, // Will be set by caller
Elapsed: 0, // Will be set by caller
SubActions: nil, // Will be populated during execution
}
return planningResult, nil
}
// isTaskFinished checks if the task is completed based on the planning result
func (dExt *XTDriver) isTaskFinished(planningResult *PlanningExecutionResult) bool {
// Check if there are no tool calls (no actions to execute)
if len(planningResult.ToolCalls) == 0 {
log.Info().Msg("no tool calls returned, task may be finished")
return true
}
// Check if any tool call is a "finished" action
for _, toolCall := range planningResult.ToolCalls {
if toolCall.Function.Name == "uixt__finished" {
log.Info().Str("reason", toolCall.Function.Arguments).Msg("finished action detected")
return true
}
}
return false
}
// invokeToolCall invokes the tool call
func (dExt *XTDriver) invokeToolCall(ctx context.Context, toolCall schema.ToolCall) error {
// Parse arguments
arguments := make(map[string]interface{})
err := json.Unmarshal([]byte(toolCall.Function.Arguments), &arguments)
if err != nil {
return err
}
// Execute the action
req := mcp.CallToolRequest{
Params: struct {
Name string `json:"name"`
Arguments map[string]any `json:"arguments,omitempty"`
Meta *struct {
ProgressToken mcp.ProgressToken `json:"progressToken,omitempty"`
} `json:"_meta,omitempty"`
}{
Name: toolCall.Function.Name,
Arguments: arguments,
},
}
_, err = dExt.client.CallTool(ctx, req)
if err != nil {
return err
}
return nil
}
// PlanningExecutionResult represents a unified planning result that contains both planning information and execution results
type PlanningExecutionResult struct {
ai.PlanningResult // Inherit all fields from ai.PlanningResult (ToolCalls, Thought, Content, Error, ModelName)
// Planning process information
ScreenshotElapsed int64 `json:"screenshot_elapsed_ms"` // screenshot elapsed time(ms)
ImagePath string `json:"image_path"` // screenshot image path
Resolution *types.Size `json:"resolution"` // image resolution
ScreenResult *ScreenResult `json:"screen_result"` // complete screen result data
ModelCallElapsed int64 `json:"model_call_elapsed_ms"` // model call elapsed time(ms)
ToolCallsCount int `json:"tool_calls_count"` // number of tool calls generated
ActionNames []string `json:"action_names"` // names of parsed actions
// Execution information
StartTime int64 `json:"start_time"` // planning start time
Elapsed int64 `json:"elapsed_ms"` // planning elapsed time(ms)
SubActions []*SubActionResult `json:"sub_actions,omitempty"` // sub-actions generated from this planning
}
// AIExecutionResult represents a unified result structure for all AI operations
type AIExecutionResult struct {
Type string `json:"type"` // operation type: "query", "action", "assert"
ModelCallElapsed int64 `json:"model_call_elapsed"` // model call elapsed time in milliseconds
ScreenshotElapsed int64 `json:"screenshot_elapsed"` // screenshot elapsed time in milliseconds
ImagePath string `json:"image_path"` // path to screenshot used for operation
Resolution *types.Size `json:"resolution"` // screen resolution
// Operation-specific results (only one will be populated based on Type)
QueryResult *ai.QueryResult `json:"query_result,omitempty"` // for ai_query operations
PlanningResult *ai.PlanningResult `json:"planning_result,omitempty"` // for ai_action operations
AssertionResult *ai.AssertionResult `json:"assertion_result,omitempty"` // for ai_assert operations
// Common fields
Error string `json:"error,omitempty"` // error message if operation failed
}
// SubActionResult represents a sub-action within a start_to_goal action
type SubActionResult struct {
ActionName string `json:"action_name"` // name of the sub-action (e.g., "tap", "input")
Arguments interface{} `json:"arguments,omitempty"` // arguments passed to the sub-action
StartTime int64 `json:"start_time"` // sub-action start time
Elapsed int64 `json:"elapsed_ms"` // sub-action elapsed time(ms)
Error error `json:"error,omitempty"` // sub-action execution result
SessionData
}
type SessionData struct {
Requests []*DriverRequests `json:"requests,omitempty"` // store sub-action specific requests
ScreenResults []*ScreenResult `json:"screen_results,omitempty"` // store sub-action specific screen_results
}
func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExecutionResult, error) {
if dExt.LLMService == nil {
return nil, errors.New("LLM service is not initialized")
}
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return nil, err
}
// parse action options to extract OutputSchema
actionOptions := option.NewActionOptions(opts...)
// Step 2: Call model and measure time
modelCallStartTime := time.Now()
// execute query
queryOpts := &ai.QueryOptions{
Query: text,
Screenshot: screenShotBase64,
Size: size,
OutputSchema: actionOptions.OutputSchema,
}
result, err := dExt.LLMService.Query(context.Background(), queryOpts)
modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
if err != nil {
return nil, errors.Wrap(err, "AI query failed")
}
// Create AIExecutionResult with all timing and metadata
aiResult := &AIExecutionResult{
Type: "query",
ModelCallElapsed: modelCallElapsed, // model call timing
ScreenshotElapsed: screenshotElapsed, // screenshot timing
ImagePath: screenResult.ImagePath, // screenshot path
Resolution: &screenResult.Resolution, // screen resolution
QueryResult: result, // query-specific result
}
return aiResult, nil
}
// AIAssert performs AI-driven assertion and returns detailed execution result
func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (*AIExecutionResult, error) {
if dExt.LLMService == nil {
return nil, errors.New("LLM service is not initialized")
}
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
assertResult := &AIExecutionResult{
Type: "assert",
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
}
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
assertResult.Error = err.Error()
return assertResult, err
}
// Step 2: Call model and measure time
modelCallStartTime := time.Now()
assertOpts := &ai.AssertOptions{
Assertion: assertion,
Screenshot: screenShotBase64,
Size: size,
}
result, err := dExt.LLMService.Assert(context.Background(), assertOpts)
assertResult.ModelCallElapsed = time.Since(modelCallStartTime).Milliseconds()
assertResult.AssertionResult = result
if err != nil {
assertResult.Error = err.Error()
return assertResult, errors.Wrap(err, "AI assertion failed")
}
if !result.Pass {
assertResult.Error = result.Thought
return assertResult, errors.New(result.Thought)
}
return assertResult, nil
}