httprunner/uixt/driver_ext_ai.go

package uixt

import (
	"context"
	"encoding/json"
	"fmt"
	"time"

	"github.com/cloudwego/eino/schema"
	"github.com/pkg/errors"
	"github.com/rs/zerolog/log"

	"github.com/httprunner/httprunner/v5/code"
	"github.com/httprunner/httprunner/v5/uixt/ai"
	"github.com/httprunner/httprunner/v5/uixt/option"
	"github.com/httprunner/httprunner/v5/uixt/types"
)

// StartToGoal (original implementation - preserved)
func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...option.ActionOption) ([]*PlanningExecutionResult, error) {
	options := option.NewActionOptions(opts...)
	logger := log.Info().Str("prompt", prompt)
	if options.MaxRetryTimes > 0 {
		logger = logger.Int("max_retry_times", options.MaxRetryTimes)
	}

	// Handle TimeLimit and Timeout with unified context mechanism
	var isTimeLimitMode bool
	if options.TimeLimit > 0 {
		// TimeLimit takes precedence over Timeout
		logger = logger.Int("time_limit_seconds", options.TimeLimit)
		var cancel context.CancelFunc
		ctx, cancel = context.WithTimeout(ctx, time.Duration(options.TimeLimit)*time.Second)
		defer cancel()
		isTimeLimitMode = true
	} else if options.Timeout > 0 {
		// Use Timeout only if TimeLimit is not set
		logger = logger.Int("timeout_seconds", options.Timeout)
		var cancel context.CancelFunc
		ctx, cancel = context.WithTimeout(ctx, time.Duration(options.Timeout)*time.Second)
		defer cancel()
	}
	logger.Msg("StartToGoal")

	var allPlannings []*PlanningExecutionResult
	var attempt int
	for {
		attempt++
		log.Info().Int("attempt", attempt).Msg("planning attempt")

		// Check for context cancellation (timeout, time limit, or interrupt)
		select {
		case <-ctx.Done():
			cause := context.Cause(ctx)
			// Handle TimeLimit timeout - return success
			if isTimeLimitMode && errors.Is(cause, context.DeadlineExceeded) {
				log.Info().
					Int("attempt", attempt).
					Int("completed_plannings", len(allPlannings)).
					Int("time_limit_seconds", options.TimeLimit).
					Msg("StartToGoal time limit reached, stopping gracefully")
				return allPlannings, nil
			}

			// Handle other cancellations (Timeout, interrupt, external cancellation) - return error
			log.Warn().
				Int("attempt", attempt).
				Int("completed_plannings", len(allPlannings)).
				Err(cause).
				Msg("StartToGoal cancelled")
			return allPlannings, errors.Wrap(cause, "StartToGoal cancelled")
		default:
		}

		// Plan next action with history reset on first attempt
		planningStartTime := time.Now()
		planningOpts := opts
		if attempt == 1 {
			// Add ResetHistory option for the first attempt
			planningOpts = append(planningOpts, option.WithResetHistory(true))
		}

		planningResult, err := dExt.PlanNextAction(ctx, prompt, planningOpts...)
		if err != nil {
			// Check if this is a LLM service request error that should be retried
			if errors.Is(err, code.LLMRequestServiceError) {
				log.Warn().Err(err).Int("attempt", attempt).
					Msg("LLM service request failed, retrying...")
				time.Sleep(5 * time.Second)
				continue
			}
			// Create planning result with error
			errorResult := &PlanningExecutionResult{
				PlanningResult: ai.PlanningResult{
					Thought:   "Planning failed",
					ModelName: "",
					Error:     err.Error(),
				},
				StartTime: planningStartTime.UnixMilli(),
				Elapsed:   time.Since(planningStartTime).Milliseconds(),
			}
			allPlannings = append(allPlannings, errorResult)
			return allPlannings, err
		}

		// Set planning execution timing
		planningResult.StartTime = planningStartTime.UnixMilli()
		planningResult.SubActions = []*SubActionResult{}

		// Check if task is finished BEFORE executing actions
		if dExt.isTaskFinished(planningResult) {
			log.Info().Msg("task finished, stopping StartToGoal")
			planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
			allPlannings = append(allPlannings, planningResult)
			return allPlannings, nil
		}

		// Invoke tool calls
		for _, toolCall := range planningResult.ToolCalls {
			// Check for context cancellation (timeout, time limit, or interrupt) before each action
			select {
			case <-ctx.Done():
				cause := context.Cause(ctx)
				// Handle TimeLimit timeout - return success
				if isTimeLimitMode && errors.Is(cause, context.DeadlineExceeded) {
					log.Info().
						Int("attempt", attempt).
						Int("completed_plannings", len(allPlannings)).
						Int("completed_tool_calls", len(planningResult.SubActions)).
						Int("total_tool_calls", len(planningResult.ToolCalls)).
						Int("time_limit_seconds", options.TimeLimit).
						Msg("StartToGoal time limit reached during tool call execution, stopping gracefully")
					planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
					allPlannings = append(allPlannings, planningResult)
					return allPlannings, nil
				}

				// Handle other cancellations (Timeout, external cancellation) - return error
				log.Warn().
					Int("attempt", attempt).
					Int("completed_plannings", len(allPlannings)).
					Int("completed_tool_calls", len(planningResult.SubActions)).
					Int("total_tool_calls", len(planningResult.ToolCalls)).
					Err(cause).
					Msg("invokeToolCalls cancelled")
				planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
				allPlannings = append(allPlannings, planningResult)
				return allPlannings, errors.Wrap(cause, "invokeToolCalls cancelled")
			default:
			}

			// Execute each tool call in a separate function to ensure proper defer execution
			err := func() error {
				subActionStartTime := time.Now()
				subActionResult := &SubActionResult{
					ActionName: toolCall.Function.Name,
					Arguments:  toolCall.Function.Arguments,
					StartTime:  subActionStartTime.UnixMilli(),
				}

				// Use defer to ensure sub-action is always processed and added to results
				defer func() {
					subActionResult.Elapsed = time.Since(subActionStartTime).Milliseconds()
					subActionResult.SessionData = dExt.GetSession().GetData(true) // reset after getting data
					planningResult.SubActions = append(planningResult.SubActions, subActionResult)
				}()

				if err := dExt.invokeToolCall(ctx, toolCall, opts...); err != nil {
					log.Error().Err(err).
						Str("action", toolCall.Function.Name).
						Msg("invoke tool call failed")
					subActionResult.Error = err.Error()
					return err
				}
				return nil
			}()
			if err != nil {
				planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
				planningResult.Error = err.Error()
				allPlannings = append(allPlannings, planningResult)
				return allPlannings, err
			}
		}

		// Complete this planning cycle
		planningResult.Elapsed = time.Since(planningStartTime).Milliseconds()
		allPlannings = append(allPlannings, planningResult)

		if options.MaxRetryTimes > 0 && attempt > options.MaxRetryTimes {
			return allPlannings, errors.New("reached max retry times")
		}

		// wait 3 seconds for tool calls to complete
		time.Sleep(3 * time.Second)
	}
}

// AIAction with WingsService priority support
func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) {
	log.Info().Str("prompt", prompt).Msg("performing AI action")

	// Step 1: Take screenshot and convert to base64
	screenResult, err := dExt.GetScreenResult(
		option.WithScreenShotFileName("ai_action"),
		option.WithScreenShotBase64(true),
	)
	if err != nil {
		return nil, err
	}

	// Step 2: Check if WingsService is available and prioritize it
	if dExt.WingsService != nil {
		log.Info().Msg("using Wings service for AI action")
		return dExt.executeAIAction(ctx, prompt, screenResult, dExt.WingsService, "wings", opts...)
	}

	// Step 3: Fallback to LLM service
	if dExt.LLMService == nil {
		return nil, errors.New("neither Wings service nor LLM service is initialized")
	}

	log.Info().Msg("using LLM service for AI action")
	return dExt.executeAIAction(ctx, prompt, screenResult, dExt.LLMService, "llm", opts...)
}

// executeAIAction executes AIAction using any AI service (generic implementation)
func (dExt *XTDriver) executeAIAction(ctx context.Context, prompt string, screenResult *ScreenResult, service ai.ILLMService, serviceType string, opts ...option.ActionOption) (*AIExecutionResult, error) {
	// Add device context for Wings service if needed
	if serviceType == "wings" {
		ctx = dExt.addDeviceContextForWings(ctx)
	}

	// Step 1: Plan next action and measure time
	modelCallStartTime := time.Now()

	var planningResult *ai.PlanningResult
	var err error

	if serviceType == "llm" {
		// For LLM service, use PlanNextAction which includes additional processing
		planningExecutionResult, planErr := dExt.PlanNextAction(ctx, prompt, opts...)
		if planErr != nil {
			modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
			return &AIExecutionResult{
				Type:              "action",
				ModelCallElapsed:  modelCallElapsed,
				ScreenshotElapsed: screenResult.Elapsed,
				ImagePath:         screenResult.ImagePath,
				Resolution:        &screenResult.Resolution,
				Error:             planErr.Error(),
			}, errors.Wrap(planErr, "get next action failed")
		}
		planningResult = &planningExecutionResult.PlanningResult
	} else {
		// For Wings service, call Plan directly
		planningOpts := &ai.PlanningOptions{
			UserInstruction: prompt,
			Message: &schema.Message{
				Role: schema.User,
				MultiContent: []schema.ChatMessagePart{
					{
						Type: schema.ChatMessagePartTypeImageURL,
						ImageURL: &schema.ChatMessageImageURL{
							URL: screenResult.Base64,
						},
					},
				},
			},
			Size: screenResult.Resolution,
		}

		planningResult, err = service.Plan(ctx, planningOpts)
		if err != nil {
			modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
			return &AIExecutionResult{
				Type:              "action",
				ModelCallElapsed:  modelCallElapsed,
				ScreenshotElapsed: screenResult.Elapsed,
				ImagePath:         screenResult.ImagePath,
				Resolution:        &screenResult.Resolution,
				Error:             err.Error(),
			}, errors.Wrap(err, fmt.Sprintf("%s service planning failed", serviceType))
		}
	}

	modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()

	aiExecutionResult := &AIExecutionResult{
		Type:              "action",
		ModelCallElapsed:  modelCallElapsed,
		ScreenshotElapsed: screenResult.Elapsed,
		ImagePath:         screenResult.ImagePath,
		Resolution:        &screenResult.Resolution,
		PlanningResult:    planningResult,
	}

	// Step 2: Execute tool calls
	for _, toolCall := range planningResult.ToolCalls {
		err = dExt.invokeToolCall(ctx, toolCall, opts...)
		if err != nil {
			log.Error().Err(err).
				Str("action", toolCall.Function.Name).
				Msg("invoke tool call failed")
			aiExecutionResult.Error = err.Error()
			return aiExecutionResult, errors.Wrap(err, "invoke tool call failed")
		}
	}

	return aiExecutionResult, nil
}

// AIAssert with WingsService priority support
func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (*AIExecutionResult, error) {
	log.Info().Str("assertion", assertion).Msg("performing AI assertion")

	// Step 1: Take screenshot and convert to base64
	screenResult, err := dExt.GetScreenResult(
		option.WithScreenShotFileName("ai_assert"),
		option.WithScreenShotBase64(true),
	)
	if err != nil {
		return nil, err
	}

	// Step 2: Check if WingsService is available and prioritize it
	if dExt.WingsService != nil {
		log.Info().Msg("using Wings service for AI assertion")
		return dExt.executeAIAssert(assertion, screenResult, dExt.WingsService, "wings", opts...)
	}

	// Step 3: Fallback to LLM service
	if dExt.LLMService == nil {
		return nil, errors.New("neither Wings service nor LLM service is initialized")
	}

	log.Info().Msg("using LLM service for AI assertion")
	return dExt.executeAIAssert(assertion, screenResult, dExt.LLMService, "llm", opts...)
}

// executeAIAssert executes AIAssert using any AI service (generic implementation)
func (dExt *XTDriver) executeAIAssert(assertion string, screenResult *ScreenResult, service ai.ILLMService, serviceType string, opts ...option.ActionOption) (*AIExecutionResult, error) {
	// Step 1: Prepare context and options
	ctx := context.Background()
	if serviceType == "wings" {
		ctx = dExt.addDeviceContextForWings(ctx)
	}

	assertResult := &AIExecutionResult{
		Type:              "assert",
		ScreenshotElapsed: screenResult.Elapsed,
		ImagePath:         screenResult.ImagePath,
		Resolution:        &screenResult.Resolution,
	}

	// Step 2: Call service and measure time
	modelCallStartTime := time.Now()
	assertOpts := &ai.AssertOptions{
		Assertion:  assertion,
		Screenshot: screenResult.Base64,
		Size:       screenResult.Resolution,
	}

	result, err := service.Assert(ctx, assertOpts)
	assertResult.ModelCallElapsed = time.Since(modelCallStartTime).Milliseconds()
	assertResult.AssertionResult = result

	if err != nil {
		assertResult.Error = err.Error()
		return assertResult, errors.Wrap(err, fmt.Sprintf("%s assertion failed", serviceType))
	}

	if !result.Pass {
		assertResult.Error = result.Thought
	}

	return assertResult, nil
}

// addDeviceContextForWings adds device information to context for Wings service
func (dExt *XTDriver) addDeviceContextForWings(ctx context.Context) context.Context {
	device := dExt.GetDevice()
	if device == nil {
		return ctx
	}

	// Add device ID to context
	ctx = context.WithValue(ctx, "device_id", device.UUID())

	// Add platform type to context
	platformType := "android" // default
	switch device.(type) {
	case *AndroidDevice:
		platformType = "android"
	case *IOSDevice:
		platformType = "ios"
	case *HarmonyDevice:
		platformType = "harmony"
	}
	ctx = context.WithValue(ctx, "platform_type", platformType)

	return ctx
}

// PlanNextAction (original implementation - preserved)
func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*PlanningExecutionResult, error) {
	if dExt.LLMService == nil {
		return nil, errors.New("LLM service is not initialized")
	}

	// Parse action options to get ResetHistory setting
	options := option.NewActionOptions(opts...)
	resetHistory := options.ResetHistory

	// Step 1: Take screenshot and convert to base64
	screenResult, err := dExt.GetScreenResult(
		option.WithScreenShotFileName("ai_planning"),
		option.WithScreenShotBase64(true),
	)
	if err != nil {
		return nil, err
	}

	// Clear session data after planning screenshot to avoid including it in sub-actions
	// The planning screenshot is already stored in planningResult.ScreenResult
	dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions

	// Step 2: Call model
	modelCallStartTime := time.Now()
	planningOpts := &ai.PlanningOptions{
		UserInstruction: prompt,
		Message: &schema.Message{
			Role: schema.User,
			MultiContent: []schema.ChatMessagePart{
				{
					Type: schema.ChatMessagePartTypeImageURL,
					ImageURL: &schema.ChatMessageImageURL{
						URL: screenResult.Base64,
					},
				},
			},
		},
		Size:         screenResult.Resolution,
		ResetHistory: resetHistory,
	}

	result, err := dExt.LLMService.Plan(ctx, planningOpts)
	modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()

	if err != nil {
		return nil, errors.Wrap(err, "failed to get next action from planner")
	}

	// Step 3: Parse result (this is already done in LLMService.Call, but we record it separately)
	actionNames := make([]string, len(result.ToolCalls))
	for i, toolCall := range result.ToolCalls {
		actionNames[i] = toolCall.Function.Name
	}

	// Create unified planning result that inherits from ai.PlanningResult
	planningResult := &PlanningExecutionResult{
		PlanningResult: *result, // Inherit all fields from ai.PlanningResult
		// Planning process timing and metadata
		ScreenshotElapsed: screenResult.Elapsed,
		ImagePath:         screenResult.ImagePath,
		Resolution:        &screenResult.Resolution,
		ScreenResult:      screenResult,
		ModelCallElapsed:  modelCallElapsed,
		ToolCallsCount:    len(result.ToolCalls),
		ActionNames:       actionNames,
		// Execution timing (will be set by StartToGoal)
		StartTime:  0,   // Will be set by caller
		Elapsed:    0,   // Will be set by caller
		SubActions: nil, // Will be populated during execution
	}

	return planningResult, nil
}

// isTaskFinished (original implementation - preserved)
func (dExt *XTDriver) isTaskFinished(planningResult *PlanningExecutionResult) bool {
	// Check if there are no tool calls (no actions to execute)
	if len(planningResult.ToolCalls) == 0 {
		log.Info().Msg("no tool calls returned, task may be finished")
		return true
	}

	// Check if any tool call is a "finished" action
	for _, toolCall := range planningResult.ToolCalls {
		if toolCall.Function.Name == "uixt__finished" {
			log.Info().Str("reason", toolCall.Function.Arguments).Msg("finished action detected")
			return true
		}
	}

	return false
}

// invokeToolCall (original implementation - preserved)
func (dExt *XTDriver) invokeToolCall(ctx context.Context, toolCall schema.ToolCall, opts ...option.ActionOption) error {
	// Parse arguments
	arguments := make(map[string]interface{})
	err := json.Unmarshal([]byte(toolCall.Function.Arguments), &arguments)
	if err != nil {
		return err
	}

	// Create a MobileAction with options to reuse BuildMCPCallToolRequest
	action := option.MobileAction{
		Options: option.NewActionOptions(opts...),
	}

	req := BuildMCPCallToolRequest(
		option.ActionName(toolCall.Function.Name),
		arguments,
		action,
	)
	_, err = dExt.client.CallTool(ctx, req)
	if err != nil {
		return err
	}

	return nil
}

// PlanningExecutionResult (original implementation - preserved)
type PlanningExecutionResult struct {
	ai.PlanningResult // Inherit all fields from ai.PlanningResult (ToolCalls, Thought, Content, Error, ModelName)
	// Planning process information
	ScreenshotElapsed int64         `json:"screenshot_elapsed_ms"` // screenshot elapsed time(ms)
	ImagePath         string        `json:"image_path"`            // screenshot image path
	Resolution        *types.Size   `json:"resolution"`            // image resolution
	ScreenResult      *ScreenResult `json:"screen_result"`         // complete screen result data
	ModelCallElapsed  int64         `json:"model_call_elapsed_ms"` // model call elapsed time(ms)
	ToolCallsCount    int           `json:"tool_calls_count"`      // number of tool calls generated
	ActionNames       []string      `json:"action_names"`          // names of parsed actions
	// Execution information
	StartTime  int64              `json:"start_time"`            // planning start time
	Elapsed    int64              `json:"elapsed_ms"`            // planning elapsed time(ms)
	SubActions []*SubActionResult `json:"sub_actions,omitempty"` // sub-actions generated from this planning
}

// AIExecutionResult (original implementation - preserved)
type AIExecutionResult struct {
	Type              string      `json:"type"`               // operation type: "query", "action", "assert"
	ModelCallElapsed  int64       `json:"model_call_elapsed"` // model call elapsed time in milliseconds
	ScreenshotElapsed int64       `json:"screenshot_elapsed"` // screenshot elapsed time in milliseconds
	ImagePath         string      `json:"image_path"`         // path to screenshot used for operation
	Resolution        *types.Size `json:"resolution"`         // screen resolution

	// Operation-specific results (only one will be populated based on Type)
	QueryResult     *ai.QueryResult     `json:"query_result,omitempty"`     // for ai_query operations
	PlanningResult  *ai.PlanningResult  `json:"planning_result,omitempty"`  // for ai_action operations
	AssertionResult *ai.AssertionResult `json:"assertion_result,omitempty"` // for ai_assert operations

	// Common fields
	Error string `json:"error,omitempty"` // error message if operation failed
}

// SubActionResult (original implementation - preserved)
type SubActionResult struct {
	ActionName string      `json:"action_name"`         // name of the sub-action (e.g., "tap", "input")
	Arguments  interface{} `json:"arguments,omitempty"` // arguments passed to the sub-action
	StartTime  int64       `json:"start_time"`          // sub-action start time
	Elapsed    int64       `json:"elapsed_ms"`          // sub-action elapsed time(ms)
	Error      string      `json:"error,omitempty"`     // sub-action execution result
	SessionData
}

type SessionData struct {
	Requests      []*DriverRequests `json:"requests,omitempty"`       // store sub-action specific requests
	ScreenResults []*ScreenResult   `json:"screen_results,omitempty"` // store sub-action specific screen_results
}

// AIQuery (original implementation - preserved)
func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExecutionResult, error) {
	if dExt.LLMService == nil {
		return nil, errors.New("LLM service is not initialized")
	}

	// Step 1: Take screenshot and convert to base64
	screenResult, err := dExt.GetScreenResult(
		option.WithScreenShotFileName("ai_query"),
		option.WithScreenShotBase64(true),
	)
	if err != nil {
		return nil, err
	}

	// parse action options to extract OutputSchema
	actionOptions := option.NewActionOptions(opts...)

	// Step 2: Call model and measure time
	modelCallStartTime := time.Now()

	// execute query
	queryOpts := &ai.QueryOptions{
		Query:        text,
		Screenshot:   screenResult.Base64,
		Size:         screenResult.Resolution,
		OutputSchema: actionOptions.OutputSchema,
	}
	result, err := dExt.LLMService.Query(context.Background(), queryOpts)
	modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
	if err != nil {
		return nil, errors.Wrap(err, "AI query failed")
	}

	// Create AIExecutionResult with all timing and metadata
	aiResult := &AIExecutionResult{
		Type:              "query",
		ModelCallElapsed:  modelCallElapsed,         // model call timing
		ScreenshotElapsed: screenResult.Elapsed,     // screenshot timing
		ImagePath:         screenResult.ImagePath,   // screenshot path
		Resolution:        &screenResult.Resolution, // screen resolution
		QueryResult:       result,                   // query-specific result
	}
	return aiResult, nil
}