feat: unify AI action handling with detailed execution results and enhanced UI integration

2026-05-07 08:02:42 +08:00 · 2025-06-24 13:22:00 +08:00
parent fc32b5d874
commit 8fc8d06604
8 changed files with 156 additions and 146 deletions
--- a/internal/version/VERSION
+++ b/internal/version/VERSION
@@ -1 +1 @@
-v5.0.0-beta-2506241150
+v5.0.0-beta-2506241342
--- a/report.go
+++ b/report.go
@@ -491,6 +491,12 @@ func (g *HTMLReportGenerator) GenerateReport(outputFile string) error {
 		"add":   func(a, b int) int { return a + b },
 		"base":  filepath.Base,
 		"index": func(m map[string]any, key string) any { return m[key] },
+		"title": func(s string) string {
+			if s == "" {
+				return ""
+			}
+			return strings.ToUpper(s[:1]) + s[1:]
+		},
 		"extractThought": func(content string) string {
 			if content == "" {
 				return ""
@@ -2379,34 +2385,34 @@ const htmlTemplate = `<!DOCTYPE html>
                                    {{end}}
                                {{end}}

-                                {{/* Enhanced AI Query Display - using QueryResult data structure */}}
-                                {{if eq $action.Method "ai_query"}}
-                                {{if $action.QueryResult}}
+                                {{/* Enhanced AI Operations Display - using unified AIResult data structure */}}
+                                {{if or (eq $action.Method "ai_query") (eq $action.Method "ai_action") (eq $action.Method "ai_assert")}}
+                                {{if $action.AIResult}}
                                <div class="sub-action-item">
                                    <div class="validator-ai-content">
                                        <!-- Display AI Thought -->
-                                        {{if $action.QueryResult.Thought}}
-                                        <div class="thought">{{$action.QueryResult.Thought}}</div>
+                                        {{if $action.AIResult.Thought}}
+                                        <div class="thought">{{$action.AIResult.Thought}}</div>
                                        {{end}}

-                                        <!-- AI Query Layout: Screenshot left, Analysis right -->
+                                        <!-- AI Operation Layout: Screenshot left, Analysis right -->
                                        <div class="validator-ai-layout">
                                            <!-- Left column: Screenshot -->
-                                            {{if $action.QueryResult.ImagePath}}
+                                            {{if $action.AIResult.ImagePath}}
                                            <div class="validator-column-screenshot">
                                                <div class="validator-step-compact">
                                                    <div class="step-header-compact">
-                                                        <span class="step-name">📸 Query Screenshot</span>
-                                                        {{if $action.QueryResult.ScreenshotElapsed}}
-                                                        <span class="duration">{{formatDuration $action.QueryResult.ScreenshotElapsed}}</span>
+                                                        <span class="step-name">📸 {{title $action.AIResult.Type}} Screenshot</span>
+                                                        {{if $action.AIResult.ScreenshotElapsed}}
+                                                        <span class="duration">{{formatDuration $action.AIResult.ScreenshotElapsed}}</span>
                                                        {{end}}
                                                    </div>
                                                    <div class="screenshot-display">
-                                                        {{$base64Image := encodeImageBase64 $action.QueryResult.ImagePath}}
+                                                        {{$base64Image := encodeImageBase64 $action.AIResult.ImagePath}}
                                                        {{if $base64Image}}
                                                        <div class="screenshot-item-compact">
                                                            <div class="screenshot-image">
-                                                                <img src="data:image/jpeg;base64,{{$base64Image}}" alt="AI Query Screenshot" onclick="openImageModal(this.src)" />
+                                                                <img src="data:image/jpeg;base64,{{$base64Image}}" alt="AI {{title $action.AIResult.Type}} Screenshot" onclick="openImageModal(this.src)" />
                                                            </div>
                                                        </div>
                                                        {{end}}
@@ -2415,30 +2421,27 @@ const htmlTemplate = `<!DOCTYPE html>
                                            </div>
                                            {{end}}

-                                            <!-- Right column: AI Query Analysis -->
+                                            <!-- Right column: AI Analysis -->
                                            <div class="validator-column-analysis">
                                                <div class="validator-step-compact">
                                                    <div class="step-header-compact">
-                                                        <span class="step-name">🤖 AI Query Analysis</span>
-                                                        {{if $action.QueryResult.ModelCallElapsed}}
-                                                        <span class="duration">{{formatDuration $action.QueryResult.ModelCallElapsed}}</span>
+                                                        <span class="step-name">🤖 AI {{title $action.AIResult.Type}} Analysis</span>
+                                                        {{if $action.AIResult.ModelCallElapsed}}
+                                                        <span class="duration">{{formatDuration $action.AIResult.ModelCallElapsed}}</span>
                                                        {{end}}
                                                    </div>
                                                    <div class="validator-ai-details">
-                                                        {{if $action.QueryResult.ModelName}}
-                                                        <div class="model-info">🤖 Model: {{$action.QueryResult.ModelName}}</div>
+                                                        {{if $action.AIResult.ModelName}}
+                                                        <div class="model-info">🤖 Model: {{$action.AIResult.ModelName}}</div>
                                                        {{end}}
-                                                        {{if $action.QueryResult.Resolution}}
-                                                        <div class="model-info">📐 Resolution: {{$action.QueryResult.Resolution.Width}}x{{$action.QueryResult.Resolution.Height}}</div>
+                                                        {{if $action.AIResult.Resolution}}
+                                                        <div class="model-info">📐 Resolution: {{$action.AIResult.Resolution.Width}}x{{$action.AIResult.Resolution.Height}}</div>
                                                        {{end}}
-                                                        {{if $action.QueryResult.Usage}}
-                                                        <div class="usage-info">📊 Tokens: {{$action.QueryResult.Usage.PromptTokens}} in / {{$action.QueryResult.Usage.CompletionTokens}} out / {{$action.QueryResult.Usage.TotalTokens}} total</div>
+                                                        {{if $action.AIResult.Usage}}
+                                                        <div class="usage-info">📊 Tokens: {{$action.AIResult.Usage.PromptTokens}} in / {{$action.AIResult.Usage.CompletionTokens}} out / {{$action.AIResult.Usage.TotalTokens}} total</div>
                                                        {{end}}
-                                                        {{if $action.QueryResult.Content}}
-                                                        <div class="model-info">💬 Query Result: {{$action.QueryResult.Content}}</div>
-                                                        {{end}}
-                                                        {{if $action.QueryResult.Error}}
-                                                        <div class="model-info" style="color: #dc3545;">❌ Error: {{$action.QueryResult.Error}}</div>
+                                                        {{if $action.AIResult.Content}}
+                                                        <div class="model-info">💬 {{title $action.AIResult.Type}} Result: {{$action.AIResult.Content}}</div>
                                                        {{end}}
                                                    </div>
                                                </div>
@@ -2531,84 +2534,6 @@ const htmlTemplate = `<!DOCTYPE html>
                                {{if and $validator.msg (ne $validator.check_result "pass")}}
                                    <div class="validator-message">{{$validator.msg}}</div>
                                {{end}}
-
-                                <!-- Enhanced AI Validator Display -->
-                                {{if or (eq $validator.check "ui_ai") (eq $validator.assert "ai_assert")}}
-                                <div class="validator-ai-content">
-                                    <!-- Extract AI validation details from step logs -->
-                                    {{$stepLogs := getStepLogs $step}}
-                                    {{$validationThought := ""}}
-                                    {{$validationModel := ""}}
-                                    {{$validationUsage := ""}}
-                                    {{$validationScreenshot := ""}}
-                                    {{range $logEntry := $stepLogs}}
-                                        {{if and (eq $logEntry.Message "log response message") (index $logEntry.Fields "content")}}
-                                            {{$content := index $logEntry.Fields "content"}}
-                                            {{if $content}}
-                                                {{$validationThought = $content}}
-                                            {{end}}
-                                        {{end}}
-                                        {{if and (eq $logEntry.Message "call model service for assertion") (index $logEntry.Fields "model")}}
-                                            {{$validationModel = index $logEntry.Fields "model"}}
-                                        {{end}}
-                                        {{if and (eq $logEntry.Message "usage statistics") (index $logEntry.Fields "input_tokens")}}
-                                            {{$inputTokens := index $logEntry.Fields "input_tokens"}}
-                                            {{$outputTokens := index $logEntry.Fields "output_tokens"}}
-                                            {{$totalTokens := index $logEntry.Fields "total_tokens"}}
-                                            {{$validationUsage = printf "📊 Tokens: %v in / %v out / %v total" $inputTokens $outputTokens $totalTokens}}
-                                        {{end}}
-                                        {{if and (eq $logEntry.Message "log screenshot") (index $logEntry.Fields "imagePath")}}
-                                            {{$validationScreenshot = index $logEntry.Fields "imagePath"}}
-                                        {{end}}
-                                    {{end}}
-
-                                    <!-- Display AI Thought at the top, same as planning -->
-                                    {{if $validationThought}}
-                                    <div class="thought">{{extractThought $validationThought}}</div>
-                                    {{end}}
-
-                                    <!-- AI Validation Layout - similar to planning layout -->
-                                    <div class="validator-ai-layout">
-                                        <!-- Left column: Screenshot -->
-                                        {{if $validationScreenshot}}
-                                        <div class="validator-column-screenshot">
-                                            <div class="validator-step-compact">
-                                                <div class="step-header-compact">
-                                                    <span class="step-name">📸 Validation Screenshot</span>
-                                                </div>
-                                                <div class="screenshot-display">
-                                                    {{$base64Image := encodeImageBase64 $validationScreenshot}}
-                                                    {{if $base64Image}}
-                                                    <div class="screenshot-item-compact">
-                                                        <div class="screenshot-image">
-                                                            <img src="data:image/jpeg;base64,{{$base64Image}}" alt="Validation Screenshot" onclick="openImageModal(this.src)" />
-                                                        </div>
-                                                    </div>
-                                                    {{end}}
-                                                </div>
-                                            </div>
-                                        </div>
-                                        {{end}}
-
-                                        <!-- Right column: AI Analysis -->
-                                        <div class="validator-column-analysis">
-                                            <div class="validator-step-compact">
-                                                <div class="step-header-compact">
-                                                    <span class="step-name">🤖 AI Analysis</span>
-                                                </div>
-                                                <div class="validator-ai-details">
-                                                    {{if $validationModel}}
-                                                    <div class="model-info">🤖 Model: {{$validationModel}}</div>
-                                                    {{end}}
-                                                    {{if $validationUsage}}
-                                                    <div class="usage-info">{{$validationUsage}}</div>
-                                                    {{end}}
-                                                </div>
-                                            </div>
-                                        </div>
-                                    </div>
-                                </div>
-                                {{end}}
                            </div>
                            {{end}}
                        </div>
--- a/step.go
+++ b/step.go
@@ -60,11 +60,11 @@ type TStep struct {
 // one step contains one or multiple actions
 type ActionResult struct {
 	option.MobileAction `json:",inline"`
-	StartTime           int64                           `json:"start_time"`             // action start time in millisecond(ms)
-	Elapsed             int64                           `json:"elapsed_ms"`             // action elapsed time(ms)
-	Error               string                          `json:"error,omitempty"`        // action execution result
-	Plannings           []*uixt.PlanningExecutionResult `json:"plannings,omitempty"`    // store planning results for start_to_goal actions, which contains multiple sub-actions
-	QueryResult         *uixt.QueryExecutionResult      `json:"query_result,omitempty"` // store query result for ai_query actions
+	StartTime           int64                           `json:"start_time"`          // action start time in millisecond(ms)
+	Elapsed             int64                           `json:"elapsed_ms"`          // action elapsed time(ms)
+	Error               string                          `json:"error,omitempty"`     // action execution result
+	Plannings           []*uixt.PlanningExecutionResult `json:"plannings,omitempty"` // store planning results for start_to_goal actions, which contains multiple sub-actions
+	AIResult            *uixt.AIExecutionResult         `json:"ai_result,omitempty"` // store unified AI execution result for ai_query/ai_action/ai_assert actions
 	uixt.SessionData                                    // store session data for other actions besides start_to_goal & ai_query
 }

--- a/step_ui.go
+++ b/step_ui.go
@@ -935,7 +935,7 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
 				}
 			}()

-			// handle start_to_goal action
+			// handle start_to_goal AI action
 			if action.Method == option.ACTION_StartToGoal {
 				planningResults, err := uiDriver.StartToGoal(ctx,
 					action.Params.(string), action.GetOptions()...)
@@ -952,12 +952,23 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
 				continue
 			}

-			// handle ai_query action
-			if action.Method == option.ACTION_Query {
-				queryResult, err := uiDriver.AIQuery(
-					action.Params.(string), action.GetOptions()...)
+			// handle AI operations (ai_action, ai_query, ai_assert) with unified result storage
+			if action.Method == option.ACTION_AIAction || action.Method == option.ACTION_Query || action.Method == option.ACTION_AIAssert {
+				var aiResult *uixt.AIExecutionResult
+				var err error
+
+				prompt := action.Params.(string)
+				switch action.Method {
+				case option.ACTION_AIAction:
+					aiResult, err = uiDriver.AIAction(ctx, prompt, action.GetOptions()...)
+				case option.ACTION_Query:
+					aiResult, err = uiDriver.AIQuery(prompt, action.GetOptions()...)
+				case option.ACTION_AIAssert:
+					aiResult, err = uiDriver.AIAssert(prompt, action.GetOptions()...)
+				}
+
 				actionResult.Elapsed = time.Since(actionStartTime).Milliseconds()
-				actionResult.QueryResult = queryResult
+				actionResult.AIResult = aiResult
 				stepResult.Actions = append(stepResult.Actions, actionResult)
 				if err != nil {
 					actionResult.Error = err.Error()
@@ -969,7 +980,7 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
 				continue
 			}

-			// handle other actions
+			// handle other non-AI actions
 			sessionData, err := uiDriver.ExecuteAction(ctx, action)
 			actionResult.Elapsed = time.Since(actionStartTime).Milliseconds()
 			actionResult.SessionData = sessionData
--- a/uixt/driver_ext_ai.go
+++ b/uixt/driver_ext_ai.go
@@ -131,24 +131,52 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
 	}
 }

-func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) error {
+// AIAction performs AI-driven action and returns detailed execution result
+func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) {
 	log.Info().Str("prompt", prompt).Msg("performing AI action")

-	// plan next action
-	planningResult, err := dExt.PlanNextAction(ctx, prompt, opts...)
+	// Step 1: Take screenshot and measure time
+	screenshotStartTime := time.Now()
+	screenResult, err := dExt.createScreenshotWithSession(
+		option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
+	)
+	screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
 	if err != nil {
-		return err
+		return nil, err
 	}

-	// Invoke tool calls
+	// Step 2: Plan next action and measure time
+	modelCallStartTime := time.Now()
+	planningResult, err := dExt.PlanNextAction(ctx, prompt, opts...)
+	modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
+	aiExecutionResult := &AIExecutionResult{
+		Type:              "action",
+		ModelCallElapsed:  modelCallElapsed,
+		ScreenshotElapsed: screenshotElapsed,
+		ImagePath:         screenResult.ImagePath,
+		Resolution:        &screenResult.Resolution,
+		ModelName:         planningResult.ModelName,
+		Usage:             planningResult.Usage,
+		PlanningResult:    &planningResult.PlanningResult,
+		Thought:           planningResult.Thought,
+		Content:           planningResult.Content,
+	}
+
+	if err != nil {
+		aiExecutionResult.Error = err.Error()
+		return aiExecutionResult, errors.Wrap(err, "get next action failed")
+	}
+
+	// Step 3: Execute tool calls
 	for _, toolCall := range planningResult.ToolCalls {
 		err = dExt.invokeToolCall(ctx, toolCall)
 		if err != nil {
-			return err
+			aiExecutionResult.Error = err.Error()
+			return aiExecutionResult, errors.Wrap(err, "invoke tool call failed")
 		}
 	}

-	return nil
+	return aiExecutionResult, nil
 }

 // PlanNextAction performs planning and returns unified planning information
@@ -301,15 +329,25 @@ type PlanningExecutionResult struct {
 	SubActions []*SubActionResult `json:"sub_actions,omitempty"` // sub-actions generated from this planning
 }

-// QueryExecutionResult contains the result of AI query execution with timing and metadata
-type QueryExecutionResult struct {
-	ai.QueryResult                       // inherit from ai.QueryResult
+// AIExecutionResult represents a unified result structure for all AI operations
+type AIExecutionResult struct {
+	Type              string             `json:"type"`               // operation type: "query", "action", "assert"
 	ModelCallElapsed  int64              `json:"model_call_elapsed"` // model call elapsed time in milliseconds
 	ScreenshotElapsed int64              `json:"screenshot_elapsed"` // screenshot elapsed time in milliseconds
-	ImagePath         string             `json:"image_path"`         // path to screenshot used for query
+	ImagePath         string             `json:"image_path"`         // path to screenshot used for operation
 	Resolution        *types.Size        `json:"resolution"`         // screen resolution
-	ModelName         string             `json:"model_name"`         // model name used for query
+	ModelName         string             `json:"model_name"`         // model name used for operation
 	Usage             *schema.TokenUsage `json:"usage,omitempty"`    // token usage statistics
+
+	// Operation-specific results (only one will be populated based on Type)
+	QueryResult     *ai.QueryResult     `json:"query_result,omitempty"`     // for ai_query operations
+	PlanningResult  *ai.PlanningResult  `json:"planning_result,omitempty"`  // for ai_action operations
+	AssertionResult *ai.AssertionResult `json:"assertion_result,omitempty"` // for ai_assert operations
+
+	// Common fields
+	Thought string `json:"thought,omitempty"` // AI reasoning/thought process
+	Content string `json:"content,omitempty"` // operation result content
+	Error   string `json:"error,omitempty"`   // error message if operation failed
 }

 // SubActionResult represents a sub-action within a start_to_goal action
@@ -327,7 +365,7 @@ type SessionData struct {
 	ScreenResults []*ScreenResult   `json:"screen_results,omitempty"` // store sub-action specific screen_results
 }

-func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*QueryExecutionResult, error) {
+func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExecutionResult, error) {
 	if dExt.LLMService == nil {
 		return nil, errors.New("LLM service is not initialized")
 	}
@@ -366,41 +404,77 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*QueryE
 		return nil, errors.Wrap(err, "AI query failed")
 	}

-	// Create QueryExecutionResult with all timing and metadata
-	queryExecResult := &QueryExecutionResult{
-		QueryResult:       *result,                  // inherit from ai.QueryResult
+	// Create AIExecutionResult with all timing and metadata
+	aiResult := &AIExecutionResult{
+		Type:              "query",
 		ModelCallElapsed:  modelCallElapsed,         // model call timing
 		ScreenshotElapsed: screenshotElapsed,        // screenshot timing
 		ImagePath:         screenResult.ImagePath,   // screenshot path
 		Resolution:        &screenResult.Resolution, // screen resolution
+		QueryResult:       result,                   // query-specific result
+		Thought:           result.Thought,           // AI reasoning
+		Content:           result.Content,           // query result content
 	}
-	return queryExecResult, nil
+	return aiResult, nil
 }

-func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) error {
+// AIAssert performs AI-driven assertion and returns detailed execution result
+func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (*AIExecutionResult, error) {
 	if dExt.LLMService == nil {
-		return errors.New("LLM service is not initialized")
+		return nil, errors.New("LLM service is not initialized")
+	}
+
+	// Step 1: Take screenshot and measure time
+	screenshotStartTime := time.Now()
+	screenResult, err := dExt.createScreenshotWithSession(
+		option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
+	)
+	screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
+	if err != nil {
+		return nil, err
 	}

 	screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
 	if err != nil {
-		return err
+		return &AIExecutionResult{
+			Type:              "assert",
+			ScreenshotElapsed: screenshotElapsed,
+			ImagePath:         screenResult.ImagePath,
+			Resolution:        &screenResult.Resolution,
+			Error:             err.Error(),
+		}, err
 	}

-	// execute assertion
+	// Step 2: Call model and measure time
+	modelCallStartTime := time.Now()
 	assertOpts := &ai.AssertOptions{
 		Assertion:  assertion,
 		Screenshot: screenShotBase64,
 		Size:       size,
 	}
 	result, err := dExt.LLMService.Assert(context.Background(), assertOpts)
+	modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
+
+	aiResult := &AIExecutionResult{
+		Type:              "assert",
+		ModelCallElapsed:  modelCallElapsed,
+		ScreenshotElapsed: screenshotElapsed,
+		ImagePath:         screenResult.ImagePath,
+		Resolution:        &screenResult.Resolution,
+		AssertionResult:   result,
+		Thought:           result.Thought,
+	}
+
 	if err != nil {
-		return errors.Wrap(err, "AI assertion failed")
+		aiResult.Error = err.Error()
+		return aiResult, errors.Wrap(err, "AI assertion failed")
 	}

 	if !result.Pass {
-		return errors.New(result.Thought)
+		aiResult.Error = result.Thought
+		return aiResult, errors.New(result.Thought)
 	}

-	return nil
+	aiResult.Content = "Assertion passed"
+	return aiResult, nil
 }
--- a/uixt/driver_ext_ai_test.go
+++ b/uixt/driver_ext_ai_test.go
@@ -15,10 +15,10 @@ import (

 func TestDriverExt_TapByLLM(t *testing.T) {
 	driver := setupDriverExt(t)
-	err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像")
+	_, err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像")
 	assert.Nil(t, err)

-	err = driver.AIAssert("当前在个人介绍页")
+	_, err = driver.AIAssert("当前在个人介绍页")
 	assert.Nil(t, err)
 }

--- a/uixt/driver_utils.go
+++ b/uixt/driver_utils.go
@@ -195,7 +195,7 @@ func (dExt *XTDriver) DoValidation(check, assert, expected string, message ...st
 	case option.SelectorOCR:
 		err = dExt.assertOCR(expected, assert)
 	case option.SelectorAI:
-		err = dExt.AIAssert(expected)
+		_, err = dExt.AIAssert(expected)
 	case option.SelectorForegroundApp:
 		err = dExt.assertForegroundApp(expected, assert)
 	case option.SelectorSelector:
--- a/uixt/mcp_tools_ai.go
+++ b/uixt/mcp_tools_ai.go
@@ -102,7 +102,7 @@ func (t *ToolAIAction) Implement() server.ToolHandlerFunc {
 		}

 		// AI action logic
-		err = driverExt.AIAction(ctx, unifiedReq.Prompt)
+		_, err = driverExt.AIAction(ctx, unifiedReq.Prompt)
 		if err != nil {
 			return NewMCPErrorResponse(fmt.Sprintf("AI action failed: %s", err.Error())), nil
 		}