feat: unify AI action handling with detailed execution results and enhanced UI integration

This commit is contained in:
lilong.129
2025-06-24 13:22:00 +08:00
parent fc32b5d874
commit 8fc8d06604
8 changed files with 156 additions and 146 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506241150
v5.0.0-beta-2506241342

135
report.go
View File

@@ -491,6 +491,12 @@ func (g *HTMLReportGenerator) GenerateReport(outputFile string) error {
"add": func(a, b int) int { return a + b },
"base": filepath.Base,
"index": func(m map[string]any, key string) any { return m[key] },
"title": func(s string) string {
if s == "" {
return ""
}
return strings.ToUpper(s[:1]) + s[1:]
},
"extractThought": func(content string) string {
if content == "" {
return ""
@@ -2379,34 +2385,34 @@ const htmlTemplate = `<!DOCTYPE html>
{{end}}
{{end}}
{{/* Enhanced AI Query Display - using QueryResult data structure */}}
{{if eq $action.Method "ai_query"}}
{{if $action.QueryResult}}
{{/* Enhanced AI Operations Display - using unified AIResult data structure */}}
{{if or (eq $action.Method "ai_query") (eq $action.Method "ai_action") (eq $action.Method "ai_assert")}}
{{if $action.AIResult}}
<div class="sub-action-item">
<div class="validator-ai-content">
<!-- Display AI Thought -->
{{if $action.QueryResult.Thought}}
<div class="thought">{{$action.QueryResult.Thought}}</div>
{{if $action.AIResult.Thought}}
<div class="thought">{{$action.AIResult.Thought}}</div>
{{end}}
<!-- AI Query Layout: Screenshot left, Analysis right -->
<!-- AI Operation Layout: Screenshot left, Analysis right -->
<div class="validator-ai-layout">
<!-- Left column: Screenshot -->
{{if $action.QueryResult.ImagePath}}
{{if $action.AIResult.ImagePath}}
<div class="validator-column-screenshot">
<div class="validator-step-compact">
<div class="step-header-compact">
<span class="step-name">📸 Query Screenshot</span>
{{if $action.QueryResult.ScreenshotElapsed}}
<span class="duration">{{formatDuration $action.QueryResult.ScreenshotElapsed}}</span>
<span class="step-name">📸 {{title $action.AIResult.Type}} Screenshot</span>
{{if $action.AIResult.ScreenshotElapsed}}
<span class="duration">{{formatDuration $action.AIResult.ScreenshotElapsed}}</span>
{{end}}
</div>
<div class="screenshot-display">
{{$base64Image := encodeImageBase64 $action.QueryResult.ImagePath}}
{{$base64Image := encodeImageBase64 $action.AIResult.ImagePath}}
{{if $base64Image}}
<div class="screenshot-item-compact">
<div class="screenshot-image">
<img src="data:image/jpeg;base64,{{$base64Image}}" alt="AI Query Screenshot" onclick="openImageModal(this.src)" />
<img src="data:image/jpeg;base64,{{$base64Image}}" alt="AI {{title $action.AIResult.Type}} Screenshot" onclick="openImageModal(this.src)" />
</div>
</div>
{{end}}
@@ -2415,30 +2421,27 @@ const htmlTemplate = `<!DOCTYPE html>
</div>
{{end}}
<!-- Right column: AI Query Analysis -->
<!-- Right column: AI Analysis -->
<div class="validator-column-analysis">
<div class="validator-step-compact">
<div class="step-header-compact">
<span class="step-name">🤖 AI Query Analysis</span>
{{if $action.QueryResult.ModelCallElapsed}}
<span class="duration">{{formatDuration $action.QueryResult.ModelCallElapsed}}</span>
<span class="step-name">🤖 AI {{title $action.AIResult.Type}} Analysis</span>
{{if $action.AIResult.ModelCallElapsed}}
<span class="duration">{{formatDuration $action.AIResult.ModelCallElapsed}}</span>
{{end}}
</div>
<div class="validator-ai-details">
{{if $action.QueryResult.ModelName}}
<div class="model-info">🤖 Model: {{$action.QueryResult.ModelName}}</div>
{{if $action.AIResult.ModelName}}
<div class="model-info">🤖 Model: {{$action.AIResult.ModelName}}</div>
{{end}}
{{if $action.QueryResult.Resolution}}
<div class="model-info">📐 Resolution: {{$action.QueryResult.Resolution.Width}}x{{$action.QueryResult.Resolution.Height}}</div>
{{if $action.AIResult.Resolution}}
<div class="model-info">📐 Resolution: {{$action.AIResult.Resolution.Width}}x{{$action.AIResult.Resolution.Height}}</div>
{{end}}
{{if $action.QueryResult.Usage}}
<div class="usage-info">📊 Tokens: {{$action.QueryResult.Usage.PromptTokens}} in / {{$action.QueryResult.Usage.CompletionTokens}} out / {{$action.QueryResult.Usage.TotalTokens}} total</div>
{{if $action.AIResult.Usage}}
<div class="usage-info">📊 Tokens: {{$action.AIResult.Usage.PromptTokens}} in / {{$action.AIResult.Usage.CompletionTokens}} out / {{$action.AIResult.Usage.TotalTokens}} total</div>
{{end}}
{{if $action.QueryResult.Content}}
<div class="model-info">💬 Query Result: {{$action.QueryResult.Content}}</div>
{{end}}
{{if $action.QueryResult.Error}}
<div class="model-info" style="color: #dc3545;">❌ Error: {{$action.QueryResult.Error}}</div>
{{if $action.AIResult.Content}}
<div class="model-info">💬 {{title $action.AIResult.Type}} Result: {{$action.AIResult.Content}}</div>
{{end}}
</div>
</div>
@@ -2531,84 +2534,6 @@ const htmlTemplate = `<!DOCTYPE html>
{{if and $validator.msg (ne $validator.check_result "pass")}}
<div class="validator-message">{{$validator.msg}}</div>
{{end}}
<!-- Enhanced AI Validator Display -->
{{if or (eq $validator.check "ui_ai") (eq $validator.assert "ai_assert")}}
<div class="validator-ai-content">
<!-- Extract AI validation details from step logs -->
{{$stepLogs := getStepLogs $step}}
{{$validationThought := ""}}
{{$validationModel := ""}}
{{$validationUsage := ""}}
{{$validationScreenshot := ""}}
{{range $logEntry := $stepLogs}}
{{if and (eq $logEntry.Message "log response message") (index $logEntry.Fields "content")}}
{{$content := index $logEntry.Fields "content"}}
{{if $content}}
{{$validationThought = $content}}
{{end}}
{{end}}
{{if and (eq $logEntry.Message "call model service for assertion") (index $logEntry.Fields "model")}}
{{$validationModel = index $logEntry.Fields "model"}}
{{end}}
{{if and (eq $logEntry.Message "usage statistics") (index $logEntry.Fields "input_tokens")}}
{{$inputTokens := index $logEntry.Fields "input_tokens"}}
{{$outputTokens := index $logEntry.Fields "output_tokens"}}
{{$totalTokens := index $logEntry.Fields "total_tokens"}}
{{$validationUsage = printf "📊 Tokens: %v in / %v out / %v total" $inputTokens $outputTokens $totalTokens}}
{{end}}
{{if and (eq $logEntry.Message "log screenshot") (index $logEntry.Fields "imagePath")}}
{{$validationScreenshot = index $logEntry.Fields "imagePath"}}
{{end}}
{{end}}
<!-- Display AI Thought at the top, same as planning -->
{{if $validationThought}}
<div class="thought">{{extractThought $validationThought}}</div>
{{end}}
<!-- AI Validation Layout - similar to planning layout -->
<div class="validator-ai-layout">
<!-- Left column: Screenshot -->
{{if $validationScreenshot}}
<div class="validator-column-screenshot">
<div class="validator-step-compact">
<div class="step-header-compact">
<span class="step-name">📸 Validation Screenshot</span>
</div>
<div class="screenshot-display">
{{$base64Image := encodeImageBase64 $validationScreenshot}}
{{if $base64Image}}
<div class="screenshot-item-compact">
<div class="screenshot-image">
<img src="data:image/jpeg;base64,{{$base64Image}}" alt="Validation Screenshot" onclick="openImageModal(this.src)" />
</div>
</div>
{{end}}
</div>
</div>
</div>
{{end}}
<!-- Right column: AI Analysis -->
<div class="validator-column-analysis">
<div class="validator-step-compact">
<div class="step-header-compact">
<span class="step-name">🤖 AI Analysis</span>
</div>
<div class="validator-ai-details">
{{if $validationModel}}
<div class="model-info">🤖 Model: {{$validationModel}}</div>
{{end}}
{{if $validationUsage}}
<div class="usage-info">{{$validationUsage}}</div>
{{end}}
</div>
</div>
</div>
</div>
</div>
{{end}}
</div>
{{end}}
</div>

10
step.go
View File

@@ -60,11 +60,11 @@ type TStep struct {
// one step contains one or multiple actions
type ActionResult struct {
option.MobileAction `json:",inline"`
StartTime int64 `json:"start_time"` // action start time in millisecond(ms)
Elapsed int64 `json:"elapsed_ms"` // action elapsed time(ms)
Error string `json:"error,omitempty"` // action execution result
Plannings []*uixt.PlanningExecutionResult `json:"plannings,omitempty"` // store planning results for start_to_goal actions, which contains multiple sub-actions
QueryResult *uixt.QueryExecutionResult `json:"query_result,omitempty"` // store query result for ai_query actions
StartTime int64 `json:"start_time"` // action start time in millisecond(ms)
Elapsed int64 `json:"elapsed_ms"` // action elapsed time(ms)
Error string `json:"error,omitempty"` // action execution result
Plannings []*uixt.PlanningExecutionResult `json:"plannings,omitempty"` // store planning results for start_to_goal actions, which contains multiple sub-actions
AIResult *uixt.AIExecutionResult `json:"ai_result,omitempty"` // store unified AI execution result for ai_query/ai_action/ai_assert actions
uixt.SessionData // store session data for other actions besides start_to_goal & ai_query
}

View File

@@ -935,7 +935,7 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
}
}()
// handle start_to_goal action
// handle start_to_goal AI action
if action.Method == option.ACTION_StartToGoal {
planningResults, err := uiDriver.StartToGoal(ctx,
action.Params.(string), action.GetOptions()...)
@@ -952,12 +952,23 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
continue
}
// handle ai_query action
if action.Method == option.ACTION_Query {
queryResult, err := uiDriver.AIQuery(
action.Params.(string), action.GetOptions()...)
// handle AI operations (ai_action, ai_query, ai_assert) with unified result storage
if action.Method == option.ACTION_AIAction || action.Method == option.ACTION_Query || action.Method == option.ACTION_AIAssert {
var aiResult *uixt.AIExecutionResult
var err error
prompt := action.Params.(string)
switch action.Method {
case option.ACTION_AIAction:
aiResult, err = uiDriver.AIAction(ctx, prompt, action.GetOptions()...)
case option.ACTION_Query:
aiResult, err = uiDriver.AIQuery(prompt, action.GetOptions()...)
case option.ACTION_AIAssert:
aiResult, err = uiDriver.AIAssert(prompt, action.GetOptions()...)
}
actionResult.Elapsed = time.Since(actionStartTime).Milliseconds()
actionResult.QueryResult = queryResult
actionResult.AIResult = aiResult
stepResult.Actions = append(stepResult.Actions, actionResult)
if err != nil {
actionResult.Error = err.Error()
@@ -969,7 +980,7 @@ func runStepMobileUI(s *SessionRunner, step IStep) (stepResult *StepResult, err
continue
}
// handle other actions
// handle other non-AI actions
sessionData, err := uiDriver.ExecuteAction(ctx, action)
actionResult.Elapsed = time.Since(actionStartTime).Milliseconds()
actionResult.SessionData = sessionData

View File

@@ -131,24 +131,52 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
}
}
func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) error {
// AIAction performs AI-driven action and returns detailed execution result
func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) {
log.Info().Str("prompt", prompt).Msg("performing AI action")
// plan next action
planningResult, err := dExt.PlanNextAction(ctx, prompt, opts...)
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return err
return nil, err
}
// Invoke tool calls
// Step 2: Plan next action and measure time
modelCallStartTime := time.Now()
planningResult, err := dExt.PlanNextAction(ctx, prompt, opts...)
modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
aiExecutionResult := &AIExecutionResult{
Type: "action",
ModelCallElapsed: modelCallElapsed,
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
ModelName: planningResult.ModelName,
Usage: planningResult.Usage,
PlanningResult: &planningResult.PlanningResult,
Thought: planningResult.Thought,
Content: planningResult.Content,
}
if err != nil {
aiExecutionResult.Error = err.Error()
return aiExecutionResult, errors.Wrap(err, "get next action failed")
}
// Step 3: Execute tool calls
for _, toolCall := range planningResult.ToolCalls {
err = dExt.invokeToolCall(ctx, toolCall)
if err != nil {
return err
aiExecutionResult.Error = err.Error()
return aiExecutionResult, errors.Wrap(err, "invoke tool call failed")
}
}
return nil
return aiExecutionResult, nil
}
// PlanNextAction performs planning and returns unified planning information
@@ -301,15 +329,25 @@ type PlanningExecutionResult struct {
SubActions []*SubActionResult `json:"sub_actions,omitempty"` // sub-actions generated from this planning
}
// QueryExecutionResult contains the result of AI query execution with timing and metadata
type QueryExecutionResult struct {
ai.QueryResult // inherit from ai.QueryResult
// AIExecutionResult represents a unified result structure for all AI operations
type AIExecutionResult struct {
Type string `json:"type"` // operation type: "query", "action", "assert"
ModelCallElapsed int64 `json:"model_call_elapsed"` // model call elapsed time in milliseconds
ScreenshotElapsed int64 `json:"screenshot_elapsed"` // screenshot elapsed time in milliseconds
ImagePath string `json:"image_path"` // path to screenshot used for query
ImagePath string `json:"image_path"` // path to screenshot used for operation
Resolution *types.Size `json:"resolution"` // screen resolution
ModelName string `json:"model_name"` // model name used for query
ModelName string `json:"model_name"` // model name used for operation
Usage *schema.TokenUsage `json:"usage,omitempty"` // token usage statistics
// Operation-specific results (only one will be populated based on Type)
QueryResult *ai.QueryResult `json:"query_result,omitempty"` // for ai_query operations
PlanningResult *ai.PlanningResult `json:"planning_result,omitempty"` // for ai_action operations
AssertionResult *ai.AssertionResult `json:"assertion_result,omitempty"` // for ai_assert operations
// Common fields
Thought string `json:"thought,omitempty"` // AI reasoning/thought process
Content string `json:"content,omitempty"` // operation result content
Error string `json:"error,omitempty"` // error message if operation failed
}
// SubActionResult represents a sub-action within a start_to_goal action
@@ -327,7 +365,7 @@ type SessionData struct {
ScreenResults []*ScreenResult `json:"screen_results,omitempty"` // store sub-action specific screen_results
}
func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*QueryExecutionResult, error) {
func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExecutionResult, error) {
if dExt.LLMService == nil {
return nil, errors.New("LLM service is not initialized")
}
@@ -366,41 +404,77 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*QueryE
return nil, errors.Wrap(err, "AI query failed")
}
// Create QueryExecutionResult with all timing and metadata
queryExecResult := &QueryExecutionResult{
QueryResult: *result, // inherit from ai.QueryResult
// Create AIExecutionResult with all timing and metadata
aiResult := &AIExecutionResult{
Type: "query",
ModelCallElapsed: modelCallElapsed, // model call timing
ScreenshotElapsed: screenshotElapsed, // screenshot timing
ImagePath: screenResult.ImagePath, // screenshot path
Resolution: &screenResult.Resolution, // screen resolution
QueryResult: result, // query-specific result
Thought: result.Thought, // AI reasoning
Content: result.Content, // query result content
}
return queryExecResult, nil
return aiResult, nil
}
func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) error {
// AIAssert performs AI-driven assertion and returns detailed execution result
func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (*AIExecutionResult, error) {
if dExt.LLMService == nil {
return errors.New("LLM service is not initialized")
return nil, errors.New("LLM service is not initialized")
}
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return err
return &AIExecutionResult{
Type: "assert",
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
Error: err.Error(),
}, err
}
// execute assertion
// Step 2: Call model and measure time
modelCallStartTime := time.Now()
assertOpts := &ai.AssertOptions{
Assertion: assertion,
Screenshot: screenShotBase64,
Size: size,
}
result, err := dExt.LLMService.Assert(context.Background(), assertOpts)
modelCallElapsed := time.Since(modelCallStartTime).Milliseconds()
aiResult := &AIExecutionResult{
Type: "assert",
ModelCallElapsed: modelCallElapsed,
ScreenshotElapsed: screenshotElapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
AssertionResult: result,
Thought: result.Thought,
}
if err != nil {
return errors.Wrap(err, "AI assertion failed")
aiResult.Error = err.Error()
return aiResult, errors.Wrap(err, "AI assertion failed")
}
if !result.Pass {
return errors.New(result.Thought)
aiResult.Error = result.Thought
return aiResult, errors.New(result.Thought)
}
return nil
aiResult.Content = "Assertion passed"
return aiResult, nil
}

View File

@@ -15,10 +15,10 @@ import (
func TestDriverExt_TapByLLM(t *testing.T) {
driver := setupDriverExt(t)
err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像")
_, err := driver.AIAction(context.Background(), "点击第一个帖子的作者头像")
assert.Nil(t, err)
err = driver.AIAssert("当前在个人介绍页")
_, err = driver.AIAssert("当前在个人介绍页")
assert.Nil(t, err)
}

View File

@@ -195,7 +195,7 @@ func (dExt *XTDriver) DoValidation(check, assert, expected string, message ...st
case option.SelectorOCR:
err = dExt.assertOCR(expected, assert)
case option.SelectorAI:
err = dExt.AIAssert(expected)
_, err = dExt.AIAssert(expected)
case option.SelectorForegroundApp:
err = dExt.assertForegroundApp(expected, assert)
case option.SelectorSelector:

View File

@@ -102,7 +102,7 @@ func (t *ToolAIAction) Implement() server.ToolHandlerFunc {
}
// AI action logic
err = driverExt.AIAction(ctx, unifiedReq.Prompt)
_, err = driverExt.AIAction(ctx, unifiedReq.Prompt)
if err != nil {
return NewMCPErrorResponse(fmt.Sprintf("AI action failed: %s", err.Error())), nil
}