From 5baabee89c2b089dc22b1fb4a7ead0d6fe69ac46 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 15:59:48 +0800 Subject: [PATCH 1/7] fix: update invokeToolCall to accept options and refactor action type handling in MarkUIOperation --- internal/version/VERSION | 2 +- uixt/driver_ext_ai.go | 10 +++++++--- uixt/driver_ext_screenshot.go | 5 +++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/internal/version/VERSION b/internal/version/VERSION index 713e7a09..53812863 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-250628 +v5.0.0-250629 diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index c2d7042e..fb4e7456 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -115,7 +115,7 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op }() // Execute the tool call - if err := dExt.invokeToolCall(ctx, toolCall); err != nil { + if err := dExt.invokeToolCall(ctx, toolCall, opts...); err != nil { subActionResult.Error = err return err } @@ -173,7 +173,7 @@ func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...optio // Step 3: Execute tool calls for _, toolCall := range planningResult.ToolCalls { - err = dExt.invokeToolCall(ctx, toolCall) + err = dExt.invokeToolCall(ctx, toolCall, opts...) if err != nil { aiExecutionResult.Error = err.Error() return aiExecutionResult, errors.Wrap(err, "invoke tool call failed") @@ -286,7 +286,7 @@ func (dExt *XTDriver) isTaskFinished(planningResult *PlanningExecutionResult) bo } // invokeToolCall invokes the tool call -func (dExt *XTDriver) invokeToolCall(ctx context.Context, toolCall schema.ToolCall) error { +func (dExt *XTDriver) invokeToolCall(ctx context.Context, toolCall schema.ToolCall, opts ...option.ActionOption) error { // Parse arguments arguments := make(map[string]interface{}) err := json.Unmarshal([]byte(toolCall.Function.Arguments), &arguments) @@ -294,6 +294,10 @@ func (dExt *XTDriver) invokeToolCall(ctx context.Context, toolCall schema.ToolCa return err } + // Merge StartToGoal options into tool call arguments + // This ensures options like PreMarkOperation are passed to specific tool implementations + extractActionOptionsToArguments(opts, arguments) + // Execute the action req := mcp.CallToolRequest{ Params: struct { diff --git a/uixt/driver_ext_screenshot.go b/uixt/driver_ext_screenshot.go index 002bec2a..22e966da 100644 --- a/uixt/driver_ext_screenshot.go +++ b/uixt/driver_ext_screenshot.go @@ -470,14 +470,15 @@ func MarkUIOperation(driver IDriver, actionType option.ActionName, actionCoordin fmt.Sprintf("action_%s_pre_%s.png", timestamp, actionType), ) - if actionType == option.ACTION_TapAbsXY || actionType == option.ACTION_DoubleTapXY { + switch actionType { + case option.ACTION_TapAbsXY, option.ACTION_DoubleTapXY: if len(actionCoordinates) != 2 { return fmt.Errorf("invalid tap action coordinates: %v", actionCoordinates) } x, y := actionCoordinates[0], actionCoordinates[1] point := image.Point{X: int(x), Y: int(y)} err = SaveImageWithCircleMarker(compressedBufSource, point, imagePath) - } else if actionType == option.ACTION_SwipeDirection || actionType == option.ACTION_SwipeCoordinate || actionType == option.ACTION_Drag { + case option.ACTION_SwipeDirection, option.ACTION_SwipeCoordinate, option.ACTION_Drag: if len(actionCoordinates) != 4 { return fmt.Errorf("invalid swipe action coordinates: %v", actionCoordinates) } From 0ae22930aa607710de2ab06734821a955af0b65c Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 18:16:26 +0800 Subject: [PATCH 2/7] feat: add horizontal scrolling screenshot display and improve screenshot handling in report generation --- report.go | 140 +++++++++++++++++++++++++++++++--- uixt/driver_ext_screenshot.go | 3 +- uixt/driver_handler.go | 24 +++--- 3 files changed, 144 insertions(+), 23 deletions(-) diff --git a/report.go b/report.go index 3f304175..c0c13970 100644 --- a/report.go +++ b/report.go @@ -1232,6 +1232,113 @@ const htmlTemplate = ` transform: scale(1.02); } + /* Horizontal scrolling screenshot styles */ + .screenshot-horizontal-scroll { + display: flex; + gap: 0 !important; + overflow-x: auto; + overflow-y: hidden; + padding: 8px; + scroll-behavior: smooth; + -webkit-overflow-scrolling: touch; + background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); + border: 1px solid #dee2e6; + border-radius: 6px; + align-items: center; + justify-content: center; + line-height: 0; + font-size: 0; + } + + .screenshot-horizontal-scroll::-webkit-scrollbar { + height: 8px; + } + + .screenshot-horizontal-scroll::-webkit-scrollbar-track { + background: #f1f1f1; + border-radius: 4px; + } + + .screenshot-horizontal-scroll::-webkit-scrollbar-thumb { + background: #888; + border-radius: 4px; + } + + .screenshot-horizontal-scroll::-webkit-scrollbar-thumb:hover { + background: #555; + } + + .screenshot-item-horizontal { + flex: 0 0 auto; + min-width: 180px; + max-width: 280px; + text-align: center; + margin: 0 !important; + padding: 0 !important; + border: none !important; + outline: none; + line-height: 0; + } + + .screenshot-item-horizontal .screenshot-image { + padding: 0; + margin: 0; + background: transparent; + border-radius: 0; + display: flex; + justify-content: center; + align-items: center; + position: relative; + overflow: hidden; + height: 250px; + border: none; + } + + .screenshot-item-horizontal .screenshot-image img { + max-width: 100%; + max-height: 100%; + border-radius: 0; + cursor: pointer; + transition: transform 0.2s; + object-fit: contain; + box-shadow: none; + display: block; + margin: 0 !important; + padding: 0 !important; + border: none !important; + vertical-align: top; + float: left; + outline: none; + } + + .screenshot-item-horizontal .screenshot-image img:hover { + transform: scale(1.05); + } + + /* Direct inline screenshot styles */ + .screenshot-inline { + max-height: 250px; + object-fit: contain; + cursor: pointer; + transition: transform 0.2s; + display: inline-block; + margin: 0 4px 0 0 !important; + padding: 0 !important; + border: none !important; + border-radius: 0 !important; + box-shadow: none !important; + vertical-align: top; + outline: none; + } + + .screenshot-inline:last-child { + margin-right: 0 !important; + } + + .screenshot-inline:hover { + transform: scale(1.05); + } + .actions-details { padding: 12px; max-height: 300px; @@ -2542,19 +2649,30 @@ const htmlTemplate = ` 📸 Take Screenshot {{formatDuration $planning.ScreenshotElapsed}} - {{if $planning.ScreenResult}} -
- {{$screenshot := $planning.ScreenResult}} - {{$base64Image := encodeImageBase64 $screenshot.ImagePath}} - {{if $base64Image}} -
-
- Planning Screenshot -
-
+
+ {{if $planning.ScreenResult}} + {{if $planning.ScreenResult.ImagePath}} + {{$base64Image := encodeImageBase64 $planning.ScreenResult.ImagePath}} + {{if $base64Image}} + Planning Screenshot + {{end}} + {{end}} + {{end}} + {{if $planning.SubActions}} + {{range $subAction := $planning.SubActions}} + {{if $subAction.ScreenResults}} + {{range $subScreenshot := $subAction.ScreenResults}} + {{if $subScreenshot.ImagePath}} + {{$base64Image := encodeImageBase64 $subScreenshot.ImagePath}} + {{if $base64Image}} + Sub-action Screenshot + {{end}} + {{end}} + {{end}} + {{end}} + {{end}} {{end}}
- {{end}}
diff --git a/uixt/driver_ext_screenshot.go b/uixt/driver_ext_screenshot.go index 22e966da..196fa576 100644 --- a/uixt/driver_ext_screenshot.go +++ b/uixt/driver_ext_screenshot.go @@ -27,6 +27,7 @@ import ( "github.com/httprunner/httprunner/v5/uixt/types" ) +// ScreenResult represents the result of taking a screenshot, including image path, recognition results, and metadata type ScreenResult struct { bufSource *bytes.Buffer // raw image buffer bytes ImagePath string `json:"image_path"` // image file path @@ -467,7 +468,7 @@ func MarkUIOperation(driver IDriver, actionType option.ActionName, actionCoordin timestamp := builtin.GenNameWithTimestamp("%d") imagePath := filepath.Join( config.GetConfig().ScreenShotsPath(), - fmt.Sprintf("action_%s_pre_%s.png", timestamp, actionType), + fmt.Sprintf("%s_pre_mark_%s.png", timestamp, actionType), ) switch actionType { diff --git a/uixt/driver_handler.go b/uixt/driver_handler.go index 0fc7fe56..5cdf6da2 100644 --- a/uixt/driver_handler.go +++ b/uixt/driver_handler.go @@ -6,16 +6,17 @@ import ( "path/filepath" "time" + "github.com/rs/zerolog/log" + "github.com/httprunner/httprunner/v5/internal/builtin" "github.com/httprunner/httprunner/v5/internal/config" "github.com/httprunner/httprunner/v5/uixt/ai" "github.com/httprunner/httprunner/v5/uixt/option" - "github.com/rs/zerolog/log" ) func preHandler_TapAbsXY(driver IDriver, options *option.ActionOptions, rawX, rawY float64) ( - x, y float64, err error) { - + x, y float64, err error, +) { // Call MCP action tool if anti-risk is enabled if options.AntiRisk { arguments := getAntiRisk_SetTouchInfoList_Arguments(driver, []ai.PointF{ @@ -40,8 +41,8 @@ func preHandler_TapAbsXY(driver IDriver, options *option.ActionOptions, rawX, ra } func preHandler_DoubleTap(driver IDriver, options *option.ActionOptions, rawX, rawY float64) ( - x, y float64, err error) { - + x, y float64, err error, +) { x, y, err = convertToAbsolutePoint(driver, rawX, rawY) if err != nil { return 0, 0, err @@ -60,8 +61,8 @@ func preHandler_DoubleTap(driver IDriver, options *option.ActionOptions, rawX, r } func preHandler_Drag(driver IDriver, options *option.ActionOptions, rawFomX, rawFromY, rawToX, rawToY float64) ( - fromX, fromY, toX, toY float64, err error) { - + fromX, fromY, toX, toY float64, err error, +) { fromX, fromY, toX, toY, err = convertToAbsoluteCoordinates(driver, rawFomX, rawFromY, rawToX, rawToY) if err != nil { return 0, 0, 0, 0, err @@ -92,8 +93,8 @@ func preHandler_Drag(driver IDriver, options *option.ActionOptions, rawFomX, raw func preHandler_Swipe(driver IDriver, actionType option.ActionName, options *option.ActionOptions, rawFomX, rawFromY, rawToX, rawToY float64) ( - fromX, fromY, toX, toY float64, err error) { - + fromX, fromY, toX, toY float64, err error, +) { fromX, fromY, toX, toY, err = convertToAbsoluteCoordinates(driver, rawFomX, rawFromY, rawToX, rawToY) if err != nil { return 0, 0, 0, 0, err @@ -142,7 +143,7 @@ func postHandler(driver IDriver, actionType option.ActionName, options *option.A timestamp := builtin.GenNameWithTimestamp("%d") imagePath := filepath.Join( config.GetConfig().ScreenShotsPath(), - fmt.Sprintf("action_%s_post_%s.png", timestamp, actionType), + fmt.Sprintf("%s_post_mark_%s.png", timestamp, actionType), ) go func() { @@ -157,7 +158,8 @@ func postHandler(driver IDriver, actionType option.ActionName, options *option.A // callMCPActionTool calls MCP tool for the given action func callMCPActionTool(driver IDriver, - serverName, actionType string, arguments map[string]any) { + serverName, actionType string, arguments map[string]any, +) { // Get XTDriver from cache dExt := getXTDriverFromCache(driver) if dExt == nil { From a040b53ea9cd1a4306cd4653dd6b284855dde74f Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 20:38:33 +0800 Subject: [PATCH 3/7] refactor: enhance screenshot handling by introducing base64 encoding and updating related methods --- uixt/driver_ext_ai.go | 75 ++++++++++++----------------------- uixt/driver_ext_screenshot.go | 40 +++++++------------ uixt/mcp_tools_screen.go | 12 ++++-- uixt/option/screen.go | 10 +++++ 4 files changed, 58 insertions(+), 79 deletions(-) diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index fb4e7456..3355d675 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -10,7 +10,6 @@ import ( "github.com/rs/zerolog/log" "github.com/httprunner/httprunner/v5/code" - "github.com/httprunner/httprunner/v5/internal/builtin" "github.com/httprunner/httprunner/v5/internal/json" "github.com/httprunner/httprunner/v5/uixt/ai" "github.com/httprunner/httprunner/v5/uixt/option" @@ -143,12 +142,11 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) { log.Info().Str("prompt", prompt).Msg("performing AI action") - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_action"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } @@ -160,7 +158,7 @@ func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...optio aiExecutionResult := &AIExecutionResult{ Type: "action", ModelCallElapsed: modelCallElapsed, - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, PlanningResult: &planningResult.PlanningResult, @@ -193,13 +191,11 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. options := option.NewActionOptions(opts...) resetHistory := options.ResetHistory - // Step 1: Take screenshot - screenshotStartTime := time.Now() - // Use GetScreenResult to handle screenshot capture, save, and session tracking - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_planning"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } @@ -208,12 +204,6 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // The planning screenshot is already stored in planningResult.ScreenResult dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions - // get screen shot buffer base64 and size - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() - if err != nil { - return nil, errors.Wrap(code.DeviceGetInfoError, err.Error()) - } - // Step 2: Call model modelCallStartTime := time.Now() planningOpts := &ai.PlanningOptions{ @@ -224,12 +214,12 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. { Type: schema.ChatMessagePartTypeImageURL, ImageURL: &schema.ChatMessageImageURL{ - URL: screenShotBase64, + URL: screenResult.Base64, }, }, }, }, - Size: size, + Size: screenResult.Resolution, ResetHistory: resetHistory, } @@ -250,7 +240,7 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. planningResult := &PlanningExecutionResult{ PlanningResult: *result, // Inherit all fields from ai.PlanningResult // Planning process timing and metadata - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, ScreenResult: screenResult, @@ -374,17 +364,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec return nil, errors.New("LLM service is not initialized") } - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_query"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() - if err != nil { - return nil, err - } - - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() if err != nil { return nil, err } @@ -398,8 +382,8 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec // execute query queryOpts := &ai.QueryOptions{ Query: text, - Screenshot: screenShotBase64, - Size: size, + Screenshot: screenResult.Base64, + Size: screenResult.Resolution, OutputSchema: actionOptions.OutputSchema, } result, err := dExt.LLMService.Query(context.Background(), queryOpts) @@ -412,7 +396,7 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec aiResult := &AIExecutionResult{ Type: "query", ModelCallElapsed: modelCallElapsed, // model call timing - ScreenshotElapsed: screenshotElapsed, // screenshot timing + ScreenshotElapsed: screenResult.Elapsed, // screenshot timing ImagePath: screenResult.ImagePath, // screenshot path Resolution: &screenResult.Resolution, // screen resolution QueryResult: result, // query-specific result @@ -426,35 +410,28 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* return nil, errors.New("LLM service is not initialized") } - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_assert"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } assertResult := &AIExecutionResult{ Type: "assert", - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, } - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() - if err != nil { - assertResult.Error = err.Error() - return assertResult, err - } - // Step 2: Call model and measure time modelCallStartTime := time.Now() assertOpts := &ai.AssertOptions{ Assertion: assertion, - Screenshot: screenShotBase64, - Size: size, + Screenshot: screenResult.Base64, + Size: screenResult.Resolution, } result, err := dExt.LLMService.Assert(context.Background(), assertOpts) assertResult.ModelCallElapsed = time.Since(modelCallStartTime).Milliseconds() diff --git a/uixt/driver_ext_screenshot.go b/uixt/driver_ext_screenshot.go index 196fa576..2d6451b7 100644 --- a/uixt/driver_ext_screenshot.go +++ b/uixt/driver_ext_screenshot.go @@ -37,6 +37,8 @@ type ScreenResult struct { Icons ai.UIResultMap `json:"icons"` // CV 识别的图标 Tags []string `json:"tags"` // tags for image, e.g. ["feed", "ad", "live"] Popup *PopupInfo `json:"popup,omitempty"` + Elapsed int64 `json:"elapsed_ms,omitempty"` // screenshot elapsed time in milliseconds + Base64 string `json:"-"` // base64 encoded screenshot } func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts { @@ -50,26 +52,11 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts { }) } -// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size -// Also saves the screenshot to session for report display -func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) { - // Create screenshot with session saving, minimal CV processing for AI operations - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName("screenshot_base64"), - ) - if err != nil { - return "", types.Size{}, err - } +// GetScreenResult takes a screenshot and returns the ScreenResult with metadata +func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { + // Take screenshot and measure time + screenshotStartTime := time.Now() - // convert buffer to base64 string - screenShotBase64 := "data:image/jpeg;base64," + - base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) - - return screenShotBase64, screenResult.Resolution, nil -} - -// createScreenshotWithSession creates a screenshot with optional OCR processing and saves to session -func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { // get compressed screenshot buffer compressBufSource, err := getScreenShotBuffer(dExt.IDriver) if err != nil { @@ -147,6 +134,13 @@ func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) ( session := dExt.GetSession() session.screenResults = append(session.screenResults, screenResult) + // Convert screenshot buffer to base64 string + if screenshotOptions.ScreenShotWithBase64 { + screenResult.Base64 = "data:image/jpeg;base64," + + base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) + } + + screenResult.Elapsed = time.Since(screenshotStartTime).Milliseconds() logger.Msg("log screenshot") return screenResult, nil } @@ -162,13 +156,7 @@ func needsCVProcessing(options *option.ActionOptions) bool { options.ScreenShotWithOCRCluster != "" } -// GetScreenResult takes a screenshot, returns the image recognition result -func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { - // Enable OCR processing for GetScreenResult - opts = append(opts, option.WithScreenShotOCR(true)) - return dExt.createScreenshotWithSession(opts...) -} - +// GetScreenTexts takes a screenshot, returns the OCR recognition result func (dExt *XTDriver) GetScreenTexts(opts ...option.ActionOption) (ocrTexts ai.OCRTexts, err error) { options := option.NewActionOptions(opts...) if options.ScreenShotFileName == "" { diff --git a/uixt/mcp_tools_screen.go b/uixt/mcp_tools_screen.go index 99d94565..9ded9d9a 100644 --- a/uixt/mcp_tools_screen.go +++ b/uixt/mcp_tools_screen.go @@ -4,10 +4,11 @@ import ( "context" "fmt" - "github.com/httprunner/httprunner/v5/uixt/option" "github.com/mark3labs/mcp-go/mcp" "github.com/mark3labs/mcp-go/server" "github.com/rs/zerolog/log" + + "github.com/httprunner/httprunner/v5/uixt/option" ) // ToolScreenShot implements the screenshot tool call. @@ -34,14 +35,17 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc { if err != nil { return nil, err } - bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize() + screenResult, err := driverExt.GetScreenResult( + option.WithScreenShotFileName("tool_screenshot"), + option.WithScreenShotBase64(true), + ) if err != nil { log.Error().Err(err).Msg("ScreenShot failed") return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil } - log.Debug().Int("imageBytes", len(bufferBase64)).Msg("take screenshot success") + log.Debug().Int("imageBytes", len(screenResult.Base64)).Msg("take screenshot success") - return mcp.NewToolResultImage("screenshot", bufferBase64, "image/jpeg"), nil + return mcp.NewToolResultImage("screenshot", screenResult.Base64, "image/jpeg"), nil } } diff --git a/uixt/option/screen.go b/uixt/option/screen.go index 6b90e951..07d13f93 100644 --- a/uixt/option/screen.go +++ b/uixt/option/screen.go @@ -16,6 +16,7 @@ type ScreenShotOptions struct { ScreenShotWithUpload bool `json:"screenshot_with_upload,omitempty" yaml:"screenshot_with_upload,omitempty"` ScreenShotWithLiveType bool `json:"screenshot_with_live_type,omitempty" yaml:"screenshot_with_live_type,omitempty"` ScreenShotWithLivePopularity bool `json:"screenshot_with_live_popularity,omitempty" yaml:"screenshot_with_live_popularity,omitempty"` + ScreenShotWithBase64 bool `json:"screenshot_with_base64,omitempty" yaml:"screenshot_with_base64,omitempty"` ScreenShotWithUITypes []string `json:"screenshot_with_ui_types,omitempty" yaml:"screenshot_with_ui_types,omitempty"` ScreenShotWithClosePopups bool `json:"screenshot_with_close_popups,omitempty" yaml:"screenshot_with_close_popups,omitempty"` ScreenShotWithOCRCluster string `json:"screenshot_with_ocr_cluster,omitempty" yaml:"screenshot_with_ocr_cluster,omitempty"` @@ -53,6 +54,9 @@ func (o *ScreenShotOptions) GetScreenShotOptions() []ActionOption { if o.ScreenShotFileName != "" { options = append(options, WithScreenShotFileName(o.ScreenShotFileName)) } + if o.ScreenShotWithBase64 { + options = append(options, WithScreenShotBase64(true)) + } return options } @@ -129,6 +133,12 @@ func WithScreenShotFileName(fileName string) ActionOption { } } +func WithScreenShotBase64(base64 bool) ActionOption { + return func(o *ActionOptions) { + o.ScreenShotWithBase64 = base64 + } +} + type ScreenRecordOptions struct { ScreenRecordDuration float64 `json:"screenrecord_duration,omitempty" yaml:"screenrecord_duration,omitempty"` ScreenRecordWithAudio bool `json:"screenrecord_with_audio,omitempty" yaml:"screenrecord_with_audio,omitempty"` From fcc6d266d4b86e60c855a946bb75fcd30cecd947 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 21:26:44 +0800 Subject: [PATCH 4/7] style: update screenshot dimensions in HTML template for improved layout --- report.go | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/report.go b/report.go index c0c13970..26e78bab 100644 --- a/report.go +++ b/report.go @@ -1212,7 +1212,7 @@ const htmlTemplate = ` .screenshot-item-compact .screenshot-image img { width: 100%; height: auto; - max-height: 400px; + max-height: 500px; border-radius: 4px; cursor: pointer; transition: transform 0.2s; @@ -1290,7 +1290,7 @@ const htmlTemplate = ` align-items: center; position: relative; overflow: hidden; - height: 250px; + height: 350px; border: none; } @@ -1317,7 +1317,7 @@ const htmlTemplate = ` /* Direct inline screenshot styles */ .screenshot-inline { - max-height: 250px; + max-height: 350px; object-fit: contain; cursor: pointer; transition: transform 0.2s; @@ -1645,12 +1645,12 @@ const htmlTemplate = ` } .screenshots-horizontal .screenshot-image { - min-height: 200px; + min-height: 300px; padding: 10px 0; } .screenshots-horizontal .screenshot-image img { - max-height: 250px; + max-height: 400px; width: auto; } @@ -1699,7 +1699,7 @@ const htmlTemplate = ` display: flex; justify-content: center; align-items: center; - min-height: 300px; + min-height: 400px; padding: 20px 0; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 8px; @@ -1708,7 +1708,7 @@ const htmlTemplate = ` .screenshot-image img { max-width: 100%; - max-height: 400px; + max-height: 600px; border-radius: 6px; cursor: pointer; transition: transform 0.2s; @@ -1721,12 +1721,12 @@ const htmlTemplate = ` } .screenshot-item.small .screenshot-image { - min-height: 250px; + min-height: 300px; padding: 15px 0; } .screenshot-item.small .screenshot-image img { - max-height: 200px; + max-height: 350px; } .validator-item { @@ -2393,21 +2393,21 @@ const htmlTemplate = ` } .screenshot-image { - min-height: 250px; + min-height: 300px; padding: 15px 0; } .screenshot-image img { - max-height: 250px; + max-height: 400px; } .screenshot-item.small .screenshot-image { - min-height: 200px; + min-height: 250px; padding: 10px 0; } .screenshot-item.small .screenshot-image img { - max-height: 150px; + max-height: 300px; } .log-header { @@ -2646,7 +2646,7 @@ const htmlTemplate = `
- 📸 Take Screenshot + 📸 ScreenShots {{formatDuration $planning.ScreenshotElapsed}}
@@ -2955,13 +2955,13 @@ const htmlTemplate = `
{{end}} - + {{if $step.Attachments}} {{$attachments := $step.Attachments}} {{if eq (printf "%T" $attachments) "map[string]interface {}"}} {{if index $attachments "screen_results"}}
-

Screenshots

+

Attachment ScreenShots

{{range $screenshot := index $attachments "screen_results"}} {{$imagePath := ""}} From 7aa4ad652c910763fbab9ded17699a117033c1fe Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 21:42:07 +0800 Subject: [PATCH 5/7] fix: wait 3 seconds for tool calls to complete --- uixt/driver_ext_ai.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index 3355d675..90fc13fc 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -135,6 +135,9 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op if options.MaxRetryTimes > 0 && attempt > options.MaxRetryTimes { return allPlannings, errors.New("reached max retry times") } + + // wait 3 seconds for tool calls to complete + time.Sleep(3 * time.Second) } } From 850bd61dde15d36d10110e4b6b87fedcf5321c7f Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Sun, 29 Jun 2025 22:41:03 +0800 Subject: [PATCH 6/7] fix: improve error handling in cache.go by replacing fmt.Errorf with errors.Wrap for better context --- uixt/cache.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/uixt/cache.go b/uixt/cache.go index 6fee28af..7ca0c6f0 100644 --- a/uixt/cache.go +++ b/uixt/cache.go @@ -6,8 +6,11 @@ import ( "strings" "sync" - "github.com/httprunner/httprunner/v5/uixt/option" + "github.com/pkg/errors" "github.com/rs/zerolog/log" + + "github.com/httprunner/httprunner/v5/code" + "github.com/httprunner/httprunner/v5/uixt/option" ) // CacheManager provides a generic cache management interface @@ -153,7 +156,7 @@ func (cm *CacheManager[T]) GetOrCreate(key string, factory func() (T, map[string item, metadata, err := factory() if err != nil { var zero T - return zero, fmt.Errorf("failed to create item: %w", err) + return zero, err } // Store in cache @@ -271,7 +274,7 @@ func createXTDriverWithConfig(config DriverCacheConfig) (*XTDriver, error) { browserOpts := config.DeviceOpts.ToBrowserOptions().Options() device, err = NewBrowserDevice(browserOpts...) default: - return nil, fmt.Errorf("unsupported platform: %s", platform) + return nil, errors.Wrapf(code.InvalidParamError, "unsupported platform: %s", platform) } } else { // Use default options, let NewXXDevice handle serial (empty or specified) @@ -301,17 +304,17 @@ func createXTDriverWithConfig(config DriverCacheConfig) (*XTDriver, error) { device, err = NewBrowserDevice() } default: - return nil, fmt.Errorf("unsupported platform: %s", platform) + return nil, errors.Wrapf(code.InvalidParamError, "unsupported platform: %s", platform) } } if err != nil { - return nil, fmt.Errorf("failed to create device: %w", err) + return nil, err } // Create driver driver, err := device.NewDriver() if err != nil { - return nil, fmt.Errorf("failed to create driver: %w", err) + return nil, errors.Wrap(err, "failed to create driver") } // Create XTDriver with AI options @@ -326,7 +329,7 @@ func createXTDriverWithConfig(config DriverCacheConfig) (*XTDriver, error) { driverExt, err := NewXTDriver(driver, aiOpts...) if err != nil { - return nil, fmt.Errorf("failed to create XTDriver: %w", err) + return nil, errors.Wrap(err, "failed to create XTDriver") } return driverExt, nil } From ab40a8c63f3c90d6c7b324231f0d9544af311bfb Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Mon, 30 Jun 2025 00:11:22 +0800 Subject: [PATCH 7/7] refactor: ForegroundInfo --- go.mod | 2 +- internal/version/VERSION | 2 +- uixt/android_driver_adb.go | 55 +++++++++++++++++++++++++++----------- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/go.mod b/go.mod index 78c40dbe..ea2db9fe 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ toolchain go1.23.7 require ( github.com/Masterminds/semver v1.5.0 github.com/andybalholm/brotli v1.0.4 + github.com/antchfx/xmlquery v1.4.4 github.com/bytedance/sonic v1.13.2 github.com/charmbracelet/glamour v0.8.0 github.com/charmbracelet/huh v0.3.0 @@ -47,7 +48,6 @@ require ( require ( github.com/alecthomas/chroma/v2 v2.14.0 // indirect - github.com/antchfx/xmlquery v1.4.4 // indirect github.com/antchfx/xpath v1.3.3 // indirect github.com/atotto/clipboard v0.1.4 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect diff --git a/internal/version/VERSION b/internal/version/VERSION index 53812863..524b9a16 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-250629 +v5.0.0-250630 diff --git a/uixt/android_driver_adb.go b/uixt/android_driver_adb.go index 65ca88ed..1e539364 100644 --- a/uixt/android_driver_adb.go +++ b/uixt/android_driver_adb.go @@ -622,9 +622,13 @@ func (ad *ADBDriver) tapByTextUsingHierarchy(hierarchy *Hierarchy, text string, func (ud *ADBDriver) TapByXpath(xpath string, opts ...option.ActionOption) (err error) { source, err := ud.Source() + if err != nil { + log.Error().Err(err).Msg("failed to get source") + return err + } doc, err := xmlquery.Parse(strings.NewReader(source)) if err != nil { - log.Error().Err(err).Str("serial", ud.Device.Serial()) + log.Error().Err(err).Msg("failed to parse source") return err } targetNodes := xmlquery.Find(doc, xpath) @@ -644,10 +648,12 @@ func (ud *ADBDriver) TapByXpath(xpath string, opts ...option.ActionOption) (err centerX := float64(x1+x2) / 2 centerY := float64(y1+y2) / 2 - log.Info().Str("serial", ud.Device.Serial()).Str("xpath", xpath).Str("bounds", bounds).Msg("find node by xpath success") + log.Info().Str("xpath", xpath).Str("bounds", bounds).Msg("find node by xpath success") return ud.TapAbsXY(centerX, centerY, opts...) } - return + + log.Error().Str("xpath", xpath).Msg("failed to find node by xpath") + return errors.New("failed to find node by xpath") } func (ad *ADBDriver) searchNodes(nodes []Layout, text string, opts ...option.ActionOption) []Bounds { @@ -756,27 +762,44 @@ func (ad *ADBDriver) GetSession() *DriverSession { } func (ad *ADBDriver) ForegroundInfo() (app types.AppInfo, err error) { - packageInfo, err := ad.runShellCommand("CLASSPATH=/data/local/tmp/evalite", "app_process", "/", "com.bytedance.iesqa.eval_process.PackageService", "2>/dev/null") + // Get foreground app package info using evalite service + packageInfo, err := ad.getForegroundPackageInfo() if err != nil { - packageInfo, err = ad.runShellCommand("CLASSPATH=/data/local/tmp/evalite", "app_process", "/", "com.bytedance.iesqa.eval_process.PackageService", "2>/dev/null") - if err != nil { - log.Error().Err(err).Str("serial", ad.Device.Serial()).Msg("failed to get foreground app") - return app, err - } + log.Error().Err(err).Msg("failed to get foreground app info") + return app, err } - log.Info().Str("serial", ad.Device.Serial()).Msg("foreground app output: " + packageInfo) - if strings.TrimSpace(packageInfo) == "" { - log.Error().Str("serial", ad.Device.Serial()).Msg("foreground app output is empty") - return app, errors.New("foreground app output is empty") + + // Parse package info JSON + packageInfo = strings.TrimSpace(packageInfo) + if packageInfo == "" { + err = errors.New("foreground app output is empty") + log.Error().Err(err).Msg("get foreground app info failed") + return app, err } - err = json.Unmarshal([]byte(strings.TrimSpace(packageInfo)), &app) - if err != nil { - log.Error().Err(err).Str("serial", ad.Device.Serial()).Str("packageInfo", packageInfo).Msg("failed to parse package info") + if err = json.Unmarshal([]byte(packageInfo), &app); err != nil { + log.Error().Err(err).Str("packageInfo", packageInfo).Msg("failed to parse package info") return app, err } return app, nil } +// getForegroundPackageInfo executes the evalite service command to get foreground app info +func (ad *ADBDriver) getForegroundPackageInfo() (string, error) { + const maxRetries = 2 + var lastErr error + for i := 0; i < maxRetries; i++ { + packageInfo, err := ad.runShellCommand("CLASSPATH=/data/local/tmp/evalite", + "app_process", "/", "com.bytedance.iesqa.eval_process.PackageService", "2>/dev/null") + if err == nil { + return packageInfo, nil + } + lastErr = err + log.Warn().Err(err).Int("attempt", i+1).Msg("failed to get foreground package info, retrying") + } + + return "", lastErr +} + func (ad *ADBDriver) SetIme(imeRegx string) error { log.Info().Str("imeRegx", imeRegx).Msg("ADBDriver.SetIme") imeList := ad.ListIme()