diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index fb4e7456..3355d675 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -10,7 +10,6 @@ import ( "github.com/rs/zerolog/log" "github.com/httprunner/httprunner/v5/code" - "github.com/httprunner/httprunner/v5/internal/builtin" "github.com/httprunner/httprunner/v5/internal/json" "github.com/httprunner/httprunner/v5/uixt/ai" "github.com/httprunner/httprunner/v5/uixt/option" @@ -143,12 +142,11 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) { log.Info().Str("prompt", prompt).Msg("performing AI action") - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_action"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } @@ -160,7 +158,7 @@ func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...optio aiExecutionResult := &AIExecutionResult{ Type: "action", ModelCallElapsed: modelCallElapsed, - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, PlanningResult: &planningResult.PlanningResult, @@ -193,13 +191,11 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. options := option.NewActionOptions(opts...) resetHistory := options.ResetHistory - // Step 1: Take screenshot - screenshotStartTime := time.Now() - // Use GetScreenResult to handle screenshot capture, save, and session tracking - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_planning"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } @@ -208,12 +204,6 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // The planning screenshot is already stored in planningResult.ScreenResult dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions - // get screen shot buffer base64 and size - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() - if err != nil { - return nil, errors.Wrap(code.DeviceGetInfoError, err.Error()) - } - // Step 2: Call model modelCallStartTime := time.Now() planningOpts := &ai.PlanningOptions{ @@ -224,12 +214,12 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. { Type: schema.ChatMessagePartTypeImageURL, ImageURL: &schema.ChatMessageImageURL{ - URL: screenShotBase64, + URL: screenResult.Base64, }, }, }, }, - Size: size, + Size: screenResult.Resolution, ResetHistory: resetHistory, } @@ -250,7 +240,7 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. planningResult := &PlanningExecutionResult{ PlanningResult: *result, // Inherit all fields from ai.PlanningResult // Planning process timing and metadata - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, ScreenResult: screenResult, @@ -374,17 +364,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec return nil, errors.New("LLM service is not initialized") } - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_query"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() - if err != nil { - return nil, err - } - - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() if err != nil { return nil, err } @@ -398,8 +382,8 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec // execute query queryOpts := &ai.QueryOptions{ Query: text, - Screenshot: screenShotBase64, - Size: size, + Screenshot: screenResult.Base64, + Size: screenResult.Resolution, OutputSchema: actionOptions.OutputSchema, } result, err := dExt.LLMService.Query(context.Background(), queryOpts) @@ -412,7 +396,7 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec aiResult := &AIExecutionResult{ Type: "query", ModelCallElapsed: modelCallElapsed, // model call timing - ScreenshotElapsed: screenshotElapsed, // screenshot timing + ScreenshotElapsed: screenResult.Elapsed, // screenshot timing ImagePath: screenResult.ImagePath, // screenshot path Resolution: &screenResult.Resolution, // screen resolution QueryResult: result, // query-specific result @@ -426,35 +410,28 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* return nil, errors.New("LLM service is not initialized") } - // Step 1: Take screenshot and measure time - screenshotStartTime := time.Now() - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")), + // Step 1: Take screenshot and convert to base64 + screenResult, err := dExt.GetScreenResult( + option.WithScreenShotFileName("ai_assert"), + option.WithScreenShotBase64(true), ) - screenshotElapsed := time.Since(screenshotStartTime).Milliseconds() if err != nil { return nil, err } assertResult := &AIExecutionResult{ Type: "assert", - ScreenshotElapsed: screenshotElapsed, + ScreenshotElapsed: screenResult.Elapsed, ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, } - screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() - if err != nil { - assertResult.Error = err.Error() - return assertResult, err - } - // Step 2: Call model and measure time modelCallStartTime := time.Now() assertOpts := &ai.AssertOptions{ Assertion: assertion, - Screenshot: screenShotBase64, - Size: size, + Screenshot: screenResult.Base64, + Size: screenResult.Resolution, } result, err := dExt.LLMService.Assert(context.Background(), assertOpts) assertResult.ModelCallElapsed = time.Since(modelCallStartTime).Milliseconds() diff --git a/uixt/driver_ext_screenshot.go b/uixt/driver_ext_screenshot.go index 196fa576..2d6451b7 100644 --- a/uixt/driver_ext_screenshot.go +++ b/uixt/driver_ext_screenshot.go @@ -37,6 +37,8 @@ type ScreenResult struct { Icons ai.UIResultMap `json:"icons"` // CV 识别的图标 Tags []string `json:"tags"` // tags for image, e.g. ["feed", "ad", "live"] Popup *PopupInfo `json:"popup,omitempty"` + Elapsed int64 `json:"elapsed_ms,omitempty"` // screenshot elapsed time in milliseconds + Base64 string `json:"-"` // base64 encoded screenshot } func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts { @@ -50,26 +52,11 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts { }) } -// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size -// Also saves the screenshot to session for report display -func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) { - // Create screenshot with session saving, minimal CV processing for AI operations - screenResult, err := dExt.createScreenshotWithSession( - option.WithScreenShotFileName("screenshot_base64"), - ) - if err != nil { - return "", types.Size{}, err - } +// GetScreenResult takes a screenshot and returns the ScreenResult with metadata +func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { + // Take screenshot and measure time + screenshotStartTime := time.Now() - // convert buffer to base64 string - screenShotBase64 := "data:image/jpeg;base64," + - base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) - - return screenShotBase64, screenResult.Resolution, nil -} - -// createScreenshotWithSession creates a screenshot with optional OCR processing and saves to session -func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { // get compressed screenshot buffer compressBufSource, err := getScreenShotBuffer(dExt.IDriver) if err != nil { @@ -147,6 +134,13 @@ func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) ( session := dExt.GetSession() session.screenResults = append(session.screenResults, screenResult) + // Convert screenshot buffer to base64 string + if screenshotOptions.ScreenShotWithBase64 { + screenResult.Base64 = "data:image/jpeg;base64," + + base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) + } + + screenResult.Elapsed = time.Since(screenshotStartTime).Milliseconds() logger.Msg("log screenshot") return screenResult, nil } @@ -162,13 +156,7 @@ func needsCVProcessing(options *option.ActionOptions) bool { options.ScreenShotWithOCRCluster != "" } -// GetScreenResult takes a screenshot, returns the image recognition result -func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { - // Enable OCR processing for GetScreenResult - opts = append(opts, option.WithScreenShotOCR(true)) - return dExt.createScreenshotWithSession(opts...) -} - +// GetScreenTexts takes a screenshot, returns the OCR recognition result func (dExt *XTDriver) GetScreenTexts(opts ...option.ActionOption) (ocrTexts ai.OCRTexts, err error) { options := option.NewActionOptions(opts...) if options.ScreenShotFileName == "" { diff --git a/uixt/mcp_tools_screen.go b/uixt/mcp_tools_screen.go index 99d94565..9ded9d9a 100644 --- a/uixt/mcp_tools_screen.go +++ b/uixt/mcp_tools_screen.go @@ -4,10 +4,11 @@ import ( "context" "fmt" - "github.com/httprunner/httprunner/v5/uixt/option" "github.com/mark3labs/mcp-go/mcp" "github.com/mark3labs/mcp-go/server" "github.com/rs/zerolog/log" + + "github.com/httprunner/httprunner/v5/uixt/option" ) // ToolScreenShot implements the screenshot tool call. @@ -34,14 +35,17 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc { if err != nil { return nil, err } - bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize() + screenResult, err := driverExt.GetScreenResult( + option.WithScreenShotFileName("tool_screenshot"), + option.WithScreenShotBase64(true), + ) if err != nil { log.Error().Err(err).Msg("ScreenShot failed") return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil } - log.Debug().Int("imageBytes", len(bufferBase64)).Msg("take screenshot success") + log.Debug().Int("imageBytes", len(screenResult.Base64)).Msg("take screenshot success") - return mcp.NewToolResultImage("screenshot", bufferBase64, "image/jpeg"), nil + return mcp.NewToolResultImage("screenshot", screenResult.Base64, "image/jpeg"), nil } } diff --git a/uixt/option/screen.go b/uixt/option/screen.go index 6b90e951..07d13f93 100644 --- a/uixt/option/screen.go +++ b/uixt/option/screen.go @@ -16,6 +16,7 @@ type ScreenShotOptions struct { ScreenShotWithUpload bool `json:"screenshot_with_upload,omitempty" yaml:"screenshot_with_upload,omitempty"` ScreenShotWithLiveType bool `json:"screenshot_with_live_type,omitempty" yaml:"screenshot_with_live_type,omitempty"` ScreenShotWithLivePopularity bool `json:"screenshot_with_live_popularity,omitempty" yaml:"screenshot_with_live_popularity,omitempty"` + ScreenShotWithBase64 bool `json:"screenshot_with_base64,omitempty" yaml:"screenshot_with_base64,omitempty"` ScreenShotWithUITypes []string `json:"screenshot_with_ui_types,omitempty" yaml:"screenshot_with_ui_types,omitempty"` ScreenShotWithClosePopups bool `json:"screenshot_with_close_popups,omitempty" yaml:"screenshot_with_close_popups,omitempty"` ScreenShotWithOCRCluster string `json:"screenshot_with_ocr_cluster,omitempty" yaml:"screenshot_with_ocr_cluster,omitempty"` @@ -53,6 +54,9 @@ func (o *ScreenShotOptions) GetScreenShotOptions() []ActionOption { if o.ScreenShotFileName != "" { options = append(options, WithScreenShotFileName(o.ScreenShotFileName)) } + if o.ScreenShotWithBase64 { + options = append(options, WithScreenShotBase64(true)) + } return options } @@ -129,6 +133,12 @@ func WithScreenShotFileName(fileName string) ActionOption { } } +func WithScreenShotBase64(base64 bool) ActionOption { + return func(o *ActionOptions) { + o.ScreenShotWithBase64 = base64 + } +} + type ScreenRecordOptions struct { ScreenRecordDuration float64 `json:"screenrecord_duration,omitempty" yaml:"screenrecord_duration,omitempty"` ScreenRecordWithAudio bool `json:"screenrecord_with_audio,omitempty" yaml:"screenrecord_with_audio,omitempty"`