refactor: enhance screenshot handling by introducing base64 encoding and updating related methods

This commit is contained in:
lilong.129
2025-06-29 20:38:33 +08:00
parent 0ae22930aa
commit a040b53ea9
4 changed files with 58 additions and 79 deletions

View File

@@ -10,7 +10,6 @@ import (
"github.com/rs/zerolog/log"
"github.com/httprunner/httprunner/v5/code"
"github.com/httprunner/httprunner/v5/internal/builtin"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/ai"
"github.com/httprunner/httprunner/v5/uixt/option"
@@ -143,12 +142,11 @@ func (dExt *XTDriver) StartToGoal(ctx context.Context, prompt string, opts ...op
func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...option.ActionOption) (*AIExecutionResult, error) {
log.Info().Str("prompt", prompt).Msg("performing AI action")
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
// Step 1: Take screenshot and convert to base64
screenResult, err := dExt.GetScreenResult(
option.WithScreenShotFileName("ai_action"),
option.WithScreenShotBase64(true),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
@@ -160,7 +158,7 @@ func (dExt *XTDriver) AIAction(ctx context.Context, prompt string, opts ...optio
aiExecutionResult := &AIExecutionResult{
Type: "action",
ModelCallElapsed: modelCallElapsed,
ScreenshotElapsed: screenshotElapsed,
ScreenshotElapsed: screenResult.Elapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
PlanningResult: &planningResult.PlanningResult,
@@ -193,13 +191,11 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
options := option.NewActionOptions(opts...)
resetHistory := options.ResetHistory
// Step 1: Take screenshot
screenshotStartTime := time.Now()
// Use GetScreenResult to handle screenshot capture, save, and session tracking
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
// Step 1: Take screenshot and convert to base64
screenResult, err := dExt.GetScreenResult(
option.WithScreenShotFileName("ai_planning"),
option.WithScreenShotBase64(true),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
@@ -208,12 +204,6 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
// The planning screenshot is already stored in planningResult.ScreenResult
dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions
// get screen shot buffer base64 and size
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return nil, errors.Wrap(code.DeviceGetInfoError, err.Error())
}
// Step 2: Call model
modelCallStartTime := time.Now()
planningOpts := &ai.PlanningOptions{
@@ -224,12 +214,12 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
{
Type: schema.ChatMessagePartTypeImageURL,
ImageURL: &schema.ChatMessageImageURL{
URL: screenShotBase64,
URL: screenResult.Base64,
},
},
},
},
Size: size,
Size: screenResult.Resolution,
ResetHistory: resetHistory,
}
@@ -250,7 +240,7 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
planningResult := &PlanningExecutionResult{
PlanningResult: *result, // Inherit all fields from ai.PlanningResult
// Planning process timing and metadata
ScreenshotElapsed: screenshotElapsed,
ScreenshotElapsed: screenResult.Elapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
ScreenResult: screenResult,
@@ -374,17 +364,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec
return nil, errors.New("LLM service is not initialized")
}
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
// Step 1: Take screenshot and convert to base64
screenResult, err := dExt.GetScreenResult(
option.WithScreenShotFileName("ai_query"),
option.WithScreenShotBase64(true),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return nil, err
}
@@ -398,8 +382,8 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec
// execute query
queryOpts := &ai.QueryOptions{
Query: text,
Screenshot: screenShotBase64,
Size: size,
Screenshot: screenResult.Base64,
Size: screenResult.Resolution,
OutputSchema: actionOptions.OutputSchema,
}
result, err := dExt.LLMService.Query(context.Background(), queryOpts)
@@ -412,7 +396,7 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (*AIExec
aiResult := &AIExecutionResult{
Type: "query",
ModelCallElapsed: modelCallElapsed, // model call timing
ScreenshotElapsed: screenshotElapsed, // screenshot timing
ScreenshotElapsed: screenResult.Elapsed, // screenshot timing
ImagePath: screenResult.ImagePath, // screenshot path
Resolution: &screenResult.Resolution, // screen resolution
QueryResult: result, // query-specific result
@@ -426,35 +410,28 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (*
return nil, errors.New("LLM service is not initialized")
}
// Step 1: Take screenshot and measure time
screenshotStartTime := time.Now()
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName(builtin.GenNameWithTimestamp("%d_screenshot")),
// Step 1: Take screenshot and convert to base64
screenResult, err := dExt.GetScreenResult(
option.WithScreenShotFileName("ai_assert"),
option.WithScreenShotBase64(true),
)
screenshotElapsed := time.Since(screenshotStartTime).Milliseconds()
if err != nil {
return nil, err
}
assertResult := &AIExecutionResult{
Type: "assert",
ScreenshotElapsed: screenshotElapsed,
ScreenshotElapsed: screenResult.Elapsed,
ImagePath: screenResult.ImagePath,
Resolution: &screenResult.Resolution,
}
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
assertResult.Error = err.Error()
return assertResult, err
}
// Step 2: Call model and measure time
modelCallStartTime := time.Now()
assertOpts := &ai.AssertOptions{
Assertion: assertion,
Screenshot: screenShotBase64,
Size: size,
Screenshot: screenResult.Base64,
Size: screenResult.Resolution,
}
result, err := dExt.LLMService.Assert(context.Background(), assertOpts)
assertResult.ModelCallElapsed = time.Since(modelCallStartTime).Milliseconds()

View File

@@ -37,6 +37,8 @@ type ScreenResult struct {
Icons ai.UIResultMap `json:"icons"` // CV 识别的图标
Tags []string `json:"tags"` // tags for image, e.g. ["feed", "ad", "live"]
Popup *PopupInfo `json:"popup,omitempty"`
Elapsed int64 `json:"elapsed_ms,omitempty"` // screenshot elapsed time in milliseconds
Base64 string `json:"-"` // base64 encoded screenshot
}
func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts {
@@ -50,26 +52,11 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts {
})
}
// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size
// Also saves the screenshot to session for report display
func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) {
// Create screenshot with session saving, minimal CV processing for AI operations
screenResult, err := dExt.createScreenshotWithSession(
option.WithScreenShotFileName("screenshot_base64"),
)
if err != nil {
return "", types.Size{}, err
}
// GetScreenResult takes a screenshot and returns the ScreenResult with metadata
func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) {
// Take screenshot and measure time
screenshotStartTime := time.Now()
// convert buffer to base64 string
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes())
return screenShotBase64, screenResult.Resolution, nil
}
// createScreenshotWithSession creates a screenshot with optional OCR processing and saves to session
func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) (screenResult *ScreenResult, err error) {
// get compressed screenshot buffer
compressBufSource, err := getScreenShotBuffer(dExt.IDriver)
if err != nil {
@@ -147,6 +134,13 @@ func (dExt *XTDriver) createScreenshotWithSession(opts ...option.ActionOption) (
session := dExt.GetSession()
session.screenResults = append(session.screenResults, screenResult)
// Convert screenshot buffer to base64 string
if screenshotOptions.ScreenShotWithBase64 {
screenResult.Base64 = "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes())
}
screenResult.Elapsed = time.Since(screenshotStartTime).Milliseconds()
logger.Msg("log screenshot")
return screenResult, nil
}
@@ -162,13 +156,7 @@ func needsCVProcessing(options *option.ActionOptions) bool {
options.ScreenShotWithOCRCluster != ""
}
// GetScreenResult takes a screenshot, returns the image recognition result
func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) {
// Enable OCR processing for GetScreenResult
opts = append(opts, option.WithScreenShotOCR(true))
return dExt.createScreenshotWithSession(opts...)
}
// GetScreenTexts takes a screenshot, returns the OCR recognition result
func (dExt *XTDriver) GetScreenTexts(opts ...option.ActionOption) (ocrTexts ai.OCRTexts, err error) {
options := option.NewActionOptions(opts...)
if options.ScreenShotFileName == "" {

View File

@@ -4,10 +4,11 @@ import (
"context"
"fmt"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/mark3labs/mcp-go/mcp"
"github.com/mark3labs/mcp-go/server"
"github.com/rs/zerolog/log"
"github.com/httprunner/httprunner/v5/uixt/option"
)
// ToolScreenShot implements the screenshot tool call.
@@ -34,14 +35,17 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc {
if err != nil {
return nil, err
}
bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize()
screenResult, err := driverExt.GetScreenResult(
option.WithScreenShotFileName("tool_screenshot"),
option.WithScreenShotBase64(true),
)
if err != nil {
log.Error().Err(err).Msg("ScreenShot failed")
return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil
}
log.Debug().Int("imageBytes", len(bufferBase64)).Msg("take screenshot success")
log.Debug().Int("imageBytes", len(screenResult.Base64)).Msg("take screenshot success")
return mcp.NewToolResultImage("screenshot", bufferBase64, "image/jpeg"), nil
return mcp.NewToolResultImage("screenshot", screenResult.Base64, "image/jpeg"), nil
}
}

View File

@@ -16,6 +16,7 @@ type ScreenShotOptions struct {
ScreenShotWithUpload bool `json:"screenshot_with_upload,omitempty" yaml:"screenshot_with_upload,omitempty"`
ScreenShotWithLiveType bool `json:"screenshot_with_live_type,omitempty" yaml:"screenshot_with_live_type,omitempty"`
ScreenShotWithLivePopularity bool `json:"screenshot_with_live_popularity,omitempty" yaml:"screenshot_with_live_popularity,omitempty"`
ScreenShotWithBase64 bool `json:"screenshot_with_base64,omitempty" yaml:"screenshot_with_base64,omitempty"`
ScreenShotWithUITypes []string `json:"screenshot_with_ui_types,omitempty" yaml:"screenshot_with_ui_types,omitempty"`
ScreenShotWithClosePopups bool `json:"screenshot_with_close_popups,omitempty" yaml:"screenshot_with_close_popups,omitempty"`
ScreenShotWithOCRCluster string `json:"screenshot_with_ocr_cluster,omitempty" yaml:"screenshot_with_ocr_cluster,omitempty"`
@@ -53,6 +54,9 @@ func (o *ScreenShotOptions) GetScreenShotOptions() []ActionOption {
if o.ScreenShotFileName != "" {
options = append(options, WithScreenShotFileName(o.ScreenShotFileName))
}
if o.ScreenShotWithBase64 {
options = append(options, WithScreenShotBase64(true))
}
return options
}
@@ -129,6 +133,12 @@ func WithScreenShotFileName(fileName string) ActionOption {
}
}
func WithScreenShotBase64(base64 bool) ActionOption {
return func(o *ActionOptions) {
o.ScreenShotWithBase64 = base64
}
}
type ScreenRecordOptions struct {
ScreenRecordDuration float64 `json:"screenrecord_duration,omitempty" yaml:"screenrecord_duration,omitempty"`
ScreenRecordWithAudio bool `json:"screenrecord_with_audio,omitempty" yaml:"screenrecord_with_audio,omitempty"`