refactor: GetScreenshotBase64WithSize

This commit is contained in:
lilong.129
2025-06-13 12:01:21 +08:00
parent f6e7e970f8
commit 409cd693f0
4 changed files with 26 additions and 37 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506131027 v5.0.0-beta-2506131201

View File

@@ -2,7 +2,6 @@ package uixt
import ( import (
"context" "context"
"encoding/base64"
"time" "time"
"github.com/cloudwego/eino/schema" "github.com/cloudwego/eino/schema"
@@ -177,12 +176,8 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
// The planning screenshot is already stored in planningResult.ScreenResult // The planning screenshot is already stored in planningResult.ScreenResult
dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions
// convert buffer to base64 string for LLM // get screen shot buffer base64 and size
screenShotBase64 := "data:image/jpeg;base64," + screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes())
// get window size
size, err := dExt.IDriver.WindowSize()
if err != nil { if err != nil {
return nil, errors.Wrap(code.DeviceGetInfoError, err.Error()) return nil, errors.Wrap(code.DeviceGetInfoError, err.Error())
} }
@@ -326,17 +321,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (string,
return "", errors.New("LLM service is not initialized") return "", errors.New("LLM service is not initialized")
} }
screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver) screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil { if err != nil {
return "", err return "", err
} }
// get window size
size, err := dExt.IDriver.WindowSize()
if err != nil {
return "", errors.Wrap(err, "get window size for AI query failed")
}
// parse action options to extract OutputSchema // parse action options to extract OutputSchema
actionOptions := option.NewActionOptions(opts...) actionOptions := option.NewActionOptions(opts...)
@@ -360,17 +349,11 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) er
return errors.New("LLM service is not initialized") return errors.New("LLM service is not initialized")
} }
screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver) screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil { if err != nil {
return err return err
} }
// get window size
size, err := dExt.IDriver.WindowSize()
if err != nil {
return errors.Wrap(err, "get window size for AI assertion failed")
}
// execute assertion // execute assertion
assertOpts := &ai.AssertOptions{ assertOpts := &ai.AssertOptions{
Assertion: assertion, Assertion: assertion,

View File

@@ -49,6 +49,26 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts {
}) })
} }
// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size
func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) {
compressBufSource, err := getScreenShotBuffer(dExt)
if err != nil {
return "", types.Size{}, err
}
// convert buffer to base64 string
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(compressBufSource.Bytes())
// get screen size
size, err = dExt.IDriver.WindowSize()
if err != nil {
return "", types.Size{}, errors.Wrap(err, "get window size failed")
}
return screenShotBase64, size, nil
}
// GetScreenResult takes a screenshot, returns the image recognition result // GetScreenResult takes a screenshot, returns the image recognition result
func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) {
// get compressed screenshot buffer // get compressed screenshot buffer
@@ -222,20 +242,6 @@ func getScreenShotBuffer(driver IDriver) (compressedBufSource *bytes.Buffer, err
return compressBufSource, nil return compressBufSource, nil
} }
// GetScreenShotBufferBase64 takes a screenshot, returns the compressed image buffer in base64 format
func GetScreenShotBufferBase64(driver IDriver) (compressedBufBase64 string, err error) {
compressBufSource, err := getScreenShotBuffer(driver)
if err != nil {
return "", err
}
// convert buffer to base64 string
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(compressBufSource.Bytes())
return screenShotBase64, nil
}
// saveScreenShot saves compressed image file with file name // saveScreenShot saves compressed image file with file name
func saveScreenShot(raw *bytes.Buffer, screenshotPath string) error { func saveScreenShot(raw *bytes.Buffer, screenshotPath string) error {
// notice: screenshot data is a stream, so we need to copy it to a new buffer // notice: screenshot data is a stream, so we need to copy it to a new buffer

View File

@@ -34,7 +34,7 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc {
if err != nil { if err != nil {
return nil, err return nil, err
} }
bufferBase64, err := GetScreenShotBufferBase64(driverExt.IDriver) bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize()
if err != nil { if err != nil {
log.Error().Err(err).Msg("ScreenShot failed") log.Error().Err(err).Msg("ScreenShot failed")
return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil