refactor: GetScreenshotBase64WithSize

This commit is contained in:
lilong.129
2025-06-13 12:01:21 +08:00
parent f6e7e970f8
commit 409cd693f0
4 changed files with 26 additions and 37 deletions

View File

@@ -1 +1 @@
v5.0.0-beta-2506131027
v5.0.0-beta-2506131201

View File

@@ -2,7 +2,6 @@ package uixt
import (
"context"
"encoding/base64"
"time"
"github.com/cloudwego/eino/schema"
@@ -177,12 +176,8 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts ..
// The planning screenshot is already stored in planningResult.ScreenResult
dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions
// convert buffer to base64 string for LLM
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes())
// get window size
size, err := dExt.IDriver.WindowSize()
// get screen shot buffer base64 and size
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return nil, errors.Wrap(code.DeviceGetInfoError, err.Error())
}
@@ -326,17 +321,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (string,
return "", errors.New("LLM service is not initialized")
}
screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver)
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return "", err
}
// get window size
size, err := dExt.IDriver.WindowSize()
if err != nil {
return "", errors.Wrap(err, "get window size for AI query failed")
}
// parse action options to extract OutputSchema
actionOptions := option.NewActionOptions(opts...)
@@ -360,17 +349,11 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) er
return errors.New("LLM service is not initialized")
}
screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver)
screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize()
if err != nil {
return err
}
// get window size
size, err := dExt.IDriver.WindowSize()
if err != nil {
return errors.Wrap(err, "get window size for AI assertion failed")
}
// execute assertion
assertOpts := &ai.AssertOptions{
Assertion: assertion,

View File

@@ -49,6 +49,26 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts {
})
}
// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size
func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) {
compressBufSource, err := getScreenShotBuffer(dExt)
if err != nil {
return "", types.Size{}, err
}
// convert buffer to base64 string
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(compressBufSource.Bytes())
// get screen size
size, err = dExt.IDriver.WindowSize()
if err != nil {
return "", types.Size{}, errors.Wrap(err, "get window size failed")
}
return screenShotBase64, size, nil
}
// GetScreenResult takes a screenshot, returns the image recognition result
func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) {
// get compressed screenshot buffer
@@ -222,20 +242,6 @@ func getScreenShotBuffer(driver IDriver) (compressedBufSource *bytes.Buffer, err
return compressBufSource, nil
}
// GetScreenShotBufferBase64 takes a screenshot, returns the compressed image buffer in base64 format
func GetScreenShotBufferBase64(driver IDriver) (compressedBufBase64 string, err error) {
compressBufSource, err := getScreenShotBuffer(driver)
if err != nil {
return "", err
}
// convert buffer to base64 string
screenShotBase64 := "data:image/jpeg;base64," +
base64.StdEncoding.EncodeToString(compressBufSource.Bytes())
return screenShotBase64, nil
}
// saveScreenShot saves compressed image file with file name
func saveScreenShot(raw *bytes.Buffer, screenshotPath string) error {
// notice: screenshot data is a stream, so we need to copy it to a new buffer

View File

@@ -34,7 +34,7 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc {
if err != nil {
return nil, err
}
bufferBase64, err := GetScreenShotBufferBase64(driverExt.IDriver)
bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize()
if err != nil {
log.Error().Err(err).Msg("ScreenShot failed")
return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil