diff --git a/internal/version/VERSION b/internal/version/VERSION index d425e058..5b19f4d5 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2506131027 +v5.0.0-beta-2506131201 diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index ea8af2d2..82abfa0b 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -2,7 +2,6 @@ package uixt import ( "context" - "encoding/base64" "time" "github.com/cloudwego/eino/schema" @@ -177,12 +176,8 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // The planning screenshot is already stored in planningResult.ScreenResult dExt.GetSession().GetData(true) // reset session data to exclude planning screenshot from sub-actions - // convert buffer to base64 string for LLM - screenShotBase64 := "data:image/jpeg;base64," + - base64.StdEncoding.EncodeToString(screenResult.bufSource.Bytes()) - - // get window size - size, err := dExt.IDriver.WindowSize() + // get screen shot buffer base64 and size + screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() if err != nil { return nil, errors.Wrap(code.DeviceGetInfoError, err.Error()) } @@ -326,17 +321,11 @@ func (dExt *XTDriver) AIQuery(text string, opts ...option.ActionOption) (string, return "", errors.New("LLM service is not initialized") } - screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver) + screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() if err != nil { return "", err } - // get window size - size, err := dExt.IDriver.WindowSize() - if err != nil { - return "", errors.Wrap(err, "get window size for AI query failed") - } - // parse action options to extract OutputSchema actionOptions := option.NewActionOptions(opts...) @@ -360,17 +349,11 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) er return errors.New("LLM service is not initialized") } - screenShotBase64, err := GetScreenShotBufferBase64(dExt.IDriver) + screenShotBase64, size, err := dExt.GetScreenshotBase64WithSize() if err != nil { return err } - // get window size - size, err := dExt.IDriver.WindowSize() - if err != nil { - return errors.Wrap(err, "get window size for AI assertion failed") - } - // execute assertion assertOpts := &ai.AssertOptions{ Assertion: assertion, diff --git a/uixt/driver_ext_screenshot.go b/uixt/driver_ext_screenshot.go index fe96f16c..e9329f78 100644 --- a/uixt/driver_ext_screenshot.go +++ b/uixt/driver_ext_screenshot.go @@ -49,6 +49,26 @@ func (s *ScreenResult) FilterTextsByScope(x1, y1, x2, y2 float64) ai.OCRTexts { }) } +// GetScreenshotBase64WithSize takes a screenshot, returns the compressed image buffer in base64 format and screen size +func (dExt *XTDriver) GetScreenshotBase64WithSize() (compressedBufBase64 string, size types.Size, err error) { + compressBufSource, err := getScreenShotBuffer(dExt) + if err != nil { + return "", types.Size{}, err + } + + // convert buffer to base64 string + screenShotBase64 := "data:image/jpeg;base64," + + base64.StdEncoding.EncodeToString(compressBufSource.Bytes()) + + // get screen size + size, err = dExt.IDriver.WindowSize() + if err != nil { + return "", types.Size{}, errors.Wrap(err, "get window size failed") + } + + return screenShotBase64, size, nil +} + // GetScreenResult takes a screenshot, returns the image recognition result func (dExt *XTDriver) GetScreenResult(opts ...option.ActionOption) (screenResult *ScreenResult, err error) { // get compressed screenshot buffer @@ -222,20 +242,6 @@ func getScreenShotBuffer(driver IDriver) (compressedBufSource *bytes.Buffer, err return compressBufSource, nil } -// GetScreenShotBufferBase64 takes a screenshot, returns the compressed image buffer in base64 format -func GetScreenShotBufferBase64(driver IDriver) (compressedBufBase64 string, err error) { - compressBufSource, err := getScreenShotBuffer(driver) - if err != nil { - return "", err - } - - // convert buffer to base64 string - screenShotBase64 := "data:image/jpeg;base64," + - base64.StdEncoding.EncodeToString(compressBufSource.Bytes()) - - return screenShotBase64, nil -} - // saveScreenShot saves compressed image file with file name func saveScreenShot(raw *bytes.Buffer, screenshotPath string) error { // notice: screenshot data is a stream, so we need to copy it to a new buffer diff --git a/uixt/mcp_tools_screen.go b/uixt/mcp_tools_screen.go index 326f001a..2d4f5393 100644 --- a/uixt/mcp_tools_screen.go +++ b/uixt/mcp_tools_screen.go @@ -34,7 +34,7 @@ func (t *ToolScreenShot) Implement() server.ToolHandlerFunc { if err != nil { return nil, err } - bufferBase64, err := GetScreenShotBufferBase64(driverExt.IDriver) + bufferBase64, _, err := driverExt.GetScreenshotBase64WithSize() if err != nil { log.Error().Err(err).Msg("ScreenShot failed") return mcp.NewToolResultError(fmt.Sprintf("Failed to take screenshot: %v", err)), nil