From e0b43eb0a1aebfa39ec9f61a221b92ebbcb17f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=99=E6=B3=93=E9=93=AE?= Date: Fri, 15 Aug 2025 15:53:51 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20=E5=85=BC=E5=AE=B9base64?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- uixt/ai/wings_service.go | 55 +++++++++++++++++++++++++++++++++++----- uixt/driver_ext_ai.go | 27 ++++++++++++-------- 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/uixt/ai/wings_service.go b/uixt/ai/wings_service.go index 4424f98a..235f2e82 100644 --- a/uixt/ai/wings_service.go +++ b/uixt/ai/wings_service.go @@ -73,7 +73,7 @@ func (w *WingsService) Plan(ctx context.Context, opts *PlanningOptions) (*Planni } // Get device info from context (if available) - deviceInfo := w.getDeviceInfoFromContext(ctx, screenshot) + deviceInfo := w.getDeviceInfoFromScreenshot(ctx, screenshot) // Prepare Wings API request apiRequest := WingsActionRequest{ @@ -391,12 +391,17 @@ func (w *WingsService) extractScreenshotFromMessage(message *schema.Message) (st return "", errors.New("no image found in message") } -// getDeviceInfoFromContext gets device info from context with fallback -func (w *WingsService) getDeviceInfoFromContext(_ context.Context, screenshot string) []WingsDeviceInfo { +// getDeviceInfoFromBase gets device info from base64 screenshot +func (w *WingsService) getDeviceInfoFromBase64(screenshotBase64 string) []WingsDeviceInfo { // TODO: Extract device info from context if available // Use last history's NowImage as PreImage if history exists - preImageUrl := screenshot + preImage := screenshotBase64 + if len(w.history) > 0 && w.history[len(w.history)-1].DeviceInfos != nil && len(*w.history[len(w.history)-1].DeviceInfos) > 0 { + preImage = (*w.history[len(w.history)-1].DeviceInfos)[0].NowImage + } + + preImageUrl := "" if len(w.history) > 0 && w.history[len(w.history)-1].DeviceInfos != nil && len(*w.history[len(w.history)-1].DeviceInfos) > 0 { preImageUrl = (*w.history[len(w.history)-1].DeviceInfos)[0].NowImageUrl } @@ -405,7 +410,38 @@ func (w *WingsService) getDeviceInfoFromContext(_ context.Context, screenshot st return []WingsDeviceInfo{ { DeviceID: "default-device", - NowImageUrl: screenshot, + NowImage: screenshotBase64, + NowImageUrl: "", + PreImage: preImage, + PreImageUrl: preImageUrl, + NowLayoutJSON: "", + OperationSystem: "android", + }, + } +} + +// getDeviceInfoFromUrl gets device info from url screenshot +func (w *WingsService) getDeviceInfoFromUrl(screenshotUrl string) []WingsDeviceInfo { + // TODO: Extract device info from context if available + + // Use last history's NowImage as PreImage if history exists + preImage := "" + if len(w.history) > 0 && w.history[len(w.history)-1].DeviceInfos != nil && len(*w.history[len(w.history)-1].DeviceInfos) > 0 { + preImage = (*w.history[len(w.history)-1].DeviceInfos)[0].NowImage + } + + preImageUrl := screenshotUrl + if len(w.history) > 0 && w.history[len(w.history)-1].DeviceInfos != nil && len(*w.history[len(w.history)-1].DeviceInfos) > 0 { + preImageUrl = (*w.history[len(w.history)-1].DeviceInfos)[0].NowImageUrl + } + + // use default device info with optimized PreImage + return []WingsDeviceInfo{ + { + DeviceID: "default-device", + NowImage: "", + NowImageUrl: screenshotUrl, + PreImage: preImage, PreImageUrl: preImageUrl, NowLayoutJSON: "", OperationSystem: "android", @@ -415,7 +451,14 @@ func (w *WingsService) getDeviceInfoFromContext(_ context.Context, screenshot st // getDeviceInfoFromScreenshot gets device info from screenshot (for Assert) func (w *WingsService) getDeviceInfoFromScreenshot(ctx context.Context, screenshot string) []WingsDeviceInfo { - return w.getDeviceInfoFromContext(ctx, screenshot) + if strings.HasPrefix(screenshot, "data:image/") { + // Remove data URL prefix like "data:image/jpeg;base64," + parts := strings.Split(screenshot, ",") + if len(parts) == 2 { + return w.getDeviceInfoFromBase64(parts[1]) + } + } + return w.getDeviceInfoFromUrl(screenshot) } // cleanScreenshotDataURL removes data URL prefix from screenshot string diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index 71d3f56c..ea0ef936 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -247,12 +247,15 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* return nil, errors.New("LLM service is not initialized") } + // Parse action options to get ResetHistory setting + options := option.NewActionOptions(opts...) + screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_action"), option.WithScreenShotBase64(true)} + if options.ScreenShotWithUpload { + screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) + } + // Step 1: Take screenshot and convert to base64 - screenResult, err := dExt.GetScreenResult( - option.WithScreenShotFileName("ai_assert"), - option.WithScreenShotBase64(true), - option.WithScreenShotUpload(true), - ) + screenResult, err := dExt.GetScreenResult(screenOptions...) if err != nil { return nil, err } @@ -263,12 +266,17 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* ImagePath: screenResult.ImagePath, Resolution: &screenResult.Resolution, } - + var imageURL string + if screenResult.UploadedURL != "" { + imageURL = screenResult.UploadedURL + } else { + imageURL = screenResult.Base64 + } // Step 2: Call model and measure time modelCallStartTime := time.Now() assertOpts := &ai.AssertOptions{ Assertion: assertion, - Screenshot: screenResult.UploadedURL, + Screenshot: imageURL, Size: screenResult.Resolution, } result, err := dExt.LLMService.Assert(context.Background(), assertOpts) @@ -298,9 +306,8 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // Parse action options to get ResetHistory setting options := option.NewActionOptions(opts...) resetHistory := options.ResetHistory - actionOptions := option.NewActionOptions(opts...) screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_action"), option.WithScreenShotBase64(true)} - if actionOptions.ScreenShotWithUpload { + if options.ScreenShotWithUpload { screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) } @@ -320,7 +327,7 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. if screenResult.UploadedURL != "" { imageURL = screenResult.UploadedURL } else { - imageURL = screenResult.ImagePath + imageURL = screenResult.Base64 } planningOpts := &ai.PlanningOptions{ UserInstruction: prompt, From ad050b472366bfcc7334170d7d18cf1c7e7f243c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=99=E6=B3=93=E9=93=AE?= Date: Fri, 15 Aug 2025 15:56:51 +0800 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=E4=BF=AE=E6=94=B9=E5=90=8D?= =?UTF-8?q?=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- uixt/driver_ext_ai.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index ea0ef936..c312f149 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -249,7 +249,7 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* // Parse action options to get ResetHistory setting options := option.NewActionOptions(opts...) - screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_action"), option.WithScreenShotBase64(true)} + screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_assert"), option.WithScreenShotBase64(true)} if options.ScreenShotWithUpload { screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) } @@ -306,7 +306,7 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // Parse action options to get ResetHistory setting options := option.NewActionOptions(opts...) resetHistory := options.ResetHistory - screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_action"), option.WithScreenShotBase64(true)} + screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_planning"), option.WithScreenShotBase64(true)} if options.ScreenShotWithUpload { screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) } From 54825fb8a615101ad00b32fab508b32da0d9d402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=99=E6=B3=93=E9=93=AE?= Date: Fri, 15 Aug 2025 16:01:46 +0800 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20=E5=8F=AF=E9=80=89=E7=9A=84base64?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- uixt/driver_ext_ai.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/uixt/driver_ext_ai.go b/uixt/driver_ext_ai.go index c312f149..630eff95 100644 --- a/uixt/driver_ext_ai.go +++ b/uixt/driver_ext_ai.go @@ -252,6 +252,8 @@ func (dExt *XTDriver) AIAssert(assertion string, opts ...option.ActionOption) (* screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_assert"), option.WithScreenShotBase64(true)} if options.ScreenShotWithUpload { screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) + } else { + screenOptions = append(screenOptions, option.WithScreenShotBase64(true)) } // Step 1: Take screenshot and convert to base64 @@ -306,9 +308,11 @@ func (dExt *XTDriver) PlanNextAction(ctx context.Context, prompt string, opts .. // Parse action options to get ResetHistory setting options := option.NewActionOptions(opts...) resetHistory := options.ResetHistory - screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_planning"), option.WithScreenShotBase64(true)} + screenOptions := []option.ActionOption{option.WithScreenShotFileName("ai_planning")} if options.ScreenShotWithUpload { screenOptions = append(screenOptions, option.WithScreenShotUpload(true)) + } else { + screenOptions = append(screenOptions, option.WithScreenShotBase64(true)) } // Step 1: Take screenshot and convert to base64