From 1a3cabcc842cf7c7bf9c1cc05268b713fe8a6ce0 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Mon, 21 Aug 2023 22:57:08 +0800 Subject: [PATCH] refactor: video crawler --- hrp/pkg/uixt/ext.go | 6 +- hrp/pkg/uixt/video_crawler.go | 508 +++++++++++++++++----------------- 2 files changed, 251 insertions(+), 263 deletions(-) diff --git a/hrp/pkg/uixt/ext.go b/hrp/pkg/uixt/ext.go index ab1a3659..b6306a8c 100644 --- a/hrp/pkg/uixt/ext.go +++ b/hrp/pkg/uixt/ext.go @@ -74,14 +74,14 @@ type cacheStepData struct { // cache step screenshot ocr results, key is image path, value is ScreenResult screenResults map[string]*ScreenResult // cache feed/live video stat - videoStat *VideoStat + videoCrawler *VideoCrawler } func (d *cacheStepData) reset() { d.screenShots = make([]string, 0) d.screenShotsUrls = make(map[string]string) d.screenResults = make(map[string]*ScreenResult) - d.videoStat = nil + d.videoCrawler = nil } type DriverExt struct { @@ -217,7 +217,7 @@ func (dExt *DriverExt) saveScreenShot(raw *bytes.Buffer, fileName string) (strin func (dExt *DriverExt) GetStepCacheData() map[string]interface{} { cacheData := make(map[string]interface{}) - cacheData["video_stat"] = dExt.cacheStepData.videoStat + cacheData["video_stat"] = dExt.cacheStepData.videoCrawler cacheData["screenshots"] = dExt.cacheStepData.screenShots cacheData["screenshots_urls"] = dExt.cacheStepData.screenShotsUrls diff --git a/hrp/pkg/uixt/video_crawler.go b/hrp/pkg/uixt/video_crawler.go index 5be39e92..2158402c 100644 --- a/hrp/pkg/uixt/video_crawler.go +++ b/hrp/pkg/uixt/video_crawler.go @@ -3,10 +3,8 @@ package uixt import ( "fmt" "regexp" - "strings" "time" - "github.com/httprunner/funplugin" "github.com/pkg/errors" "github.com/rs/zerolog/log" @@ -14,161 +12,6 @@ import ( "github.com/httprunner/httprunner/v4/hrp/internal/json" ) -type VideoStat struct { - configs *VideoCrawlerConfigs - timer *time.Timer - - FeedCount int `json:"feed_count"` - FeedStat map[string]int `json:"feed_stat"` // 分类统计 feed 数量:视频/图文/广告/特效/模板/购物 - LiveCount int `json:"live_count"` - LiveStat map[string]int `json:"live_stat"` // 分类统计 live 数量:秀场/游戏/电商/多人 -} - -func (s *VideoStat) isFeedTargetAchieved() bool { - targetStat := make(map[string]int) - for _, targetLabel := range s.configs.Feed.TargetLabels { - targetStat[targetLabel.Text] = targetLabel.Target - } - - log.Info(). - Int("current_total", s.FeedCount). - Interface("current_stat", s.FeedStat). - Int("target_total", s.configs.Feed.TargetCount). - Interface("target_stat", targetStat). - Msg("display feed crawler progress") - - // check total feed count - if s.FeedCount < s.configs.Feed.TargetCount { - return false - } - - // check each feed type's count - for _, targetLabel := range s.configs.Feed.TargetLabels { - if s.FeedStat[targetLabel.Text] < targetLabel.Target { - return false - } - } - - return true -} - -func (s *VideoStat) isLiveTargetAchieved() bool { - targetStat := make(map[string]int) - for _, targetLabel := range s.configs.Live.TargetLabels { - targetStat[targetLabel.Text] = targetLabel.Target - } - - log.Info(). - Int("current_total", s.LiveCount). - Interface("current_stat", s.LiveStat). - Int("target_total", s.configs.Live.TargetCount). - Interface("target_stat", targetStat). - Msg("display live crawler progress") - - // check total live count - if s.LiveCount < s.configs.Live.TargetCount { - return false - } - - // check each live type's count - for _, targetLabel := range s.configs.Live.TargetLabels { - if s.LiveStat[targetLabel.Text] < targetLabel.Target { - return false - } - } - - return true -} - -func (s *VideoStat) isTargetAchieved() bool { - return s.isFeedTargetAchieved() && s.isLiveTargetAchieved() -} - -// incrFeed increases feed count and feed stat -func (s *VideoStat) incrFeed(screenResult *ScreenResult, driverExt *DriverExt) error { - screenResult.VideoType = "feed" - - var author string - if screenResult.Texts != nil { - // handle screenshot - // find feed author - actionOptions := []ActionOption{ - WithRegex(true), - driverExt.GenAbsScope(0, 0.5, 1, 1).Option(), - } - ocrText, err := screenResult.Texts.FindText("^@", actionOptions...) - if err != nil { - return errors.Wrap(err, "find feed author failed") - } - author = fmt.Sprintf("@%s", removeNonAlphanumeric(ocrText.Text)) - log.Info().Str("author", author).Msg("found feed author by OCR") - - // find target labels - for _, targetLabel := range s.configs.Feed.TargetLabels { - scope := targetLabel.Scope - actionOptions := []ActionOption{ - WithRegex(targetLabel.Regex), - driverExt.GenAbsScope(scope[0], scope[1], scope[2], scope[3]).Option(), - } - if _, err := screenResult.Texts.FindText(targetLabel.Text, actionOptions...); err == nil { - key := targetLabel.Text - if _, ok := s.FeedStat[key]; !ok { - s.FeedStat[key] = 0 - } - s.FeedStat[key]++ - screenResult.Tags = append(screenResult.Tags, key) - } - } - } - - if screenResult.Feed == nil { - // get feed trackings by author - if driverExt.plugin != nil { - feedVideo, err := getFeedVideo(driverExt.plugin, author) - if err != nil { - return errors.Wrap(err, "get feed video from plugin failed") - } - screenResult.Feed = feedVideo - } else { - screenResult.Feed = &FeedVideo{} - } - } - - // get simulation play duration - if screenResult.Feed.SimulationPlayDuration != 0 { - screenResult.Feed.PlayDuration = screenResult.Feed.SimulationPlayDuration - } else { - screenResult.Feed.RandomPlayDuration = getSimulationDuration(s.configs.Feed.SleepRandom) - screenResult.Feed.PlayDuration = screenResult.Feed.RandomPlayDuration - } - - log.Info().Strs("tags", screenResult.Tags). - Interface("feed", screenResult.Feed). - Msg("found feed success") - s.FeedCount++ - return nil -} - -// incrLive increases live count and live stat -func (s *VideoStat) incrLive(screenResult *ScreenResult, driverExt *DriverExt) error { - screenResult.VideoType = "live" - // TODO: check live type - - if screenResult.Live == nil { - screenResult.Live = &LiveRoom{} - } - - // TODO: add popularity data for live - - screenResult.Live.SimulationWatchDuration = getSimulationDuration(s.configs.Live.SleepRandom) - - log.Info().Strs("tags", screenResult.Tags). - Interface("live", screenResult.Live). - Msg("found live success") - s.LiveCount++ - return nil -} - type TargetLabel struct { Text string `json:"text"` Scope Scope `json:"scope"` @@ -195,13 +38,174 @@ type VideoCrawlerConfigs struct { Live LiveConfig `json:"live"` } -type LiveCrawler struct { - driver *DriverExt - configs *VideoCrawlerConfigs // target video count - currentStat *VideoStat // current video stat +type VideoCrawler struct { + driverExt *DriverExt + configs *VideoCrawlerConfigs + timer *time.Timer + + FeedCount int `json:"feed_count"` + FeedStat map[string]int `json:"feed_stat"` // 分类统计 feed 数量:视频/图文/广告/特效/模板/购物 + LiveCount int `json:"live_count"` + LiveStat map[string]int `json:"live_stat"` // 分类统计 live 数量:秀场/游戏/电商/多人 } -func (l *LiveCrawler) checkLiveVideo(texts OCRTexts) (enterPoint PointF, yes bool) { +func (vc *VideoCrawler) isFeedTargetAchieved() bool { + targetStat := make(map[string]int) + for _, targetLabel := range vc.configs.Feed.TargetLabels { + targetStat[targetLabel.Text] = targetLabel.Target + } + + log.Info(). + Int("current_total", vc.FeedCount). + Interface("current_stat", vc.FeedStat). + Int("target_total", vc.configs.Feed.TargetCount). + Interface("target_stat", targetStat). + Msg("display feed crawler progress") + + // check total feed count + if vc.FeedCount < vc.configs.Feed.TargetCount { + return false + } + + // check each feed type's count + for _, targetLabel := range vc.configs.Feed.TargetLabels { + if vc.FeedStat[targetLabel.Text] < targetLabel.Target { + return false + } + } + + return true +} + +func (vc *VideoCrawler) isLiveTargetAchieved() bool { + targetStat := make(map[string]int) + for _, targetLabel := range vc.configs.Live.TargetLabels { + targetStat[targetLabel.Text] = targetLabel.Target + } + + log.Info(). + Int("current_total", vc.LiveCount). + Interface("current_stat", vc.LiveStat). + Int("target_total", vc.configs.Live.TargetCount). + Interface("target_stat", targetStat). + Msg("display live crawler progress") + + // check total live count + if vc.LiveCount < vc.configs.Live.TargetCount { + return false + } + + // check each live type's count + for _, targetLabel := range vc.configs.Live.TargetLabels { + if vc.LiveStat[targetLabel.Text] < targetLabel.Target { + return false + } + } + + return true +} + +func (vc *VideoCrawler) isTargetAchieved() bool { + return vc.isFeedTargetAchieved() && vc.isLiveTargetAchieved() +} + +// incrFeed increases feed count and feed stat +func (vc *VideoCrawler) incrFeed(screenResult *ScreenResult) error { + screenResult.VideoType = "feed" + + var author string + if screenResult.Texts != nil { + // handle screenshot + // find feed author + actionOptions := []ActionOption{ + WithRegex(true), + vc.driverExt.GenAbsScope(0, 0.5, 1, 1).Option(), + } + ocrText, err := screenResult.Texts.FindText("^@", actionOptions...) + if err != nil { + return errors.Wrap(err, "find feed author failed") + } + author = fmt.Sprintf("@%s", removeNonAlphanumeric(ocrText.Text)) + log.Info().Str("author", author).Msg("found feed author by OCR") + + // find target labels + for _, targetLabel := range vc.configs.Feed.TargetLabels { + scope := targetLabel.Scope + actionOptions := []ActionOption{ + WithRegex(targetLabel.Regex), + vc.driverExt.GenAbsScope(scope[0], scope[1], scope[2], scope[3]).Option(), + } + if _, err := screenResult.Texts.FindText(targetLabel.Text, actionOptions...); err == nil { + key := targetLabel.Text + if _, ok := vc.FeedStat[key]; !ok { + vc.FeedStat[key] = 0 + } + vc.FeedStat[key]++ + screenResult.Tags = append(screenResult.Tags, key) + } + } + } + + if screenResult.Feed == nil { + // get feed trackings by author + if vc.driverExt.plugin != nil { + feedVideo, err := vc.getFeedVideo(author) + if err != nil { + return errors.Wrap(err, "get feed video from plugin failed") + } + screenResult.Feed = feedVideo + } else { + screenResult.Feed = &FeedVideo{} + } + } + + // get simulation play duration + if screenResult.Feed.SimulationPlayDuration != 0 { + screenResult.Feed.PlayDuration = screenResult.Feed.SimulationPlayDuration + } else { + screenResult.Feed.RandomPlayDuration = getSimulationDuration(vc.configs.Feed.SleepRandom) + screenResult.Feed.PlayDuration = screenResult.Feed.RandomPlayDuration + } + + log.Info().Strs("tags", screenResult.Tags). + Interface("feed", screenResult.Feed). + Msg("found feed success") + vc.FeedCount++ + return nil +} + +// incrLive increases live count and live stat +func (vc *VideoCrawler) incrLive(screenResult *ScreenResult) error { + screenResult.VideoType = "live" + // TODO: check live type + + if screenResult.Live == nil { + screenResult.Live = &LiveRoom{} + } + + // TODO: add popularity data for live + + screenResult.Live.SimulationWatchDuration = getSimulationDuration(vc.configs.Live.SleepRandom) + + log.Info().Strs("tags", screenResult.Tags). + Interface("live", screenResult.Live). + Msg("found live success") + vc.LiveCount++ + return nil +} + +func (vc *VideoCrawler) checkLiveVideo(feedVideo *FeedVideo) (enterPoint PointF, yes bool) { + // TODO: check if preview-live from feedVideo + if feedVideo.Type != "live" { + return PointF{}, false + } + + // take screenshot and get OCR texts via image service + texts, err := vc.driverExt.GetScreenTexts() + if err != nil { + return PointF{}, false + } + // 预览流入口:DY/KS // 标签文案:点击进入直播间|进入直播间领金币 points, err := texts.FindTexts([]string{".*进入直播间.*"}, WithScope(0, 0.3, 1, 0.8), WithRegex(true)) @@ -233,25 +237,25 @@ func (l *LiveCrawler) checkLiveVideo(texts OCRTexts) (enterPoint PointF, yes boo } // run live video crawler -func (l *LiveCrawler) Run(driver *DriverExt, enterPoint PointF) error { +func (vc *VideoCrawler) startLiveCrawler(enterPoint PointF) error { log.Info().Msg("enter live room") - if err := driver.TapAbsXY(enterPoint.X, enterPoint.Y); err != nil { + if err := vc.driverExt.TapAbsXY(enterPoint.X, enterPoint.Y); err != nil { log.Error().Err(err).Msg("tap live video failed") return err } time.Sleep(5 * time.Second) - for !l.currentStat.isLiveTargetAchieved() { + for !vc.isLiveTargetAchieved() { select { - case <-l.currentStat.timer.C: + case <-vc.timer.C: log.Warn().Msg("timeout in live crawler") return errors.Wrap(code.TimeoutError, "live crawler timeout") - case <-l.driver.interruptSignal: + case <-vc.driverExt.interruptSignal: log.Warn().Msg("interrupted in live crawler") return errors.Wrap(code.InterruptError, "live crawler interrupted") default: // swipe to next live video swipeStartTime := time.Now() - if err := l.driver.SwipeUp(); err != nil { + if err := vc.driverExt.SwipeUp(); err != nil { log.Error().Err(err).Msg("live swipe up failed") return err } @@ -261,7 +265,7 @@ func (l *LiveCrawler) Run(driver *DriverExt, enterPoint PointF) error { time.Sleep(5 * time.Second) // take screenshot and get screen texts by OCR - screenResult, err := l.driver.GetScreenResult() + screenResult, err := vc.driverExt.GetScreenResult() if err != nil { log.Error().Err(err).Msg("OCR GetTexts failed") time.Sleep(3 * time.Second) @@ -269,7 +273,7 @@ func (l *LiveCrawler) Run(driver *DriverExt, enterPoint PointF) error { } // check live type and incr live count - if err := l.currentStat.incrLive(screenResult, l.driver); err != nil { + if err := vc.incrLive(screenResult); err != nil { log.Error().Err(err).Msg("incr live failed") } @@ -285,22 +289,22 @@ func (l *LiveCrawler) Run(driver *DriverExt, enterPoint PointF) error { log.Info().Msg("live count achieved, exit live room") - return l.exitLiveRoom() + return vc.exitLiveRoom() } -func (l *LiveCrawler) exitLiveRoom() error { +func (vc *VideoCrawler) exitLiveRoom() error { for i := 0; i < 3; i++ { - l.driver.SwipeRelative(0.1, 0.5, 0.9, 0.5) + vc.driverExt.SwipeRelative(0.1, 0.5, 0.9, 0.5) time.Sleep(2 * time.Second) } // exit live room failed, while video count achieved - if l.currentStat.isTargetAchieved() { + if vc.isTargetAchieved() { return nil } // click X button on upper-right corner - if err := l.driver.TapXY(0.95, 0.05); err == nil { + if err := vc.driverExt.TapXY(0.95, 0.05); err == nil { log.Info().Msg("tap X button on upper-right corner to exit live room") time.Sleep(2 * time.Second) } @@ -309,6 +313,10 @@ func (l *LiveCrawler) exitLiveRoom() error { } func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { + if dExt.plugin == nil { + return errors.New("miss plugin for video crawler") + } + // set default sleep random strategy if not set if configs.Feed.SleepRandom == nil { configs.Feed.SleepRandom = []interface{}{1, 5} @@ -317,8 +325,9 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { configs.Live.SleepRandom = []interface{}{10, 15} } - currVideoStat := &VideoStat{ - configs: configs, + crawler := &VideoCrawler{ + driverExt: dExt, + configs: configs, FeedCount: 0, FeedStat: make(map[string]int), @@ -326,21 +335,15 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { LiveStat: make(map[string]int), } defer func() { - dExt.cacheStepData.videoStat = currVideoStat + dExt.cacheStepData.videoCrawler = crawler }() - liveCrawler := LiveCrawler{ - driver: dExt, - configs: configs, - currentStat: currVideoStat, - } - // loop until target count achieved or timeout // the main loop is feed crawler - currVideoStat.timer = time.NewTimer(time.Duration(configs.Timeout) * time.Second) + crawler.timer = time.NewTimer(time.Duration(configs.Timeout) * time.Second) for { select { - case <-currVideoStat.timer.C: + case <-crawler.timer.C: log.Warn().Msg("timeout in feed crawler") return errors.Wrap(code.TimeoutError, "feed crawler timeout") case <-dExt.interruptSignal: @@ -357,44 +360,25 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { swipeFinishTime := time.Now() var screenResult *ScreenResult - if dExt.plugin != nil { - // get screen info from app event trackings - if feedVideo, err := getCurrentFeedVideo(dExt.plugin); err == nil && feedVideo != nil { - screenResult = &ScreenResult{ - Feed: feedVideo, - Texts: nil, - Tags: nil, - } - dExt.cacheStepData.screenResults[time.Now().String()] = screenResult - } + // get app event trackings + feedVideo, err := crawler.getCurrentFeedVideo() + if err != nil { + return errors.Wrap(err, "get app event trackings failed") } - - if screenResult == nil { - // take screenshot and get screen texts by OCR - screenResult, err = dExt.GetScreenResult() - if err != nil { - if strings.Contains(err.Error(), "connect: connection refused") { - return err - } - log.Error().Err(err).Msg("OCR GetTexts failed") - time.Sleep(3 * time.Second) - continue - } - } - - // automatic handling of pop-up windows - if err := dExt.AutoPopupHandler(screenResult.Texts); err != nil { - log.Error().Err(err).Msg("auto handle popup failed") - return err + screenResult = &ScreenResult{ + Feed: feedVideo, + Texts: nil, + Tags: nil, } + dExt.cacheStepData.screenResults[time.Now().String()] = screenResult // check if live video && run live crawler - if enterPoint, isLive := liveCrawler.checkLiveVideo(screenResult.Texts); isLive { + if enterPoint, isLive := crawler.checkLiveVideo(feedVideo); isLive { // 直播预览流 screenResult.VideoType = "live-preview" log.Info().Msg("live video found") - if !liveCrawler.currentStat.isLiveTargetAchieved() { - if err := liveCrawler.Run(dExt, enterPoint); err != nil { + if !crawler.isLiveTargetAchieved() { + if err := crawler.startLiveCrawler(enterPoint); err != nil { if errors.Is(err, code.TimeoutError) || errors.Is(err, code.InterruptError) { return err } @@ -405,7 +389,7 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { } else { // 点播 // check feed type and incr feed count - err := currVideoStat.incrFeed(screenResult, dExt) + err := crawler.incrFeed(screenResult) if err != nil { log.Warn().Err(err).Msg("incr feed failed") } else { @@ -415,7 +399,7 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { } // check if target count achieved - if currVideoStat.isTargetAchieved() { + if crawler.isTargetAchieved() { log.Info().Msg("target count achieved, exit crawler") return nil } @@ -428,43 +412,6 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { } } -func getFeedVideo(plugin funplugin.IPlugin, authorName string) (feedVideo *FeedVideo, err error) { - if !plugin.Has("GetFeedVideo") { - return nil, errors.New("plugin missing GetFeedVideo method") - } - - resp, err := plugin.Call("GetFeedVideo", authorName) - if err != nil { - return nil, errors.Wrap(err, "call plugin GetFeedVideo failed") - } - - if resp == nil { - return nil, errors.New("feed not found") - } - - feedBytes, err := json.Marshal(resp) - if err != nil { - return nil, errors.New("json marshal feed video info failed") - } - - feedVideo = &FeedVideo{} - err = json.Unmarshal(feedBytes, feedVideo) - if err != nil { - return nil, errors.Wrap(err, "json unmarshal feed video info failed") - } - - log.Info().Interface("feedVideo", feedVideo).Msg("get feed video success") - return feedVideo, nil -} - -func removeNonAlphanumeric(input string) string { - // 使用正则表达式匹配中英文字符以外的内容 - re := regexp.MustCompile(`[^\p{L}\p{N}]+`) - // 删除匹配到的非中英文字符 - processed := re.ReplaceAllString(input, "") - return processed -} - type FeedVideo struct { // 视频基础数据 CacheKey string `json:"cache_key"` // 视频 CacheKey @@ -510,15 +457,45 @@ type LiveRoom struct { PreloadTimestamp int64 `json:"preload_timestamp"` // feed 预加载时间戳 } -func getCurrentFeedVideo(plugin funplugin.IPlugin) (feedVideo *FeedVideo, err error) { - if !plugin.Has("GetCurrentFeedVideo") { +func (vc *VideoCrawler) getFeedVideo(authorName string) (feedVideo *FeedVideo, err error) { + if !vc.driverExt.plugin.Has("GetFeedVideo") { + return nil, errors.New("plugin missing GetFeedVideo method") + } + + resp, err := vc.driverExt.plugin.Call("GetFeedVideo", authorName) + if err != nil { + return nil, errors.Wrap(err, "call plugin GetFeedVideo failed") + } + + if resp == nil { + return nil, errors.New("feed not found") + } + + feedBytes, err := json.Marshal(resp) + if err != nil { + return nil, errors.New("json marshal feed video info failed") + } + + feedVideo = &FeedVideo{} + err = json.Unmarshal(feedBytes, feedVideo) + if err != nil { + return nil, errors.Wrap(err, "json unmarshal feed video info failed") + } + + log.Info().Interface("feedVideo", feedVideo).Msg("get feed video success") + return feedVideo, nil +} + +func (vc *VideoCrawler) getCurrentFeedVideo() (feedVideo *FeedVideo, err error) { + if !vc.driverExt.plugin.Has("GetCurrentFeedVideo") { return nil, errors.New("plugin missing GetCurrentFeedVideo method") } // FIXME: wait for cache update time.Sleep(2000 * time.Millisecond) - resp, err := plugin.Call("GetCurrentFeedVideo") + // TODO: retry 3 times if get failed, abort if fail more than 3 times + resp, err := vc.driverExt.plugin.Call("GetCurrentFeedVideo") if err != nil { return nil, errors.Wrap(err, "call plugin GetCurrentFeedVideo failed") } @@ -538,12 +515,23 @@ func getCurrentFeedVideo(plugin funplugin.IPlugin) (feedVideo *FeedVideo, err er return nil, errors.Wrap(err, "json unmarshal feed video info failed") } + // TODO: check if app event trackings changed + // TODO: check and handle popups if event trackings not changed log.Info(). Interface("feedVideoCaption", feedVideo.Caption). Msg("get current feed video success") return feedVideo, nil } -func getCurrentLiveRoom(plugin funplugin.IPlugin) (liveVideo *LiveRoom, err error) { +func (vc *VideoCrawler) getCurrentLiveRoom() (liveVideo *LiveRoom, err error) { + // TODO return } + +func removeNonAlphanumeric(input string) string { + // 使用正则表达式匹配中英文字符以外的内容 + re := regexp.MustCompile(`[^\p{L}\p{N}]+`) + // 删除匹配到的非中英文字符 + processed := re.ReplaceAllString(input, "") + return processed +}