From 038f4171ba531ddd5abe55a605b2ed33117efe12 Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Thu, 4 May 2023 17:39:59 +0800 Subject: [PATCH] feat: add popularity data for feed --- hrp/pkg/uixt/ext.go | 12 ++++++++++-- hrp/pkg/uixt/ocr_vedem.go | 35 +++++++++++++++++++++-------------- hrp/pkg/uixt/video_crawler.go | 27 ++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 21 deletions(-) diff --git a/hrp/pkg/uixt/ext.go b/hrp/pkg/uixt/ext.go index d7dd00d5..8384709d 100644 --- a/hrp/pkg/uixt/ext.go +++ b/hrp/pkg/uixt/ext.go @@ -48,9 +48,17 @@ func WithThreshold(threshold float64) CVOption { } } +type Popularity struct { + Stars string `json:"stars"` // 点赞数 + Comments string `json:"comments"` // 评论数 + Favorites string `json:"favorites"` // 收藏数 + Shares string `json:"shares"` // 分享数 +} + type OcrResult struct { - Texts OCRTexts `json:"texts"` // dumped OCRTexts - Tags []string `json:"tags"` // tags for image, e.g. ["feed", "ad", "live"] + Texts OCRTexts `json:"texts"` // dumped OCRTexts + Tags []string `json:"tags"` // tags for image, e.g. ["feed", "ad", "live"] + Popularity Popularity `json:"popularity"` // video popularity data } type cacheStepData struct { diff --git a/hrp/pkg/uixt/ocr_vedem.go b/hrp/pkg/uixt/ocr_vedem.go index fda92a98..152b7f40 100644 --- a/hrp/pkg/uixt/ocr_vedem.go +++ b/hrp/pkg/uixt/ocr_vedem.go @@ -52,26 +52,33 @@ func (t OCRTexts) texts() (texts []string) { return texts } +func (t OCRTexts) FilterScope(scope AbsScope) (results OCRTexts) { + for _, ocrText := range t { + rect := ocrText.Rect + + // check if text in scope + if len(scope) == 4 { + if rect.Min.X < scope[0] || + rect.Min.Y < scope[1] || + rect.Max.X > scope[2] || + rect.Max.Y > scope[3] { + // not in scope + continue + } + } + + results = append(results, ocrText) + } + return +} + func (t OCRTexts) FindText(text string, options ...ActionOption) ( result OCRText, err error) { actionOptions := NewActionOptions(options...) var results []OCRText - for _, ocrText := range t { - rect := ocrText.Rect - - // check if text in scope - if len(actionOptions.AbsScope) == 4 { - if rect.Min.X < actionOptions.AbsScope[0] || - rect.Min.Y < actionOptions.AbsScope[1] || - rect.Max.X > actionOptions.AbsScope[2] || - rect.Max.Y > actionOptions.AbsScope[3] { - // not in scope - continue - } - } - + for _, ocrText := range t.FilterScope(actionOptions.AbsScope) { if actionOptions.Regex { // regex on, check if match regex if !regexp.MustCompile(text).MatchString(ocrText.Text) { diff --git a/hrp/pkg/uixt/video_crawler.go b/hrp/pkg/uixt/video_crawler.go index 32a7b450..2b566a17 100644 --- a/hrp/pkg/uixt/video_crawler.go +++ b/hrp/pkg/uixt/video_crawler.go @@ -112,6 +112,21 @@ func (s *VideoStat) incrFeed(ocrResult *OcrResult, driverExt *DriverExt) error { } } + // add popularity data for feed + popularityData := ocrResult.Texts.FilterScope(driverExt.GenAbsScope(0.8, 0.5, 1, 0.8)) + if len(popularityData) != 4 { + log.Warn().Interface("popularity", popularityData).Msg("get popularity data failed") + } else { + ocrResult.Popularity = Popularity{ + Stars: popularityData[0].Text, + Comments: popularityData[1].Text, + Favorites: popularityData[2].Text, + Shares: popularityData[3].Text, + } + log.Info().Interface("popularity", ocrResult.Popularity). + Msg("found feed popularity success") + } + s.FeedCount++ return nil } @@ -308,7 +323,6 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { continue } ocrResult := dExt.cacheStepData.OcrResults[imagePath] - ocrResult.Tags = []string{"feed"} // automatic handling of pop-up windows if err := dExt.autoPopupHandler(ocrResult); err != nil { @@ -328,11 +342,14 @@ func (dExt *DriverExt) VideoCrawler(configs *VideoCrawlerConfigs) (err error) { continue } } - } + ocrResult.Tags = []string{"live-preview"} + } else { + ocrResult.Tags = []string{"feed"} - // check feed type and incr feed count - if err := currVideoStat.incrFeed(ocrResult, dExt); err != nil { - log.Error().Err(err).Msg("incr feed failed") + // check feed type and incr feed count + if err := currVideoStat.incrFeed(ocrResult, dExt); err != nil { + log.Error().Err(err).Msg("incr feed failed") + } } // sleep custom random time