feat: parse url with js plugins support (#96)
* feat: WIP. add parser functionality and text message handling * fix: use json to marshal js result * feat: add metadata handling and version validation for jsParser * refactor: rename parser package to parsers and restructure parser handling * refactor: core code struct and impl parse task handle * feat: impl parsed download * fix: seek cache file when processing tph picture * feat: implement parsed task handling and progress tracking * feat: enhance task processing with concurrency control and progress tracking * feat: add resource ID generation and improve resource processing handling * feat: improve message formatting in parsed text and progress completion * feat: add example js plugin * feat: implement Twitter parser * fix: twitter parse video json decode error * feat: impl stream mode for parse task
This commit is contained in:
184
parsers/js.go
Normal file
184
parsers/js.go
Normal file
@@ -0,0 +1,184 @@
|
||||
package parsers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/blang/semver"
|
||||
"github.com/charmbracelet/log"
|
||||
"github.com/dop251/goja"
|
||||
"github.com/krau/SaveAny-Bot/pkg/parser"
|
||||
)
|
||||
|
||||
var (
|
||||
LatestParserVersion = semver.MustParse("1.0.0")
|
||||
MinimumParserVersion = semver.MustParse("1.0.0")
|
||||
)
|
||||
|
||||
type PluginMeta struct {
|
||||
Name string `json:"name"`
|
||||
Version string `json:"version"` // [TODO] 分版本解析, 但是我们现在只有 v1 所以先不写
|
||||
Description string `json:"description"`
|
||||
Author string `json:"author"`
|
||||
}
|
||||
|
||||
type jsParser struct {
|
||||
meta PluginMeta
|
||||
vm *goja.Runtime
|
||||
reqCh chan jsParserReq
|
||||
}
|
||||
|
||||
type jsParserReq struct {
|
||||
method string
|
||||
url string
|
||||
respCh chan jsParserResp
|
||||
}
|
||||
|
||||
type jsParserResp struct {
|
||||
item *parser.Item
|
||||
ok bool
|
||||
err error
|
||||
}
|
||||
|
||||
func (p *jsParser) CanHandle(url string) bool {
|
||||
respCh := make(chan jsParserResp, 1)
|
||||
p.reqCh <- jsParserReq{method: "canHandle", url: url, respCh: respCh}
|
||||
resp := <-respCh
|
||||
return resp.ok && resp.err == nil
|
||||
}
|
||||
|
||||
func (p *jsParser) Parse(url string) (*parser.Item, error) {
|
||||
respCh := make(chan jsParserResp, 1)
|
||||
p.reqCh <- jsParserReq{method: "parse", url: url, respCh: respCh}
|
||||
resp := <-respCh
|
||||
return resp.item, resp.err
|
||||
}
|
||||
|
||||
func newJSParser(vm *goja.Runtime, canHandleFunc, parseFunc goja.Value, metadata PluginMeta) *jsParser {
|
||||
p := &jsParser{
|
||||
vm: vm,
|
||||
reqCh: make(chan jsParserReq, 10),
|
||||
meta: metadata,
|
||||
}
|
||||
|
||||
go func() {
|
||||
for req := range p.reqCh {
|
||||
switch req.method {
|
||||
case "canHandle":
|
||||
fn, _ := goja.AssertFunction(canHandleFunc)
|
||||
res, err := fn(goja.Undefined(), p.vm.ToValue(req.url))
|
||||
if err != nil {
|
||||
req.respCh <- jsParserResp{ok: false, err: err}
|
||||
continue
|
||||
}
|
||||
req.respCh <- jsParserResp{ok: res.ToBoolean()}
|
||||
case "parse":
|
||||
fn, _ := goja.AssertFunction(parseFunc)
|
||||
result, err := fn(goja.Undefined(), p.vm.ToValue(req.url))
|
||||
if err != nil {
|
||||
req.respCh <- jsParserResp{err: err}
|
||||
continue
|
||||
}
|
||||
|
||||
var item parser.Item
|
||||
if exported := result.Export(); exported != nil {
|
||||
data, err := json.Marshal(exported)
|
||||
if err != nil {
|
||||
req.respCh <- jsParserResp{err: fmt.Errorf("failed to marshal result to JSON: %w", err)}
|
||||
continue
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(data, &item); err != nil {
|
||||
req.respCh <- jsParserResp{err: fmt.Errorf("failed to unmarshal JSON to Item: %w", err)}
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
req.respCh <- jsParserResp{err: fmt.Errorf("JS function returned null or undefined")}
|
||||
continue
|
||||
}
|
||||
req.respCh <- jsParserResp{item: &item}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func registerParser(vm *goja.Runtime) func(call goja.FunctionCall) goja.Value {
|
||||
return func(call goja.FunctionCall) goja.Value {
|
||||
jsObj := call.Argument(0)
|
||||
if jsObj == nil || goja.IsUndefined(jsObj) || goja.IsNull(jsObj) {
|
||||
panic("registerParser expects an object { canHandle, parse }")
|
||||
}
|
||||
|
||||
obj := jsObj.ToObject(vm)
|
||||
if obj == nil {
|
||||
panic("registerParser: cannot convert argument to object")
|
||||
}
|
||||
metaValue := obj.Get("metadata")
|
||||
if metaValue == nil || goja.IsUndefined(metaValue) {
|
||||
panic("parser must provide metadata")
|
||||
}
|
||||
var metadata PluginMeta
|
||||
if exported := metaValue.Export(); exported != nil {
|
||||
data, err := json.Marshal(exported)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("failed to marshal metadata to JSON: %v", err))
|
||||
}
|
||||
if err := json.Unmarshal(data, &metadata); err != nil {
|
||||
panic(fmt.Sprintf("failed to unmarshal JSON to PluginMeta: %v", err))
|
||||
}
|
||||
} else {
|
||||
panic("metadata cannot be null or undefined")
|
||||
}
|
||||
|
||||
pluginV := semver.MustParse(metadata.Version)
|
||||
if pluginV.LT(MinimumParserVersion) || pluginV.GT(LatestParserVersion) {
|
||||
panic(fmt.Sprintf("parser version %s is not supported, must be between %s and %s", metadata.Version, MinimumParserVersion, LatestParserVersion))
|
||||
}
|
||||
|
||||
handleFn := obj.Get("canHandle")
|
||||
parseFn := obj.Get("parse")
|
||||
if parseFn == nil || goja.IsUndefined(parseFn) {
|
||||
panic("parser must provide a parse function")
|
||||
}
|
||||
|
||||
parsers = append(parsers, newJSParser(vm, handleFn, parseFn, metadata))
|
||||
return goja.Undefined()
|
||||
}
|
||||
}
|
||||
|
||||
func LoadPlugins(ctx context.Context, dir string) error {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
if filepath.Ext(e.Name()) != ".js" {
|
||||
continue
|
||||
}
|
||||
scriptPath := filepath.Join(dir, e.Name())
|
||||
code, err := os.ReadFile(scriptPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
vm := goja.New()
|
||||
logger := log.FromContext(ctx).WithPrefix(fmt.Sprintf("[plugin|parser]/%s", e.Name()))
|
||||
vm.Set("registerParser", registerParser(vm))
|
||||
vm.Set("console", map[string]any{
|
||||
"log": func(args ...any) {
|
||||
logger.Info(fmt.Sprint(args...))
|
||||
},
|
||||
})
|
||||
|
||||
if _, err := vm.RunString(string(code)); err != nil {
|
||||
return fmt.Errorf("error loading plugin %s: %w", e.Name(), err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
65
parsers/parser.go
Normal file
65
parsers/parser.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package parsers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/krau/SaveAny-Bot/parsers/twitter"
|
||||
"github.com/krau/SaveAny-Bot/pkg/parser"
|
||||
)
|
||||
|
||||
var (
|
||||
parsers []parser.Parser
|
||||
parsersMu sync.Mutex
|
||||
)
|
||||
|
||||
func GetParsers() []parser.Parser {
|
||||
parsersMu.Lock()
|
||||
defer parsersMu.Unlock()
|
||||
return parsers
|
||||
}
|
||||
|
||||
func AddParser(p parser.Parser) {
|
||||
parsersMu.Lock()
|
||||
defer parsersMu.Unlock()
|
||||
parsers = append(parsers, p)
|
||||
}
|
||||
|
||||
func init() {
|
||||
AddParser(new(twitter.TwitterParser))
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoParserFound = fmt.Errorf("no parser found for the given URL")
|
||||
)
|
||||
|
||||
func ParseWithContext(ctx context.Context, url string) (*parser.Item, error) {
|
||||
ch := make(chan *parser.Item, 1)
|
||||
errCh := make(chan error, 1)
|
||||
|
||||
go func() {
|
||||
for _, pser := range parsers {
|
||||
if !pser.CanHandle(url) {
|
||||
continue
|
||||
}
|
||||
item, err := pser.Parse(url)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
return
|
||||
}
|
||||
ch <- item
|
||||
return
|
||||
}
|
||||
errCh <- ErrNoParserFound
|
||||
}()
|
||||
|
||||
select {
|
||||
case item := <-ch:
|
||||
return item, nil
|
||||
case err := <-errCh:
|
||||
return nil, err
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
83
parsers/twitter/parser.go
Normal file
83
parsers/twitter/parser.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package twitter
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/krau/SaveAny-Bot/pkg/parser"
|
||||
)
|
||||
|
||||
type TwitterParser struct {
|
||||
client http.Client
|
||||
}
|
||||
|
||||
const (
|
||||
FxTwitterApi = "api.fxtwitter.com"
|
||||
)
|
||||
|
||||
var _ parser.Parser = (*TwitterParser)(nil)
|
||||
|
||||
var (
|
||||
twitterSourceURLRegexp *regexp.Regexp = regexp.MustCompile(`(?:twitter|x)\.com/([^/]+)/status/(\d+)`)
|
||||
)
|
||||
|
||||
func getTweetID(sourceURL string) string {
|
||||
matches := twitterSourceURLRegexp.FindStringSubmatch(sourceURL)
|
||||
if len(matches) < 3 {
|
||||
return ""
|
||||
}
|
||||
return matches[2]
|
||||
}
|
||||
|
||||
func (p *TwitterParser) Parse(u string) (*parser.Item, error) {
|
||||
id := getTweetID(u)
|
||||
if id == "" {
|
||||
return nil, errors.New("invalid Twitter URL")
|
||||
}
|
||||
apiUrl := fmt.Sprintf("https://%s/_/status/%s", FxTwitterApi, id)
|
||||
resp, err := p.client.Get(apiUrl)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch Twitter API: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("failed to fetch Twitter API, status code: %d", resp.StatusCode)
|
||||
}
|
||||
var fxResp FxTwitterApiResp
|
||||
if err := json.NewDecoder(resp.Body).Decode(&fxResp); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode Twitter API response: %w", err)
|
||||
}
|
||||
if fxResp.Code != 200 {
|
||||
return nil, fmt.Errorf("request twitter API error: %s", fxResp.Message)
|
||||
}
|
||||
if len(fxResp.Tweet.Media.All) == 0 {
|
||||
return nil, errors.New("no media found in the tweet")
|
||||
}
|
||||
resources := make([]parser.Resource, 0, len(fxResp.Tweet.Media.All))
|
||||
for _, media := range fxResp.Tweet.Media.All {
|
||||
resources = append(resources, parser.Resource{
|
||||
URL: media.URL,
|
||||
Filename: path.Base(strings.Split(media.URL, "?")[0]),
|
||||
})
|
||||
}
|
||||
item := &parser.Item{
|
||||
Site: "Twitter",
|
||||
Title: fmt.Sprintf("Tweet/%s", id),
|
||||
URL: fxResp.Tweet.URL,
|
||||
Description: fxResp.Tweet.Text,
|
||||
Author: fxResp.Tweet.Author.Name,
|
||||
Tags: make([]string, 0),
|
||||
Extra: make(map[string]any),
|
||||
Resources: resources,
|
||||
}
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func (p *TwitterParser) CanHandle(u string) bool {
|
||||
return twitterSourceURLRegexp.MatchString(u)
|
||||
}
|
||||
122
parsers/twitter/types.go
Normal file
122
parsers/twitter/types.go
Normal file
@@ -0,0 +1,122 @@
|
||||
package twitter
|
||||
|
||||
// type AutoGenerated struct {
|
||||
// Code int `json:"code"`
|
||||
// Message string `json:"message"`
|
||||
// Tweet struct {
|
||||
// URL string `json:"url"`
|
||||
// ID string `json:"id"`
|
||||
// Text string `json:"text"`
|
||||
// RawText struct {
|
||||
// Text string `json:"text"`
|
||||
// Facets []struct {
|
||||
// Type string `json:"type"`
|
||||
// Indices []int `json:"indices"`
|
||||
// Original string `json:"original"`
|
||||
// ID string `json:"id,omitempty"`
|
||||
// Display string `json:"display,omitempty"`
|
||||
// Replacement string `json:"replacement,omitempty"`
|
||||
// } `json:"facets"`
|
||||
// } `json:"raw_text"`
|
||||
// Author struct {
|
||||
// ID string `json:"id"`
|
||||
// Name string `json:"name"`
|
||||
// ScreenName string `json:"screen_name"`
|
||||
// AvatarURL string `json:"avatar_url"`
|
||||
// BannerURL interface{} `json:"banner_url"`
|
||||
// Description string `json:"description"`
|
||||
// Location string `json:"location"`
|
||||
// URL string `json:"url"`
|
||||
// Followers int `json:"followers"`
|
||||
// Following int `json:"following"`
|
||||
// Joined string `json:"joined"`
|
||||
// Likes int `json:"likes"`
|
||||
// MediaCount int `json:"media_count"`
|
||||
// Protected bool `json:"protected"`
|
||||
// Website struct {
|
||||
// URL string `json:"url"`
|
||||
// DisplayURL string `json:"display_url"`
|
||||
// } `json:"website"`
|
||||
// Tweets int `json:"tweets"`
|
||||
// AvatarColor interface{} `json:"avatar_color"`
|
||||
// } `json:"author"`
|
||||
// Replies int `json:"replies"`
|
||||
// Retweets int `json:"retweets"`
|
||||
// Likes int `json:"likes"`
|
||||
// Bookmarks int `json:"bookmarks"`
|
||||
// CreatedAt string `json:"created_at"`
|
||||
// CreatedTimestamp int `json:"created_timestamp"`
|
||||
// PossiblySensitive bool `json:"possibly_sensitive"`
|
||||
// Views int `json:"views"`
|
||||
// IsNoteTweet bool `json:"is_note_tweet"`
|
||||
// CommunityNote interface{} `json:"community_note"`
|
||||
// Lang string `json:"lang"`
|
||||
// ReplyingTo interface{} `json:"replying_to"`
|
||||
// ReplyingToStatus interface{} `json:"replying_to_status"`
|
||||
// Media struct {
|
||||
// All []struct {
|
||||
// URL string `json:"url"`
|
||||
// ThumbnailURL string `json:"thumbnail_url"`
|
||||
// Duration int `json:"duration"`
|
||||
// Width int `json:"width"`
|
||||
// Height int `json:"height"`
|
||||
// Format string `json:"format"`
|
||||
// Type string `json:"type"`
|
||||
// Variants []struct {
|
||||
// Bitrate int `json:"bitrate"`
|
||||
// ContentType string `json:"content_type"`
|
||||
// URL string `json:"url"`
|
||||
// } `json:"variants"`
|
||||
// } `json:"all"`
|
||||
// Photos []struct {
|
||||
// Type string `json:"type"`
|
||||
// URL string `json:"url"`
|
||||
// Width int `json:"width"`
|
||||
// Height int `json:"height"`
|
||||
// } `json:"photos"`
|
||||
// Videos []struct {
|
||||
// URL string `json:"url"`
|
||||
// ThumbnailURL string `json:"thumbnail_url"`
|
||||
// Duration int `json:"duration"`
|
||||
// Width int `json:"width"`
|
||||
// Height int `json:"height"`
|
||||
// Format string `json:"format"`
|
||||
// Type string `json:"type"`
|
||||
// Variants []struct {
|
||||
// Bitrate int `json:"bitrate"`
|
||||
// ContentType string `json:"content_type"`
|
||||
// URL string `json:"url"`
|
||||
// } `json:"variants"`
|
||||
// } `json:"videos"`
|
||||
// } `json:"media"`
|
||||
// Source string `json:"source"`
|
||||
// TwitterCard string `json:"twitter_card"`
|
||||
// Color interface{} `json:"color"`
|
||||
// Provider string `json:"provider"`
|
||||
// } `json:"tweet"`
|
||||
// }
|
||||
|
||||
type FxTwitterApiResp struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
Tweet struct {
|
||||
URL string `json:"url"`
|
||||
ID string `json:"id"`
|
||||
Text string `json:"text"`
|
||||
Author struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
ScreenName string `json:"screen_name"`
|
||||
Protected bool `json:"protected"`
|
||||
} `json:"author"`
|
||||
PossiblySensitive bool `json:"possibly_sensitive"`
|
||||
IsNoteTweet bool `json:"is_note_tweet"`
|
||||
Lang string `json:"lang"`
|
||||
Media struct {
|
||||
All []struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"`
|
||||
} `json:"all"`
|
||||
} `json:"media"`
|
||||
} `json:"tweet"`
|
||||
}
|
||||
Reference in New Issue
Block a user