feat: parse url with js plugins support (#96)

* feat: WIP. add parser functionality and text message handling

* fix: use json to marshal js result

* feat: add metadata handling and version validation for jsParser

* refactor: rename parser package to parsers and restructure parser handling

* refactor: core code struct and impl parse task handle

* feat: impl parsed download

* fix: seek cache file when processing tph picture

* feat: implement parsed task handling and progress tracking

* feat: enhance task processing with concurrency control and progress tracking

* feat: add resource ID generation and improve resource processing handling

* feat: improve message formatting in parsed text and progress completion

* feat: add example js plugin

* feat: implement Twitter parser

* fix: twitter parse video json decode error

* feat: impl stream mode for parse task
This commit is contained in:
Krau
2025-08-21 23:48:17 +08:00
committed by GitHub
parent 79386bdd7d
commit 302db2fe75
47 changed files with 1348 additions and 47 deletions

83
parsers/twitter/parser.go Normal file
View File

@@ -0,0 +1,83 @@
package twitter
import (
"encoding/json"
"errors"
"fmt"
"net/http"
"path"
"regexp"
"strings"
"github.com/krau/SaveAny-Bot/pkg/parser"
)
type TwitterParser struct {
client http.Client
}
const (
FxTwitterApi = "api.fxtwitter.com"
)
var _ parser.Parser = (*TwitterParser)(nil)
var (
twitterSourceURLRegexp *regexp.Regexp = regexp.MustCompile(`(?:twitter|x)\.com/([^/]+)/status/(\d+)`)
)
func getTweetID(sourceURL string) string {
matches := twitterSourceURLRegexp.FindStringSubmatch(sourceURL)
if len(matches) < 3 {
return ""
}
return matches[2]
}
func (p *TwitterParser) Parse(u string) (*parser.Item, error) {
id := getTweetID(u)
if id == "" {
return nil, errors.New("invalid Twitter URL")
}
apiUrl := fmt.Sprintf("https://%s/_/status/%s", FxTwitterApi, id)
resp, err := p.client.Get(apiUrl)
if err != nil {
return nil, fmt.Errorf("failed to fetch Twitter API: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to fetch Twitter API, status code: %d", resp.StatusCode)
}
var fxResp FxTwitterApiResp
if err := json.NewDecoder(resp.Body).Decode(&fxResp); err != nil {
return nil, fmt.Errorf("failed to decode Twitter API response: %w", err)
}
if fxResp.Code != 200 {
return nil, fmt.Errorf("request twitter API error: %s", fxResp.Message)
}
if len(fxResp.Tweet.Media.All) == 0 {
return nil, errors.New("no media found in the tweet")
}
resources := make([]parser.Resource, 0, len(fxResp.Tweet.Media.All))
for _, media := range fxResp.Tweet.Media.All {
resources = append(resources, parser.Resource{
URL: media.URL,
Filename: path.Base(strings.Split(media.URL, "?")[0]),
})
}
item := &parser.Item{
Site: "Twitter",
Title: fmt.Sprintf("Tweet/%s", id),
URL: fxResp.Tweet.URL,
Description: fxResp.Tweet.Text,
Author: fxResp.Tweet.Author.Name,
Tags: make([]string, 0),
Extra: make(map[string]any),
Resources: resources,
}
return item, nil
}
func (p *TwitterParser) CanHandle(u string) bool {
return twitterSourceURLRegexp.MatchString(u)
}

122
parsers/twitter/types.go Normal file
View File

@@ -0,0 +1,122 @@
package twitter
// type AutoGenerated struct {
// Code int `json:"code"`
// Message string `json:"message"`
// Tweet struct {
// URL string `json:"url"`
// ID string `json:"id"`
// Text string `json:"text"`
// RawText struct {
// Text string `json:"text"`
// Facets []struct {
// Type string `json:"type"`
// Indices []int `json:"indices"`
// Original string `json:"original"`
// ID string `json:"id,omitempty"`
// Display string `json:"display,omitempty"`
// Replacement string `json:"replacement,omitempty"`
// } `json:"facets"`
// } `json:"raw_text"`
// Author struct {
// ID string `json:"id"`
// Name string `json:"name"`
// ScreenName string `json:"screen_name"`
// AvatarURL string `json:"avatar_url"`
// BannerURL interface{} `json:"banner_url"`
// Description string `json:"description"`
// Location string `json:"location"`
// URL string `json:"url"`
// Followers int `json:"followers"`
// Following int `json:"following"`
// Joined string `json:"joined"`
// Likes int `json:"likes"`
// MediaCount int `json:"media_count"`
// Protected bool `json:"protected"`
// Website struct {
// URL string `json:"url"`
// DisplayURL string `json:"display_url"`
// } `json:"website"`
// Tweets int `json:"tweets"`
// AvatarColor interface{} `json:"avatar_color"`
// } `json:"author"`
// Replies int `json:"replies"`
// Retweets int `json:"retweets"`
// Likes int `json:"likes"`
// Bookmarks int `json:"bookmarks"`
// CreatedAt string `json:"created_at"`
// CreatedTimestamp int `json:"created_timestamp"`
// PossiblySensitive bool `json:"possibly_sensitive"`
// Views int `json:"views"`
// IsNoteTweet bool `json:"is_note_tweet"`
// CommunityNote interface{} `json:"community_note"`
// Lang string `json:"lang"`
// ReplyingTo interface{} `json:"replying_to"`
// ReplyingToStatus interface{} `json:"replying_to_status"`
// Media struct {
// All []struct {
// URL string `json:"url"`
// ThumbnailURL string `json:"thumbnail_url"`
// Duration int `json:"duration"`
// Width int `json:"width"`
// Height int `json:"height"`
// Format string `json:"format"`
// Type string `json:"type"`
// Variants []struct {
// Bitrate int `json:"bitrate"`
// ContentType string `json:"content_type"`
// URL string `json:"url"`
// } `json:"variants"`
// } `json:"all"`
// Photos []struct {
// Type string `json:"type"`
// URL string `json:"url"`
// Width int `json:"width"`
// Height int `json:"height"`
// } `json:"photos"`
// Videos []struct {
// URL string `json:"url"`
// ThumbnailURL string `json:"thumbnail_url"`
// Duration int `json:"duration"`
// Width int `json:"width"`
// Height int `json:"height"`
// Format string `json:"format"`
// Type string `json:"type"`
// Variants []struct {
// Bitrate int `json:"bitrate"`
// ContentType string `json:"content_type"`
// URL string `json:"url"`
// } `json:"variants"`
// } `json:"videos"`
// } `json:"media"`
// Source string `json:"source"`
// TwitterCard string `json:"twitter_card"`
// Color interface{} `json:"color"`
// Provider string `json:"provider"`
// } `json:"tweet"`
// }
type FxTwitterApiResp struct {
Code int `json:"code"`
Message string `json:"message"`
Tweet struct {
URL string `json:"url"`
ID string `json:"id"`
Text string `json:"text"`
Author struct {
ID string `json:"id"`
Name string `json:"name"`
ScreenName string `json:"screen_name"`
Protected bool `json:"protected"`
} `json:"author"`
PossiblySensitive bool `json:"possibly_sensitive"`
IsNoteTweet bool `json:"is_note_tweet"`
Lang string `json:"lang"`
Media struct {
All []struct {
URL string `json:"url"`
Type string `json:"type"`
} `json:"all"`
} `json:"media"`
} `json:"tweet"`
}