feat: parse url with js plugins support (#96)

* feat: WIP. add parser functionality and text message handling

* fix: use json to marshal js result

* feat: add metadata handling and version validation for jsParser

* refactor: rename parser package to parsers and restructure parser handling

* refactor: core code struct and impl parse task handle

* feat: impl parsed download

* fix: seek cache file when processing tph picture

* feat: implement parsed task handling and progress tracking

* feat: enhance task processing with concurrency control and progress tracking

* feat: add resource ID generation and improve resource processing handling

* feat: improve message formatting in parsed text and progress completion

* feat: add example js plugin

* feat: implement Twitter parser

* fix: twitter parse video json decode error

* feat: impl stream mode for parse task
This commit is contained in:
Krau
2025-08-21 23:48:17 +08:00
committed by GitHub
parent 79386bdd7d
commit 302db2fe75
47 changed files with 1348 additions and 47 deletions

View File

@@ -1,5 +1,5 @@
package tasktype
//go:generate go-enum --values --names --flag --nocase
// ENUM(tgfiles,tphpics)
// ENUM(tgfiles,tphpics,parseditem)
type TaskType string

View File

@@ -16,6 +16,8 @@ const (
TaskTypeTgfiles TaskType = "tgfiles"
// TaskTypeTphpics is a TaskType of type tphpics.
TaskTypeTphpics TaskType = "tphpics"
// TaskTypeParseditem is a TaskType of type parseditem.
TaskTypeParseditem TaskType = "parseditem"
)
var ErrInvalidTaskType = fmt.Errorf("not a valid TaskType, try [%s]", strings.Join(_TaskTypeNames, ", "))
@@ -23,6 +25,7 @@ var ErrInvalidTaskType = fmt.Errorf("not a valid TaskType, try [%s]", strings.Jo
var _TaskTypeNames = []string{
string(TaskTypeTgfiles),
string(TaskTypeTphpics),
string(TaskTypeParseditem),
}
// TaskTypeNames returns a list of possible string values of TaskType.
@@ -37,6 +40,7 @@ func TaskTypeValues() []TaskType {
return []TaskType{
TaskTypeTgfiles,
TaskTypeTphpics,
TaskTypeParseditem,
}
}
@@ -53,8 +57,9 @@ func (x TaskType) IsValid() bool {
}
var _TaskTypeValue = map[string]TaskType{
"tgfiles": TaskTypeTgfiles,
"tphpics": TaskTypeTphpics,
"tgfiles": TaskTypeTgfiles,
"tphpics": TaskTypeTphpics,
"parseditem": TaskTypeParseditem,
}
// ParseTaskType attempts to convert a string to a TaskType.

63
pkg/parser/parser.go Normal file
View File

@@ -0,0 +1,63 @@
package parser
import (
"crypto/md5"
"fmt"
)
type Parser interface {
CanHandle(url string) bool
Parse(url string) (*Item, error)
}
// Resource is a single downloadable resource with metadata.
type Resource struct {
URL string `json:"url"`
Filename string `json:"filename"` // with ext
MimeType string `json:"mime_type"`
Extension string `json:"extension"`
Size int64 `json:"size"` // 0 when unknown
Hash map[string]string `json:"hash"` // {"md5": "...", "sha256": "..."}
Headers map[string]string `json:"headers"` // HTTP headers when downloading
Extra map[string]any `json:"extra"`
}
type Item struct {
Site string `json:"site"`
URL string `json:"url"` // original URL of the item
Title string `json:"title"`
Author string `json:"author"`
Description string `json:"description"`
Tags []string `json:"tags"`
Resources []Resource `json:"resources"`
Extra map[string]any `json:"extra"`
}
func (r *Resource) FileName() string {
return r.Filename
}
func (r *Resource) FileSize() int64 {
return r.Size
}
func (r *Resource) ID() string {
h := md5.New()
h.Write([]byte(r.URL))
h.Write([]byte(r.Filename))
h.Write([]byte(r.MimeType))
h.Write([]byte(r.Extension))
h.Write([]byte(fmt.Sprintf("%d", r.Size)))
for k, v := range r.Hash {
h.Write([]byte(k))
h.Write([]byte(v))
}
for k, v := range r.Headers {
h.Write([]byte(k))
h.Write([]byte(v))
}
return fmt.Sprintf("%x", h.Sum(nil))
}

View File

@@ -2,6 +2,7 @@ package tcbdata
import (
"github.com/krau/SaveAny-Bot/pkg/enums/tasktype"
"github.com/krau/SaveAny-Bot/pkg/parser"
"github.com/krau/SaveAny-Bot/pkg/telegraph"
"github.com/krau/SaveAny-Bot/pkg/tfile"
)
@@ -26,6 +27,7 @@ const (
// }
type Add struct {
// [TODO] maybe we should to spilit this into different types...
TaskType tasktype.TaskType
SelectedStorName string
DirID uint
@@ -37,6 +39,8 @@ type Add struct {
TphPageNode *telegraph.Page
TphPics []string
TphDirPath string // unescaped telegraph.Page.Path
// parseditem
ParsedItem *parser.Item
}
type SetDefaultStorage struct {