mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-05 07:29:56 +08:00
feat: accelerate RSS parsing with Rust
This commit is contained in:
10
rust/moviepilot_rust/Cargo.lock
generated
10
rust/moviepilot_rust/Cargo.lock
generated
@@ -360,6 +360,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pyo3",
|
||||
"quick-xml",
|
||||
"regex",
|
||||
"scraper",
|
||||
"url",
|
||||
@@ -551,6 +552,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.38.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
|
||||
@@ -11,6 +11,7 @@ crate-type = ["cdylib"]
|
||||
once_cell = "1.20"
|
||||
percent-encoding = "2.3"
|
||||
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
|
||||
quick-xml = "0.38"
|
||||
regex = "1.11"
|
||||
scraper = "0.24"
|
||||
url = "2.5"
|
||||
|
||||
@@ -3,7 +3,7 @@ use once_cell::sync::Lazy;
|
||||
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
|
||||
use pyo3::exceptions::PyValueError;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyAny, PyDict, PyList};
|
||||
use pyo3::types::{PyDict, PyList};
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use url::form_urlencoded;
|
||||
@@ -41,17 +41,6 @@ enum RowParseResult {
|
||||
Item(PyObject),
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
pub(crate) fn apply_indexer_text_filters_fast(
|
||||
text: &Bound<'_, PyAny>,
|
||||
filters: &Bound<'_, PyAny>,
|
||||
) -> PyResult<Option<String>> {
|
||||
if text.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
apply_text_filters(text.str()?.to_str()?.to_string(), filters)
|
||||
}
|
||||
|
||||
/// 批量解析普通配置 indexer 页面,遇到不支持的选择器配置时返回 None 交给 Python 回退。
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
|
||||
@@ -171,16 +160,7 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
|
||||
Ok(Some(current.trim().to_string()))
|
||||
}
|
||||
|
||||
/// 将站点页面中的文件大小文本转换为字节数。
|
||||
#[pyfunction]
|
||||
pub(crate) fn parse_filesize_fast(text: &Bound<'_, PyAny>) -> PyResult<i64> {
|
||||
if text.is_none() {
|
||||
return Ok(0);
|
||||
}
|
||||
Ok(parse_filesize_text(text.str()?.to_str()?))
|
||||
}
|
||||
|
||||
/// 将文件大小文本转换为字节数,供 Python 导出函数和 Rust HTML 解析共用。
|
||||
/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。
|
||||
fn parse_filesize_text(text: &str) -> i64 {
|
||||
let raw = text.trim().to_string();
|
||||
if raw.is_empty() {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
mod filter;
|
||||
mod indexer;
|
||||
mod meta;
|
||||
mod rss;
|
||||
mod utils;
|
||||
|
||||
use pyo3::prelude::*;
|
||||
@@ -20,12 +21,8 @@ fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_function(wrap_pyfunction!(meta::parse_video_title_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(filter::filter_torrents_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(
|
||||
indexer::apply_indexer_text_filters_fast,
|
||||
m
|
||||
)?)?;
|
||||
m.add_function(wrap_pyfunction!(indexer::parse_filesize_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(indexer::build_indexer_search_url_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
204
rust/moviepilot_rust/src/rss.rs
Normal file
204
rust/moviepilot_rust/src/rss.rs
Normal file
@@ -0,0 +1,204 @@
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyDict;
|
||||
use quick_xml::events::{BytesStart, Event};
|
||||
use quick_xml::name::QName;
|
||||
use quick_xml::Reader;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RssItem {
|
||||
title: String,
|
||||
description: String,
|
||||
link: String,
|
||||
enclosure: String,
|
||||
size: i64,
|
||||
pubdate: String,
|
||||
nickname: String,
|
||||
}
|
||||
|
||||
impl RssItem {
|
||||
fn has_output(&self) -> bool {
|
||||
!self.title.is_empty() && (!self.enclosure.is_empty() || !self.link.is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum TextField {
|
||||
Title,
|
||||
Description,
|
||||
Link,
|
||||
Pubdate,
|
||||
Nickname,
|
||||
}
|
||||
|
||||
/// 批量解析 RSS/Atom 条目,返回 Python 侧后续处理需要的核心字段。
|
||||
#[pyfunction]
|
||||
pub(crate) fn parse_rss_items_fast(
|
||||
py: Python<'_>,
|
||||
xml_text: &str,
|
||||
max_items: usize,
|
||||
) -> PyResult<Option<Vec<PyObject>>> {
|
||||
let mut reader = Reader::from_str(xml_text);
|
||||
reader.config_mut().trim_text(true);
|
||||
|
||||
let mut items = Vec::new();
|
||||
let mut current_item: Option<RssItem> = None;
|
||||
let mut current_field: Option<TextField> = None;
|
||||
let mut item_depth = 0usize;
|
||||
let mut parse_failed = false;
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if current_item.is_none() && (local == "item" || local == "entry") {
|
||||
current_item = Some(RssItem::default());
|
||||
current_field = None;
|
||||
item_depth = 1;
|
||||
continue;
|
||||
}
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
item_depth += 1;
|
||||
match local.as_str() {
|
||||
"title" => current_field = Some(TextField::Title),
|
||||
"description" | "summary" => current_field = Some(TextField::Description),
|
||||
"pubDate" | "published" | "updated" => current_field = Some(TextField::Pubdate),
|
||||
"creator" => current_field = Some(TextField::Nickname),
|
||||
"link" => {
|
||||
current_field = Some(TextField::Link);
|
||||
if item.link.is_empty() {
|
||||
if let Some(href) = attr_value(&event, QName(b"href")) {
|
||||
item.link = href;
|
||||
}
|
||||
}
|
||||
}
|
||||
"enclosure" => {
|
||||
if let Some(url) = attr_value(&event, QName(b"url")) {
|
||||
item.enclosure = url;
|
||||
}
|
||||
if let Some(length) = attr_value(&event, QName(b"length")) {
|
||||
item.size = length.parse::<i64>().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::Empty(event)) => {
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
match local_name(event.name()).as_str() {
|
||||
"link" => {
|
||||
if item.link.is_empty() {
|
||||
if let Some(href) = attr_value(&event, QName(b"href")) {
|
||||
item.link = href;
|
||||
}
|
||||
}
|
||||
}
|
||||
"enclosure" => {
|
||||
if let Some(url) = attr_value(&event, QName(b"url")) {
|
||||
item.enclosure = url;
|
||||
}
|
||||
if let Some(length) = attr_value(&event, QName(b"length")) {
|
||||
item.size = length.parse::<i64>().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(event)) => {
|
||||
if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) {
|
||||
if let Ok(value) = event.decode() {
|
||||
append_field(item, field, value.as_ref());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::CData(event)) => {
|
||||
if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) {
|
||||
if let Ok(value) = event.decode() {
|
||||
append_field(item, field, value.as_ref());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(event)) => {
|
||||
if current_item.is_some() {
|
||||
let local = local_name(event.name());
|
||||
if local == "item" || local == "entry" {
|
||||
let mut item = current_item.take().unwrap_or_default();
|
||||
if item.enclosure.is_empty() && !item.link.is_empty() {
|
||||
item.enclosure = item.link.clone();
|
||||
}
|
||||
if item.has_output() {
|
||||
items.push(item_to_py(py, &item)?.into_any().unbind());
|
||||
if items.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
current_field = None;
|
||||
item_depth = 0;
|
||||
} else {
|
||||
item_depth = item_depth.saturating_sub(1);
|
||||
if item_depth <= 1 {
|
||||
current_field = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(_) => {
|
||||
parse_failed = true;
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if parse_failed && items.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(items))
|
||||
}
|
||||
}
|
||||
|
||||
/// 将内部 RSS 条目结构转换为 Python 字典。
|
||||
fn item_to_py<'py>(py: Python<'py>, item: &RssItem) -> PyResult<Bound<'py, PyDict>> {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("title", item.title.trim())?;
|
||||
dict.set_item("enclosure", item.enclosure.trim())?;
|
||||
dict.set_item("size", item.size)?;
|
||||
dict.set_item("description", item.description.trim())?;
|
||||
dict.set_item("link", item.link.trim())?;
|
||||
dict.set_item("pubdate_raw", item.pubdate.trim())?;
|
||||
if !item.nickname.trim().is_empty() {
|
||||
dict.set_item("nickname", item.nickname.trim())?;
|
||||
}
|
||||
Ok(dict)
|
||||
}
|
||||
|
||||
/// 返回 XML 名称去掉命名空间前缀后的本地名称。
|
||||
fn local_name(name: QName<'_>) -> String {
|
||||
let raw = name.as_ref();
|
||||
let local = raw.rsplit(|byte| *byte == b':').next().unwrap_or(raw);
|
||||
std::str::from_utf8(local).unwrap_or("").to_string()
|
||||
}
|
||||
|
||||
/// 读取 XML 节点属性并完成实体反转义。
|
||||
fn attr_value(event: &BytesStart<'_>, name: QName<'_>) -> Option<String> {
|
||||
event
|
||||
.try_get_attribute(name)
|
||||
.ok()
|
||||
.flatten()
|
||||
.and_then(|attr| attr.decode_and_unescape_value(event.decoder()).ok())
|
||||
.map(|value| value.into_owned())
|
||||
}
|
||||
|
||||
/// 追加当前文本节点到对应 RSS 字段。
|
||||
fn append_field(item: &mut RssItem, field: TextField, value: &str) {
|
||||
let target = match field {
|
||||
TextField::Title => &mut item.title,
|
||||
TextField::Description => &mut item.description,
|
||||
TextField::Link => &mut item.link,
|
||||
TextField::Pubdate => &mut item.pubdate,
|
||||
TextField::Nickname => &mut item.nickname,
|
||||
};
|
||||
target.push_str(value);
|
||||
}
|
||||
Reference in New Issue
Block a user