feat: accelerate RSS parsing with Rust

This commit is contained in:
jxxghp
2026-05-22 21:31:18 +08:00
parent 052e1ca8e4
commit 4de4044a3e
15 changed files with 467 additions and 102 deletions

View File

@@ -360,6 +360,7 @@ dependencies = [
"once_cell",
"percent-encoding",
"pyo3",
"quick-xml",
"regex",
"scraper",
"url",
@@ -551,6 +552,15 @@ dependencies = [
"syn",
]
[[package]]
name = "quick-xml"
version = "0.38.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.45"

View File

@@ -11,6 +11,7 @@ crate-type = ["cdylib"]
once_cell = "1.20"
percent-encoding = "2.3"
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
quick-xml = "0.38"
regex = "1.11"
scraper = "0.24"
url = "2.5"

View File

@@ -3,7 +3,7 @@ use once_cell::sync::Lazy;
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDict, PyList};
use pyo3::types::{PyDict, PyList};
use regex::{Regex, RegexBuilder};
use scraper::{ElementRef, Html, Selector};
use url::form_urlencoded;
@@ -41,17 +41,6 @@ enum RowParseResult {
Item(PyObject),
}
#[pyfunction]
pub(crate) fn apply_indexer_text_filters_fast(
text: &Bound<'_, PyAny>,
filters: &Bound<'_, PyAny>,
) -> PyResult<Option<String>> {
if text.is_none() {
return Ok(None);
}
apply_text_filters(text.str()?.to_str()?.to_string(), filters)
}
/// 批量解析普通配置 indexer 页面,遇到不支持的选择器配置时返回 None 交给 Python 回退。
#[pyfunction]
#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
@@ -171,16 +160,7 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
Ok(Some(current.trim().to_string()))
}
/// 将站点页面中的文件大小文本转换为字节数。
#[pyfunction]
pub(crate) fn parse_filesize_fast(text: &Bound<'_, PyAny>) -> PyResult<i64> {
if text.is_none() {
return Ok(0);
}
Ok(parse_filesize_text(text.str()?.to_str()?))
}
/// 将文件大小文本转换为字节数,供 Python 导出函数和 Rust HTML 解析共用。
/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用
fn parse_filesize_text(text: &str) -> i64 {
let raw = text.trim().to_string();
if raw.is_empty() {

View File

@@ -1,6 +1,7 @@
mod filter;
mod indexer;
mod meta;
mod rss;
mod utils;
use pyo3::prelude::*;
@@ -20,12 +21,8 @@ fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(meta::parse_video_title_fast, m)?)?;
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
m.add_function(wrap_pyfunction!(filter::filter_torrents_fast, m)?)?;
m.add_function(wrap_pyfunction!(
indexer::apply_indexer_text_filters_fast,
m
)?)?;
m.add_function(wrap_pyfunction!(indexer::parse_filesize_fast, m)?)?;
m.add_function(wrap_pyfunction!(indexer::build_indexer_search_url_fast, m)?)?;
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?;
Ok(())
}

View File

@@ -0,0 +1,204 @@
use pyo3::prelude::*;
use pyo3::types::PyDict;
use quick_xml::events::{BytesStart, Event};
use quick_xml::name::QName;
use quick_xml::Reader;
#[derive(Default)]
struct RssItem {
title: String,
description: String,
link: String,
enclosure: String,
size: i64,
pubdate: String,
nickname: String,
}
impl RssItem {
fn has_output(&self) -> bool {
!self.title.is_empty() && (!self.enclosure.is_empty() || !self.link.is_empty())
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum TextField {
Title,
Description,
Link,
Pubdate,
Nickname,
}
/// 批量解析 RSS/Atom 条目,返回 Python 侧后续处理需要的核心字段。
#[pyfunction]
pub(crate) fn parse_rss_items_fast(
py: Python<'_>,
xml_text: &str,
max_items: usize,
) -> PyResult<Option<Vec<PyObject>>> {
let mut reader = Reader::from_str(xml_text);
reader.config_mut().trim_text(true);
let mut items = Vec::new();
let mut current_item: Option<RssItem> = None;
let mut current_field: Option<TextField> = None;
let mut item_depth = 0usize;
let mut parse_failed = false;
loop {
match reader.read_event() {
Ok(Event::Start(event)) => {
let local = local_name(event.name());
if current_item.is_none() && (local == "item" || local == "entry") {
current_item = Some(RssItem::default());
current_field = None;
item_depth = 1;
continue;
}
if let Some(item) = current_item.as_mut() {
item_depth += 1;
match local.as_str() {
"title" => current_field = Some(TextField::Title),
"description" | "summary" => current_field = Some(TextField::Description),
"pubDate" | "published" | "updated" => current_field = Some(TextField::Pubdate),
"creator" => current_field = Some(TextField::Nickname),
"link" => {
current_field = Some(TextField::Link);
if item.link.is_empty() {
if let Some(href) = attr_value(&event, QName(b"href")) {
item.link = href;
}
}
}
"enclosure" => {
if let Some(url) = attr_value(&event, QName(b"url")) {
item.enclosure = url;
}
if let Some(length) = attr_value(&event, QName(b"length")) {
item.size = length.parse::<i64>().unwrap_or(0);
}
}
_ => {}
}
}
}
Ok(Event::Empty(event)) => {
if let Some(item) = current_item.as_mut() {
match local_name(event.name()).as_str() {
"link" => {
if item.link.is_empty() {
if let Some(href) = attr_value(&event, QName(b"href")) {
item.link = href;
}
}
}
"enclosure" => {
if let Some(url) = attr_value(&event, QName(b"url")) {
item.enclosure = url;
}
if let Some(length) = attr_value(&event, QName(b"length")) {
item.size = length.parse::<i64>().unwrap_or(0);
}
}
_ => {}
}
}
}
Ok(Event::Text(event)) => {
if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) {
if let Ok(value) = event.decode() {
append_field(item, field, value.as_ref());
}
}
}
Ok(Event::CData(event)) => {
if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) {
if let Ok(value) = event.decode() {
append_field(item, field, value.as_ref());
}
}
}
Ok(Event::End(event)) => {
if current_item.is_some() {
let local = local_name(event.name());
if local == "item" || local == "entry" {
let mut item = current_item.take().unwrap_or_default();
if item.enclosure.is_empty() && !item.link.is_empty() {
item.enclosure = item.link.clone();
}
if item.has_output() {
items.push(item_to_py(py, &item)?.into_any().unbind());
if items.len() >= max_items {
break;
}
}
current_field = None;
item_depth = 0;
} else {
item_depth = item_depth.saturating_sub(1);
if item_depth <= 1 {
current_field = None;
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => {
parse_failed = true;
break;
}
_ => {}
}
}
if parse_failed && items.is_empty() {
Ok(None)
} else {
Ok(Some(items))
}
}
/// 将内部 RSS 条目结构转换为 Python 字典。
fn item_to_py<'py>(py: Python<'py>, item: &RssItem) -> PyResult<Bound<'py, PyDict>> {
let dict = PyDict::new(py);
dict.set_item("title", item.title.trim())?;
dict.set_item("enclosure", item.enclosure.trim())?;
dict.set_item("size", item.size)?;
dict.set_item("description", item.description.trim())?;
dict.set_item("link", item.link.trim())?;
dict.set_item("pubdate_raw", item.pubdate.trim())?;
if !item.nickname.trim().is_empty() {
dict.set_item("nickname", item.nickname.trim())?;
}
Ok(dict)
}
/// 返回 XML 名称去掉命名空间前缀后的本地名称。
fn local_name(name: QName<'_>) -> String {
let raw = name.as_ref();
let local = raw.rsplit(|byte| *byte == b':').next().unwrap_or(raw);
std::str::from_utf8(local).unwrap_or("").to_string()
}
/// 读取 XML 节点属性并完成实体反转义。
fn attr_value(event: &BytesStart<'_>, name: QName<'_>) -> Option<String> {
event
.try_get_attribute(name)
.ok()
.flatten()
.and_then(|attr| attr.decode_and_unescape_value(event.decoder()).ok())
.map(|value| value.into_owned())
}
/// 追加当前文本节点到对应 RSS 字段。
fn append_field(item: &mut RssItem, field: TextField, value: &str) {
let target = match field {
TextField::Title => &mut item.title,
TextField::Description => &mut item.description,
TextField::Link => &mut item.link,
TextField::Pubdate => &mut item.pubdate,
TextField::Nickname => &mut item.nickname,
};
target.push_str(value);
}