feat: accelerate site indexer parsing with rust

This commit is contained in:
jxxghp
2026-05-23 15:59:20 +08:00
parent 222f6ce7d8
commit d1e2881347
9 changed files with 2523 additions and 1 deletions

View File

@@ -12,6 +12,7 @@ from pyquery import PyQuery
from app.core.config import settings
from app.log import logger
from app.schemas.types import MediaType
from app.utils import rust_accel
from app.utils.http import RequestUtils, AsyncRequestUtils
from app.utils.string import StringUtils
from app.utils.url import UrlUtils
@@ -737,6 +738,17 @@ class SiteSpider:
self.is_error = True
return []
rust_torrents = rust_accel.parse_indexer_torrents(
html_text=html_text,
domain=self.domain,
list_config=self.list,
fields=self.fields,
category=self.category,
result_num=self.result_num
)
if rust_torrents is not None:
return rust_torrents
# 清空旧结果
self.torrents_info_array = []
html_doc = None

View File

@@ -1,4 +1,4 @@
from typing import Optional
from typing import List, Optional
from app.log import logger
@@ -39,6 +39,34 @@ def parse_filter_rule(expression: str) -> Optional[list]:
return None
def parse_indexer_torrents(
html_text: str,
domain: str,
list_config: dict,
fields: dict,
category: Optional[dict] = None,
result_num: int = 100
) -> Optional[List[dict]]:
"""
使用 Rust 批量解析普通配置站点种子列表,不可用时返回 None。
"""
if not _moviepilot_rust:
return None
try:
return _moviepilot_rust.parse_indexer_torrents_fast(
html_text,
domain,
list_config,
fields,
category,
result_num
)
except BaseException as err:
_raise_non_rust_panic(err)
logger.debug(f"Rust 站点列表解析失败,使用 Python 解析兜底:{err}")
return None
def _raise_non_rust_panic(err: BaseException) -> None:
"""
只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。

File diff suppressed because it is too large Load Diff

View File

@@ -8,4 +8,10 @@ name = "moviepilot_rust"
crate-type = ["cdylib"]
[dependencies]
minijinja = "2.20"
chrono = "0.4"
once_cell = "1.20"
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
regex = "1.11"
scraper = "0.24"
url = "2.5"

View File

@@ -0,0 +1,972 @@
use crate::utils::{extract_i64, get_optional_i64, get_optional_string};
use chrono::{DateTime, Duration, Local, NaiveDate, NaiveDateTime, NaiveTime};
use minijinja::{context, Environment, UndefinedBehavior};
use once_cell::sync::Lazy;
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDict, PyList};
use regex::{Regex, RegexBuilder};
use scraper::{ElementRef, Html, Selector};
use std::collections::{BTreeMap, HashMap, HashSet};
use url::form_urlencoded;
use url::Url;
static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"[KMGTPI]*B?")
.case_insensitive(true)
.build()
.unwrap()
});
static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
static FIELD_REF_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap());
static HAS_QUOTED_SELECTOR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap());
static HAS_SELECTOR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#":has\(\s*(?:"([^"]+)"|'([^']+)'|([^)]*))\s*\)"#).unwrap());
static TABLE_DIRECT_TR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
static EN_ELAPSED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap());
const OUTPUT_FIELDS: &[(&str, &str)] = &[
("title", "title"),
("description", "description"),
("imdbid", "imdbid"),
("size", "size"),
("leechers", "peers"),
("seeders", "seeders"),
("grabs", "grabs"),
("date_elapsed", "date_elapsed"),
("freedate", "freedate"),
("labels", "labels"),
("hr", "hit_and_run"),
("category", "category"),
];
struct FieldSpec<'py> {
name: String,
config: Bound<'py, PyDict>,
}
enum RowParseResult {
Empty,
Item(PyObject),
}
/// 批量解析普通配置 indexer 页面,优先在 Rust 内覆盖站点配置语义。
#[pyfunction]
#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
pub(crate) fn parse_indexer_torrents_fast(
py: Python<'_>,
html_text: &str,
domain: &str,
list_config: &Bound<'_, PyDict>,
fields: &Bound<'_, PyDict>,
category: Option<&Bound<'_, PyDict>>,
result_num: usize,
) -> PyResult<Option<PyObject>> {
let Some(list_selector_text) = get_optional_string(list_config, "selector")? else {
return Ok(None);
};
if list_selector_text.is_empty() {
return Ok(None);
}
let document = Html::parse_document(html_text);
let Some(rows) = select_site_elements(document.root_element(), &list_selector_text) else {
return Ok(None);
};
let result = PyList::empty(py);
let field_specs = build_field_specs(fields)?;
for row in rows.into_iter().take(result_num) {
match parse_indexer_row(py, row, domain, &field_specs, category)? {
RowParseResult::Empty => {}
RowParseResult::Item(item) => result.append(item)?,
}
}
Ok(Some(result.into()))
}
/// 预处理字段配置,保留 Python 字典引用以避免重复转换整份配置。
fn build_field_specs<'py>(fields: &Bound<'py, PyDict>) -> PyResult<Vec<FieldSpec<'py>>> {
let mut specs = Vec::new();
for (key, value) in fields.iter() {
if value.is_none() {
continue;
}
let Ok(config) = value.downcast_into::<PyDict>() else {
continue;
};
specs.push(FieldSpec {
name: key.extract::<String>()?,
config,
});
}
Ok(specs)
}
/// 解析单行种子信息,覆盖普通配置站点的主字段抽取流程。
fn parse_indexer_row(
py: Python<'_>,
row: ElementRef<'_>,
domain: &str,
field_specs: &[FieldSpec<'_>],
category: Option<&Bound<'_, PyDict>>,
) -> PyResult<RowParseResult> {
let output = PyDict::new(py);
let field_map = field_specs
.iter()
.map(|field| (field.name.as_str(), field))
.collect::<HashMap<&str, &FieldSpec<'_>>>();
let mut cache = BTreeMap::new();
let mut resolving = HashSet::new();
if let Some(value) = eval_field_by_name(row, &field_map, "details", &mut cache, &mut resolving)? {
if !value.is_empty() {
output.set_item("page_url", normalize_site_link(domain, &value, true))?;
}
}
if let Some(value) = eval_field_by_name(row, &field_map, "download", &mut cache, &mut resolving)? {
if !value.is_empty() {
output.set_item("enclosure", normalize_site_link(domain, &value, false))?;
}
}
if let Some(value) = eval_factor_field(row, &field_map, "downloadvolumefactor", &mut cache, &mut resolving)? {
output.set_item("downloadvolumefactor", value)?;
}
if let Some(value) = eval_factor_field(row, &field_map, "uploadvolumefactor", &mut cache, &mut resolving)? {
output.set_item("uploadvolumefactor", value)?;
}
if let Some(value) = eval_pubdate_field(row, &field_map, &mut cache, &mut resolving)? {
if !value.is_empty() {
output.set_item("pubdate", value)?;
}
}
for (source_key, target_key) in OUTPUT_FIELDS {
match *source_key {
"labels" => {
parse_labels_field(py, row, &field_map, &output)?;
}
"hr" => {
if let Some(value) = eval_hr_field(row, &field_map)? {
output.set_item(*target_key, value)?;
}
}
"category" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
output.set_item(*target_key, map_category_value(&value, category)?)?;
}
}
"size" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
output.set_item(*target_key, parse_filesize_text(value.replace('\n', "").trim()))?;
}
}
"leechers" | "seeders" | "grabs" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
output.set_item(*target_key, parse_peer_count(&value))?;
}
}
_ => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
if !value.is_empty() {
output.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
}
}
}
}
}
if output.is_empty() {
return Ok(RowParseResult::Empty);
}
Ok(RowParseResult::Item(output.into()))
}
/// 按字段名求值并缓存结果,支持 Jinja 模板里的任意 fields 引用。
fn eval_field_by_name(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
name: &str,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<Option<String>> {
if let Some(value) = cache.get(name) {
return Ok(Some(value.clone()));
}
if resolving.contains(name) {
return Ok(Some(String::new()));
}
let Some(spec) = field_map.get(name).copied() else {
return Ok(None);
};
resolving.insert(name.to_string());
let value = eval_field(row, field_map, spec, cache, resolving)?;
resolving.remove(name);
if let Some(value) = value.clone() {
cache.insert(name.to_string(), value);
}
Ok(value)
}
/// 执行单个字段配置,统一处理 selector/text/default/filter 的组合语义。
fn eval_field(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
spec: &FieldSpec<'_>,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<Option<String>> {
let mut value = if let Some(template) = get_optional_string(&spec.config, "text")? {
Some(render_field_template(row, field_map, &template, cache, resolving)?)
} else {
safe_query(row, &spec.config)?
};
if let Some(current) = value.as_deref() {
if contains_jinja_syntax(current) {
value = Some(render_embedded_value(
row,
field_map,
&spec.name,
current,
cache,
resolving,
)?);
}
}
if let Some(filters) = spec.config.get_item("filters")? {
if !filters.is_none() {
value = apply_text_filters(value.unwrap_or_default(), &filters)?;
}
}
if value.as_deref().map(str::is_empty).unwrap_or(true) {
if let Some(default_value) = get_default_value(&spec.config)? {
value = Some(default_value);
}
}
Ok(value)
}
/// 渲染字段 text 模板,只抽取模板实际引用的依赖字段。
fn render_field_template(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
template: &str,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<String> {
let mut values = BTreeMap::new();
for key in extract_template_field_names(template) {
let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default();
values.insert(key, value);
}
Ok(render_jinja_template(template, &values).unwrap_or_default())
}
/// 渲染字段值中残留的 Jinja 模板,兼容少数站点把模板写进 title 属性的情况。
fn render_embedded_value(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
current_name: &str,
template: &str,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<String> {
let mut values = BTreeMap::new();
for key in extract_template_field_names(template) {
if key == current_name {
values.insert(key, String::new());
continue;
}
let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default();
values.insert(key, value);
}
Ok(render_jinja_template(template, &values).unwrap_or_default())
}
/// 提取 Jinja 模板中出现过的 fields 字段名。
fn extract_template_field_names(template: &str) -> Vec<String> {
let mut keys = Vec::new();
for captures in FIELD_REF_RE.captures_iter(template) {
let Some(key) = captures.get(1).or_else(|| captures.get(2)) else {
continue;
};
let key = key.as_str();
if !keys.iter().any(|item: &String| item == key) {
keys.push(key.to_string());
}
}
keys
}
/// 读取字段默认值,兼容历史配置里的 defualt_value 拼写。
fn get_default_value(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
if let Some(value) = get_optional_string(selector_config, "default_value")? {
return Ok(Some(value));
}
get_optional_string(selector_config, "defualt_value")
}
/// 解析上传/下载优惠系数字段,保留配置里 0.5/0.3 这类浮点倍率。
fn eval_factor_field(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
key: &str,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<Option<f64>> {
let Some(spec) = field_map.get(key).copied() else {
return Ok(None);
};
if let Some(case_obj) = spec.config.get_item("case")? {
let case_dict = case_obj.downcast::<PyDict>()?;
for (case_selector_obj, value) in case_dict.iter() {
let case_selector = case_selector_obj.extract::<String>()?;
if selector_exists(row, &case_selector)? {
return Ok(Some(value.extract::<f64>().unwrap_or(1.0)));
}
}
return Ok(Some(1.0));
}
if let Some(value) = eval_field_by_name(row, field_map, key, cache, resolving)? {
if let Some(number) = NUMERIC_FACTOR_RE
.captures(&value)
.and_then(|caps| caps.get(1))
.and_then(|item| item.as_str().parse::<f64>().ok())
{
return Ok(Some(number));
}
}
Ok(Some(1.0))
}
/// 解析标签列表字段,保持 Python 侧 labels 输出为字符串数组。
fn parse_labels_field(
py: Python<'_>,
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
output: &Bound<'_, PyDict>,
) -> PyResult<()> {
let Some(spec) = field_map.get("labels").copied() else {
return Ok(());
};
if !spec.config.contains("selector")? {
output.set_item("labels", PyList::empty(py))?;
return Ok(());
}
let labels = PyList::empty(py);
if let Some(values) = query_all_values(row, &spec.config)? {
for value in values.into_iter().filter(|item| !item.is_empty()) {
labels.append(value)?;
}
}
output.set_item("labels", labels)?;
Ok(())
}
/// 解析 HR 标记字段,配置存在时输出布尔值。
fn eval_hr_field(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
) -> PyResult<Option<bool>> {
let Some(spec) = field_map.get("hr").copied() else {
return Ok(None);
};
let Some(selector_text) = get_selector_text(&spec.config)? else {
return Ok(Some(false));
};
Ok(Some(selector_exists(row, &selector_text)?))
}
/// 将站点分类 ID 映射为 MoviePilot 的媒体类型中文值。
fn map_category_value(value: &str, category: Option<&Bound<'_, PyDict>>) -> PyResult<&'static str> {
let Some(category) = category else {
return Ok("未知");
};
let tv_cats = category_ids_for_field(category, "tv")?;
let movie_cats = category_ids_for_field(category, "movie")?;
if tv_cats.iter().any(|item| item == value) && !movie_cats.iter().any(|item| item == value) {
return Ok("电视剧");
}
if movie_cats.iter().any(|item| item == value) {
return Ok("电影");
}
Ok("未知")
}
/// 解析整数类统计字段,兼容 "12/34" 和千分位逗号。
fn parse_peer_count(value: &str) -> i64 {
value
.split('/')
.next()
.unwrap_or("")
.replace(',', "")
.trim()
.parse::<i64>()
.unwrap_or(0)
}
/// 解析发布时间字段,并在 date 模板产出相对时间时使用 date_added 保持可排序时间。
fn eval_pubdate_field(
row: ElementRef<'_>,
field_map: &HashMap<&str, &FieldSpec<'_>>,
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<Option<String>> {
if let Some(value) = eval_field_by_name(row, field_map, "date", cache, resolving)? {
let normalized = normalize_pubdate_text(&value);
if is_standard_datetime(&normalized) || !field_map.contains_key("date_added") {
return Ok(Some(normalized));
}
if let Some(date_added) = eval_field_by_name(row, field_map, "date_added", cache, resolving)? {
let fallback = normalize_pubdate_text(&date_added);
if is_standard_datetime(&fallback) {
return Ok(Some(fallback));
}
}
return Ok(Some(normalized));
}
Ok(eval_field_by_name(row, field_map, "date_added", cache, resolving)?
.map(|value| normalize_pubdate_text(&value)))
}
/// 规范化发布时间文本为 MoviePilot 期望的字符串格式。
fn normalize_pubdate_text(value: &str) -> String {
if let Some(parsed) = format_date_value(value, "%Y-%m-%d %H:%M:%S") {
return parsed;
}
value.replace('\n', " ").trim().to_string()
}
/// 判断文本是否已经是 MoviePilot 标准日期时间格式。
fn is_standard_datetime(value: &str) -> bool {
NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S").is_ok()
}
/// 解析标准 CSS selector并保留 table > tr 的 HTML5 tbody 兼容扩展。
fn parse_css_selector(selector_text: &str) -> Option<Selector> {
if selector_text == "*" {
return Selector::parse("*").ok();
}
let normalized = normalize_pyquery_selector(selector_text);
let expanded = expand_table_direct_tr_selector(&normalized);
if let Ok(selector) = Selector::parse(&expanded) {
return Some(selector);
}
if expanded != normalized {
if let Ok(selector) = Selector::parse(&normalized) {
return Some(selector);
}
}
Selector::parse(selector_text).ok()
}
/// 查询站点选择器,额外支持 PyQuery 的 :has("selector") 写法。
fn select_site_elements<'a>(
root: ElementRef<'a>,
selector_text: &str,
) -> Option<Vec<ElementRef<'a>>> {
let Some(captures) = HAS_SELECTOR_RE.captures(selector_text) else {
let selector = parse_css_selector(selector_text)?;
return Some(root.select(&selector).collect());
};
let matched = captures.get(0)?;
let prefix = selector_text[..matched.start()].trim();
let suffix = selector_text[matched.end()..].trim();
let inner = captures
.get(1)
.or_else(|| captures.get(2))
.or_else(|| captures.get(3))?
.as_str()
.trim();
let base_selector = parse_css_selector(prefix)?;
let has_selector = parse_css_selector(inner)?;
let bases = root
.select(&base_selector)
.filter(|element| element.select(&has_selector).next().is_some());
if suffix.is_empty() {
return Some(bases.collect());
}
let suffix_selector_text = suffix.trim_start_matches('>').trim();
let suffix_selector = parse_css_selector(suffix_selector_text)?;
let mut values = Vec::new();
for base in bases {
values.extend(base.select(&suffix_selector));
}
Some(values)
}
/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
fn normalize_pyquery_selector(selector_text: &str) -> String {
HAS_QUOTED_SELECTOR_RE
.replace_all(selector_text, |captures: &regex::Captures<'_>| {
let inner = captures
.get(1)
.or_else(|| captures.get(2))
.map(|item| item.as_str())
.unwrap_or_default();
format!(":has({inner})")
})
.into_owned()
}
/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。
fn expand_table_direct_tr_selector(selector_text: &str) -> String {
let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2");
if expanded == selector_text {
return selector_text.to_string();
}
format!("{selector_text}, {expanded}")
}
/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
fn safe_query(row: ElementRef<'_>, selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
let Some(values) = query_all_values(row, selector_config)? else {
return Ok(None);
};
Ok(select_indexed_value(values, selector_config))
}
/// 查询 selector 的全部文本或属性值。
fn query_all_values(
row: ElementRef<'_>,
selector_config: &Bound<'_, PyDict>,
) -> PyResult<Option<Vec<String>>> {
let Some(selector_text) = get_selector_text(selector_config)? else {
return Ok(None);
};
let Some(elements) = select_site_elements(row, &selector_text) else {
return Ok(None);
};
let attribute = get_optional_string(selector_config, "attribute")?;
let remove_selectors = parse_remove_selectors(selector_config)?;
let mut values = Vec::new();
for element in elements {
if let Some(attribute) = attribute.as_deref() {
values.push(element.value().attr(attribute).unwrap_or("").to_string());
} else {
values.push(normalize_element_text(element, &remove_selectors));
}
}
Ok(Some(values))
}
/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。
fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<Selector>> {
let Some(remove_text) = get_optional_string(selector_config, "remove")? else {
return Ok(Vec::new());
};
let mut selectors = Vec::new();
for item in remove_text.split(',') {
let item = item.trim();
if item.is_empty() {
continue;
}
let Some(selector) = parse_css_selector(item) else {
return Ok(Vec::new());
};
selectors.push(selector);
}
Ok(selectors)
}
/// 读取 selector 或 selectors 配置。
fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
if let Some(selector) = get_optional_string(selector_config, "selector")? {
if !selector.is_empty() {
return Ok(Some(selector));
}
}
if let Some(selector) = get_optional_string(selector_config, "selectors")? {
if !selector.is_empty() {
return Ok(Some(selector));
}
}
Ok(None)
}
/// 对查询结果应用 contents/index 规则。
fn select_indexed_value(
values: Vec<String>,
selector_config: &Bound<'_, PyDict>,
) -> Option<String> {
if values.is_empty() {
return None;
}
if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
if let Some(first) = values.first() {
let lines: Vec<&str> = first.split('\n').collect();
return pick_indexed_item(&lines, contents).map(|item| item.to_string());
}
}
if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
return pick_indexed_item(&values, index).cloned();
}
values.first().cloned()
}
/// 按 Python 列表语义读取正负索引。
fn pick_indexed_item<T>(items: &[T], index: i64) -> Option<&T> {
let len = items.len() as i64;
let resolved = if index < 0 { len + index } else { index };
if resolved < 0 {
return None;
}
items.get(resolved as usize)
}
/// 执行 indexer 文本过滤器,覆盖 Build 配置中出现的全部过滤器。
fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResult<Option<String>> {
let Ok(filter_list) = filters.downcast::<PyList>() else {
return Ok(Some(current));
};
for item in filter_list.iter() {
let filter = item.downcast::<PyDict>()?;
let method_name = get_optional_string(filter, "name")?;
if current.is_empty() {
break;
}
match method_name.as_deref() {
Some("re_search") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
let Ok(args_list) = args.downcast::<PyList>() else {
continue;
};
if args_list.len() < 2 {
continue;
}
let pattern = args_list.get_item(0)?.extract::<String>()?;
let group_index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
let Ok(regex) = Regex::new(&pattern) else {
continue;
};
if let Some(captures) = regex.captures(&current) {
if let Some(value) = captures.get(group_index as usize) {
current = value.as_str().to_string();
}
}
}
Some("split") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
let Ok(args_list) = args.downcast::<PyList>() else {
continue;
};
if args_list.len() < 2 {
continue;
}
let delimiter = args_list.get_item(0)?.extract::<String>()?;
let index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
let parts: Vec<&str> = current.split(&delimiter).collect();
if let Some(value) = pick_indexed_item(&parts, index) {
current = value.to_string();
}
}
Some("replace") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
let Ok(args_list) = args.downcast::<PyList>() else {
continue;
};
if args_list.len() < 2 {
continue;
}
let from = args_list.get_item(0)?.extract::<String>()?;
let to = args_list.get_item(args_list.len() - 1)?.extract::<String>()?;
current = current.replace(&from, &to);
}
Some("dateparse") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
let format = args.str()?.to_str()?.to_string();
if let Some(value) = format_date_value(&current, &format) {
current = value;
}
}
Some("date_en_elapsed_parse") => {
if let Some(value) = parse_english_elapsed_date(&current) {
current = value;
}
}
Some("strip") => {
current = current.trim().to_string();
}
Some("lstrip") => {
let Some(args) = filter.get_item("args")? else {
current = current.trim_start().to_string();
continue;
};
current = lstrip_text(&current, &args)?;
}
Some("appendleft") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
current = format!("{}{}", args.str()?.to_str()?, current);
}
Some("querystring") => {
let Some(args) = filter.get_item("args")? else {
continue;
};
current = query_param_value(&current, args.str()?.to_str()?).unwrap_or_default();
}
_ => {}
}
}
Ok(Some(current.trim().to_string()))
}
/// 按 Python str.lstrip(chars) 语义处理左侧字符集。
fn lstrip_text(current: &str, args: &Bound<'_, PyAny>) -> PyResult<String> {
let chars = if let Ok(args_list) = args.downcast::<PyList>() {
if args_list.is_empty() {
String::new()
} else {
args_list.get_item(0)?.str()?.to_str()?.to_string()
}
} else {
args.str()?.to_str()?.to_string()
};
if chars.is_empty() {
return Ok(current.trim_start().to_string());
}
Ok(current
.trim_start_matches(|ch| chars.contains(ch))
.to_string())
}
/// 将日期文本按站点格式解析为统一时间字符串。
fn format_date_value(value: &str, format: &str) -> Option<String> {
let value = value.replace('\n', " ").trim().to_string();
if value.is_empty() {
return None;
}
if value.eq_ignore_ascii_case("now") {
return Some(Local::now().format("%Y-%m-%d %H:%M:%S").to_string());
}
if let Ok(datetime) = NaiveDateTime::parse_from_str(&value, format) {
return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
}
if let Ok(date) = NaiveDate::parse_from_str(&value, format) {
return Some(
date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?)
.format("%Y-%m-%d %H:%M:%S")
.to_string(),
);
}
parse_common_date_value(&value)
}
/// 尝试解析站点常见日期格式,补足 dateparse 失败后的兼容路径。
fn parse_common_date_value(value: &str) -> Option<String> {
if let Ok(datetime) = DateTime::parse_from_rfc3339(value) {
return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
}
for format in [
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d%H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y-%m-%d",
"%b %d %Y, %H:%M",
"%H:%M:%S%d/%m/%Y",
] {
if let Ok(datetime) = NaiveDateTime::parse_from_str(value, format) {
return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
}
if let Ok(date) = NaiveDate::parse_from_str(value, format) {
let datetime = date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?);
return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
}
}
None
}
/// 解析 IPT 等英文站点的相对发布时间。
fn parse_english_elapsed_date(value: &str) -> Option<String> {
if let Some(parsed) = parse_common_date_value(value) {
return Some(parsed);
}
let captures = EN_ELAPSED_RE.captures(value)?;
let amount = captures.get(1)?.as_str().parse::<i64>().ok()?;
let unit = captures.get(2)?.as_str().to_ascii_lowercase();
let duration = match unit.as_str() {
"second" => Duration::seconds(amount),
"minute" => Duration::minutes(amount),
"hour" => Duration::hours(amount),
"day" => Duration::days(amount),
"week" => Duration::weeks(amount),
"month" => Duration::days(amount * 30),
"year" => Duration::days(amount * 365),
_ => return None,
};
Some((Local::now() - duration).format("%Y-%m-%d %H:%M:%S").to_string())
}
/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。
fn parse_filesize_text(text: &str) -> i64 {
let raw = text.trim().to_string();
if raw.is_empty() {
return 0;
}
if raw.chars().all(|ch| ch.is_ascii_digit()) {
return raw.parse::<i64>().unwrap_or(0);
}
let normalized = raw.replace([',', ' '], "").to_uppercase();
let size_text = FILESIZE_UNIT_RE.replace_all(&normalized, "").to_string();
let Ok(mut size) = size_text.parse::<f64>() else {
return 0;
};
if normalized.contains("PB") || normalized.contains("PIB") {
size *= 1024_f64.powi(5);
} else if normalized.contains("TB") || normalized.contains("TIB") {
size *= 1024_f64.powi(4);
} else if normalized.contains("GB") || normalized.contains("GIB") {
size *= 1024_f64.powi(3);
} else if normalized.contains("MB") || normalized.contains("MIB") {
size *= 1024_f64.powi(2);
} else if normalized.contains("KB") || normalized.contains("KIB") {
size *= 1024_f64;
}
size.round() as i64
}
/// 规范化元素文本,尽量接近 PyQuery.text() 输出。
fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String {
let mut rendered = String::new();
for node in element.descendants() {
let Some(text_node) = node.value().as_text() else {
continue;
};
if should_skip_text_node(
node.parent().and_then(ElementRef::wrap),
element,
remove_selectors,
) {
continue;
}
rendered.push_str(text_node);
}
normalize_whitespace(&rendered)
}
/// 折叠 PyQuery.text() 中的连续空白,保留元素相邻文本节点的直接拼接效果。
fn normalize_whitespace(value: &str) -> String {
value.split_whitespace().collect::<Vec<&str>>().join(" ")
}
/// 判断文本节点是否位于需要 remove 的元素子树中。
fn should_skip_text_node(
mut parent: Option<ElementRef<'_>>,
root: ElementRef<'_>,
remove_selectors: &[Selector],
) -> bool {
while let Some(element) = parent {
if element == root {
return false;
}
if remove_selectors.iter().any(|selector| selector.matches(&element)) {
return true;
}
parent = element.parent().and_then(ElementRef::wrap);
}
false
}
/// 判断 row 内是否存在指定 selector。
fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
if selector_text == "*" {
return Ok(true);
}
Ok(select_site_elements(row, selector_text)
.map(|elements| !elements.is_empty())
.unwrap_or(false))
}
/// 拼接详情和下载链接。
fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> String {
if link.starts_with("http") || link.starts_with("magnet") {
return link.to_string();
}
if protocol_relative && link.starts_with("//") {
let scheme = domain.split(':').next().unwrap_or("http");
return format!("{scheme}:{link}");
}
if !protocol_relative {
if let Ok(base) = Url::parse(&standardize_base_url(domain)) {
if let Some(host) = base.host_str() {
if link.contains(host) {
if link.starts_with('/') {
return format!("{}:{link}", base.scheme());
}
return format!("{}://{link}", base.scheme());
}
}
}
}
if let Some(stripped) = link.strip_prefix('/') {
format!("{domain}{stripped}")
} else {
format!("{domain}{link}")
}
}
/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)。
fn render_jinja_template(template: &str, fields: &BTreeMap<String, String>) -> Option<String> {
let mut env = Environment::new();
env.set_undefined_behavior(UndefinedBehavior::Chainable);
env.render_str(template, context! { fields => fields }).ok()
}
/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选。
fn contains_jinja_syntax(value: &str) -> bool {
value.contains("{{") || value.contains("{%") || value.contains("{#")
}
/// 读取分类配置中的 ID 列表。
fn category_ids_for_field(category: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
let Some(list_obj) = category.get_item(key)? else {
return Ok(Vec::new());
};
let Ok(list) = list_obj.downcast::<PyList>() else {
return Ok(Vec::new());
};
let mut values = Vec::new();
for item in list.iter() {
let dict = item.downcast::<PyDict>()?;
if let Some(id) = get_optional_string(dict, "id")? {
values.push(id);
}
}
Ok(values)
}
/// 读取 URL 查询参数中的第一个值。
fn query_param_value(text: &str, key: &str) -> Option<String> {
let query = if let Ok(url) = Url::parse(text) {
url.query().unwrap_or("").to_string()
} else {
text.split_once('?')
.map(|(_, query)| query.split('#').next().unwrap_or("").to_string())
.unwrap_or_default()
};
form_urlencoded::parse(query.as_bytes())
.find(|(param_key, _)| param_key == key)
.map(|(_, value)| value.to_string())
}
/// 标准化基础 URL与 Python UrlUtils.standardize_base_url 保持一致。
fn standardize_base_url(host: &str) -> String {
let mut value = host.to_string();
if !value.ends_with('/') {
value.push('/');
}
if !value.starts_with("http://") && !value.starts_with("https://") {
value = format!("http://{value}");
}
value
}

View File

@@ -1,4 +1,6 @@
mod filter;
mod indexer;
mod utils;
use pyo3::prelude::*;
@@ -13,5 +15,6 @@ fn is_available() -> bool {
fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(is_available, m)?)?;
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
Ok(())
}

View File

@@ -0,0 +1,46 @@
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDict};
/// 从 Python 字典读取可选字符串。
pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<String>> {
let Some(value) = dict.get_item(key)? else {
return Ok(None);
};
if value.is_none() {
return Ok(None);
}
Ok(Some(value.str()?.to_str()?.to_string()))
}
/// 从 Python 字典读取可选整数。
pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<i64>> {
let Some(value) = dict.get_item(key)? else {
return Ok(None);
};
if value.is_none() {
return Ok(None);
}
if let Ok(parsed) = value.extract::<i64>() {
return Ok(Some(parsed));
}
let text = value.str()?.to_str()?.trim().to_string();
if text.is_empty() {
return Ok(None);
}
Ok(text.parse::<i64>().ok())
}
/// 将 Python 对象转换为 i64用于兼容配置里字符串或数字形式的下标。
pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
if value.is_none() {
return Ok(None);
}
if let Ok(parsed) = value.extract::<i64>() {
return Ok(Some(parsed));
}
let text = value.str()?.to_str()?.trim().to_string();
if text.is_empty() {
return Ok(None);
}
Ok(text.parse::<i64>().ok())
}

View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
Benchmark SiteSpider indexer parsing with real MoviePilot-Build site configs.
"""
import argparse
import copy
import contextlib
import io
import statistics
import sys
import time
from pathlib import Path
from typing import Callable, Dict, List
import yaml
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from app.modules.indexer import spider as spider_module
from app.modules.indexer.spider import SiteSpider
def _load_site_config(sites_dir: Path, site_id: str) -> dict:
"""
从 MoviePilot-Build sites 目录读取指定站点配置。
"""
candidates = list(sites_dir.rglob(f"{site_id}.yml"))
if not candidates:
raise FileNotFoundError(f"找不到站点配置:{site_id}")
with candidates[0].open("r", encoding="utf-8") as file:
return yaml.safe_load(file)
def _nexus_row(
torrent_id: int,
title: str,
category: str = "402",
date_text: str = "2025-05-01 12:13:14",
free_class: str = "pro_free",
) -> str:
"""
生成 NexusPhp 类站点配置可解析的单行 HTML。
"""
return f"""
<tr>
<td><a href="?cat={category}">cat</a></td>
<td>
<table class="torrentname">
<tr>
<td class="embedded">
<a href="details.php?id={torrent_id}" title="{title}">{title}.fallback</a>
<a href="download.php?id={torrent_id}">download</a>
<img class="{free_class}" />
<img class="hitandrun" />
<font class="subtitle">Desc {torrent_id} <span>remove</span></font>
<span class="tags">标签{torrent_id}</span>
</td>
</tr>
</table>
</td>
<td></td>
<td><span title="{date_text}">{date_text}</span></td>
<td>1.5 GB</td>
<td>{torrent_id + 10}</td>
<td>{torrent_id % 7}</td>
<td>{torrent_id % 11}</td>
</tr>
"""
def _build_nexus_html(rows: int) -> str:
"""
生成多行 NexusPhp 表格 HTML用于驱动真实 SiteSpider 解析链路。
"""
body = "\n".join(
_nexus_row(1000 + index, f"Benchmark.Title.{index}")
for index in range(rows)
)
return f"<table class=\"torrents\">{body}</table>"
def _build_ipt_html(rows: int) -> str:
"""
生成 IPT 配置使用的表格 HTML覆盖 lstrip 和英文相对时间过滤器。
"""
body = "\n".join(
f"""
<tr>
<td><a href="/t/{1000 + index}">Benchmark.IPT.{index}</a>
<a href="/download.php/{1000 + index}">download</a><span class="free">Free</span></td>
<td><div>Uploaded | {index % 5 + 1} hours ago</div></td>
<td></td><td></td><td></td>
<td>{1 + index % 9}.2 GB</td>
<td>{index % 13}</td>
<td>{index + 20}</td>
<td>{index % 7}</td>
</tr>
"""
for index in range(rows)
)
return f"<table id=\"torrents\"><tbody>{body}</tbody></table>"
def _prepare_cases(sites_dir: Path, rows: int) -> List[Dict[str, object]]:
"""
构造使用真实站点配置的 benchmark 用例。
"""
cases = []
for site_id in ["agsvpt", "pttime", "chdbits"]:
config = _load_site_config(sites_dir, site_id)
cases.append({
"site": site_id,
"indexer": config,
"html": _build_nexus_html(rows),
})
cases.append({
"site": "iptorrents",
"indexer": _load_site_config(sites_dir, "iptorrents"),
"html": _build_ipt_html(rows),
})
return cases
def _parse_with_rust(indexer: dict, html: str) -> List[dict]:
"""
使用 SiteSpider 默认入口解析Rust 扩展可用时会命中 Rust fast path。
"""
return SiteSpider(copy.deepcopy(indexer)).parse(html)
def _parse_with_python(indexer: dict, html: str) -> List[dict]:
"""
临时禁用 Rust fast path复用同一个 SiteSpider.parse 入口测 PyQuery 路径。
"""
original = spider_module.rust_accel.parse_indexer_torrents
spider_module.rust_accel.parse_indexer_torrents = lambda *args, **kwargs: None
try:
return SiteSpider(copy.deepcopy(indexer)).parse(html)
finally:
spider_module.rust_accel.parse_indexer_torrents = original
def _time_parser(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int) -> float:
"""
多次执行解析函数并返回总耗时。
"""
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
started = time.perf_counter()
for _ in range(loops):
parser(indexer, html)
return time.perf_counter() - started
def _median_time(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int, repeats: int) -> float:
"""
重复测量并取中位数,降低单次抖动对结论的影响。
"""
samples = [_time_parser(parser, indexer, html, loops) for _ in range(repeats)]
return statistics.median(samples)
def run_benchmark(sites_dir: Path, rows: int, loops: int, repeats: int) -> None:
"""
执行真实 SiteSpider.parse 链路 benchmark 并打印 Rust/PyQuery 对比。
"""
cases = _prepare_cases(sites_dir, rows)
print(f"sites_dir={sites_dir}")
print(f"rows={rows} loops={loops} repeats={repeats}")
print("site, rows, rust_ms, python_ms, speedup")
for case in cases:
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
rust_result = _parse_with_rust(case["indexer"], case["html"])
python_result = _parse_with_python(case["indexer"], case["html"])
if not rust_result or not python_result:
raise RuntimeError(f"{case['site']} benchmark fixture did not parse any rows")
rust_time = _median_time(_parse_with_rust, case["indexer"], case["html"], loops, repeats)
python_time = _median_time(_parse_with_python, case["indexer"], case["html"], loops, repeats)
speedup = python_time / rust_time if rust_time else 0
print(
f"{case['site']}, {len(rust_result)}, "
f"{rust_time * 1000:.2f}, {python_time * 1000:.2f}, {speedup:.2f}x"
)
def parse_args() -> argparse.Namespace:
"""
解析命令行参数。
"""
parser = argparse.ArgumentParser(description="Benchmark MoviePilot Rust indexer parser")
parser.add_argument(
"--sites-dir",
type=Path,
default=Path("/Users/jxxghp/PycharmProjects/MoviePilot-Build/sites"),
help="MoviePilot-Build sites directory",
)
parser.add_argument("--rows", type=int, default=80, help="Rows per fixture page")
parser.add_argument("--loops", type=int, default=50, help="Loops per timing sample")
parser.add_argument("--repeats", type=int, default=5, help="Timing samples per parser")
return parser.parse_args()
def main() -> None:
"""
入口函数。
"""
args = parse_args()
run_benchmark(args.sites_dir, args.rows, args.loops, args.repeats)
if __name__ == "__main__":
main()

View File

@@ -1,5 +1,7 @@
import pytest
from app.modules.indexer.spider import SiteSpider
from app.schemas.types import MediaType
from app.utils import rust_accel
@@ -25,3 +27,219 @@ def test_rust_filter_rule_parser_handles_parentheses_and_or():
result = rust_accel.parse_filter_rule("CNSUB & (4K | 1080P) & !BLU")
assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]]
def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links():
"""
Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。
"""
html = """
<table class="torrents">
<tr>
<td><a href="?cat=402">TV</a></td>
<td>
<table class="torrentname">
<tr>
<td class="embedded">
<a href="details.php?id=100" title="Optional.Title">Default.Title</a>
<a href="download.php?id=100">DL</a>
<a href="https://www.imdb.com/title/tt1234567/">IMDb</a>
<font class="subtitle">Main description <span>remove</span><a>link</a></font>
<span class="label">FREE</span>
<img class="hitandrun" />
</td>
</tr>
</table>
</td>
<td></td>
<td><span title="2025-05-01 12:13:14">1 hour ago</span></td>
<td>1.5 GB</td>
<td>1,234</td>
<td>5/7</td>
<td>9</td>
</tr>
</table>
"""
indexer = {
"id": "unit",
"name": "Unit",
"domain": "https://example.com/",
"search": {"paths": [{"path": "torrents.php"}]},
"category": {
"movie": [{"id": "401"}],
"tv": [{"id": "402"}],
},
"torrents": {
"list": {"selector": 'table.torrents > tr:has("table.torrentname")'},
"fields": {
"title_default": {"selector": 'a[href*="details.php?id="]'},
"title_optional": {
"selector": 'a[title][href*="details.php?id="]',
"attribute": "title",
},
"title": {
"text": "{% if fields['title_optional'] %}{{ fields['title_optional'] }}{% else %}"
"{{ fields['title_default'] }}{% endif %}"
},
"details": {"selector": 'a[href*="details.php?id="]', "attribute": "href"},
"download": {"selector": 'a[href*="download.php?id="]', "attribute": "href"},
"imdbid": {
"selector": 'a[href*="imdb.com/title/tt"]',
"attribute": "href",
"filters": [{"name": "re_search", "args": ["tt\\d+", 0]}],
},
"date_elapsed": {"selector": "td:nth-child(4) > span"},
"date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"},
"date": {
"text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
"{{ fields['date_added'] if fields['date_added'] else fields['date_elapsed'] }}"
"{% else %}now{% endif %}",
"filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
},
"size": {"selector": "td:nth-child(5)"},
"seeders": {"selector": "td:nth-child(6)"},
"leechers": {"selector": "td:nth-child(7)"},
"grabs": {"selector": "td:nth-child(8)"},
"downloadvolumefactor": {"case": {"img.free": 0, "*": 1}},
"uploadvolumefactor": {"case": {"*": 1}},
"description": {
"selector": "font.subtitle",
"remove": "span,a",
},
"labels": {"selector": "span.label"},
"hr": {"selector": "img.hitandrun"},
"category": {
"selector": 'a[href*="?cat="]',
"attribute": "href",
"filters": [{"name": "querystring", "args": "cat"}],
},
},
},
}
result = SiteSpider(indexer, mtype=MediaType.TV).parse(html)
assert result == [{
"page_url": "https://example.com/details.php?id=100",
"enclosure": "https://example.com/download.php?id=100",
"downloadvolumefactor": 1.0,
"uploadvolumefactor": 1.0,
"pubdate": "2025-05-01 12:13:14",
"title": "Optional.Title",
"description": "Main description",
"imdbid": "tt1234567",
"size": 1610612736,
"peers": 5,
"seeders": 1234,
"grabs": 9,
"date_elapsed": "1 hour ago",
"labels": ["FREE"],
"hit_and_run": True,
"category": "电视剧",
}]
def test_rust_indexer_parser_handles_default_values_and_template_arithmetic():
"""
Rust indexer 解析应支持 defualt_value、Jinja int filter 和模板算术表达式。
"""
html = """
<table class="torrents">
<tr>
<td><a href="details.php?id=200">Default.Title</a></td>
</tr>
</table>
"""
fields = {
"title_default": {"selector": 'a[href*="details.php?id="]'},
"missing_days": {"defualt_value": "2", "selector": "span.missing"},
"title": {"text": "{{ fields['title_default'] }} {{ (fields['missing_days']|int)*86400 }}"},
}
result = rust_accel.parse_indexer_torrents(
html_text=html,
domain="https://example.com/",
list_config={"selector": "table.torrents > tr"},
fields=fields,
category=None,
result_num=100,
)
assert result == [{"title": "Default.Title 172800"}]
def test_rust_indexer_parser_handles_lstrip_and_english_elapsed_date():
"""
Rust indexer 解析应覆盖 IPT 配置用到的 lstrip 和 date_en_elapsed_parse 过滤器。
"""
html = """
<table id="torrents">
<tr>
<td><a href="/t/123">Title</a><a href="/download.php/123">download</a></td>
<td><div>Uploaded | 2 hours ago</div></td>
</tr>
</table>
"""
fields = {
"title": {"selector": 'a[href*="/t/"]'},
"download": {
"selector": 'a[href*="/download.php/"]',
"attribute": "href",
"filters": [{"name": "lstrip", "args": ["/"]}],
},
"date": {
"selector": "td:nth-child(2) > div",
"filters": [
{"name": "split", "args": ["|", 1]},
{"name": "date_en_elapsed_parse"},
],
},
}
result = rust_accel.parse_indexer_torrents(
html_text=html,
domain="https://iptorrents.com/",
list_config={"selector": 'table[id="torrents"] tr'},
fields=fields,
category=None,
result_num=100,
)
assert len(result) == 1
assert result[0]["title"] == "Title"
assert result[0]["enclosure"] == "https://iptorrents.com/download.php/123"
assert result[0]["pubdate"]
def test_rust_indexer_parser_prefers_date_added_when_date_template_returns_elapsed_text():
"""
Rust indexer 解析 date 模板产出相对时间时,应使用 date_added 里的标准时间。
"""
html = """
<table class="torrents">
<tr>
<td><span title="2025-06-02 03:04:05">1 hour ago</span></td>
</tr>
</table>
"""
fields = {
"date_elapsed": {"selector": "span"},
"date_added": {"selector": "span", "attribute": "title"},
"date": {
"text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
"{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}"
"{% else %}now{% endif %}",
"filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
},
}
result = rust_accel.parse_indexer_torrents(
html_text=html,
domain="https://example.com/",
list_config={"selector": "table.torrents > tr"},
fields=fields,
category=None,
result_num=100,
)
assert result[0]["pubdate"] == "2025-06-02 03:04:05"