feat: accelerate site indexer parsing with rust

2026-07-19 03:31:50 +08:00 · 2026-05-23 15:59:20 +08:00
parent 222f6ce7d8
commit d1e2881347
9 changed files with 2523 additions and 1 deletions
--- a/app/modules/indexer/spider/init.py
+++ b/app/modules/indexer/spider/init.py
@@ -12,6 +12,7 @@ from pyquery import PyQuery
 from app.core.config import settings
 from app.log import logger
 from app.schemas.types import MediaType
+from app.utils import rust_accel
 from app.utils.http import RequestUtils, AsyncRequestUtils
 from app.utils.string import StringUtils
 from app.utils.url import UrlUtils
@@ -737,6 +738,17 @@ class SiteSpider:
            self.is_error = True
            return []

+        rust_torrents = rust_accel.parse_indexer_torrents(
+            html_text=html_text,
+            domain=self.domain,
+            list_config=self.list,
+            fields=self.fields,
+            category=self.category,
+            result_num=self.result_num
+        )
+        if rust_torrents is not None:
+            return rust_torrents
+
        # 清空旧结果
        self.torrents_info_array = []
        html_doc = None
--- a/app/utils/rust_accel.py
+++ b/app/utils/rust_accel.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import List, Optional

 from app.log import logger

@@ -39,6 +39,34 @@ def parse_filter_rule(expression: str) -> Optional[list]:
        return None


+def parse_indexer_torrents(
+        html_text: str,
+        domain: str,
+        list_config: dict,
+        fields: dict,
+        category: Optional[dict] = None,
+        result_num: int = 100
+) -> Optional[List[dict]]:
+    """
+    使用 Rust 批量解析普通配置站点种子列表，不可用时返回 None。
+    """
+    if not _moviepilot_rust:
+        return None
+    try:
+        return _moviepilot_rust.parse_indexer_torrents_fast(
+            html_text,
+            domain,
+            list_config,
+            fields,
+            category,
+            result_num
+        )
+    except BaseException as err:
+        _raise_non_rust_panic(err)
+        logger.debug(f"Rust 站点列表解析失败，使用 Python 解析兜底：{err}")
+        return None
+
+
 def _raise_non_rust_panic(err: BaseException) -> None:
    """
    只吞掉 Rust 扩展 panic/异常，保留用户中断和进程退出语义。
--- a/rust/moviepilot_rust/Cargo.lock
+++ b/rust/moviepilot_rust/Cargo.lock
--- a/rust/moviepilot_rust/Cargo.toml
+++ b/rust/moviepilot_rust/Cargo.toml
@@ -8,4 +8,10 @@ name = "moviepilot_rust"
 crate-type = ["cdylib"]

 [dependencies]
+minijinja = "2.20"
+chrono = "0.4"
+once_cell = "1.20"
 pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
+regex = "1.11"
+scraper = "0.24"
+url = "2.5"
--- a/rust/moviepilot_rust/src/indexer.rs
+++ b/rust/moviepilot_rust/src/indexer.rs
@@ -0,0 +1,972 @@
+use crate::utils::{extract_i64, get_optional_i64, get_optional_string};
+use chrono::{DateTime, Duration, Local, NaiveDate, NaiveDateTime, NaiveTime};
+use minijinja::{context, Environment, UndefinedBehavior};
+use once_cell::sync::Lazy;
+use pyo3::prelude::*;
+use pyo3::types::{PyAny, PyDict, PyList};
+use regex::{Regex, RegexBuilder};
+use scraper::{ElementRef, Html, Selector};
+use std::collections::{BTreeMap, HashMap, HashSet};
+use url::form_urlencoded;
+use url::Url;
+
+static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
+    RegexBuilder::new(r"[KMGTPI]*B?")
+        .case_insensitive(true)
+        .build()
+        .unwrap()
+});
+static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
+static FIELD_REF_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap());
+static HAS_QUOTED_SELECTOR_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap());
+static HAS_SELECTOR_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#":has\(\s*(?:"([^"]+)"|'([^']+)'|([^)]*))\s*\)"#).unwrap());
+static TABLE_DIRECT_TR_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
+static EN_ELAPSED_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap());
+const OUTPUT_FIELDS: &[(&str, &str)] = &[
+    ("title", "title"),
+    ("description", "description"),
+    ("imdbid", "imdbid"),
+    ("size", "size"),
+    ("leechers", "peers"),
+    ("seeders", "seeders"),
+    ("grabs", "grabs"),
+    ("date_elapsed", "date_elapsed"),
+    ("freedate", "freedate"),
+    ("labels", "labels"),
+    ("hr", "hit_and_run"),
+    ("category", "category"),
+];
+
+struct FieldSpec<'py> {
+    name: String,
+    config: Bound<'py, PyDict>,
+}
+
+enum RowParseResult {
+    Empty,
+    Item(PyObject),
+}
+
+/// 批量解析普通配置 indexer 页面，优先在 Rust 内覆盖站点配置语义。
+#[pyfunction]
+#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
+pub(crate) fn parse_indexer_torrents_fast(
+    py: Python<'_>,
+    html_text: &str,
+    domain: &str,
+    list_config: &Bound<'_, PyDict>,
+    fields: &Bound<'_, PyDict>,
+    category: Option<&Bound<'_, PyDict>>,
+    result_num: usize,
+) -> PyResult<Option<PyObject>> {
+    let Some(list_selector_text) = get_optional_string(list_config, "selector")? else {
+        return Ok(None);
+    };
+    if list_selector_text.is_empty() {
+        return Ok(None);
+    }
+    let document = Html::parse_document(html_text);
+    let Some(rows) = select_site_elements(document.root_element(), &list_selector_text) else {
+        return Ok(None);
+    };
+    let result = PyList::empty(py);
+    let field_specs = build_field_specs(fields)?;
+    for row in rows.into_iter().take(result_num) {
+        match parse_indexer_row(py, row, domain, &field_specs, category)? {
+            RowParseResult::Empty => {}
+            RowParseResult::Item(item) => result.append(item)?,
+        }
+    }
+    Ok(Some(result.into()))
+}
+
+/// 预处理字段配置，保留 Python 字典引用以避免重复转换整份配置。
+fn build_field_specs<'py>(fields: &Bound<'py, PyDict>) -> PyResult<Vec<FieldSpec<'py>>> {
+    let mut specs = Vec::new();
+    for (key, value) in fields.iter() {
+        if value.is_none() {
+            continue;
+        }
+        let Ok(config) = value.downcast_into::<PyDict>() else {
+            continue;
+        };
+        specs.push(FieldSpec {
+            name: key.extract::<String>()?,
+            config,
+        });
+    }
+    Ok(specs)
+}
+
+/// 解析单行种子信息，覆盖普通配置站点的主字段抽取流程。
+fn parse_indexer_row(
+    py: Python<'_>,
+    row: ElementRef<'_>,
+    domain: &str,
+    field_specs: &[FieldSpec<'_>],
+    category: Option<&Bound<'_, PyDict>>,
+) -> PyResult<RowParseResult> {
+    let output = PyDict::new(py);
+    let field_map = field_specs
+        .iter()
+        .map(|field| (field.name.as_str(), field))
+        .collect::<HashMap<&str, &FieldSpec<'_>>>();
+    let mut cache = BTreeMap::new();
+    let mut resolving = HashSet::new();
+
+    if let Some(value) = eval_field_by_name(row, &field_map, "details", &mut cache, &mut resolving)? {
+        if !value.is_empty() {
+            output.set_item("page_url", normalize_site_link(domain, &value, true))?;
+        }
+    }
+    if let Some(value) = eval_field_by_name(row, &field_map, "download", &mut cache, &mut resolving)? {
+        if !value.is_empty() {
+            output.set_item("enclosure", normalize_site_link(domain, &value, false))?;
+        }
+    }
+    if let Some(value) = eval_factor_field(row, &field_map, "downloadvolumefactor", &mut cache, &mut resolving)? {
+        output.set_item("downloadvolumefactor", value)?;
+    }
+    if let Some(value) = eval_factor_field(row, &field_map, "uploadvolumefactor", &mut cache, &mut resolving)? {
+        output.set_item("uploadvolumefactor", value)?;
+    }
+    if let Some(value) = eval_pubdate_field(row, &field_map, &mut cache, &mut resolving)? {
+        if !value.is_empty() {
+            output.set_item("pubdate", value)?;
+        }
+    }
+
+    for (source_key, target_key) in OUTPUT_FIELDS {
+        match *source_key {
+            "labels" => {
+                parse_labels_field(py, row, &field_map, &output)?;
+            }
+            "hr" => {
+                if let Some(value) = eval_hr_field(row, &field_map)? {
+                    output.set_item(*target_key, value)?;
+                }
+            }
+            "category" => {
+                if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
+                    output.set_item(*target_key, map_category_value(&value, category)?)?;
+                }
+            }
+            "size" => {
+                if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
+                    output.set_item(*target_key, parse_filesize_text(value.replace('\n', "").trim()))?;
+                }
+            }
+            "leechers" | "seeders" | "grabs" => {
+                if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
+                    output.set_item(*target_key, parse_peer_count(&value))?;
+                }
+            }
+            _ => {
+                if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
+                    if !value.is_empty() {
+                        output.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
+                    }
+                }
+            }
+        }
+    }
+
+    if output.is_empty() {
+        return Ok(RowParseResult::Empty);
+    }
+    Ok(RowParseResult::Item(output.into()))
+}
+
+/// 按字段名求值并缓存结果，支持 Jinja 模板里的任意 fields 引用。
+fn eval_field_by_name(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    name: &str,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<Option<String>> {
+    if let Some(value) = cache.get(name) {
+        return Ok(Some(value.clone()));
+    }
+    if resolving.contains(name) {
+        return Ok(Some(String::new()));
+    }
+    let Some(spec) = field_map.get(name).copied() else {
+        return Ok(None);
+    };
+    resolving.insert(name.to_string());
+    let value = eval_field(row, field_map, spec, cache, resolving)?;
+    resolving.remove(name);
+    if let Some(value) = value.clone() {
+        cache.insert(name.to_string(), value);
+    }
+    Ok(value)
+}
+
+/// 执行单个字段配置，统一处理 selector/text/default/filter 的组合语义。
+fn eval_field(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    spec: &FieldSpec<'_>,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<Option<String>> {
+    let mut value = if let Some(template) = get_optional_string(&spec.config, "text")? {
+        Some(render_field_template(row, field_map, &template, cache, resolving)?)
+    } else {
+        safe_query(row, &spec.config)?
+    };
+
+    if let Some(current) = value.as_deref() {
+        if contains_jinja_syntax(current) {
+            value = Some(render_embedded_value(
+                row,
+                field_map,
+                &spec.name,
+                current,
+                cache,
+                resolving,
+            )?);
+        }
+    }
+    if let Some(filters) = spec.config.get_item("filters")? {
+        if !filters.is_none() {
+            value = apply_text_filters(value.unwrap_or_default(), &filters)?;
+        }
+    }
+    if value.as_deref().map(str::is_empty).unwrap_or(true) {
+        if let Some(default_value) = get_default_value(&spec.config)? {
+            value = Some(default_value);
+        }
+    }
+    Ok(value)
+}
+
+/// 渲染字段 text 模板，只抽取模板实际引用的依赖字段。
+fn render_field_template(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    template: &str,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<String> {
+    let mut values = BTreeMap::new();
+    for key in extract_template_field_names(template) {
+        let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default();
+        values.insert(key, value);
+    }
+    Ok(render_jinja_template(template, &values).unwrap_or_default())
+}
+
+/// 渲染字段值中残留的 Jinja 模板，兼容少数站点把模板写进 title 属性的情况。
+fn render_embedded_value(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    current_name: &str,
+    template: &str,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<String> {
+    let mut values = BTreeMap::new();
+    for key in extract_template_field_names(template) {
+        if key == current_name {
+            values.insert(key, String::new());
+            continue;
+        }
+        let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default();
+        values.insert(key, value);
+    }
+    Ok(render_jinja_template(template, &values).unwrap_or_default())
+}
+
+/// 提取 Jinja 模板中出现过的 fields 字段名。
+fn extract_template_field_names(template: &str) -> Vec<String> {
+    let mut keys = Vec::new();
+    for captures in FIELD_REF_RE.captures_iter(template) {
+        let Some(key) = captures.get(1).or_else(|| captures.get(2)) else {
+            continue;
+        };
+        let key = key.as_str();
+        if !keys.iter().any(|item: &String| item == key) {
+            keys.push(key.to_string());
+        }
+    }
+    keys
+}
+
+/// 读取字段默认值，兼容历史配置里的 defualt_value 拼写。
+fn get_default_value(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
+    if let Some(value) = get_optional_string(selector_config, "default_value")? {
+        return Ok(Some(value));
+    }
+    get_optional_string(selector_config, "defualt_value")
+}
+
+/// 解析上传/下载优惠系数字段，保留配置里 0.5/0.3 这类浮点倍率。
+fn eval_factor_field(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    key: &str,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<Option<f64>> {
+    let Some(spec) = field_map.get(key).copied() else {
+        return Ok(None);
+    };
+    if let Some(case_obj) = spec.config.get_item("case")? {
+        let case_dict = case_obj.downcast::<PyDict>()?;
+        for (case_selector_obj, value) in case_dict.iter() {
+            let case_selector = case_selector_obj.extract::<String>()?;
+            if selector_exists(row, &case_selector)? {
+                return Ok(Some(value.extract::<f64>().unwrap_or(1.0)));
+            }
+        }
+        return Ok(Some(1.0));
+    }
+    if let Some(value) = eval_field_by_name(row, field_map, key, cache, resolving)? {
+        if let Some(number) = NUMERIC_FACTOR_RE
+            .captures(&value)
+            .and_then(|caps| caps.get(1))
+            .and_then(|item| item.as_str().parse::<f64>().ok())
+        {
+            return Ok(Some(number));
+        }
+    }
+    Ok(Some(1.0))
+}
+
+/// 解析标签列表字段，保持 Python 侧 labels 输出为字符串数组。
+fn parse_labels_field(
+    py: Python<'_>,
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    output: &Bound<'_, PyDict>,
+) -> PyResult<()> {
+    let Some(spec) = field_map.get("labels").copied() else {
+        return Ok(());
+    };
+    if !spec.config.contains("selector")? {
+        output.set_item("labels", PyList::empty(py))?;
+        return Ok(());
+    }
+    let labels = PyList::empty(py);
+    if let Some(values) = query_all_values(row, &spec.config)? {
+        for value in values.into_iter().filter(|item| !item.is_empty()) {
+            labels.append(value)?;
+        }
+    }
+    output.set_item("labels", labels)?;
+    Ok(())
+}
+
+/// 解析 HR 标记字段，配置存在时输出布尔值。
+fn eval_hr_field(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+) -> PyResult<Option<bool>> {
+    let Some(spec) = field_map.get("hr").copied() else {
+        return Ok(None);
+    };
+    let Some(selector_text) = get_selector_text(&spec.config)? else {
+        return Ok(Some(false));
+    };
+    Ok(Some(selector_exists(row, &selector_text)?))
+}
+
+/// 将站点分类 ID 映射为 MoviePilot 的媒体类型中文值。
+fn map_category_value(value: &str, category: Option<&Bound<'_, PyDict>>) -> PyResult<&'static str> {
+    let Some(category) = category else {
+        return Ok("未知");
+    };
+    let tv_cats = category_ids_for_field(category, "tv")?;
+    let movie_cats = category_ids_for_field(category, "movie")?;
+    if tv_cats.iter().any(|item| item == value) && !movie_cats.iter().any(|item| item == value) {
+        return Ok("电视剧");
+    }
+    if movie_cats.iter().any(|item| item == value) {
+        return Ok("电影");
+    }
+    Ok("未知")
+}
+
+/// 解析整数类统计字段，兼容 "12/34" 和千分位逗号。
+fn parse_peer_count(value: &str) -> i64 {
+    value
+        .split('/')
+        .next()
+        .unwrap_or("")
+        .replace(',', "")
+        .trim()
+        .parse::<i64>()
+        .unwrap_or(0)
+}
+
+/// 解析发布时间字段，并在 date 模板产出相对时间时使用 date_added 保持可排序时间。
+fn eval_pubdate_field(
+    row: ElementRef<'_>,
+    field_map: &HashMap<&str, &FieldSpec<'_>>,
+    cache: &mut BTreeMap<String, String>,
+    resolving: &mut HashSet<String>,
+) -> PyResult<Option<String>> {
+    if let Some(value) = eval_field_by_name(row, field_map, "date", cache, resolving)? {
+        let normalized = normalize_pubdate_text(&value);
+        if is_standard_datetime(&normalized) || !field_map.contains_key("date_added") {
+            return Ok(Some(normalized));
+        }
+        if let Some(date_added) = eval_field_by_name(row, field_map, "date_added", cache, resolving)? {
+            let fallback = normalize_pubdate_text(&date_added);
+            if is_standard_datetime(&fallback) {
+                return Ok(Some(fallback));
+            }
+        }
+        return Ok(Some(normalized));
+    }
+    Ok(eval_field_by_name(row, field_map, "date_added", cache, resolving)?
+        .map(|value| normalize_pubdate_text(&value)))
+}
+
+/// 规范化发布时间文本为 MoviePilot 期望的字符串格式。
+fn normalize_pubdate_text(value: &str) -> String {
+    if let Some(parsed) = format_date_value(value, "%Y-%m-%d %H:%M:%S") {
+        return parsed;
+    }
+    value.replace('\n', " ").trim().to_string()
+}
+
+/// 判断文本是否已经是 MoviePilot 标准日期时间格式。
+fn is_standard_datetime(value: &str) -> bool {
+    NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S").is_ok()
+}
+
+/// 解析标准 CSS selector，并保留 table > tr 的 HTML5 tbody 兼容扩展。
+fn parse_css_selector(selector_text: &str) -> Option<Selector> {
+    if selector_text == "*" {
+        return Selector::parse("*").ok();
+    }
+    let normalized = normalize_pyquery_selector(selector_text);
+    let expanded = expand_table_direct_tr_selector(&normalized);
+    if let Ok(selector) = Selector::parse(&expanded) {
+        return Some(selector);
+    }
+    if expanded != normalized {
+        if let Ok(selector) = Selector::parse(&normalized) {
+            return Some(selector);
+        }
+    }
+    Selector::parse(selector_text).ok()
+}
+
+/// 查询站点选择器，额外支持 PyQuery 的 :has("selector") 写法。
+fn select_site_elements<'a>(
+    root: ElementRef<'a>,
+    selector_text: &str,
+) -> Option<Vec<ElementRef<'a>>> {
+    let Some(captures) = HAS_SELECTOR_RE.captures(selector_text) else {
+        let selector = parse_css_selector(selector_text)?;
+        return Some(root.select(&selector).collect());
+    };
+    let matched = captures.get(0)?;
+    let prefix = selector_text[..matched.start()].trim();
+    let suffix = selector_text[matched.end()..].trim();
+    let inner = captures
+        .get(1)
+        .or_else(|| captures.get(2))
+        .or_else(|| captures.get(3))?
+        .as_str()
+        .trim();
+    let base_selector = parse_css_selector(prefix)?;
+    let has_selector = parse_css_selector(inner)?;
+    let bases = root
+        .select(&base_selector)
+        .filter(|element| element.select(&has_selector).next().is_some());
+    if suffix.is_empty() {
+        return Some(bases.collect());
+    }
+    let suffix_selector_text = suffix.trim_start_matches('>').trim();
+    let suffix_selector = parse_css_selector(suffix_selector_text)?;
+    let mut values = Vec::new();
+    for base in bases {
+        values.extend(base.select(&suffix_selector));
+    }
+    Some(values)
+}
+
+/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
+fn normalize_pyquery_selector(selector_text: &str) -> String {
+    HAS_QUOTED_SELECTOR_RE
+        .replace_all(selector_text, |captures: &regex::Captures<'_>| {
+            let inner = captures
+                .get(1)
+                .or_else(|| captures.get(2))
+                .map(|item| item.as_str())
+                .unwrap_or_default();
+            format!(":has({inner})")
+        })
+        .into_owned()
+}
+
+/// 为 table > tr 选择器追加 tbody 变体，适配 Rust HTML5 解析自动补 tbody 的行为。
+fn expand_table_direct_tr_selector(selector_text: &str) -> String {
+    let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2");
+    if expanded == selector_text {
+        return selector_text.to_string();
+    }
+    format!("{selector_text}, {expanded}")
+}
+
+/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
+fn safe_query(row: ElementRef<'_>, selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
+    let Some(values) = query_all_values(row, selector_config)? else {
+        return Ok(None);
+    };
+    Ok(select_indexed_value(values, selector_config))
+}
+
+/// 查询 selector 的全部文本或属性值。
+fn query_all_values(
+    row: ElementRef<'_>,
+    selector_config: &Bound<'_, PyDict>,
+) -> PyResult<Option<Vec<String>>> {
+    let Some(selector_text) = get_selector_text(selector_config)? else {
+        return Ok(None);
+    };
+    let Some(elements) = select_site_elements(row, &selector_text) else {
+        return Ok(None);
+    };
+    let attribute = get_optional_string(selector_config, "attribute")?;
+    let remove_selectors = parse_remove_selectors(selector_config)?;
+    let mut values = Vec::new();
+    for element in elements {
+        if let Some(attribute) = attribute.as_deref() {
+            values.push(element.value().attr(attribute).unwrap_or("").to_string());
+        } else {
+            values.push(normalize_element_text(element, &remove_selectors));
+        }
+    }
+    Ok(Some(values))
+}
+
+/// 解析 remove 配置，支持逗号分隔的 CSS 选择器列表。
+fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<Selector>> {
+    let Some(remove_text) = get_optional_string(selector_config, "remove")? else {
+        return Ok(Vec::new());
+    };
+    let mut selectors = Vec::new();
+    for item in remove_text.split(',') {
+        let item = item.trim();
+        if item.is_empty() {
+            continue;
+        }
+        let Some(selector) = parse_css_selector(item) else {
+            return Ok(Vec::new());
+        };
+        selectors.push(selector);
+    }
+    Ok(selectors)
+}
+
+/// 读取 selector 或 selectors 配置。
+fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
+    if let Some(selector) = get_optional_string(selector_config, "selector")? {
+        if !selector.is_empty() {
+            return Ok(Some(selector));
+        }
+    }
+    if let Some(selector) = get_optional_string(selector_config, "selectors")? {
+        if !selector.is_empty() {
+            return Ok(Some(selector));
+        }
+    }
+    Ok(None)
+}
+
+/// 对查询结果应用 contents/index 规则。
+fn select_indexed_value(
+    values: Vec<String>,
+    selector_config: &Bound<'_, PyDict>,
+) -> Option<String> {
+    if values.is_empty() {
+        return None;
+    }
+    if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
+        if let Some(first) = values.first() {
+            let lines: Vec<&str> = first.split('\n').collect();
+            return pick_indexed_item(&lines, contents).map(|item| item.to_string());
+        }
+    }
+    if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
+        return pick_indexed_item(&values, index).cloned();
+    }
+    values.first().cloned()
+}
+
+/// 按 Python 列表语义读取正负索引。
+fn pick_indexed_item<T>(items: &[T], index: i64) -> Option<&T> {
+    let len = items.len() as i64;
+    let resolved = if index < 0 { len + index } else { index };
+    if resolved < 0 {
+        return None;
+    }
+    items.get(resolved as usize)
+}
+
+/// 执行 indexer 文本过滤器，覆盖 Build 配置中出现的全部过滤器。
+fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResult<Option<String>> {
+    let Ok(filter_list) = filters.downcast::<PyList>() else {
+        return Ok(Some(current));
+    };
+    for item in filter_list.iter() {
+        let filter = item.downcast::<PyDict>()?;
+        let method_name = get_optional_string(filter, "name")?;
+        if current.is_empty() {
+            break;
+        }
+        match method_name.as_deref() {
+            Some("re_search") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                let Ok(args_list) = args.downcast::<PyList>() else {
+                    continue;
+                };
+                if args_list.len() < 2 {
+                    continue;
+                }
+                let pattern = args_list.get_item(0)?.extract::<String>()?;
+                let group_index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
+                let Ok(regex) = Regex::new(&pattern) else {
+                    continue;
+                };
+                if let Some(captures) = regex.captures(&current) {
+                    if let Some(value) = captures.get(group_index as usize) {
+                        current = value.as_str().to_string();
+                    }
+                }
+            }
+            Some("split") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                let Ok(args_list) = args.downcast::<PyList>() else {
+                    continue;
+                };
+                if args_list.len() < 2 {
+                    continue;
+                }
+                let delimiter = args_list.get_item(0)?.extract::<String>()?;
+                let index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
+                let parts: Vec<&str> = current.split(&delimiter).collect();
+                if let Some(value) = pick_indexed_item(&parts, index) {
+                    current = value.to_string();
+                }
+            }
+            Some("replace") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                let Ok(args_list) = args.downcast::<PyList>() else {
+                    continue;
+                };
+                if args_list.len() < 2 {
+                    continue;
+                }
+                let from = args_list.get_item(0)?.extract::<String>()?;
+                let to = args_list.get_item(args_list.len() - 1)?.extract::<String>()?;
+                current = current.replace(&from, &to);
+            }
+            Some("dateparse") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                let format = args.str()?.to_str()?.to_string();
+                if let Some(value) = format_date_value(&current, &format) {
+                    current = value;
+                }
+            }
+            Some("date_en_elapsed_parse") => {
+                if let Some(value) = parse_english_elapsed_date(&current) {
+                    current = value;
+                }
+            }
+            Some("strip") => {
+                current = current.trim().to_string();
+            }
+            Some("lstrip") => {
+                let Some(args) = filter.get_item("args")? else {
+                    current = current.trim_start().to_string();
+                    continue;
+                };
+                current = lstrip_text(&current, &args)?;
+            }
+            Some("appendleft") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                current = format!("{}{}", args.str()?.to_str()?, current);
+            }
+            Some("querystring") => {
+                let Some(args) = filter.get_item("args")? else {
+                    continue;
+                };
+                current = query_param_value(&current, args.str()?.to_str()?).unwrap_or_default();
+            }
+            _ => {}
+        }
+    }
+    Ok(Some(current.trim().to_string()))
+}
+
+/// 按 Python str.lstrip(chars) 语义处理左侧字符集。
+fn lstrip_text(current: &str, args: &Bound<'_, PyAny>) -> PyResult<String> {
+    let chars = if let Ok(args_list) = args.downcast::<PyList>() {
+        if args_list.is_empty() {
+            String::new()
+        } else {
+            args_list.get_item(0)?.str()?.to_str()?.to_string()
+        }
+    } else {
+        args.str()?.to_str()?.to_string()
+    };
+    if chars.is_empty() {
+        return Ok(current.trim_start().to_string());
+    }
+    Ok(current
+        .trim_start_matches(|ch| chars.contains(ch))
+        .to_string())
+}
+
+/// 将日期文本按站点格式解析为统一时间字符串。
+fn format_date_value(value: &str, format: &str) -> Option<String> {
+    let value = value.replace('\n', " ").trim().to_string();
+    if value.is_empty() {
+        return None;
+    }
+    if value.eq_ignore_ascii_case("now") {
+        return Some(Local::now().format("%Y-%m-%d %H:%M:%S").to_string());
+    }
+    if let Ok(datetime) = NaiveDateTime::parse_from_str(&value, format) {
+        return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
+    }
+    if let Ok(date) = NaiveDate::parse_from_str(&value, format) {
+        return Some(
+            date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?)
+                .format("%Y-%m-%d %H:%M:%S")
+                .to_string(),
+        );
+    }
+    parse_common_date_value(&value)
+}
+
+/// 尝试解析站点常见日期格式，补足 dateparse 失败后的兼容路径。
+fn parse_common_date_value(value: &str) -> Option<String> {
+    if let Ok(datetime) = DateTime::parse_from_rfc3339(value) {
+        return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
+    }
+    for format in [
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d%H:%M:%S",
+        "%Y-%m-%d %H:%M",
+        "%Y-%m-%d",
+        "%b %d %Y, %H:%M",
+        "%H:%M:%S%d/%m/%Y",
+    ] {
+        if let Ok(datetime) = NaiveDateTime::parse_from_str(value, format) {
+            return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
+        }
+        if let Ok(date) = NaiveDate::parse_from_str(value, format) {
+            let datetime = date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?);
+            return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string());
+        }
+    }
+    None
+}
+
+/// 解析 IPT 等英文站点的相对发布时间。
+fn parse_english_elapsed_date(value: &str) -> Option<String> {
+    if let Some(parsed) = parse_common_date_value(value) {
+        return Some(parsed);
+    }
+    let captures = EN_ELAPSED_RE.captures(value)?;
+    let amount = captures.get(1)?.as_str().parse::<i64>().ok()?;
+    let unit = captures.get(2)?.as_str().to_ascii_lowercase();
+    let duration = match unit.as_str() {
+        "second" => Duration::seconds(amount),
+        "minute" => Duration::minutes(amount),
+        "hour" => Duration::hours(amount),
+        "day" => Duration::days(amount),
+        "week" => Duration::weeks(amount),
+        "month" => Duration::days(amount * 30),
+        "year" => Duration::days(amount * 365),
+        _ => return None,
+    };
+    Some((Local::now() - duration).format("%Y-%m-%d %H:%M:%S").to_string())
+}
+
+/// 将文件大小文本转换为字节数，供 Rust HTML 解析内部共用。
+fn parse_filesize_text(text: &str) -> i64 {
+    let raw = text.trim().to_string();
+    if raw.is_empty() {
+        return 0;
+    }
+    if raw.chars().all(|ch| ch.is_ascii_digit()) {
+        return raw.parse::<i64>().unwrap_or(0);
+    }
+    let normalized = raw.replace([',', ' '], "").to_uppercase();
+    let size_text = FILESIZE_UNIT_RE.replace_all(&normalized, "").to_string();
+    let Ok(mut size) = size_text.parse::<f64>() else {
+        return 0;
+    };
+    if normalized.contains("PB") || normalized.contains("PIB") {
+        size *= 1024_f64.powi(5);
+    } else if normalized.contains("TB") || normalized.contains("TIB") {
+        size *= 1024_f64.powi(4);
+    } else if normalized.contains("GB") || normalized.contains("GIB") {
+        size *= 1024_f64.powi(3);
+    } else if normalized.contains("MB") || normalized.contains("MIB") {
+        size *= 1024_f64.powi(2);
+    } else if normalized.contains("KB") || normalized.contains("KIB") {
+        size *= 1024_f64;
+    }
+    size.round() as i64
+}
+
+/// 规范化元素文本，尽量接近 PyQuery.text() 输出。
+fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String {
+    let mut rendered = String::new();
+    for node in element.descendants() {
+        let Some(text_node) = node.value().as_text() else {
+            continue;
+        };
+        if should_skip_text_node(
+            node.parent().and_then(ElementRef::wrap),
+            element,
+            remove_selectors,
+        ) {
+            continue;
+        }
+        rendered.push_str(text_node);
+    }
+    normalize_whitespace(&rendered)
+}
+
+/// 折叠 PyQuery.text() 中的连续空白，保留元素相邻文本节点的直接拼接效果。
+fn normalize_whitespace(value: &str) -> String {
+    value.split_whitespace().collect::<Vec<&str>>().join(" ")
+}
+
+/// 判断文本节点是否位于需要 remove 的元素子树中。
+fn should_skip_text_node(
+    mut parent: Option<ElementRef<'_>>,
+    root: ElementRef<'_>,
+    remove_selectors: &[Selector],
+) -> bool {
+    while let Some(element) = parent {
+        if element == root {
+            return false;
+        }
+        if remove_selectors.iter().any(|selector| selector.matches(&element)) {
+            return true;
+        }
+        parent = element.parent().and_then(ElementRef::wrap);
+    }
+    false
+}
+
+/// 判断 row 内是否存在指定 selector。
+fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
+    if selector_text == "*" {
+        return Ok(true);
+    }
+    Ok(select_site_elements(row, selector_text)
+        .map(|elements| !elements.is_empty())
+        .unwrap_or(false))
+}
+
+/// 拼接详情和下载链接。
+fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> String {
+    if link.starts_with("http") || link.starts_with("magnet") {
+        return link.to_string();
+    }
+    if protocol_relative && link.starts_with("//") {
+        let scheme = domain.split(':').next().unwrap_or("http");
+        return format!("{scheme}:{link}");
+    }
+    if !protocol_relative {
+        if let Ok(base) = Url::parse(&standardize_base_url(domain)) {
+            if let Some(host) = base.host_str() {
+                if link.contains(host) {
+                    if link.starts_with('/') {
+                        return format!("{}:{link}", base.scheme());
+                    }
+                    return format!("{}://{link}", base.scheme());
+                }
+            }
+        }
+    }
+    if let Some(stripped) = link.strip_prefix('/') {
+        format!("{domain}{stripped}")
+    } else {
+        format!("{domain}{link}")
+    }
+}
+
+/// 使用 MiniJinja 渲染站点字段模板，语义对齐 Python jinja2 的 Template.render(fields=...)。
+fn render_jinja_template(template: &str, fields: &BTreeMap<String, String>) -> Option<String> {
+    let mut env = Environment::new();
+    env.set_undefined_behavior(UndefinedBehavior::Chainable);
+    env.render_str(template, context! { fields => fields }).ok()
+}
+
+/// 判断文本是否包含 Jinja 语法标记，作为字段内嵌模板的低成本预筛选。
+fn contains_jinja_syntax(value: &str) -> bool {
+    value.contains("{{") || value.contains("{%") || value.contains("{#")
+}
+
+/// 读取分类配置中的 ID 列表。
+fn category_ids_for_field(category: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
+    let Some(list_obj) = category.get_item(key)? else {
+        return Ok(Vec::new());
+    };
+    let Ok(list) = list_obj.downcast::<PyList>() else {
+        return Ok(Vec::new());
+    };
+    let mut values = Vec::new();
+    for item in list.iter() {
+        let dict = item.downcast::<PyDict>()?;
+        if let Some(id) = get_optional_string(dict, "id")? {
+            values.push(id);
+        }
+    }
+    Ok(values)
+}
+
+/// 读取 URL 查询参数中的第一个值。
+fn query_param_value(text: &str, key: &str) -> Option<String> {
+    let query = if let Ok(url) = Url::parse(text) {
+        url.query().unwrap_or("").to_string()
+    } else {
+        text.split_once('?')
+            .map(|(_, query)| query.split('#').next().unwrap_or("").to_string())
+            .unwrap_or_default()
+    };
+    form_urlencoded::parse(query.as_bytes())
+        .find(|(param_key, _)| param_key == key)
+        .map(|(_, value)| value.to_string())
+}
+
+/// 标准化基础 URL，与 Python UrlUtils.standardize_base_url 保持一致。
+fn standardize_base_url(host: &str) -> String {
+    let mut value = host.to_string();
+    if !value.ends_with('/') {
+        value.push('/');
+    }
+    if !value.starts_with("http://") && !value.starts_with("https://") {
+        value = format!("http://{value}");
+    }
+    value
+}
--- a/rust/moviepilot_rust/src/lib.rs
+++ b/rust/moviepilot_rust/src/lib.rs
@@ -1,4 +1,6 @@
 mod filter;
+mod indexer;
+mod utils;

 use pyo3::prelude::*;

@@ -13,5 +15,6 @@ fn is_available() -> bool {
 fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(is_available, m)?)?;
    m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
+    m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
    Ok(())
 }
--- a/rust/moviepilot_rust/src/utils.rs
+++ b/rust/moviepilot_rust/src/utils.rs
@@ -0,0 +1,46 @@
+use pyo3::prelude::*;
+use pyo3::types::{PyAny, PyDict};
+
+/// 从 Python 字典读取可选字符串。
+pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<String>> {
+    let Some(value) = dict.get_item(key)? else {
+        return Ok(None);
+    };
+    if value.is_none() {
+        return Ok(None);
+    }
+    Ok(Some(value.str()?.to_str()?.to_string()))
+}
+
+/// 从 Python 字典读取可选整数。
+pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<i64>> {
+    let Some(value) = dict.get_item(key)? else {
+        return Ok(None);
+    };
+    if value.is_none() {
+        return Ok(None);
+    }
+    if let Ok(parsed) = value.extract::<i64>() {
+        return Ok(Some(parsed));
+    }
+    let text = value.str()?.to_str()?.trim().to_string();
+    if text.is_empty() {
+        return Ok(None);
+    }
+    Ok(text.parse::<i64>().ok())
+}
+
+/// 将 Python 对象转换为 i64，用于兼容配置里字符串或数字形式的下标。
+pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
+    if value.is_none() {
+        return Ok(None);
+    }
+    if let Ok(parsed) = value.extract::<i64>() {
+        return Ok(Some(parsed));
+    }
+    let text = value.str()?.to_str()?.trim().to_string();
+    if text.is_empty() {
+        return Ok(None);
+    }
+    Ok(text.parse::<i64>().ok())
+}
--- a/scripts/benchmark_indexer_rust.py
+++ b/scripts/benchmark_indexer_rust.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+Benchmark SiteSpider indexer parsing with real MoviePilot-Build site configs.
+"""
+
+import argparse
+import copy
+import contextlib
+import io
+import statistics
+import sys
+import time
+from pathlib import Path
+from typing import Callable, Dict, List
+
+import yaml
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from app.modules.indexer import spider as spider_module
+from app.modules.indexer.spider import SiteSpider
+
+
+def _load_site_config(sites_dir: Path, site_id: str) -> dict:
+    """
+    从 MoviePilot-Build sites 目录读取指定站点配置。
+    """
+    candidates = list(sites_dir.rglob(f"{site_id}.yml"))
+    if not candidates:
+        raise FileNotFoundError(f"找不到站点配置：{site_id}")
+    with candidates[0].open("r", encoding="utf-8") as file:
+        return yaml.safe_load(file)
+
+
+def _nexus_row(
+        torrent_id: int,
+        title: str,
+        category: str = "402",
+        date_text: str = "2025-05-01 12:13:14",
+        free_class: str = "pro_free",
+) -> str:
+    """
+    生成 NexusPhp 类站点配置可解析的单行 HTML。
+    """
+    return f"""
+    <tr>
+      <td><a href="?cat={category}">cat</a></td>
+      <td>
+        <table class="torrentname">
+          <tr>
+            <td class="embedded">
+              <a href="details.php?id={torrent_id}" title="{title}">{title}.fallback</a>
+              <a href="download.php?id={torrent_id}">download</a>
+              <img class="{free_class}" />
+              <img class="hitandrun" />
+              <font class="subtitle">Desc {torrent_id} <span>remove</span></font>
+              <span class="tags">标签{torrent_id}</span>
+            </td>
+          </tr>
+        </table>
+      </td>
+      <td></td>
+      <td><span title="{date_text}">{date_text}</span></td>
+      <td>1.5 GB</td>
+      <td>{torrent_id + 10}</td>
+      <td>{torrent_id % 7}</td>
+      <td>{torrent_id % 11}</td>
+    </tr>
+    """
+
+
+def _build_nexus_html(rows: int) -> str:
+    """
+    生成多行 NexusPhp 表格 HTML，用于驱动真实 SiteSpider 解析链路。
+    """
+    body = "\n".join(
+        _nexus_row(1000 + index, f"Benchmark.Title.{index}")
+        for index in range(rows)
+    )
+    return f"<table class=\"torrents\">{body}</table>"
+
+
+def _build_ipt_html(rows: int) -> str:
+    """
+    生成 IPT 配置使用的表格 HTML，覆盖 lstrip 和英文相对时间过滤器。
+    """
+    body = "\n".join(
+        f"""
+        <tr>
+          <td><a href="/t/{1000 + index}">Benchmark.IPT.{index}</a>
+              <a href="/download.php/{1000 + index}">download</a><span class="free">Free</span></td>
+          <td><div>Uploaded | {index % 5 + 1} hours ago</div></td>
+          <td></td><td></td><td></td>
+          <td>{1 + index % 9}.2 GB</td>
+          <td>{index % 13}</td>
+          <td>{index + 20}</td>
+          <td>{index % 7}</td>
+        </tr>
+        """
+        for index in range(rows)
+    )
+    return f"<table id=\"torrents\"><tbody>{body}</tbody></table>"
+
+
+def _prepare_cases(sites_dir: Path, rows: int) -> List[Dict[str, object]]:
+    """
+    构造使用真实站点配置的 benchmark 用例。
+    """
+    cases = []
+    for site_id in ["agsvpt", "pttime", "chdbits"]:
+        config = _load_site_config(sites_dir, site_id)
+        cases.append({
+            "site": site_id,
+            "indexer": config,
+            "html": _build_nexus_html(rows),
+        })
+    cases.append({
+        "site": "iptorrents",
+        "indexer": _load_site_config(sites_dir, "iptorrents"),
+        "html": _build_ipt_html(rows),
+    })
+    return cases
+
+
+def _parse_with_rust(indexer: dict, html: str) -> List[dict]:
+    """
+    使用 SiteSpider 默认入口解析，Rust 扩展可用时会命中 Rust fast path。
+    """
+    return SiteSpider(copy.deepcopy(indexer)).parse(html)
+
+
+def _parse_with_python(indexer: dict, html: str) -> List[dict]:
+    """
+    临时禁用 Rust fast path，复用同一个 SiteSpider.parse 入口测 PyQuery 路径。
+    """
+    original = spider_module.rust_accel.parse_indexer_torrents
+    spider_module.rust_accel.parse_indexer_torrents = lambda *args, **kwargs: None
+    try:
+        return SiteSpider(copy.deepcopy(indexer)).parse(html)
+    finally:
+        spider_module.rust_accel.parse_indexer_torrents = original
+
+
+def _time_parser(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int) -> float:
+    """
+    多次执行解析函数并返回总耗时。
+    """
+    buffer = io.StringIO()
+    with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
+        started = time.perf_counter()
+        for _ in range(loops):
+            parser(indexer, html)
+    return time.perf_counter() - started
+
+
+def _median_time(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int, repeats: int) -> float:
+    """
+    重复测量并取中位数，降低单次抖动对结论的影响。
+    """
+    samples = [_time_parser(parser, indexer, html, loops) for _ in range(repeats)]
+    return statistics.median(samples)
+
+
+def run_benchmark(sites_dir: Path, rows: int, loops: int, repeats: int) -> None:
+    """
+    执行真实 SiteSpider.parse 链路 benchmark 并打印 Rust/PyQuery 对比。
+    """
+    cases = _prepare_cases(sites_dir, rows)
+    print(f"sites_dir={sites_dir}")
+    print(f"rows={rows} loops={loops} repeats={repeats}")
+    print("site, rows, rust_ms, python_ms, speedup")
+    for case in cases:
+        buffer = io.StringIO()
+        with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
+            rust_result = _parse_with_rust(case["indexer"], case["html"])
+            python_result = _parse_with_python(case["indexer"], case["html"])
+        if not rust_result or not python_result:
+            raise RuntimeError(f"{case['site']} benchmark fixture did not parse any rows")
+
+        rust_time = _median_time(_parse_with_rust, case["indexer"], case["html"], loops, repeats)
+        python_time = _median_time(_parse_with_python, case["indexer"], case["html"], loops, repeats)
+        speedup = python_time / rust_time if rust_time else 0
+        print(
+            f"{case['site']}, {len(rust_result)}, "
+            f"{rust_time * 1000:.2f}, {python_time * 1000:.2f}, {speedup:.2f}x"
+        )
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    解析命令行参数。
+    """
+    parser = argparse.ArgumentParser(description="Benchmark MoviePilot Rust indexer parser")
+    parser.add_argument(
+        "--sites-dir",
+        type=Path,
+        default=Path("/Users/jxxghp/PycharmProjects/MoviePilot-Build/sites"),
+        help="MoviePilot-Build sites directory",
+    )
+    parser.add_argument("--rows", type=int, default=80, help="Rows per fixture page")
+    parser.add_argument("--loops", type=int, default=50, help="Loops per timing sample")
+    parser.add_argument("--repeats", type=int, default=5, help="Timing samples per parser")
+    return parser.parse_args()
+
+
+def main() -> None:
+    """
+    入口函数。
+    """
+    args = parse_args()
+    run_benchmark(args.sites_dir, args.rows, args.loops, args.repeats)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_rust_accel.py
+++ b/tests/test_rust_accel.py
@@ -1,5 +1,7 @@
 import pytest

+from app.modules.indexer.spider import SiteSpider
+from app.schemas.types import MediaType
 from app.utils import rust_accel


@@ -25,3 +27,219 @@ def test_rust_filter_rule_parser_handles_parentheses_and_or():
    result = rust_accel.parse_filter_rule("CNSUB & (4K | 1080P) & !BLU")

    assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]]
+
+
+def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links():
+    """
+    Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。
+    """
+    html = """
+    <table class="torrents">
+      <tr>
+        <td><a href="?cat=402">TV</a></td>
+        <td>
+          <table class="torrentname">
+            <tr>
+              <td class="embedded">
+                <a href="details.php?id=100" title="Optional.Title">Default.Title</a>
+                <a href="download.php?id=100">DL</a>
+                <a href="https://www.imdb.com/title/tt1234567/">IMDb</a>
+                <font class="subtitle">Main description <span>remove</span><a>link</a></font>
+                <span class="label">FREE</span>
+                <img class="hitandrun" />
+              </td>
+            </tr>
+          </table>
+        </td>
+        <td></td>
+        <td><span title="2025-05-01 12:13:14">1 hour ago</span></td>
+        <td>1.5 GB</td>
+        <td>1,234</td>
+        <td>5/7</td>
+        <td>9</td>
+      </tr>
+    </table>
+    """
+    indexer = {
+        "id": "unit",
+        "name": "Unit",
+        "domain": "https://example.com/",
+        "search": {"paths": [{"path": "torrents.php"}]},
+        "category": {
+            "movie": [{"id": "401"}],
+            "tv": [{"id": "402"}],
+        },
+        "torrents": {
+            "list": {"selector": 'table.torrents > tr:has("table.torrentname")'},
+            "fields": {
+                "title_default": {"selector": 'a[href*="details.php?id="]'},
+                "title_optional": {
+                    "selector": 'a[title][href*="details.php?id="]',
+                    "attribute": "title",
+                },
+                "title": {
+                    "text": "{% if fields['title_optional'] %}{{ fields['title_optional'] }}{% else %}"
+                            "{{ fields['title_default'] }}{% endif %}"
+                },
+                "details": {"selector": 'a[href*="details.php?id="]', "attribute": "href"},
+                "download": {"selector": 'a[href*="download.php?id="]', "attribute": "href"},
+                "imdbid": {
+                    "selector": 'a[href*="imdb.com/title/tt"]',
+                    "attribute": "href",
+                    "filters": [{"name": "re_search", "args": ["tt\\d+", 0]}],
+                },
+                "date_elapsed": {"selector": "td:nth-child(4) > span"},
+                "date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"},
+                "date": {
+                    "text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
+                            "{{ fields['date_added'] if fields['date_added'] else fields['date_elapsed'] }}"
+                            "{% else %}now{% endif %}",
+                    "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
+                },
+                "size": {"selector": "td:nth-child(5)"},
+                "seeders": {"selector": "td:nth-child(6)"},
+                "leechers": {"selector": "td:nth-child(7)"},
+                "grabs": {"selector": "td:nth-child(8)"},
+                "downloadvolumefactor": {"case": {"img.free": 0, "*": 1}},
+                "uploadvolumefactor": {"case": {"*": 1}},
+                "description": {
+                    "selector": "font.subtitle",
+                    "remove": "span,a",
+                },
+                "labels": {"selector": "span.label"},
+                "hr": {"selector": "img.hitandrun"},
+                "category": {
+                    "selector": 'a[href*="?cat="]',
+                    "attribute": "href",
+                    "filters": [{"name": "querystring", "args": "cat"}],
+                },
+            },
+        },
+    }
+
+    result = SiteSpider(indexer, mtype=MediaType.TV).parse(html)
+
+    assert result == [{
+        "page_url": "https://example.com/details.php?id=100",
+        "enclosure": "https://example.com/download.php?id=100",
+        "downloadvolumefactor": 1.0,
+        "uploadvolumefactor": 1.0,
+        "pubdate": "2025-05-01 12:13:14",
+        "title": "Optional.Title",
+        "description": "Main description",
+        "imdbid": "tt1234567",
+        "size": 1610612736,
+        "peers": 5,
+        "seeders": 1234,
+        "grabs": 9,
+        "date_elapsed": "1 hour ago",
+        "labels": ["FREE"],
+        "hit_and_run": True,
+        "category": "电视剧",
+    }]
+
+
+def test_rust_indexer_parser_handles_default_values_and_template_arithmetic():
+    """
+    Rust indexer 解析应支持 defualt_value、Jinja int filter 和模板算术表达式。
+    """
+    html = """
+    <table class="torrents">
+      <tr>
+        <td><a href="details.php?id=200">Default.Title</a></td>
+      </tr>
+    </table>
+    """
+    fields = {
+        "title_default": {"selector": 'a[href*="details.php?id="]'},
+        "missing_days": {"defualt_value": "2", "selector": "span.missing"},
+        "title": {"text": "{{ fields['title_default'] }} {{ (fields['missing_days']|int)*86400 }}"},
+    }
+
+    result = rust_accel.parse_indexer_torrents(
+        html_text=html,
+        domain="https://example.com/",
+        list_config={"selector": "table.torrents > tr"},
+        fields=fields,
+        category=None,
+        result_num=100,
+    )
+
+    assert result == [{"title": "Default.Title 172800"}]
+
+
+def test_rust_indexer_parser_handles_lstrip_and_english_elapsed_date():
+    """
+    Rust indexer 解析应覆盖 IPT 配置用到的 lstrip 和 date_en_elapsed_parse 过滤器。
+    """
+    html = """
+    <table id="torrents">
+      <tr>
+        <td><a href="/t/123">Title</a><a href="/download.php/123">download</a></td>
+        <td><div>Uploaded | 2 hours ago</div></td>
+      </tr>
+    </table>
+    """
+    fields = {
+        "title": {"selector": 'a[href*="/t/"]'},
+        "download": {
+            "selector": 'a[href*="/download.php/"]',
+            "attribute": "href",
+            "filters": [{"name": "lstrip", "args": ["/"]}],
+        },
+        "date": {
+            "selector": "td:nth-child(2) > div",
+            "filters": [
+                {"name": "split", "args": ["|", 1]},
+                {"name": "date_en_elapsed_parse"},
+            ],
+        },
+    }
+
+    result = rust_accel.parse_indexer_torrents(
+        html_text=html,
+        domain="https://iptorrents.com/",
+        list_config={"selector": 'table[id="torrents"] tr'},
+        fields=fields,
+        category=None,
+        result_num=100,
+    )
+
+    assert len(result) == 1
+    assert result[0]["title"] == "Title"
+    assert result[0]["enclosure"] == "https://iptorrents.com/download.php/123"
+    assert result[0]["pubdate"]
+
+
+def test_rust_indexer_parser_prefers_date_added_when_date_template_returns_elapsed_text():
+    """
+    Rust indexer 解析 date 模板产出相对时间时，应使用 date_added 里的标准时间。
+    """
+    html = """
+    <table class="torrents">
+      <tr>
+        <td><span title="2025-06-02 03:04:05">1 hour ago</span></td>
+      </tr>
+    </table>
+    """
+    fields = {
+        "date_elapsed": {"selector": "span"},
+        "date_added": {"selector": "span", "attribute": "title"},
+        "date": {
+            "text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
+                    "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}"
+                    "{% else %}now{% endif %}",
+            "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
+        },
+    }
+
+    result = rust_accel.parse_indexer_torrents(
+        html_text=html,
+        domain="https://example.com/",
+        list_config={"selector": "table.torrents > tr"},
+        fields=fields,
+        category=None,
+        result_num=100,
+    )
+
+    assert result[0]["pubdate"] == "2025-06-02 03:04:05"