perf: optimize rust acceleration paths

Rust vs Python benchmark results:

- RSS: Rust 0.299 ms/loop vs Python 7.913 ms/loop, 26.47x faster

- Filter: Rust 12.740 ms/loop vs Python 57.187 ms/loop, 4.49x faster

- MetaInfo: Rust 64.680 ms/loop vs Python 316.158 ms/loop, 4.89x faster

- Indexer agsvpt: Rust 145.76 ms vs Python 3686.50 ms, 25.29x faster

- Indexer pttime: Rust 166.51 ms vs Python 4019.87 ms, 24.14x faster

- Indexer chdbits: Rust 161.17 ms vs Python 3604.28 ms, 22.36x faster

- Indexer iptorrents: Rust 77.82 ms vs Python 17615.52 ms, 226.36x faster

Validation:

- cargo fmt/check/test for rust/moviepilot_rust

- pytest Rust-related coverage: tests/test_rust_accel.py tests/test_torrent_filter.py tests/test_metainfo.py tests/test_indexer_spider_search_url.py tests/test_workflow_fetch_rss.py

- tests/run.py legacy suite

- pylint app/ --errors-only
This commit is contained in:
jxxghp
2026-05-23 19:35:55 +08:00
parent a6826e6a4e
commit 0bf228d29d
15 changed files with 1727 additions and 392 deletions

View File

@@ -1,4 +1,5 @@
from pathlib import Path
from functools import lru_cache
from typing import Tuple, List, Optional
import regex as re
@@ -123,12 +124,14 @@ def _build_meta_info(
return meta
def _rust_parse_options(custom_words: List[str] = None) -> dict:
@lru_cache(maxsize=1)
def _rust_default_parse_options() -> dict:
"""
收集 Rust Meta 解析所需的运行时配置,避免 Rust 层直接访问数据库和 settings
缓存 Rust Meta 默认解析配置,避免热路径反复读取配置并复制流媒体平台大表
"""
from app.core.meta.customization import CustomizationMatcher
from app.core.meta.releasegroup import ReleaseGroupsMatcher
from app.core.meta.streamingplatform import StreamingPlatforms
from app.db.systemconfig_oper import SystemConfigOper
from app.schemas.types import SystemConfigKey
@@ -144,17 +147,42 @@ def _rust_parse_options(custom_words: List[str] = None) -> dict:
customization = CustomizationMatcher._normalize_customization(
systemconfig.get(SystemConfigKey.Customization)
)
words = custom_words
if words is None:
words = systemconfig.get(SystemConfigKey.CustomIdentifiers) or []
return {
"custom_words": words or [],
"custom_words": systemconfig.get(SystemConfigKey.CustomIdentifiers) or [],
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
"release_groups": release_groups,
"customization": customization,
"streaming_platforms": StreamingPlatforms()._lookup_cache,
}
@lru_cache(maxsize=256)
def _rust_custom_parse_options(custom_words: Tuple[str, ...]) -> dict:
"""
缓存带自定义识别词的 Rust Meta 配置,避免同一组识别词重复构造配置对象。
"""
options = dict(_rust_default_parse_options())
options["custom_words"] = list(custom_words)
return options
def _rust_parse_options(custom_words: List[str] = None) -> dict:
"""
收集 Rust Meta 解析所需的运行时配置,避免 Rust 层直接访问数据库和 settings。
"""
if custom_words is None:
return _rust_default_parse_options()
return _rust_custom_parse_options(tuple(custom_words or []))
def clear_rust_parse_options_cache() -> None:
"""
清理 Rust Meta 默认解析配置缓存,供系统配置变更后重载使用。
"""
_rust_default_parse_options.cache_clear()
_rust_custom_parse_options.cache_clear()
def _meta_from_rust(parsed: dict) -> Optional[MetaBase]:
"""
将 Rust 解析结果灌回现有 MetaVideo/MetaAnime 对象,保留下游属性和方法兼容性。

View File

@@ -228,6 +228,37 @@ class RssHelper:
},
}
def __decode_fast_text(self, raw_data: bytes, ret) -> Optional[str]:
"""
使用响应声明编码或 UTF-8 快速解码,优先服务 Rust 解析快路径。
"""
seen_encodings = set()
for encoding in (getattr(ret, "encoding", None), "utf-8"):
if not encoding or encoding in seen_encodings:
continue
seen_encodings.add(encoding)
try:
return raw_data.decode(encoding)
except UnicodeDecodeError:
continue
return None
def __parse_with_rust(self, ret_xml: Optional[str]) -> Optional[list]:
"""
调用 Rust RSS 解析器,并统一处理基础 XML 校验和最大条目限制。
"""
if not ret_xml or not ret_xml.strip():
return None
ret_xml_stripped = ret_xml.strip()
if not ret_xml_stripped.startswith('<'):
return None
rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1)
if rust_items is None:
return None
if len(rust_items) > self.MAX_RSS_ITEMS:
logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}")
return rust_items[:self.MAX_RSS_ITEMS]
def parse(self, url, proxy: bool = False,
timeout: Optional[int] = 15, headers: dict = None, ua: str = None) -> Union[List[dict], None, bool]:
"""
@@ -270,21 +301,26 @@ class RssHelper:
return False
if raw_data:
try:
result = chardet.detect(raw_data)
encoding = result['encoding']
# 解码为字符串
ret_xml = raw_data.decode(encoding)
except Exception as e:
logger.debug(f"chardet解码失败{str(e)}")
# 探测utf-8解码
match = re.search(r'encoding\s*=\s*["\']([^"\']+)["\']', ret.text)
if match:
encoding = match.group(1)
if encoding:
ret_xml = raw_data.decode(encoding)
else:
ret.encoding = ret.apparent_encoding
ret_xml = self.__decode_fast_text(raw_data, ret)
rust_items = self.__parse_with_rust(ret_xml)
if rust_items is not None:
return rust_items
if not ret_xml:
try:
result = chardet.detect(raw_data)
encoding = result['encoding']
# 解码为字符串
ret_xml = raw_data.decode(encoding)
except Exception as e:
logger.debug(f"chardet解码失败{str(e)}")
# 探测utf-8解码
match = re.search(r'encoding\s*=\s*["\']([^"\']+)["\']', ret.text)
if match:
encoding = match.group(1)
if encoding:
ret_xml = raw_data.decode(encoding)
else:
ret.encoding = ret.apparent_encoding
if not ret_xml:
ret_xml = ret.text
@@ -299,11 +335,9 @@ class RssHelper:
logger.error("RSS内容不是有效的XML格式")
return False
rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1)
rust_items = self.__parse_with_rust(ret_xml)
if rust_items is not None:
if len(rust_items) > self.MAX_RSS_ITEMS:
logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}")
return rust_items[:self.MAX_RSS_ITEMS]
return rust_items
# 使用lxml.etree解析XML
parser = None

View File

@@ -4,13 +4,14 @@ from functools import lru_cache
from typing import List, Tuple, Union, Dict, Optional
from app.core.context import TorrentInfo, MediaInfo
from app.core.metainfo import MetaInfo
from app.core.metainfo import MetaInfo, clear_rust_parse_options_cache, _rust_parse_options
from app.helper.rule import RuleHelper
from app.log import logger
from app.modules import _ModuleBase
from app.modules.filter.RuleParser import RuleParser
from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
from app.utils import rust_accel
from app.utils.string import StringUtils
@@ -60,7 +61,12 @@ def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
class FilterModule(_ModuleBase):
CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}
CONFIG_WATCH = {
SystemConfigKey.CustomFilterRules.value,
SystemConfigKey.CustomIdentifiers.value,
SystemConfigKey.CustomReleaseGroups.value,
SystemConfigKey.Customization.value,
}
# 保留一份只读内置规则定义,方便查询工具准确区分“内置规则”和“自定义规则”。
builtin_rule_set: Dict[str, dict] = deepcopy(BUILTIN_RULE_SET)
@@ -76,6 +82,13 @@ class FilterModule(_ModuleBase):
self.rule_set = deepcopy(self.builtin_rule_set)
self.__init_custom_rules()
def on_config_changed(self):
"""
自定义过滤或 Meta 识别配置变更后重建规则集并刷新 Rust Meta 配置缓存。
"""
clear_rust_parse_options_cache()
self.init_module()
def __init_custom_rules(self):
"""
加载用户自定义规则,如跟内置规则冲突,以用户自定义规则为准
@@ -131,25 +144,37 @@ class FilterModule(_ModuleBase):
"""
if not rule_groups:
return torrent_list
parser = RuleParser()
# 同一轮过滤里,相同的优先级层级会被多个种子反复使用;按需解析并缓存,
# 既减少 pyparsing 开销,也保留原来“命中高优先级后不解析低层级”的容错行为。
parsed_rule_cache: Dict[str, Union[list, str]] = {}
# 查询规则表详情
groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
if groups:
for group in groups:
# 过滤种子
torrent_list = self.__filter_torrents(
rule_string=group.rule_string,
rule_name=group.name,
torrent_list=torrent_list,
mediainfo=mediainfo,
parser=parser,
parsed_rule_cache=parsed_rule_cache,
)
group_defs = [group.model_dump() if hasattr(group, "model_dump") else vars(group) for group in groups]
matched_orders = rust_accel.filter_torrents(
groups=group_defs,
torrent_list=torrent_list,
rule_set=self.rule_set,
mediainfo=mediainfo,
metainfo_options=_rust_parse_options() if self.__needs_metainfo_options(group_defs) else None,
)
ret_torrents = []
for index, pri_order in matched_orders:
torrent = torrent_list[index]
torrent.pri_order = pri_order
ret_torrents.append(torrent)
return ret_torrents
return torrent_list
def __needs_metainfo_options(self, groups: List[dict]) -> bool:
"""
判断当前规则链是否会触发 size_range避免无大小规则时读取 MetaInfo 运行配置。
"""
rule_ids = set()
for group in groups:
rule_string = group.get("rule_string")
if not rule_string:
continue
rule_ids.update(re.findall(r"[A-Za-z][A-Za-z0-9]*|[0-9]+[A-Za-z][A-Za-z0-9]*", rule_string))
return any(self.rule_set.get(rule_id, {}).get("size_range") for rule_id in rule_ids)
def __filter_torrents(self, rule_string: str, rule_name: str,
torrent_list: List[TorrentInfo],
mediainfo: MediaInfo,

View File

@@ -39,6 +39,31 @@ def parse_filter_rule(expression: str) -> Optional[list]:
return None
def filter_torrents(
groups: list,
torrent_list: list,
rule_set: dict,
mediainfo=None,
metainfo_options: Optional[dict] = None,
) -> list:
"""
使用 Rust 执行完整种子过滤入口,返回原列表下标和优先级。
"""
if not _moviepilot_rust:
raise RuntimeError(f"Rust 扩展不可用,无法执行种子过滤: {_import_error}")
try:
return _moviepilot_rust.filter_torrents_fast(
groups,
torrent_list,
rule_set,
mediainfo,
metainfo_options or {},
)
except BaseException as err:
_raise_non_rust_panic(err)
raise
def parse_indexer_torrents(
html_text: str,
domain: str,

View File

@@ -32,6 +32,21 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bitflags"
version = "2.11.1"
@@ -161,6 +176,17 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "fancy-regex"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
dependencies = [
"bit-set",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
@@ -507,6 +533,7 @@ version = "0.1.0"
dependencies = [
"anitomy-pure",
"chrono",
"fancy-regex",
"inputx-pinyin",
"minijinja",
"once_cell",

View File

@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
[dependencies]
anitomy-pure = "0.1"
fancy-regex = "0.14"
inputx-pinyin = "1.0.2"
minijinja = "2.20"
chrono = "0.4"

View File

@@ -1,6 +1,21 @@
use crate::metainfo::parse_total_episode_for_filter;
use crate::utils::{
get_optional_f64, get_optional_i64, get_optional_nonempty_string, get_string_list,
object_optional_f64, object_optional_i64, object_optional_string, object_string_list,
py_any_to_string_list,
};
use chrono::{Local, NaiveDateTime};
use fancy_regex::Regex as FancyRegex;
use once_cell::sync::Lazy;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyList, PyString};
use pyo3::types::{PyAny, PyDict, PyList, PyString};
use std::collections::{HashMap, HashSet};
use std::sync::Mutex;
static REGEX_CACHE: Lazy<Mutex<HashMap<String, FancyRegex>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
const SIZE_UNIT: f64 = 1024.0 * 1024.0;
#[derive(Clone, Debug)]
enum RuleExpr {
@@ -19,6 +34,70 @@ enum Token {
LParen,
RParen,
}
#[derive(Clone)]
struct FilterGroup {
levels: Vec<String>,
}
struct RuleMatcher {
rules: HashMap<String, PyObject>,
match_fields: HashSet<String>,
}
struct TorrentSnapshot {
title: String,
description: String,
labels: Vec<String>,
fields: HashMap<String, Vec<String>>,
size: f64,
seeders: i64,
downloadvolumefactor: Option<f64>,
pub_minutes: f64,
}
struct MediaSnapshot {
available: bool,
values: HashMap<String, Vec<String>>,
}
#[pyfunction]
#[pyo3(signature = (groups, torrent_list, rule_set, mediainfo=None, metainfo_options=None))]
pub(crate) fn filter_torrents_fast(
py: Python<'_>,
groups: &Bound<'_, PyList>,
torrent_list: &Bound<'_, PyList>,
rule_set: &Bound<'_, PyDict>,
mediainfo: Option<&Bound<'_, PyAny>>,
metainfo_options: Option<&Bound<'_, PyDict>>,
) -> PyResult<PyObject> {
let groups = parse_filter_groups(groups)?;
if groups.is_empty() {
return Ok(PyList::empty(py).into());
}
let matcher = RuleMatcher::from_py(rule_set)?;
let media = MediaSnapshot::from_py(mediainfo)?;
let results = PyList::empty(py);
let mut parsed_rule_cache: HashMap<String, RuleExpr> = HashMap::new();
let mut episode_count_cache: HashMap<String, i64> = HashMap::new();
for (index, torrent_obj) in torrent_list.iter().enumerate() {
let torrent = TorrentSnapshot::from_py(&torrent_obj, &matcher.match_fields)?;
if let Some(priority) = match_torrent(
py,
&torrent,
&groups,
&matcher,
&media,
metainfo_options,
&mut parsed_rule_cache,
&mut episode_count_cache,
)? {
results.append((index, priority))?;
}
}
Ok(results.into())
}
#[pyfunction]
pub(crate) fn parse_filter_rule_fast(py: Python<'_>, expression: &str) -> PyResult<PyObject> {
let tokens = tokenize_rule(expression)?;
@@ -222,3 +301,625 @@ fn expr_binary_to_py(
list.append(expr_to_py(py, right)?)?;
Ok(list.into())
}
/// 解析 Python 侧已经按媒体筛选后的规则组。
fn parse_filter_groups(groups: &Bound<'_, PyList>) -> PyResult<Vec<FilterGroup>> {
let mut result = Vec::new();
for item in groups.iter() {
let dict = item.downcast::<PyDict>()?;
let rule_string = get_optional_nonempty_string(dict, "rule_string")?.unwrap_or_default();
if rule_string.is_empty() {
continue;
}
let levels = rule_string
.split('>')
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string)
.collect::<Vec<_>>();
if !levels.is_empty() {
result.push(FilterGroup { levels });
}
}
Ok(result)
}
impl RuleMatcher {
/// 构建规则查找表,保留 Python 规则对象引用以按需读取字段。
fn from_py(rule_set: &Bound<'_, PyDict>) -> PyResult<Self> {
let mut rules = HashMap::new();
let mut match_fields = HashSet::new();
for (key, value) in rule_set.iter() {
if let Ok(rule) = value.downcast::<PyDict>() {
for field in get_string_list(rule, "match")? {
match_fields.insert(field);
}
}
rules.insert(key.extract::<String>()?, value.into());
}
Ok(Self {
rules,
match_fields,
})
}
/// 根据规则名获取规则字典。
fn get<'py>(&self, py: Python<'py>, name: &str) -> Option<Bound<'py, PyDict>> {
self.rules
.get(name)?
.bind(py)
.downcast::<PyDict>()
.ok()
.cloned()
}
}
impl TorrentSnapshot {
/// 从 Python TorrentInfo 对象抽取过滤所需字段。
fn from_py(torrent: &Bound<'_, PyAny>, match_fields: &HashSet<String>) -> PyResult<Self> {
let title = object_optional_string(torrent, "title")?.unwrap_or_default();
let description = object_optional_string(torrent, "description")?.unwrap_or_default();
let labels = object_string_list(torrent, "labels")?;
let fields = selected_object_fields(torrent, match_fields, &title, &description, &labels)?;
Ok(Self {
title,
description,
labels,
fields,
size: object_optional_f64(torrent, "size")?.unwrap_or(0.0),
seeders: object_optional_i64(torrent, "seeders")?.unwrap_or(0),
downloadvolumefactor: object_optional_f64(torrent, "downloadvolumefactor")?,
pub_minutes: pub_minutes_from_py(torrent)?,
})
}
/// 拼接默认匹配内容:标题、副标题和标签。
fn default_content(&self) -> String {
format!(
"{} {} {}",
if self.title.is_empty() {
"None"
} else {
&self.title
},
if self.description.is_empty() {
"None"
} else {
&self.description
},
self.labels.join(" ")
)
}
/// 读取任意 TorrentInfo 字段的匹配文本列表。
fn field_values(&self, field: &str) -> Option<&Vec<String>> {
self.fields.get(field)
}
}
impl MediaSnapshot {
/// 从 Python MediaInfo 对象抽取 TMDB 规则可能访问的属性。
fn from_py(mediainfo: Option<&Bound<'_, PyAny>>) -> PyResult<Self> {
let mut values = HashMap::new();
let Some(media) = mediainfo else {
return Ok(Self {
available: false,
values,
});
};
if media.is_none() {
return Ok(Self {
available: false,
values,
});
}
for attr in [
"type",
"category",
"original_language",
"tmdb_id",
"imdb_id",
"tvdb_id",
"douban_id",
"bangumi_id",
"collection_id",
"origin_country",
"genre_ids",
"production_countries",
"spoken_languages",
"languages",
] {
let attr_values = media_attr_values(media, attr)?;
if !attr_values.is_empty() {
values.insert(attr.to_string(), attr_values);
}
}
if let Ok(dict) = media.getattr("__dict__") {
if let Ok(dict) = dict.downcast::<PyDict>() {
for (key, value) in dict.iter() {
let key = key.extract::<String>()?;
if values.contains_key(&key) || value.is_none() {
continue;
}
let attr_values = if key == "production_countries" {
production_country_values(&value)?
} else {
py_any_to_string_list(&value)?
.into_iter()
.map(|item| item.to_uppercase())
.collect::<Vec<_>>()
};
if !attr_values.is_empty() {
values.insert(key, attr_values);
}
}
}
}
Ok(Self {
available: true,
values,
})
}
/// 判断 TMDB 字段是否包含任一目标值。
fn matches(&self, attr: &str, value: &str) -> bool {
let Some(info_values) = self.values.get(attr) else {
return false;
};
let values = value
.split(',')
.filter(|item| !item.is_empty())
.map(|item| item.to_uppercase())
.collect::<Vec<_>>();
values
.iter()
.any(|value| info_values.iter().any(|info_value| info_value == value))
}
}
/// 执行完整种子过滤并返回匹配优先级。
fn match_torrent(
py: Python<'_>,
torrent: &TorrentSnapshot,
groups: &[FilterGroup],
matcher: &RuleMatcher,
media: &MediaSnapshot,
metainfo_options: Option<&Bound<'_, PyDict>>,
parsed_rule_cache: &mut HashMap<String, RuleExpr>,
episode_count_cache: &mut HashMap<String, i64>,
) -> PyResult<Option<i64>> {
let mut last_priority = None;
for group in groups {
let mut priority = 100i64;
let mut matched_priority = None;
for level in &group.levels {
let expr = parse_cached_expr(level, parsed_rule_cache)?;
if match_group(
py,
torrent,
&expr,
matcher,
media,
metainfo_options,
episode_count_cache,
)? {
matched_priority = Some(priority);
break;
}
priority -= 1;
}
match matched_priority {
Some(priority) => last_priority = Some(priority),
None => return Ok(None),
}
}
Ok(last_priority)
}
/// 延迟解析并缓存优先级层级表达式,保持命中高优先级后不解析低层级的语义。
fn parse_cached_expr<'a>(
level: &str,
parsed_rule_cache: &'a mut HashMap<String, RuleExpr>,
) -> PyResult<&'a RuleExpr> {
if !parsed_rule_cache.contains_key(level) {
let tokens = tokenize_rule(level)?;
let mut parser = RuleParserState::new(tokens);
let expr = parser.parse_expression()?;
if parser.has_remaining() {
return Err(PyValueError::new_err("规则表达式包含无法解析的剩余内容"));
}
parsed_rule_cache.insert(level.to_string(), expr);
}
Ok(parsed_rule_cache.get(level).expect("cached rule exists"))
}
/// 递归求值规则布尔表达式。
fn match_group(
py: Python<'_>,
torrent: &TorrentSnapshot,
expr: &RuleExpr,
matcher: &RuleMatcher,
media: &MediaSnapshot,
metainfo_options: Option<&Bound<'_, PyDict>>,
episode_count_cache: &mut HashMap<String, i64>,
) -> PyResult<bool> {
match expr {
RuleExpr::Name(name) => match_rule(
py,
torrent,
name,
matcher,
media,
metainfo_options,
episode_count_cache,
),
RuleExpr::Not(inner) => Ok(!match_group(
py,
torrent,
inner,
matcher,
media,
metainfo_options,
episode_count_cache,
)?),
RuleExpr::And(left, right) => {
if !match_group(
py,
torrent,
left,
matcher,
media,
metainfo_options,
episode_count_cache,
)? {
return Ok(false);
}
match_group(
py,
torrent,
right,
matcher,
media,
metainfo_options,
episode_count_cache,
)
}
RuleExpr::Or(left, right) => {
if match_group(
py,
torrent,
left,
matcher,
media,
metainfo_options,
episode_count_cache,
)? {
return Ok(true);
}
match_group(
py,
torrent,
right,
matcher,
media,
metainfo_options,
episode_count_cache,
)
}
}
}
/// 执行单条规则匹配。
fn match_rule(
py: Python<'_>,
torrent: &TorrentSnapshot,
rule_name: &str,
matcher: &RuleMatcher,
media: &MediaSnapshot,
metainfo_options: Option<&Bound<'_, PyDict>>,
episode_count_cache: &mut HashMap<String, i64>,
) -> PyResult<bool> {
let Some(rule) = matcher.get(py, rule_name) else {
return Ok(false);
};
if match_tmdb_rule(&rule, media)? {
return Ok(true);
}
let content = rule_match_content(&rule, torrent)?;
let includes = get_string_list(&rule, "include")?;
if !includes.is_empty() {
let mut included = false;
for pattern in includes {
if regex_search(&pattern, &content)? {
included = true;
break;
}
}
if !included {
return Ok(false);
}
}
let excludes = get_string_list(&rule, "exclude")?;
for pattern in excludes {
if regex_search(&pattern, &content)? {
return Ok(false);
}
}
if let Some(size_range) = get_optional_nonempty_string(&rule, "size_range")? {
if !match_size(torrent, &size_range, metainfo_options, episode_count_cache)? {
return Ok(false);
}
}
if let Some(seeders) = get_optional_i64(&rule, "seeders")? {
if torrent.seeders < seeders {
return Ok(false);
}
}
if let Some(download_factor) = get_optional_f64(&rule, "downloadvolumefactor")? {
if torrent.downloadvolumefactor != Some(download_factor) {
return Ok(false);
}
}
if let Some(publish_time) = get_optional_nonempty_string(&rule, "publish_time")? {
if !match_publish_time(torrent.pub_minutes, &publish_time)? {
return Ok(false);
}
}
Ok(true)
}
/// 判断规则中的 TMDB 条件是否匹配媒体信息。
fn match_tmdb_rule(rule: &Bound<'_, PyDict>, media: &MediaSnapshot) -> PyResult<bool> {
let Some(tmdb_obj) = rule.get_item("tmdb")? else {
return Ok(false);
};
if tmdb_obj.is_none() {
return Ok(false);
}
if !media.available {
return Ok(false);
}
let tmdb = tmdb_obj.downcast::<PyDict>()?;
for (key, value) in tmdb.iter() {
if value.is_none() {
continue;
}
let value = value.str()?.to_str()?.to_string();
if value.is_empty() {
continue;
}
if !media.matches(&key.extract::<String>()?, &value) {
return Ok(false);
}
}
Ok(true)
}
/// 计算规则实际用于正则匹配的内容。
fn rule_match_content(rule: &Bound<'_, PyDict>, torrent: &TorrentSnapshot) -> PyResult<String> {
let matches = get_string_list(rule, "match")?;
if matches.is_empty() {
return Ok(torrent.default_content());
}
let mut content = Vec::new();
for field in matches {
if let Some(values) = torrent.field_values(&field) {
content.extend(values.iter().filter(|item| !item.is_empty()).cloned());
}
}
if content.is_empty() {
Ok(torrent.default_content())
} else {
Ok(content.join(" "))
}
}
/// 匹配大小范围,剧集按总集数折算单集大小。
fn match_size(
torrent: &TorrentSnapshot,
size_range: &str,
metainfo_options: Option<&Bound<'_, PyDict>>,
episode_count_cache: &mut HashMap<String, i64>,
) -> PyResult<bool> {
let cache_key = format!("{}\n{}", torrent.title, torrent.description);
let episode_count = match episode_count_cache.get(&cache_key) {
Some(value) => *value,
None => {
let value = parse_total_episode_for_filter(
torrent.title.as_str(),
Some(torrent.description.as_str()),
metainfo_options,
)?;
episode_count_cache.insert(cache_key, value);
value
}
}
.max(1) as f64;
let torrent_size = torrent.size / episode_count;
match parse_size_range(size_range)? {
SizeRange::Between(min, max) => Ok(min <= torrent_size && torrent_size <= max),
SizeRange::Gte(min) => Ok(torrent_size >= min),
SizeRange::Lte(max) => Ok(torrent_size <= max),
SizeRange::Unknown => Ok(false),
}
}
enum SizeRange {
Between(f64, f64),
Gte(f64),
Lte(f64),
Unknown,
}
/// 解析大小规则,单位与 Python 旧实现保持为 MB。
fn parse_size_range(size_range: &str) -> PyResult<SizeRange> {
let size_range = size_range.trim();
if let Some((left, right)) = size_range.split_once('-') {
return Ok(SizeRange::Between(
parse_f64(left.trim(), "大小范围")? * SIZE_UNIT,
parse_f64(right.trim(), "大小范围")? * SIZE_UNIT,
));
}
if let Some(value) = size_range.strip_prefix('>') {
return Ok(SizeRange::Gte(
parse_f64(value.trim(), "大小范围")? * SIZE_UNIT,
));
}
if let Some(value) = size_range.strip_prefix('<') {
return Ok(SizeRange::Lte(
parse_f64(value.trim(), "大小范围")? * SIZE_UNIT,
));
}
Ok(SizeRange::Unknown)
}
/// 匹配发布时间分钟数范围。
fn match_publish_time(pub_minutes: f64, publish_time: &str) -> PyResult<bool> {
let values = publish_time
.split('-')
.map(|item| parse_f64(item, "发布时间规则"))
.collect::<PyResult<Vec<_>>>()?;
if values.len() == 1 {
Ok(pub_minutes >= values[0])
} else if values.len() >= 2 {
Ok(values[0] <= pub_minutes && pub_minutes <= values[1])
} else {
Ok(true)
}
}
/// 执行忽略大小写的正则搜索,按规则文本缓存编译结果。
fn regex_search(pattern: &str, content: &str) -> PyResult<bool> {
let cache_key = format!("(?i){pattern}");
if let Ok(guard) = REGEX_CACHE.lock() {
if let Some(regex) = guard.get(&cache_key) {
return regex
.is_match(content)
.map_err(|err| PyValueError::new_err(err.to_string()));
}
}
let regex =
FancyRegex::new(&cache_key).map_err(|err| PyValueError::new_err(err.to_string()))?;
let result = regex
.is_match(content)
.map_err(|err| PyValueError::new_err(err.to_string()))?;
if let Ok(mut guard) = REGEX_CACHE.lock() {
guard.insert(cache_key, regex);
}
Ok(result)
}
/// 抽取媒体字段值并统一转为大写字符串列表。
fn media_attr_values(media: &Bound<'_, PyAny>, attr: &str) -> PyResult<Vec<String>> {
let Ok(value) = media.getattr(attr) else {
return Ok(Vec::new());
};
if value.is_none() {
return Ok(Vec::new());
}
if attr == "production_countries" {
return production_country_values(&value);
}
let mut result = py_any_to_string_list(&value)?
.into_iter()
.map(|item| item.to_uppercase())
.collect::<Vec<_>>();
if result.is_empty() {
let text = value.str()?.to_str()?.to_uppercase();
if !text.is_empty() {
result.push(text);
}
}
Ok(result)
}
/// 从 TMDB production_countries 字段提取 iso_3166_1。
fn production_country_values(value: &Bound<'_, PyAny>) -> PyResult<Vec<String>> {
let Ok(list) = value.downcast::<PyList>() else {
return Ok(Vec::new());
};
let mut result = Vec::new();
for item in list.iter() {
if let Ok(dict) = item.downcast::<PyDict>() {
if let Some(code) = get_optional_nonempty_string(dict, "iso_3166_1")? {
result.push(code.to_uppercase());
}
}
}
Ok(result)
}
/// 按规则 match 字段读取 TorrentInfo 属性,避免热路径遍历整个 __dict__。
fn selected_object_fields(
torrent: &Bound<'_, PyAny>,
match_fields: &HashSet<String>,
title: &str,
description: &str,
labels: &[String],
) -> PyResult<HashMap<String, Vec<String>>> {
let mut result = HashMap::new();
for field in match_fields {
match field.as_str() {
"title" => {
if !title.is_empty() {
result.insert(field.clone(), vec![title.to_string()]);
}
continue;
}
"description" => {
if !description.is_empty() {
result.insert(field.clone(), vec![description.to_string()]);
}
continue;
}
"labels" => {
if !labels.is_empty() {
result.insert(field.clone(), labels.to_vec());
}
continue;
}
_ => {}
}
let Ok(value) = torrent.getattr(field) else {
continue;
};
if value.is_none() || !value.is_truthy()? {
continue;
}
let values = if let Ok(list) = value.downcast::<PyList>() {
let mut items = Vec::new();
for item in list.iter() {
if !item.is_none() && item.is_truthy()? {
items.push(item.str()?.to_str()?.to_string());
}
}
items
} else {
vec![value.str()?.to_str()?.to_string()]
};
if !values.is_empty() {
result.insert(field.clone(), values);
}
}
Ok(result)
}
/// 用 Rust 复刻 TorrentInfo.pub_minutes避免过滤热路径回调 Python 方法。
fn pub_minutes_from_py(torrent: &Bound<'_, PyAny>) -> PyResult<f64> {
let Some(pubdate) = object_optional_string(torrent, "pubdate")? else {
return Ok(0.0);
};
let Ok(pubdate) = NaiveDateTime::parse_from_str(&pubdate, "%Y-%m-%d %H:%M:%S") else {
return Ok(0.0);
};
let now = Local::now().naive_local();
Ok((now - pubdate).num_seconds().div_euclid(60) as f64)
}
/// 解析浮点数字符串,保持 Python float 转换失败时抛异常的语义。
fn parse_f64(value: &str, context: &str) -> PyResult<f64> {
value
.trim()
.parse::<f64>()
.map_err(|err| PyValueError::new_err(format!("{context}解析失败: {err}")))
}

View File

@@ -25,8 +25,9 @@ static HAS_SELECTOR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#":has\(\s*(?:"([^"]+)"|'([^']+)'|([^)]*))\s*\)"#).unwrap());
static TABLE_DIRECT_TR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
static EN_ELAPSED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap());
static EN_ELAPSED_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap()
});
const OUTPUT_FIELDS: &[(&str, &str)] = &[
("title", "title"),
("description", "description"),
@@ -44,7 +45,11 @@ const OUTPUT_FIELDS: &[(&str, &str)] = &[
struct FieldSpec<'py> {
name: String,
config: Bound<'py, PyDict>,
text_template: Option<String>,
default_value: Option<String>,
filters: Option<Bound<'py, PyAny>>,
query: Option<QuerySpec>,
case_selectors: Vec<(SelectorPlan, f64)>,
}
enum RowParseResult {
@@ -52,6 +57,23 @@ enum RowParseResult {
Item(PyObject),
}
struct QuerySpec {
selector: SelectorPlan,
attribute: Option<String>,
remove_selectors: Vec<Selector>,
contents: Option<i64>,
index: Option<i64>,
}
enum SelectorPlan {
Direct(Selector),
Has {
base: Selector,
inner: Selector,
suffix: Option<Selector>,
},
}
/// 批量解析普通配置 indexer 页面,优先在 Rust 内覆盖站点配置语义。
#[pyfunction]
#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
@@ -76,8 +98,12 @@ pub(crate) fn parse_indexer_torrents_fast(
};
let result = PyList::empty(py);
let field_specs = build_field_specs(fields)?;
let field_map = field_specs
.iter()
.map(|field| (field.name.as_str(), field))
.collect::<HashMap<&str, &FieldSpec<'_>>>();
for row in rows.into_iter().take(result_num) {
match parse_indexer_row(py, row, domain, &field_specs, category)? {
match parse_indexer_row(py, row, domain, &field_map, category)? {
RowParseResult::Empty => {}
RowParseResult::Item(item) => result.append(item)?,
}
@@ -95,47 +121,97 @@ fn build_field_specs<'py>(fields: &Bound<'py, PyDict>) -> PyResult<Vec<FieldSpec
let Ok(config) = value.downcast_into::<PyDict>() else {
continue;
};
let filters = config.get_item("filters")?.filter(|value| !value.is_none());
specs.push(FieldSpec {
name: key.extract::<String>()?,
config,
text_template: get_optional_string(&config, "text")?,
default_value: get_default_value(&config)?,
filters,
query: build_query_spec(&config)?,
case_selectors: build_case_selectors(&config)?,
});
}
Ok(specs)
}
/// 预编译字段选择器和静态取值参数,避免每行重复读取 Python 配置。
fn build_query_spec(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<QuerySpec>> {
let Some(selector_text) = get_selector_text(selector_config)? else {
return Ok(None);
};
let Some(selector) = parse_selector_plan(&selector_text) else {
return Ok(None);
};
Ok(Some(QuerySpec {
selector,
attribute: get_optional_string(selector_config, "attribute")?,
remove_selectors: parse_remove_selectors(selector_config)?,
contents: get_optional_i64(selector_config, "contents")?,
index: get_optional_i64(selector_config, "index")?,
}))
}
/// 预编译优惠字段的 case selector匹配失败时仍按原语义回落到 1.0。
fn build_case_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<(SelectorPlan, f64)>> {
let Some(case_obj) = selector_config.get_item("case")? else {
return Ok(Vec::new());
};
let Ok(case_dict) = case_obj.downcast::<PyDict>() else {
return Ok(Vec::new());
};
let mut selectors = Vec::new();
for (case_selector_obj, value) in case_dict.iter() {
let case_selector = case_selector_obj.extract::<String>()?;
if let Some(selector) = parse_selector_plan(&case_selector) {
selectors.push((selector, value.extract::<f64>().unwrap_or(1.0)));
}
}
Ok(selectors)
}
/// 解析单行种子信息,覆盖普通配置站点的主字段抽取流程。
fn parse_indexer_row(
py: Python<'_>,
row: ElementRef<'_>,
domain: &str,
field_specs: &[FieldSpec<'_>],
field_map: &HashMap<&str, &FieldSpec<'_>>,
category: Option<&Bound<'_, PyDict>>,
) -> PyResult<RowParseResult> {
let output = PyDict::new(py);
let field_map = field_specs
.iter()
.map(|field| (field.name.as_str(), field))
.collect::<HashMap<&str, &FieldSpec<'_>>>();
let mut cache = BTreeMap::new();
let mut resolving = HashSet::new();
if let Some(value) = eval_field_by_name(row, &field_map, "details", &mut cache, &mut resolving)? {
if let Some(value) = eval_field_by_name(row, field_map, "details", &mut cache, &mut resolving)?
{
if !value.is_empty() {
output.set_item("page_url", normalize_site_link(domain, &value, true))?;
}
}
if let Some(value) = eval_field_by_name(row, &field_map, "download", &mut cache, &mut resolving)? {
if let Some(value) = eval_field_by_name(row, field_map, "download", &mut cache, &mut resolving)?
{
if !value.is_empty() {
output.set_item("enclosure", normalize_site_link(domain, &value, false))?;
}
}
if let Some(value) = eval_factor_field(row, &field_map, "downloadvolumefactor", &mut cache, &mut resolving)? {
if let Some(value) = eval_factor_field(
row,
field_map,
"downloadvolumefactor",
&mut cache,
&mut resolving,
)? {
output.set_item("downloadvolumefactor", value)?;
}
if let Some(value) = eval_factor_field(row, &field_map, "uploadvolumefactor", &mut cache, &mut resolving)? {
if let Some(value) = eval_factor_field(
row,
field_map,
"uploadvolumefactor",
&mut cache,
&mut resolving,
)? {
output.set_item("uploadvolumefactor", value)?;
}
if let Some(value) = eval_pubdate_field(row, &field_map, &mut cache, &mut resolving)? {
if let Some(value) = eval_pubdate_field(row, field_map, &mut cache, &mut resolving)? {
if !value.is_empty() {
output.set_item("pubdate", value)?;
}
@@ -144,32 +220,44 @@ fn parse_indexer_row(
for (source_key, target_key) in OUTPUT_FIELDS {
match *source_key {
"labels" => {
parse_labels_field(py, row, &field_map, &output)?;
parse_labels_field(py, row, field_map, &output)?;
}
"hr" => {
if let Some(value) = eval_hr_field(row, &field_map)? {
if let Some(value) = eval_hr_field(row, field_map)? {
output.set_item(*target_key, value)?;
}
}
"category" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
if let Some(value) =
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
{
output.set_item(*target_key, map_category_value(&value, category)?)?;
}
}
"size" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
output.set_item(*target_key, parse_filesize_text(value.replace('\n', "").trim()))?;
if let Some(value) =
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
{
output.set_item(
*target_key,
parse_filesize_text(value.replace('\n', "").trim()),
)?;
}
}
"leechers" | "seeders" | "grabs" => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
if let Some(value) =
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
{
output.set_item(*target_key, parse_peer_count(&value))?;
}
}
_ => {
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
if let Some(value) =
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
{
if !value.is_empty() {
output.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
output
.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
}
}
}
@@ -216,32 +304,27 @@ fn eval_field(
cache: &mut BTreeMap<String, String>,
resolving: &mut HashSet<String>,
) -> PyResult<Option<String>> {
let mut value = if let Some(template) = get_optional_string(&spec.config, "text")? {
Some(render_field_template(row, field_map, &template, cache, resolving)?)
let mut value = if let Some(template) = spec.text_template.as_deref() {
Some(render_field_template(
row, field_map, &template, cache, resolving,
)?)
} else {
safe_query(row, &spec.config)?
safe_query(row, spec.query.as_ref())
};
if let Some(current) = value.as_deref() {
if contains_jinja_syntax(current) {
value = Some(render_embedded_value(
row,
field_map,
&spec.name,
current,
cache,
resolving,
row, field_map, &spec.name, current, cache, resolving,
)?);
}
}
if let Some(filters) = spec.config.get_item("filters")? {
if !filters.is_none() {
value = apply_text_filters(value.unwrap_or_default(), &filters)?;
}
if let Some(filters) = spec.filters.as_ref() {
value = apply_text_filters(value.unwrap_or_default(), filters)?;
}
if value.as_deref().map(str::is_empty).unwrap_or(true) {
if let Some(default_value) = get_default_value(&spec.config)? {
value = Some(default_value);
if let Some(default_value) = spec.default_value.as_ref() {
value = Some(default_value.clone());
}
}
Ok(value)
@@ -318,12 +401,10 @@ fn eval_factor_field(
let Some(spec) = field_map.get(key).copied() else {
return Ok(None);
};
if let Some(case_obj) = spec.config.get_item("case")? {
let case_dict = case_obj.downcast::<PyDict>()?;
for (case_selector_obj, value) in case_dict.iter() {
let case_selector = case_selector_obj.extract::<String>()?;
if selector_exists(row, &case_selector)? {
return Ok(Some(value.extract::<f64>().unwrap_or(1.0)));
if !spec.case_selectors.is_empty() {
for (selector, value) in &spec.case_selectors {
if selector_exists_with_plan(row, selector) {
return Ok(Some(*value));
}
}
return Ok(Some(1.0));
@@ -350,15 +431,16 @@ fn parse_labels_field(
let Some(spec) = field_map.get("labels").copied() else {
return Ok(());
};
if !spec.config.contains("selector")? {
let Some(query) = spec.query.as_ref() else {
output.set_item("labels", PyList::empty(py))?;
return Ok(());
}
};
let labels = PyList::empty(py);
if let Some(values) = query_all_values(row, &spec.config)? {
for value in values.into_iter().filter(|item| !item.is_empty()) {
labels.append(value)?;
}
for value in query_all_values(row, query)
.into_iter()
.filter(|item| !item.is_empty())
{
labels.append(value)?;
}
output.set_item("labels", labels)?;
Ok(())
@@ -372,10 +454,10 @@ fn eval_hr_field(
let Some(spec) = field_map.get("hr").copied() else {
return Ok(None);
};
let Some(selector_text) = get_selector_text(&spec.config)? else {
let Some(query) = spec.query.as_ref() else {
return Ok(Some(false));
};
Ok(Some(selector_exists(row, &selector_text)?))
Ok(Some(selector_exists_with_plan(row, &query.selector)))
}
/// 将站点分类 ID 映射为 MoviePilot 的媒体类型中文值。
@@ -418,7 +500,9 @@ fn eval_pubdate_field(
if is_standard_datetime(&normalized) || !field_map.contains_key("date_added") {
return Ok(Some(normalized));
}
if let Some(date_added) = eval_field_by_name(row, field_map, "date_added", cache, resolving)? {
if let Some(date_added) =
eval_field_by_name(row, field_map, "date_added", cache, resolving)?
{
let fallback = normalize_pubdate_text(&date_added);
if is_standard_datetime(&fallback) {
return Ok(Some(fallback));
@@ -426,8 +510,10 @@ fn eval_pubdate_field(
}
return Ok(Some(normalized));
}
Ok(eval_field_by_name(row, field_map, "date_added", cache, resolving)?
.map(|value| normalize_pubdate_text(&value)))
Ok(
eval_field_by_name(row, field_map, "date_added", cache, resolving)?
.map(|value| normalize_pubdate_text(&value)),
)
}
/// 规范化发布时间文本为 MoviePilot 期望的字符串格式。
@@ -466,9 +552,15 @@ fn select_site_elements<'a>(
root: ElementRef<'a>,
selector_text: &str,
) -> Option<Vec<ElementRef<'a>>> {
let plan = parse_selector_plan(selector_text)?;
Some(select_site_elements_with_plan(root, &plan))
}
/// 将站点选择器预编译为可复用计划,覆盖 PyQuery 的 :has("selector") 写法。
fn parse_selector_plan(selector_text: &str) -> Option<SelectorPlan> {
let Some(captures) = HAS_SELECTOR_RE.captures(selector_text) else {
let selector = parse_css_selector(selector_text)?;
return Some(root.select(&selector).collect());
return Some(SelectorPlan::Direct(selector));
};
let matched = captures.get(0)?;
let prefix = selector_text[..matched.start()].trim();
@@ -481,19 +573,45 @@ fn select_site_elements<'a>(
.trim();
let base_selector = parse_css_selector(prefix)?;
let has_selector = parse_css_selector(inner)?;
let bases = root
.select(&base_selector)
.filter(|element| element.select(&has_selector).next().is_some());
if suffix.is_empty() {
return Some(bases.collect());
let suffix = if suffix.is_empty() {
None
} else {
let suffix_selector_text = suffix.trim_start_matches('>').trim();
Some(parse_css_selector(suffix_selector_text)?)
};
Some(SelectorPlan::Has {
base: base_selector,
inner: has_selector,
suffix,
})
}
/// 执行预编译 selector 计划,避免每个字段每行重复解析 CSS。
fn select_site_elements_with_plan<'a>(
root: ElementRef<'a>,
plan: &SelectorPlan,
) -> Vec<ElementRef<'a>> {
match plan {
SelectorPlan::Direct(selector) => root.select(selector).collect(),
SelectorPlan::Has {
base,
inner,
suffix,
} => {
let bases = root
.select(base)
.filter(|element| element.select(inner).next().is_some());
if let Some(suffix) = suffix {
let mut values = Vec::new();
for base in bases {
values.extend(base.select(suffix));
}
values
} else {
bases.collect()
}
}
}
let suffix_selector_text = suffix.trim_start_matches('>').trim();
let suffix_selector = parse_css_selector(suffix_selector_text)?;
let mut values = Vec::new();
for base in bases {
values.extend(base.select(&suffix_selector));
}
Some(values)
}
/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
@@ -520,35 +638,24 @@ fn expand_table_direct_tr_selector(selector_text: &str) -> String {
}
/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
fn safe_query(row: ElementRef<'_>, selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
let Some(values) = query_all_values(row, selector_config)? else {
return Ok(None);
};
Ok(select_indexed_value(values, selector_config))
fn safe_query(row: ElementRef<'_>, query: Option<&QuerySpec>) -> Option<String> {
let query = query?;
let values = query_all_values(row, query);
select_indexed_value(values, query)
}
/// 查询 selector 的全部文本或属性值。
fn query_all_values(
row: ElementRef<'_>,
selector_config: &Bound<'_, PyDict>,
) -> PyResult<Option<Vec<String>>> {
let Some(selector_text) = get_selector_text(selector_config)? else {
return Ok(None);
};
let Some(elements) = select_site_elements(row, &selector_text) else {
return Ok(None);
};
let attribute = get_optional_string(selector_config, "attribute")?;
let remove_selectors = parse_remove_selectors(selector_config)?;
fn query_all_values(row: ElementRef<'_>, query: &QuerySpec) -> Vec<String> {
let elements = select_site_elements_with_plan(row, &query.selector);
let mut values = Vec::new();
for element in elements {
if let Some(attribute) = attribute.as_deref() {
if let Some(attribute) = query.attribute.as_deref() {
values.push(element.value().attr(attribute).unwrap_or("").to_string());
} else {
values.push(normalize_element_text(element, &remove_selectors));
values.push(normalize_element_text(element, &query.remove_selectors));
}
}
Ok(Some(values))
values
}
/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。
@@ -586,20 +693,17 @@ fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<Str
}
/// 对查询结果应用 contents/index 规则。
fn select_indexed_value(
values: Vec<String>,
selector_config: &Bound<'_, PyDict>,
) -> Option<String> {
fn select_indexed_value(values: Vec<String>, query: &QuerySpec) -> Option<String> {
if values.is_empty() {
return None;
}
if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
if let Some(contents) = query.contents {
if let Some(first) = values.first() {
let lines: Vec<&str> = first.split('\n').collect();
return pick_indexed_item(&lines, contents).map(|item| item.to_string());
}
}
if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
if let Some(index) = query.index {
return pick_indexed_item(&values, index).cloned();
}
values.first().cloned()
@@ -638,7 +742,8 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
continue;
}
let pattern = args_list.get_item(0)?.extract::<String>()?;
let group_index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
let group_index =
extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
let Ok(regex) = Regex::new(&pattern) else {
continue;
};
@@ -676,7 +781,9 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
continue;
}
let from = args_list.get_item(0)?.extract::<String>()?;
let to = args_list.get_item(args_list.len() - 1)?.extract::<String>()?;
let to = args_list
.get_item(args_list.len() - 1)?
.extract::<String>()?;
current = current.replace(&from, &to);
}
Some("dateparse") => {
@@ -804,7 +911,11 @@ fn parse_english_elapsed_date(value: &str) -> Option<String> {
"year" => Duration::days(amount * 365),
_ => return None,
};
Some((Local::now() - duration).format("%Y-%m-%d %H:%M:%S").to_string())
Some(
(Local::now() - duration)
.format("%Y-%m-%d %H:%M:%S")
.to_string(),
)
}
/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。
@@ -869,7 +980,10 @@ fn should_skip_text_node(
if element == root {
return false;
}
if remove_selectors.iter().any(|selector| selector.matches(&element)) {
if remove_selectors
.iter()
.any(|selector| selector.matches(&element))
{
return true;
}
parent = element.parent().and_then(ElementRef::wrap);
@@ -877,14 +991,12 @@ fn should_skip_text_node(
false
}
/// 判断 row 内是否存在指定 selector。
fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
if selector_text == "*" {
return Ok(true);
}
Ok(select_site_elements(row, selector_text)
.map(|elements| !elements.is_empty())
.unwrap_or(false))
/// 判断 row 内是否存在预编译 selector 计划匹配的元素
fn selector_exists_with_plan(row: ElementRef<'_>, selector: &SelectorPlan) -> bool {
select_site_elements_with_plan(row, selector)
.into_iter()
.next()
.is_some()
}
/// 拼接详情和下载链接。

View File

@@ -17,6 +17,7 @@ fn is_available() -> bool {
fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(is_available, m)?)?;
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
m.add_function(wrap_pyfunction!(filter::filter_torrents_fast, m)?)?;
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_fast, m)?)?;
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_path_fast, m)?)?;

View File

@@ -1,13 +1,14 @@
use crate::utils::{cached_regex, get_config_string_list};
use anitomy_pure::elements::Category;
use anitomy_pure::Parser;
use golia_pinyin::{is_valid_syllable, segment};
use once_cell::sync::Lazy;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList};
use pyo3::types::PyDict;
use regex::{Captures, Regex, RegexBuilder};
use std::collections::{BTreeMap, HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
static ANIME_BRACKET_RE: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"【[+0-9XVPI-]+】\s*【")
@@ -311,12 +312,14 @@ static KEYWORD_META_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
static EPISODE_VERSION_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"v\d+$").unwrap());
static TOKEN_SPLIT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\s+|\(|\)|\[|]|-|【|】|/||;|&|\||#|_|「|」|~").unwrap());
static STREAMING_PLATFORM_LOOKUP: Lazy<HashMap<String, String>> =
Lazy::new(streaming_platform_lookup);
static RELEASE_GROUP_RE_CACHE: Lazy<Mutex<HashMap<String, Regex>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
static CUSTOMIZATION_RE_CACHE: Lazy<Mutex<HashMap<String, Regex>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
static STREAMING_PLATFORM_CACHE: Lazy<Mutex<HashMap<usize, Arc<HashMap<String, String>>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
static PARSE_OPTIONS_CACHE: Lazy<Mutex<HashMap<usize, Arc<ParseOptions>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
const MEDIA_TYPE_MOVIE: &str = "电影";
const MEDIA_TYPE_TV: &str = "电视剧";
@@ -370,11 +373,13 @@ struct ExplicitMetaInfo {
total_episode: Option<i64>,
}
#[derive(Clone)]
struct ParseOptions {
custom_words: Vec<String>,
media_exts: HashSet<String>,
release_groups: String,
customization_patterns: Vec<String>,
streaming_platforms: Arc<HashMap<String, String>>,
}
struct TokenCursor {
@@ -404,11 +409,22 @@ pub(crate) fn parse_metainfo_fast(
subtitle: Option<&str>,
options: Option<&Bound<'_, PyDict>>,
) -> PyResult<PyObject> {
let options = ParseOptions::from_py(options)?;
let meta = build_meta_info(title, subtitle, &options, true)?;
let options = ParseOptions::from_py_cached(options)?;
let meta = build_meta_info(title, subtitle, options.as_ref(), true)?;
meta_to_py(py, &meta)
}
/// 给过滤器 size_range 规则复用完整 Rust MetaInfo 解析链路。
pub(crate) fn parse_total_episode_for_filter(
title: &str,
subtitle: Option<&str>,
options: Option<&Bound<'_, PyDict>>,
) -> PyResult<i64> {
let options = ParseOptions::from_py_cached(options)?;
let meta = build_meta_info(title, subtitle, options.as_ref(), true)?;
Ok(meta.total_episode)
}
/// 从路径入口解析 MetaInfoPath并在 Rust 内完成父目录合并。
#[pyfunction]
#[pyo3(signature = (path, options=None))]
@@ -417,8 +433,8 @@ pub(crate) fn parse_metainfo_path_fast(
path: &str,
options: Option<&Bound<'_, PyDict>>,
) -> PyResult<PyObject> {
let options = ParseOptions::from_py(options)?;
let meta = build_meta_path(path, &options)?;
let options = ParseOptions::from_py_cached(options)?;
let meta = build_meta_path(path, options.as_ref())?;
meta_to_py(py, &meta)
}
@@ -443,19 +459,26 @@ pub(crate) fn find_metainfo_fast(py: Python<'_>, title: &str) -> PyResult<PyObje
}
impl ParseOptions {
/// Python 字典读取 Rust 解析所需配置,避免 Rust 层访问数据库或 settings
fn from_py(options: Option<&Bound<'_, PyDict>>) -> PyResult<Self> {
/// Python 配置字典对象身份缓存解析配置,避免每个标题重复跨语言解包
fn from_py_cached(options: Option<&Bound<'_, PyDict>>) -> PyResult<Arc<Self>> {
let Some(options) = options else {
return Ok(Self {
return Ok(Arc::new(Self {
custom_words: Vec::new(),
media_exts: HashSet::new(),
release_groups: default_release_groups(),
release_groups: String::new(),
customization_patterns: Vec::new(),
});
streaming_platforms: Arc::new(HashMap::new()),
}));
};
Ok(Self {
custom_words: get_string_list(options, "custom_words")?,
media_exts: get_string_list(options, "media_exts")?
let cache_key = options.as_ptr() as usize;
if let Ok(guard) = PARSE_OPTIONS_CACHE.lock() {
if let Some(cached) = guard.get(&cache_key) {
return Ok(cached.clone());
}
}
let parsed = Arc::new(Self {
custom_words: get_config_string_list(options, "custom_words")?,
media_exts: get_config_string_list(options, "media_exts")?
.into_iter()
.map(|item| item.to_lowercase())
.collect(),
@@ -465,42 +488,45 @@ impl ParseOptions {
.map(|value| value.extract::<String>())
.transpose()?
.filter(|value| !value.is_empty())
.unwrap_or_else(default_release_groups),
customization_patterns: get_string_list(options, "customization")?,
})
.unwrap_or_default(),
customization_patterns: get_config_string_list(options, "customization")?,
streaming_platforms: get_streaming_platforms(options)?,
});
if let Ok(mut guard) = PARSE_OPTIONS_CACHE.lock() {
guard.insert(cache_key, parsed.clone());
}
Ok(parsed)
}
}
/// 读取 Python 字典中的字符串列表。
fn get_string_list(options: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
let Some(value) = options.get_item(key)? else {
return Ok(Vec::new());
/// Python 传入的流媒体平台表构建查询映射,避免 Rust 里重复维护默认列表。
fn get_streaming_platforms(options: &Bound<'_, PyDict>) -> PyResult<Arc<HashMap<String, String>>> {
let mut result = HashMap::new();
let Some(value) = options.get_item("streaming_platforms")? else {
return Ok(Arc::new(result));
};
if value.is_none() {
return Ok(Vec::new());
return Ok(Arc::new(result));
}
if let Ok(list) = value.downcast::<PyList>() {
let mut result = Vec::new();
for item in list.iter() {
let text = item.extract::<String>()?;
if !text.is_empty() {
result.push(text);
}
let dict = value.downcast::<PyDict>()?;
let cache_key = dict.as_ptr() as usize;
if let Ok(guard) = STREAMING_PLATFORM_CACHE.lock() {
if let Some(cached) = guard.get(&cache_key) {
return Ok(cached.clone());
}
return Ok(result);
}
if let Ok(text) = value.extract::<String>() {
return Ok(text
.replace('\n', ";")
.replace('|', ";")
.split(';')
.filter_map(|item| {
let item = item.trim();
(!item.is_empty()).then(|| item.to_string())
})
.collect());
for (key, value) in dict.iter() {
let key = key.str()?.to_str()?.to_uppercase();
let value = value.str()?.to_str()?.to_string();
if !key.is_empty() && !value.is_empty() {
result.insert(key, value);
}
}
Ok(Vec::new())
let result = Arc::new(result);
if let Ok(mut guard) = STREAMING_PLATFORM_CACHE.lock() {
guard.insert(cache_key, result.clone());
}
Ok(result)
}
/// 构建标题入口的完整元信息。
@@ -541,6 +567,7 @@ fn build_meta_info(
media_exts: options.media_exts.clone(),
release_groups: options.release_groups.clone(),
customization_patterns: options.customization_patterns.clone(),
streaming_platforms: options.streaming_platforms.clone(),
},
false,
)?;
@@ -1024,7 +1051,13 @@ fn parse_video(
init_resource_type(&mut meta, &mut state, &current);
}
if state.continue_flag {
init_web_source(&mut meta, &mut state, &current, &mut tokens);
init_web_source(
&mut meta,
&mut state,
&current,
&mut tokens,
&options.streaming_platforms,
);
}
if state.continue_flag {
init_video_encode(&mut meta, &mut state, &current);
@@ -1844,13 +1877,12 @@ fn init_web_source(
state: &mut VideoState,
token: &str,
tokens: &mut TokenCursor,
streaming_platforms: &HashMap<String, String>,
) {
if meta_name(meta).is_none() {
return;
}
let mut platform_name = STREAMING_PLATFORM_LOOKUP
.get(&token.to_uppercase())
.cloned();
let mut platform_name = streaming_platforms.get(&token.to_uppercase()).cloned();
let mut query_range = 1usize;
let prev_token = state
.index
@@ -1869,7 +1901,7 @@ fn init_web_source(
} else {
format!("{adjacent}{separator}{token}")
};
if let Some(name) = STREAMING_PLATFORM_LOOKUP.get(&combined.to_uppercase()) {
if let Some(name) = streaming_platforms.get(&combined.to_uppercase()) {
platform_name = Some(name.clone());
query_range = 2;
if is_next {
@@ -3061,152 +3093,6 @@ fn match_customization(title: &str, patterns: &[String]) -> Option<String> {
(!unique.is_empty()).then(|| unique.into_values().collect::<Vec<_>>().join("@"))
}
/// 按正则文本缓存动态配置正则,避免每个标题重复编译长规则。
fn cached_regex(cache: &Lazy<Mutex<HashMap<String, Regex>>>, pattern: &str) -> Option<Regex> {
if let Ok(guard) = cache.lock() {
if let Some(regex) = guard.get(pattern) {
return Some(regex.clone());
}
}
let regex = Regex::new(pattern).ok()?;
if let Ok(mut guard) = cache.lock() {
guard.insert(pattern.to_string(), regex.clone());
}
Some(regex)
}
/// 默认发布组正则,来自 ReleaseGroupsMatcher.RELEASE_GROUPS。
fn default_release_groups() -> String {
vec![
"FF(?:(?:A|WE)B|CD|E(?:DU|B)|TV)",
"Audies",
"AD(?:Audio|E(?:book|)|Music|Web)",
"BeiTai",
"Bts(?:CHOOL|HD|PAD|TV)",
"Zone",
"CHD(?:Bits|PAD|(?:|HK)TV|WEB|)",
"StBOX",
"OneHD",
"Lee",
"xiaopie",
"(?:(?:iNT|(?:HALFC|Mini(?:S|H|FH)D))-|)TLF",
"(?:DG|GBWE)B",
"Hares(?:(?:M|T)V|Web|)",
"HDA(?:pad|rea|TV)",
"EPiC",
"HDC(?:hina|TV|)",
"k9611",
"tudou",
"iHD",
"D(?:ream|BTV)",
"(?:HD|QHstudI)o",
"beAst(?:TV|)",
"HDH(?:ome|Pad|TV|WEB|)",
"HDPT(?:Web|)",
"HDS(?:ky|TV|Pad|WEB|)",
"AQLJ",
"HDZ(?:one|)",
"HHWEB",
"HTPT",
"FRDS",
"Yumi",
"cXcY",
"L(?:eague(?:(?:C|H)D|(?:M|T)V|NF|WEB)|HD)",
"i18n",
"CiNT",
"MTeam(?:TV|)",
"MPAD",
"MWeb",
"Our(?:Bits|TV)",
"FLTTH",
"Ao",
"PbK",
"MGs",
"iLove(?:HD|TV)",
"Panda",
"AilMWeb",
"PiGo(?:NF|(?:H|WE)B)",
"PTer(?:DIY|Game|(?:M|T)V|WEB|)",
"PTH(?:Audio|eBook|music|ome|tv|WEB|)",
"PTsbao",
"OPS",
"F(?:Fans(?:AIeNcE|BD|D(?:VD|IY)|TV|WEB)|HDMv)",
"SGXT",
"PuTao",
"CMCT(?:V|)",
"Shark(?:WEB|DIY|TV|MV|)",
"TJUPT",
"TTG",
"WiKi",
"NGB",
"DoA",
"(?:ARi|ExRE)N",
"B(?:MDru|eyondHD|TN)",
"C(?:fandora|trlhd|MRG)",
"DON",
"EVO",
"FLUX",
"HONE(?:yG|)",
"N(?:oGroup|T(?:b|G))",
"PandaMoon",
"SMURF",
"T(?:EPES|aengoo|rollHD )",
"ANi",
"HYSUB",
"KTXP",
"LoliHouse",
"MCE",
"Nekomoe kissaten",
"SweetSub",
"MingY",
"(?:Lilith|NC)-Raws",
"织梦字幕组",
"枫叶字幕组",
"猎户手抄部",
"喵萌奶茶屋",
"漫猫字幕社",
"霜庭云花Sub",
"北宇治字幕组",
"氢气烤肉架",
"云歌字幕组",
"萌樱字幕组",
"极影字幕社",
"悠哈璃羽字幕社",
"❀拨雪寻春❀",
"沸羊羊(?:制作|字幕组)",
"(?:桜|樱)都字幕组",
"FROG(?:E|Web|)",
"UB(?:its|WEB|TV)",
]
.join("|")
}
/// 构建常用流媒体平台查询表。
fn streaming_platform_lookup() -> HashMap<String, String> {
let pairs = [
("AMZN", "Amazon"),
("NF", "Netflix"),
("ATVP", "Apple TV+"),
("DSNP", "Disney+"),
("PMTP", "Paramount+"),
("HMAX", "Max"),
("HULU", "Hulu Networks"),
("CR", "Crunchyroll"),
("Baha", "Baha"),
("BG", "B-Global"),
("HBO", "HBO"),
("CRAV", "Crave"),
("PCOK", "Peacock"),
("HMAX", "Max"),
];
let mut map = HashMap::new();
for (short, full) in pairs {
map.insert(short.to_uppercase(), full.to_string());
map.insert(full.to_uppercase(), full.to_string());
}
map
}
/// 将 Rust 元信息转换为 Python dict。
fn meta_to_py(py: Python<'_>, meta: &MetaResult) -> PyResult<PyObject> {
let dict = PyDict::new(py);

View File

@@ -1,9 +1,11 @@
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
use chrono::{
DateTime, Datelike, Local, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone, Timelike, Utc,
};
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDict, PyList};
use quick_xml::events::{BytesRef, BytesStart, Event};
use quick_xml::name::QName;
use quick_xml::Reader;
use std::collections::HashMap;
#[derive(Default)]
struct RssItem {
@@ -35,8 +37,20 @@ pub(crate) fn parse_rss_items_fast(
) -> PyResult<Option<PyObject>> {
let parsed = parse_rss_items(xml_text, max_items)?;
let result = PyList::empty(py);
let datetime_mod = py.import("datetime")?;
let datetime_cls = datetime_mod.getattr("datetime")?;
let timezone_cls = datetime_mod.getattr("timezone")?;
let timedelta_cls = datetime_mod.getattr("timedelta")?;
let mut timezone_cache = HashMap::new();
for item in parsed {
result.append(item_to_py(py, &item)?)?;
result.append(item_to_py(
py,
&item,
&datetime_cls,
&timezone_cls,
&timedelta_cls,
&mut timezone_cache,
)?)?;
}
Ok(Some(result.into()))
}
@@ -45,7 +59,7 @@ pub(crate) fn parse_rss_items_fast(
fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
let mut reader = Reader::from_str(xml_text);
let mut results = Vec::new();
let mut results = Vec::with_capacity(max_items.min(1024));
let mut current_item: Option<RssItem> = None;
let mut item_depth = 0usize;
let mut current_field: Option<(TextField, usize)> = None;
@@ -53,8 +67,9 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
loop {
match reader.read_event() {
Ok(Event::Start(event)) => {
let local = local_name(event.name());
if current_item.is_none() && is_item_node(&local) {
let name = event.name();
let local = local_name(name.as_ref());
if current_item.is_none() && is_item_node(local) {
current_item = Some(RssItem::default());
item_depth = 1;
current_field = None;
@@ -63,25 +78,26 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
if let Some(item) = current_item.as_mut() {
item_depth += 1;
handle_start_field(&event, &local, item, item_depth, &mut current_field)?;
handle_start_field(&event, local, item, item_depth, &mut current_field)?;
}
}
Ok(Event::Empty(event)) => {
let local = local_name(event.name());
let name = event.name();
let local = local_name(name.as_ref());
if let Some(item) = current_item.as_mut() {
handle_empty_field(&event, &local, item)?;
handle_empty_field(&event, local, item)?;
}
}
Ok(Event::Text(event)) => {
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
let text = event.decode().map_err(to_py_value_error)?.to_string();
append_text_field(item, field, &text);
let text = event.decode().map_err(to_py_value_error)?;
append_text_field(item, field, text.as_ref());
}
}
Ok(Event::CData(event)) => {
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
let text = event.decode().map_err(to_py_value_error)?.to_string();
append_text_field(item, field, &text);
let text = event.decode().map_err(to_py_value_error)?;
append_text_field(item, field, text.as_ref());
}
}
Ok(Event::GeneralRef(event)) => {
@@ -91,8 +107,9 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
}
}
Ok(Event::End(event)) => {
let local = local_name(event.name());
if current_item.is_some() && item_depth == 1 && is_item_node(&local) {
let name = event.name();
let local = local_name(name.as_ref());
if current_item.is_some() && item_depth == 1 && is_item_node(local) {
if let Some(item) = current_item.take() {
if let Some(item) = finalize_item(item) {
results.push(item);
@@ -130,17 +147,17 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
/// 处理开始标签,记录当前需要采集文本的字段和链接属性。
fn handle_start_field(
event: &BytesStart<'_>,
local: &str,
local: &[u8],
item: &mut RssItem,
depth: usize,
current_field: &mut Option<(TextField, usize)>,
) -> PyResult<()> {
if local == "enclosure" {
if local.eq_ignore_ascii_case(b"enclosure") {
fill_enclosure(event, item)?;
return Ok(());
}
if local == "link" {
if local.eq_ignore_ascii_case(b"link") {
fill_link_from_href(event, item)?;
}
@@ -153,25 +170,39 @@ fn handle_start_field(
}
/// 处理空标签,覆盖 Atom 的 link href 和 RSS 的 enclosure。
fn handle_empty_field(event: &BytesStart<'_>, local: &str, item: &mut RssItem) -> PyResult<()> {
match local {
"enclosure" => fill_enclosure(event, item)?,
"link" => fill_link_from_href(event, item)?,
_ => {}
fn handle_empty_field(event: &BytesStart<'_>, local: &[u8], item: &mut RssItem) -> PyResult<()> {
if local.eq_ignore_ascii_case(b"enclosure") {
fill_enclosure(event, item)?;
} else if local.eq_ignore_ascii_case(b"link") {
fill_link_from_href(event, item)?;
}
Ok(())
}
/// 根据标签名和已采集状态选择当前文本字段。
fn pick_text_field(local: &str, item: &RssItem) -> Option<TextField> {
match local {
"title" if item.title.is_empty() => Some(TextField::Title),
"description" | "summary" if item.description.is_empty() => Some(TextField::Description),
"link" if item.link.is_empty() => Some(TextField::Link),
"pubDate" | "published" | "updated" if item.pubdate.is_empty() => Some(TextField::Pubdate),
"creator" if item.nickname.is_empty() => Some(TextField::Nickname),
_ => None,
fn pick_text_field(local: &[u8], item: &RssItem) -> Option<TextField> {
if local.eq_ignore_ascii_case(b"title") && item.title.is_empty() {
return Some(TextField::Title);
}
if (local.eq_ignore_ascii_case(b"description") || local.eq_ignore_ascii_case(b"summary"))
&& item.description.is_empty()
{
return Some(TextField::Description);
}
if local.eq_ignore_ascii_case(b"link") && item.link.is_empty() {
return Some(TextField::Link);
}
if (local.eq_ignore_ascii_case(b"pubDate")
|| local.eq_ignore_ascii_case(b"published")
|| local.eq_ignore_ascii_case(b"updated"))
&& item.pubdate.is_empty()
{
return Some(TextField::Pubdate);
}
if local.eq_ignore_ascii_case(b"creator") && item.nickname.is_empty() {
return Some(TextField::Nickname);
}
None
}
/// 追加文本字段内容,兼容 CDATA 和带内联标签的描述。
@@ -266,7 +297,14 @@ fn finalize_item(mut item: RssItem) -> Option<RssItem> {
}
/// 将 Rust 条目转换为 Python dict字段名保持与 RssHelper.parse 原返回一致。
fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
fn item_to_py(
py: Python<'_>,
item: &RssItem,
datetime_cls: &Bound<'_, PyAny>,
timezone_cls: &Bound<'_, PyAny>,
timedelta_cls: &Bound<'_, PyAny>,
timezone_cache: &mut HashMap<i32, PyObject>,
) -> PyResult<PyObject> {
let dict = PyDict::new(py);
dict.set_item("title", &item.title)?;
dict.set_item("enclosure", &item.enclosure)?;
@@ -274,7 +312,17 @@ fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
dict.set_item("description", &item.description)?;
dict.set_item("link", &item.link)?;
if let Some(timestamp) = parse_pubdate_timestamp(&item.pubdate) {
dict.set_item("pubdate", py_datetime_from_timestamp(py, timestamp)?)?;
dict.set_item(
"pubdate",
py_datetime_from_timestamp(
py,
timestamp,
datetime_cls,
timezone_cls,
timedelta_cls,
timezone_cache,
)?,
)?;
} else {
dict.set_item("pubdate", "")?;
}
@@ -285,13 +333,41 @@ fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
}
/// 将 Unix 时间戳转换为本地时区 Python datetime匹配原 astimezone(tz=None) 语义。
fn py_datetime_from_timestamp<'py>(py: Python<'py>, timestamp: i64) -> PyResult<Bound<'py, PyAny>> {
let datetime_mod = py.import("datetime")?;
let datetime_cls = datetime_mod.getattr("datetime")?;
let timezone_cls = datetime_mod.getattr("timezone")?;
let utc = timezone_cls.getattr("utc")?;
let utc_dt = datetime_cls.call_method1("fromtimestamp", (timestamp, utc))?;
utc_dt.call_method0("astimezone")
fn py_datetime_from_timestamp<'py>(
py: Python<'py>,
timestamp: i64,
datetime_cls: &Bound<'py, PyAny>,
timezone_cls: &Bound<'py, PyAny>,
timedelta_cls: &Bound<'py, PyAny>,
timezone_cache: &mut HashMap<i32, PyObject>,
) -> PyResult<Bound<'py, PyAny>> {
let Some(local_dt) = Local
.timestamp_opt(timestamp, 0)
.single()
.or_else(|| Local.timestamp_opt(timestamp, 0).earliest())
else {
return datetime_cls.call_method1("fromtimestamp", (timestamp,));
};
let offset_seconds = local_dt.offset().fix().local_minus_utc();
let tzinfo = match timezone_cache.get(&offset_seconds) {
Some(cached) => cached.clone_ref(py),
None => {
let delta = timedelta_cls.call1((0, offset_seconds))?;
let timezone = timezone_cls.call1((delta,))?.unbind();
timezone_cache.insert(offset_seconds, timezone.clone_ref(py));
timezone
}
};
datetime_cls.call1((
local_dt.year(),
local_dt.month(),
local_dt.day(),
local_dt.hour(),
local_dt.minute(),
local_dt.second(),
0,
tzinfo.bind(py),
))
}
/// 解析 RSS/Atom 常见日期格式并返回时间戳。
@@ -365,17 +441,13 @@ fn local_timestamp(naive: NaiveDateTime) -> Option<i64> {
}
/// 判断当前标签是否为 RSS item 或 Atom entry。
fn is_item_node(local: &str) -> bool {
matches!(local, "item" | "entry")
fn is_item_node(local: &[u8]) -> bool {
local.eq_ignore_ascii_case(b"item") || local.eq_ignore_ascii_case(b"entry")
}
/// 提取 XML 名称的本地部分,用于兼容 dc:creator 这类命名空间字段。
fn local_name(name: QName<'_>) -> String {
let raw = std::str::from_utf8(name.as_ref()).unwrap_or_default();
raw.rsplit_once(':')
.map(|(_, local)| local)
.unwrap_or(raw)
.to_string()
fn local_name(raw: &[u8]) -> &[u8] {
raw.rsplit(|byte| *byte == b':').next().unwrap_or(raw)
}
/// 将 quick-xml 错误转换为 Python ValueError 交给 Python 包装层判断是否兜底。

View File

@@ -1,5 +1,8 @@
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyDict};
use pyo3::types::{PyAny, PyDict, PyList};
use regex::Regex;
use std::collections::HashMap;
use std::sync::Mutex;
/// 从 Python 字典读取可选字符串。
pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<String>> {
@@ -12,6 +15,22 @@ pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResu
Ok(Some(value.str()?.to_str()?.to_string()))
}
/// 从 Python 字典读取非空字符串。
pub(crate) fn get_optional_nonempty_string(
dict: &Bound<'_, PyDict>,
key: &str,
) -> PyResult<Option<String>> {
let Some(value) = get_optional_string(dict, key)? else {
return Ok(None);
};
let value = value.trim().to_string();
if value.is_empty() {
Ok(None)
} else {
Ok(Some(value))
}
}
/// 从 Python 字典读取可选整数。
pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<i64>> {
let Some(value) = dict.get_item(key)? else {
@@ -30,6 +49,54 @@ pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<
Ok(text.parse::<i64>().ok())
}
/// 从 Python 字典读取可选浮点数。
pub(crate) fn get_optional_f64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<f64>> {
let Some(value) = dict.get_item(key)? else {
return Ok(None);
};
py_any_to_f64(&value)
}
/// 从 Python 字典读取字符串列表,兼容单值字符串。
pub(crate) fn get_string_list(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
let Some(value) = dict.get_item(key)? else {
return Ok(Vec::new());
};
py_any_to_string_list(&value)
}
/// 从 Python 字典读取配置字符串列表,兼容列表和以换行、竖线、分号分隔的字符串。
pub(crate) fn get_config_string_list(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
let Some(value) = dict.get_item(key)? else {
return Ok(Vec::new());
};
if value.is_none() {
return Ok(Vec::new());
}
if let Ok(list) = value.downcast::<PyList>() {
let mut result = Vec::new();
for item in list.iter() {
let text = item.extract::<String>()?;
if !text.is_empty() {
result.push(text);
}
}
return Ok(result);
}
if let Ok(text) = value.extract::<String>() {
return Ok(text
.replace('\n', ";")
.replace('|', ";")
.split(';')
.filter_map(|item| {
let item = item.trim();
(!item.is_empty()).then(|| item.to_string())
})
.collect());
}
Ok(Vec::new())
}
/// 将 Python 对象转换为 i64用于兼容配置里字符串或数字形式的下标。
pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
if value.is_none() {
@@ -44,3 +111,105 @@ pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
}
Ok(text.parse::<i64>().ok())
}
/// 将 Python 值转换为可选 i64。
pub(crate) fn py_any_to_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
if value.is_none() {
return Ok(None);
}
if let Ok(parsed) = value.extract::<i64>() {
return Ok(Some(parsed));
}
let text = value.str()?.to_str()?.trim().to_string();
if text.is_empty() {
return Ok(None);
}
Ok(text.parse::<i64>().ok())
}
/// 将 Python 值转换为可选 f64。
pub(crate) fn py_any_to_f64(value: &Bound<'_, PyAny>) -> PyResult<Option<f64>> {
if value.is_none() {
return Ok(None);
}
if let Ok(parsed) = value.extract::<f64>() {
return Ok(Some(parsed));
}
let text = value.str()?.to_str()?.trim().to_string();
if text.is_empty() {
return Ok(None);
}
Ok(text.parse::<f64>().ok())
}
/// 将 Python 值转换为字符串列表。
pub(crate) fn py_any_to_string_list(value: &Bound<'_, PyAny>) -> PyResult<Vec<String>> {
if value.is_none() {
return Ok(Vec::new());
}
if let Ok(list) = value.downcast::<PyList>() {
let mut result = Vec::new();
for item in list.iter() {
let text = item.str()?.to_str()?.to_string();
if !text.is_empty() {
result.push(text);
}
}
return Ok(result);
}
let text = value.str()?.to_str()?.to_string();
if text.is_empty() {
Ok(Vec::new())
} else {
Ok(vec![text])
}
}
/// 从对象读取可选字符串属性。
pub(crate) fn object_optional_string(
obj: &Bound<'_, PyAny>,
attr: &str,
) -> PyResult<Option<String>> {
let value = obj.getattr(attr)?;
if value.is_none() {
return Ok(None);
}
let text = value.str()?.to_str()?.to_string();
if text.is_empty() {
Ok(None)
} else {
Ok(Some(text))
}
}
/// 从对象读取可选字符串列表属性。
pub(crate) fn object_string_list(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Vec<String>> {
let value = obj.getattr(attr)?;
py_any_to_string_list(&value)
}
/// 从对象读取可选整数属性。
pub(crate) fn object_optional_i64(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Option<i64>> {
let value = obj.getattr(attr)?;
py_any_to_i64(&value)
}
/// 从对象读取可选浮点属性。
pub(crate) fn object_optional_f64(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Option<f64>> {
let value = obj.getattr(attr)?;
py_any_to_f64(&value)
}
/// 按正则文本缓存动态正则,避免热路径重复编译。
pub(crate) fn cached_regex(cache: &Mutex<HashMap<String, Regex>>, pattern: &str) -> Option<Regex> {
if let Ok(guard) = cache.lock() {
if let Some(regex) = guard.get(pattern) {
return Some(regex.clone());
}
}
let regex = Regex::new(pattern).ok()?;
if let Ok(mut guard) = cache.lock() {
guard.insert(pattern.to_string(), regex.clone());
}
Some(regex)
}

View File

@@ -0,0 +1,182 @@
import argparse
import statistics
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path
from types import SimpleNamespace
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
from app.core.context import MediaInfo, TorrentInfo
from app.modules.filter import FilterModule
from app.modules.filter.RuleParser import RuleParser
class StaticRuleHelper:
"""
为 benchmark 提供固定规则组,避免系统配置和数据库访问影响过滤链路测量。
"""
def __init__(self, groups):
self._groups = groups
def get_rule_group_by_media(self, media=None, group_names=None): # noqa: ARG002
"""
按规则名返回固定规则组。
"""
if not group_names:
return self._groups
return [group for group in self._groups if group.name in group_names]
def build_module() -> FilterModule:
"""
构造覆盖正则、标签、站点字段、促销、做种数、发布时间和 size_range 的过滤模块。
"""
module = FilterModule()
module.rulehelper = StaticRuleHelper(
[
SimpleNamespace(name="quality", rule_string="(HDR | DV) & !BLU > WEBDL & FREE"),
SimpleNamespace(name="site", rule_string="LABEL & SITE & SIZE & SEED & PUB & TMDB"),
]
)
module.rule_set = {
"HDR": {"include": r"[\s.]+HDR[\s.]+|HDR10|HDR10\+"},
"DV": {"include": r"DOVI|Dolby[\s.]+Vision"},
"BLU": {
"include": [r"\bBlu-?Ray\b"],
"exclude": [r"(?<!WEB-|HDTV)RIP"],
},
"WEBDL": {"include": r"WEB-?DL|WEB-?RIP"},
"FREE": {"downloadvolumefactor": 0},
"LABEL": {"include": "官方", "match": ["labels"]},
"SITE": {"include": "Alpha|Beta", "match": ["site_name"]},
"SIZE": {"size_range": "100-500"},
"SEED": {"seeders": "5"},
"PUB": {"publish_time": "0-2880"},
"TMDB": {"tmdb": {"original_language": "zh,cn"}},
}
return module
def build_torrents(count: int) -> list[TorrentInfo]:
"""
构造稳定的种子列表,让匹配和不匹配路径都进入完整过滤逻辑。
"""
pubdate = (datetime.now() - timedelta(minutes=90)).strftime("%Y-%m-%d %H:%M:%S")
torrents = []
for index in range(count):
matched = index % 3 != 0
title = f"Example Show S01E{index % 12 + 1:02d}-E{index % 12 + 2:02d} 1080p WEB-DL HDR10"
if not matched:
title = f"Example Show S01E{index % 12 + 1:02d} 1080p BluRay HDR"
torrents.append(
TorrentInfo(
site_name="Alpha" if index % 2 else "Beta",
title=title,
description="简繁中字",
labels=["官方"] if matched else ["转载"],
size=(700 if matched else 2400) * 1024 * 1024,
seeders=20 if matched else 1,
downloadvolumefactor=0 if matched else 1,
pubdate=pubdate,
)
)
return torrents
def build_mediainfo() -> MediaInfo:
"""
构造命中 TMDB 规则的媒体信息。
"""
mediainfo = MediaInfo()
mediainfo.original_language = "zh"
return mediainfo
def reset_priority(torrents: list[TorrentInfo]) -> None:
"""
清理上一轮过滤写入的优先级,避免重复测量间互相影响。
"""
for torrent in torrents:
torrent.pri_order = 0
def python_filter(module: FilterModule, rule_groups: list[str], torrent_list: list[TorrentInfo], mediainfo: MediaInfo):
"""
调用旧 Python 私有实现,作为同一入口语义下的基准对照。
"""
parser = RuleParser()
parsed_rule_cache = {}
groups = module.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
for group in groups:
torrent_list = module._FilterModule__filter_torrents(
rule_string=group.rule_string,
rule_name=group.name,
torrent_list=torrent_list,
mediainfo=mediainfo,
parser=parser,
parsed_rule_cache=parsed_rule_cache,
)
return torrent_list
def rust_filter(module: FilterModule, rule_groups: list[str], torrent_list: list[TorrentInfo], mediainfo: MediaInfo):
"""
调用当前生产入口,测量 Rust 完整过滤链路。
"""
return module.filter_torrents(rule_groups=rule_groups, torrent_list=torrent_list, mediainfo=mediainfo)
def measure(func, module: FilterModule, torrents: list[TorrentInfo], mediainfo: MediaInfo, loops: int, repeats: int):
"""
多轮测量过滤耗时并返回中位数。
"""
samples = []
filtered_count = 0
rule_groups = ["quality", "site"]
for _ in range(repeats):
start = time.perf_counter()
for _ in range(loops):
reset_priority(torrents)
filtered_count = len(func(module, rule_groups, torrents, mediainfo))
samples.append((time.perf_counter() - start) * 1000 / loops)
return statistics.median(samples), filtered_count
def parse_args():
"""
解析命令行参数。
"""
parser = argparse.ArgumentParser(description="Benchmark FilterModule.filter_torrents Rust entry")
parser.add_argument("--items", type=int, default=2000, help="Torrent count per loop")
parser.add_argument("--loops", type=int, default=20, help="Loops per repeat")
parser.add_argument("--repeats", type=int, default=5, help="Repeat count")
return parser.parse_args()
def main() -> int:
"""
运行 Filter Rust 与旧 Python 过滤链路基准测试。
"""
args = parse_args()
module = build_module()
torrents = build_torrents(args.items)
mediainfo = build_mediainfo()
rust_ms, rust_count = measure(rust_filter, module, torrents, mediainfo, args.loops, args.repeats)
python_ms, python_count = measure(python_filter, module, torrents, mediainfo, args.loops, args.repeats)
speedup = python_ms / rust_ms if rust_ms else 0
print(f"items_per_loop={len(torrents)} loops={args.loops} repeats={args.repeats}")
print(f"rust_items={rust_count} python_items={python_count}")
print(f"rust_chain_ms_per_loop={rust_ms:.3f}")
print(f"python_chain_ms_per_loop={python_ms:.3f}")
print(f"speedup={speedup:.2f}x")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -180,6 +180,7 @@ def _metainfo_options(custom_words=None):
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
"release_groups": release_groups,
"customization": customization,
"streaming_platforms": metainfo_module._rust_parse_options()["streaming_platforms"],
}

View File

@@ -1,7 +1,8 @@
import unittest
from datetime import datetime, timedelta
from types import SimpleNamespace
from app.core.context import TorrentInfo
from app.core.context import MediaInfo, TorrentInfo
from app.helper.torrent import TorrentHelper
from app.modules.filter import FilterModule
@@ -64,6 +65,76 @@ class TorrentFilterTest(unittest.TestCase):
self.assertEqual([torrent], filtered)
self.assertEqual(100, torrent.pri_order)
def test_filter_torrents_keeps_sequential_rule_group_semantics(self):
module = FilterModule()
module.rulehelper = _RuleHelper(
[
SimpleNamespace(name="first", rule_string="HDR"),
SimpleNamespace(name="second", rule_string="FREE"),
]
)
module.rule_set = {
"HDR": {"include": "HDR"},
"FREE": {"downloadvolumefactor": 0},
}
keep = TorrentInfo(title="Movie HDR WEB-DL", description="", downloadvolumefactor=0)
drop = TorrentInfo(title="Movie HDR WEB-DL", description="", downloadvolumefactor=1)
filtered = module.filter_torrents(rule_groups=["first", "second"], torrent_list=[keep, drop])
self.assertEqual([keep], filtered)
self.assertEqual(100, keep.pri_order)
def test_filter_torrents_supports_full_rule_fields_in_rust_entry(self):
module = _build_filter_module(
rule_string="TMDB & LABEL & SIZE & SEED & PUB & SITE",
rule_set={
"TMDB": {"tmdb": {"original_language": "zh,cn"}},
"LABEL": {"include": "官方", "match": ["labels"]},
"SIZE": {"size_range": "100-400"},
"SEED": {"seeders": "5"},
"PUB": {"publish_time": "0-120"},
"SITE": {"include": "Alpha", "match": ["site_name"]},
},
)
torrent = TorrentInfo(
site_name="Alpha",
title="Show S01E01-E02 1080p",
description="",
labels=["官方"],
size=600 * 1024 * 1024,
seeders=8,
pubdate=(datetime.now() - timedelta(minutes=30)).strftime("%Y-%m-%d %H:%M:%S"),
)
mediainfo = MediaInfo()
mediainfo.original_language = "zh"
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent], mediainfo=mediainfo)
self.assertEqual([torrent], filtered)
self.assertEqual(100, torrent.pri_order)
def test_filter_torrents_uses_rust_entry_without_python_match_fallback(self):
module = _build_filter_module(
rule_string="KEEP",
rule_set={"KEEP": {"include": "Movie"}},
)
def fail_python_fallback(*_args, **_kwargs):
"""
如果入口仍调用旧 Python 私有匹配逻辑,测试应立即失败。
"""
raise AssertionError("Python fallback should not be called")
module._FilterModule__filter_torrents = fail_python_fallback
filtered = module.filter_torrents(
rule_groups=["test"],
torrent_list=[TorrentInfo(title="Movie", description="")],
)
self.assertEqual(1, len(filtered))
def test_filter_torrent_keeps_extra_filter_semantics(self):
torrent = TorrentInfo(
title="Movie 1080p HDR",