mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-20 23:14:32 +08:00
perf: optimize rust acceleration paths
Rust vs Python benchmark results: - RSS: Rust 0.299 ms/loop vs Python 7.913 ms/loop, 26.47x faster - Filter: Rust 12.740 ms/loop vs Python 57.187 ms/loop, 4.49x faster - MetaInfo: Rust 64.680 ms/loop vs Python 316.158 ms/loop, 4.89x faster - Indexer agsvpt: Rust 145.76 ms vs Python 3686.50 ms, 25.29x faster - Indexer pttime: Rust 166.51 ms vs Python 4019.87 ms, 24.14x faster - Indexer chdbits: Rust 161.17 ms vs Python 3604.28 ms, 22.36x faster - Indexer iptorrents: Rust 77.82 ms vs Python 17615.52 ms, 226.36x faster Validation: - cargo fmt/check/test for rust/moviepilot_rust - pytest Rust-related coverage: tests/test_rust_accel.py tests/test_torrent_filter.py tests/test_metainfo.py tests/test_indexer_spider_search_url.py tests/test_workflow_fetch_rss.py - tests/run.py legacy suite - pylint app/ --errors-only
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from pathlib import Path
|
||||
from functools import lru_cache
|
||||
from typing import Tuple, List, Optional
|
||||
|
||||
import regex as re
|
||||
@@ -123,12 +124,14 @@ def _build_meta_info(
|
||||
return meta
|
||||
|
||||
|
||||
def _rust_parse_options(custom_words: List[str] = None) -> dict:
|
||||
@lru_cache(maxsize=1)
|
||||
def _rust_default_parse_options() -> dict:
|
||||
"""
|
||||
收集 Rust Meta 解析所需的运行时配置,避免 Rust 层直接访问数据库和 settings。
|
||||
缓存 Rust Meta 默认解析配置,避免热路径反复读取配置并复制流媒体平台大表。
|
||||
"""
|
||||
from app.core.meta.customization import CustomizationMatcher
|
||||
from app.core.meta.releasegroup import ReleaseGroupsMatcher
|
||||
from app.core.meta.streamingplatform import StreamingPlatforms
|
||||
from app.db.systemconfig_oper import SystemConfigOper
|
||||
from app.schemas.types import SystemConfigKey
|
||||
|
||||
@@ -144,17 +147,42 @@ def _rust_parse_options(custom_words: List[str] = None) -> dict:
|
||||
customization = CustomizationMatcher._normalize_customization(
|
||||
systemconfig.get(SystemConfigKey.Customization)
|
||||
)
|
||||
words = custom_words
|
||||
if words is None:
|
||||
words = systemconfig.get(SystemConfigKey.CustomIdentifiers) or []
|
||||
return {
|
||||
"custom_words": words or [],
|
||||
"custom_words": systemconfig.get(SystemConfigKey.CustomIdentifiers) or [],
|
||||
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
|
||||
"release_groups": release_groups,
|
||||
"customization": customization,
|
||||
"streaming_platforms": StreamingPlatforms()._lookup_cache,
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=256)
|
||||
def _rust_custom_parse_options(custom_words: Tuple[str, ...]) -> dict:
|
||||
"""
|
||||
缓存带自定义识别词的 Rust Meta 配置,避免同一组识别词重复构造配置对象。
|
||||
"""
|
||||
options = dict(_rust_default_parse_options())
|
||||
options["custom_words"] = list(custom_words)
|
||||
return options
|
||||
|
||||
|
||||
def _rust_parse_options(custom_words: List[str] = None) -> dict:
|
||||
"""
|
||||
收集 Rust Meta 解析所需的运行时配置,避免 Rust 层直接访问数据库和 settings。
|
||||
"""
|
||||
if custom_words is None:
|
||||
return _rust_default_parse_options()
|
||||
return _rust_custom_parse_options(tuple(custom_words or []))
|
||||
|
||||
|
||||
def clear_rust_parse_options_cache() -> None:
|
||||
"""
|
||||
清理 Rust Meta 默认解析配置缓存,供系统配置变更后重载使用。
|
||||
"""
|
||||
_rust_default_parse_options.cache_clear()
|
||||
_rust_custom_parse_options.cache_clear()
|
||||
|
||||
|
||||
def _meta_from_rust(parsed: dict) -> Optional[MetaBase]:
|
||||
"""
|
||||
将 Rust 解析结果灌回现有 MetaVideo/MetaAnime 对象,保留下游属性和方法兼容性。
|
||||
|
||||
@@ -228,6 +228,37 @@ class RssHelper:
|
||||
},
|
||||
}
|
||||
|
||||
def __decode_fast_text(self, raw_data: bytes, ret) -> Optional[str]:
|
||||
"""
|
||||
使用响应声明编码或 UTF-8 快速解码,优先服务 Rust 解析快路径。
|
||||
"""
|
||||
seen_encodings = set()
|
||||
for encoding in (getattr(ret, "encoding", None), "utf-8"):
|
||||
if not encoding or encoding in seen_encodings:
|
||||
continue
|
||||
seen_encodings.add(encoding)
|
||||
try:
|
||||
return raw_data.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def __parse_with_rust(self, ret_xml: Optional[str]) -> Optional[list]:
|
||||
"""
|
||||
调用 Rust RSS 解析器,并统一处理基础 XML 校验和最大条目限制。
|
||||
"""
|
||||
if not ret_xml or not ret_xml.strip():
|
||||
return None
|
||||
ret_xml_stripped = ret_xml.strip()
|
||||
if not ret_xml_stripped.startswith('<'):
|
||||
return None
|
||||
rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1)
|
||||
if rust_items is None:
|
||||
return None
|
||||
if len(rust_items) > self.MAX_RSS_ITEMS:
|
||||
logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}个")
|
||||
return rust_items[:self.MAX_RSS_ITEMS]
|
||||
|
||||
def parse(self, url, proxy: bool = False,
|
||||
timeout: Optional[int] = 15, headers: dict = None, ua: str = None) -> Union[List[dict], None, bool]:
|
||||
"""
|
||||
@@ -270,21 +301,26 @@ class RssHelper:
|
||||
return False
|
||||
|
||||
if raw_data:
|
||||
try:
|
||||
result = chardet.detect(raw_data)
|
||||
encoding = result['encoding']
|
||||
# 解码为字符串
|
||||
ret_xml = raw_data.decode(encoding)
|
||||
except Exception as e:
|
||||
logger.debug(f"chardet解码失败:{str(e)}")
|
||||
# 探测utf-8解码
|
||||
match = re.search(r'encoding\s*=\s*["\']([^"\']+)["\']', ret.text)
|
||||
if match:
|
||||
encoding = match.group(1)
|
||||
if encoding:
|
||||
ret_xml = raw_data.decode(encoding)
|
||||
else:
|
||||
ret.encoding = ret.apparent_encoding
|
||||
ret_xml = self.__decode_fast_text(raw_data, ret)
|
||||
rust_items = self.__parse_with_rust(ret_xml)
|
||||
if rust_items is not None:
|
||||
return rust_items
|
||||
if not ret_xml:
|
||||
try:
|
||||
result = chardet.detect(raw_data)
|
||||
encoding = result['encoding']
|
||||
# 解码为字符串
|
||||
ret_xml = raw_data.decode(encoding)
|
||||
except Exception as e:
|
||||
logger.debug(f"chardet解码失败:{str(e)}")
|
||||
# 探测utf-8解码
|
||||
match = re.search(r'encoding\s*=\s*["\']([^"\']+)["\']', ret.text)
|
||||
if match:
|
||||
encoding = match.group(1)
|
||||
if encoding:
|
||||
ret_xml = raw_data.decode(encoding)
|
||||
else:
|
||||
ret.encoding = ret.apparent_encoding
|
||||
if not ret_xml:
|
||||
ret_xml = ret.text
|
||||
|
||||
@@ -299,11 +335,9 @@ class RssHelper:
|
||||
logger.error("RSS内容不是有效的XML格式")
|
||||
return False
|
||||
|
||||
rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1)
|
||||
rust_items = self.__parse_with_rust(ret_xml)
|
||||
if rust_items is not None:
|
||||
if len(rust_items) > self.MAX_RSS_ITEMS:
|
||||
logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}个")
|
||||
return rust_items[:self.MAX_RSS_ITEMS]
|
||||
return rust_items
|
||||
|
||||
# 使用lxml.etree解析XML
|
||||
parser = None
|
||||
|
||||
@@ -4,13 +4,14 @@ from functools import lru_cache
|
||||
from typing import List, Tuple, Union, Dict, Optional
|
||||
|
||||
from app.core.context import TorrentInfo, MediaInfo
|
||||
from app.core.metainfo import MetaInfo
|
||||
from app.core.metainfo import MetaInfo, clear_rust_parse_options_cache, _rust_parse_options
|
||||
from app.helper.rule import RuleHelper
|
||||
from app.log import logger
|
||||
from app.modules import _ModuleBase
|
||||
from app.modules.filter.RuleParser import RuleParser
|
||||
from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
|
||||
from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
|
||||
from app.utils import rust_accel
|
||||
from app.utils.string import StringUtils
|
||||
|
||||
|
||||
@@ -60,7 +61,12 @@ def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
|
||||
|
||||
|
||||
class FilterModule(_ModuleBase):
|
||||
CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}
|
||||
CONFIG_WATCH = {
|
||||
SystemConfigKey.CustomFilterRules.value,
|
||||
SystemConfigKey.CustomIdentifiers.value,
|
||||
SystemConfigKey.CustomReleaseGroups.value,
|
||||
SystemConfigKey.Customization.value,
|
||||
}
|
||||
|
||||
# 保留一份只读内置规则定义,方便查询工具准确区分“内置规则”和“自定义规则”。
|
||||
builtin_rule_set: Dict[str, dict] = deepcopy(BUILTIN_RULE_SET)
|
||||
@@ -76,6 +82,13 @@ class FilterModule(_ModuleBase):
|
||||
self.rule_set = deepcopy(self.builtin_rule_set)
|
||||
self.__init_custom_rules()
|
||||
|
||||
def on_config_changed(self):
|
||||
"""
|
||||
自定义过滤或 Meta 识别配置变更后重建规则集并刷新 Rust Meta 配置缓存。
|
||||
"""
|
||||
clear_rust_parse_options_cache()
|
||||
self.init_module()
|
||||
|
||||
def __init_custom_rules(self):
|
||||
"""
|
||||
加载用户自定义规则,如跟内置规则冲突,以用户自定义规则为准
|
||||
@@ -131,25 +144,37 @@ class FilterModule(_ModuleBase):
|
||||
"""
|
||||
if not rule_groups:
|
||||
return torrent_list
|
||||
parser = RuleParser()
|
||||
# 同一轮过滤里,相同的优先级层级会被多个种子反复使用;按需解析并缓存,
|
||||
# 既减少 pyparsing 开销,也保留原来“命中高优先级后不解析低层级”的容错行为。
|
||||
parsed_rule_cache: Dict[str, Union[list, str]] = {}
|
||||
# 查询规则表详情
|
||||
groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
|
||||
if groups:
|
||||
for group in groups:
|
||||
# 过滤种子
|
||||
torrent_list = self.__filter_torrents(
|
||||
rule_string=group.rule_string,
|
||||
rule_name=group.name,
|
||||
torrent_list=torrent_list,
|
||||
mediainfo=mediainfo,
|
||||
parser=parser,
|
||||
parsed_rule_cache=parsed_rule_cache,
|
||||
)
|
||||
group_defs = [group.model_dump() if hasattr(group, "model_dump") else vars(group) for group in groups]
|
||||
matched_orders = rust_accel.filter_torrents(
|
||||
groups=group_defs,
|
||||
torrent_list=torrent_list,
|
||||
rule_set=self.rule_set,
|
||||
mediainfo=mediainfo,
|
||||
metainfo_options=_rust_parse_options() if self.__needs_metainfo_options(group_defs) else None,
|
||||
)
|
||||
ret_torrents = []
|
||||
for index, pri_order in matched_orders:
|
||||
torrent = torrent_list[index]
|
||||
torrent.pri_order = pri_order
|
||||
ret_torrents.append(torrent)
|
||||
return ret_torrents
|
||||
return torrent_list
|
||||
|
||||
def __needs_metainfo_options(self, groups: List[dict]) -> bool:
|
||||
"""
|
||||
判断当前规则链是否会触发 size_range,避免无大小规则时读取 MetaInfo 运行配置。
|
||||
"""
|
||||
rule_ids = set()
|
||||
for group in groups:
|
||||
rule_string = group.get("rule_string")
|
||||
if not rule_string:
|
||||
continue
|
||||
rule_ids.update(re.findall(r"[A-Za-z][A-Za-z0-9]*|[0-9]+[A-Za-z][A-Za-z0-9]*", rule_string))
|
||||
return any(self.rule_set.get(rule_id, {}).get("size_range") for rule_id in rule_ids)
|
||||
|
||||
def __filter_torrents(self, rule_string: str, rule_name: str,
|
||||
torrent_list: List[TorrentInfo],
|
||||
mediainfo: MediaInfo,
|
||||
|
||||
@@ -39,6 +39,31 @@ def parse_filter_rule(expression: str) -> Optional[list]:
|
||||
return None
|
||||
|
||||
|
||||
def filter_torrents(
|
||||
groups: list,
|
||||
torrent_list: list,
|
||||
rule_set: dict,
|
||||
mediainfo=None,
|
||||
metainfo_options: Optional[dict] = None,
|
||||
) -> list:
|
||||
"""
|
||||
使用 Rust 执行完整种子过滤入口,返回原列表下标和优先级。
|
||||
"""
|
||||
if not _moviepilot_rust:
|
||||
raise RuntimeError(f"Rust 扩展不可用,无法执行种子过滤: {_import_error}")
|
||||
try:
|
||||
return _moviepilot_rust.filter_torrents_fast(
|
||||
groups,
|
||||
torrent_list,
|
||||
rule_set,
|
||||
mediainfo,
|
||||
metainfo_options or {},
|
||||
)
|
||||
except BaseException as err:
|
||||
_raise_non_rust_panic(err)
|
||||
raise
|
||||
|
||||
|
||||
def parse_indexer_torrents(
|
||||
html_text: str,
|
||||
domain: str,
|
||||
|
||||
27
rust/moviepilot_rust/Cargo.lock
generated
27
rust/moviepilot_rust/Cargo.lock
generated
@@ -32,6 +32,21 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
||||
dependencies = [
|
||||
"bit-vec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-vec"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.1"
|
||||
@@ -161,6 +176,17 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
|
||||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
@@ -507,6 +533,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anitomy-pure",
|
||||
"chrono",
|
||||
"fancy-regex",
|
||||
"inputx-pinyin",
|
||||
"minijinja",
|
||||
"once_cell",
|
||||
|
||||
@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
anitomy-pure = "0.1"
|
||||
fancy-regex = "0.14"
|
||||
inputx-pinyin = "1.0.2"
|
||||
minijinja = "2.20"
|
||||
chrono = "0.4"
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
use crate::metainfo::parse_total_episode_for_filter;
|
||||
use crate::utils::{
|
||||
get_optional_f64, get_optional_i64, get_optional_nonempty_string, get_string_list,
|
||||
object_optional_f64, object_optional_i64, object_optional_string, object_string_list,
|
||||
py_any_to_string_list,
|
||||
};
|
||||
use chrono::{Local, NaiveDateTime};
|
||||
use fancy_regex::Regex as FancyRegex;
|
||||
use once_cell::sync::Lazy;
|
||||
use pyo3::exceptions::PyValueError;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyList, PyString};
|
||||
use pyo3::types::{PyAny, PyDict, PyList, PyString};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Mutex;
|
||||
|
||||
static REGEX_CACHE: Lazy<Mutex<HashMap<String, FancyRegex>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
const SIZE_UNIT: f64 = 1024.0 * 1024.0;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum RuleExpr {
|
||||
@@ -19,6 +34,70 @@ enum Token {
|
||||
LParen,
|
||||
RParen,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FilterGroup {
|
||||
levels: Vec<String>,
|
||||
}
|
||||
|
||||
struct RuleMatcher {
|
||||
rules: HashMap<String, PyObject>,
|
||||
match_fields: HashSet<String>,
|
||||
}
|
||||
|
||||
struct TorrentSnapshot {
|
||||
title: String,
|
||||
description: String,
|
||||
labels: Vec<String>,
|
||||
fields: HashMap<String, Vec<String>>,
|
||||
size: f64,
|
||||
seeders: i64,
|
||||
downloadvolumefactor: Option<f64>,
|
||||
pub_minutes: f64,
|
||||
}
|
||||
|
||||
struct MediaSnapshot {
|
||||
available: bool,
|
||||
values: HashMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (groups, torrent_list, rule_set, mediainfo=None, metainfo_options=None))]
|
||||
pub(crate) fn filter_torrents_fast(
|
||||
py: Python<'_>,
|
||||
groups: &Bound<'_, PyList>,
|
||||
torrent_list: &Bound<'_, PyList>,
|
||||
rule_set: &Bound<'_, PyDict>,
|
||||
mediainfo: Option<&Bound<'_, PyAny>>,
|
||||
metainfo_options: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<PyObject> {
|
||||
let groups = parse_filter_groups(groups)?;
|
||||
if groups.is_empty() {
|
||||
return Ok(PyList::empty(py).into());
|
||||
}
|
||||
let matcher = RuleMatcher::from_py(rule_set)?;
|
||||
let media = MediaSnapshot::from_py(mediainfo)?;
|
||||
let results = PyList::empty(py);
|
||||
let mut parsed_rule_cache: HashMap<String, RuleExpr> = HashMap::new();
|
||||
let mut episode_count_cache: HashMap<String, i64> = HashMap::new();
|
||||
for (index, torrent_obj) in torrent_list.iter().enumerate() {
|
||||
let torrent = TorrentSnapshot::from_py(&torrent_obj, &matcher.match_fields)?;
|
||||
if let Some(priority) = match_torrent(
|
||||
py,
|
||||
&torrent,
|
||||
&groups,
|
||||
&matcher,
|
||||
&media,
|
||||
metainfo_options,
|
||||
&mut parsed_rule_cache,
|
||||
&mut episode_count_cache,
|
||||
)? {
|
||||
results.append((index, priority))?;
|
||||
}
|
||||
}
|
||||
Ok(results.into())
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
pub(crate) fn parse_filter_rule_fast(py: Python<'_>, expression: &str) -> PyResult<PyObject> {
|
||||
let tokens = tokenize_rule(expression)?;
|
||||
@@ -222,3 +301,625 @@ fn expr_binary_to_py(
|
||||
list.append(expr_to_py(py, right)?)?;
|
||||
Ok(list.into())
|
||||
}
|
||||
|
||||
/// 解析 Python 侧已经按媒体筛选后的规则组。
|
||||
fn parse_filter_groups(groups: &Bound<'_, PyList>) -> PyResult<Vec<FilterGroup>> {
|
||||
let mut result = Vec::new();
|
||||
for item in groups.iter() {
|
||||
let dict = item.downcast::<PyDict>()?;
|
||||
let rule_string = get_optional_nonempty_string(dict, "rule_string")?.unwrap_or_default();
|
||||
if rule_string.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let levels = rule_string
|
||||
.split('>')
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect::<Vec<_>>();
|
||||
if !levels.is_empty() {
|
||||
result.push(FilterGroup { levels });
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
impl RuleMatcher {
|
||||
/// 构建规则查找表,保留 Python 规则对象引用以按需读取字段。
|
||||
fn from_py(rule_set: &Bound<'_, PyDict>) -> PyResult<Self> {
|
||||
let mut rules = HashMap::new();
|
||||
let mut match_fields = HashSet::new();
|
||||
for (key, value) in rule_set.iter() {
|
||||
if let Ok(rule) = value.downcast::<PyDict>() {
|
||||
for field in get_string_list(rule, "match")? {
|
||||
match_fields.insert(field);
|
||||
}
|
||||
}
|
||||
rules.insert(key.extract::<String>()?, value.into());
|
||||
}
|
||||
Ok(Self {
|
||||
rules,
|
||||
match_fields,
|
||||
})
|
||||
}
|
||||
|
||||
/// 根据规则名获取规则字典。
|
||||
fn get<'py>(&self, py: Python<'py>, name: &str) -> Option<Bound<'py, PyDict>> {
|
||||
self.rules
|
||||
.get(name)?
|
||||
.bind(py)
|
||||
.downcast::<PyDict>()
|
||||
.ok()
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl TorrentSnapshot {
|
||||
/// 从 Python TorrentInfo 对象抽取过滤所需字段。
|
||||
fn from_py(torrent: &Bound<'_, PyAny>, match_fields: &HashSet<String>) -> PyResult<Self> {
|
||||
let title = object_optional_string(torrent, "title")?.unwrap_or_default();
|
||||
let description = object_optional_string(torrent, "description")?.unwrap_or_default();
|
||||
let labels = object_string_list(torrent, "labels")?;
|
||||
let fields = selected_object_fields(torrent, match_fields, &title, &description, &labels)?;
|
||||
Ok(Self {
|
||||
title,
|
||||
description,
|
||||
labels,
|
||||
fields,
|
||||
size: object_optional_f64(torrent, "size")?.unwrap_or(0.0),
|
||||
seeders: object_optional_i64(torrent, "seeders")?.unwrap_or(0),
|
||||
downloadvolumefactor: object_optional_f64(torrent, "downloadvolumefactor")?,
|
||||
pub_minutes: pub_minutes_from_py(torrent)?,
|
||||
})
|
||||
}
|
||||
|
||||
/// 拼接默认匹配内容:标题、副标题和标签。
|
||||
fn default_content(&self) -> String {
|
||||
format!(
|
||||
"{} {} {}",
|
||||
if self.title.is_empty() {
|
||||
"None"
|
||||
} else {
|
||||
&self.title
|
||||
},
|
||||
if self.description.is_empty() {
|
||||
"None"
|
||||
} else {
|
||||
&self.description
|
||||
},
|
||||
self.labels.join(" ")
|
||||
)
|
||||
}
|
||||
|
||||
/// 读取任意 TorrentInfo 字段的匹配文本列表。
|
||||
fn field_values(&self, field: &str) -> Option<&Vec<String>> {
|
||||
self.fields.get(field)
|
||||
}
|
||||
}
|
||||
|
||||
impl MediaSnapshot {
|
||||
/// 从 Python MediaInfo 对象抽取 TMDB 规则可能访问的属性。
|
||||
fn from_py(mediainfo: Option<&Bound<'_, PyAny>>) -> PyResult<Self> {
|
||||
let mut values = HashMap::new();
|
||||
let Some(media) = mediainfo else {
|
||||
return Ok(Self {
|
||||
available: false,
|
||||
values,
|
||||
});
|
||||
};
|
||||
if media.is_none() {
|
||||
return Ok(Self {
|
||||
available: false,
|
||||
values,
|
||||
});
|
||||
}
|
||||
for attr in [
|
||||
"type",
|
||||
"category",
|
||||
"original_language",
|
||||
"tmdb_id",
|
||||
"imdb_id",
|
||||
"tvdb_id",
|
||||
"douban_id",
|
||||
"bangumi_id",
|
||||
"collection_id",
|
||||
"origin_country",
|
||||
"genre_ids",
|
||||
"production_countries",
|
||||
"spoken_languages",
|
||||
"languages",
|
||||
] {
|
||||
let attr_values = media_attr_values(media, attr)?;
|
||||
if !attr_values.is_empty() {
|
||||
values.insert(attr.to_string(), attr_values);
|
||||
}
|
||||
}
|
||||
if let Ok(dict) = media.getattr("__dict__") {
|
||||
if let Ok(dict) = dict.downcast::<PyDict>() {
|
||||
for (key, value) in dict.iter() {
|
||||
let key = key.extract::<String>()?;
|
||||
if values.contains_key(&key) || value.is_none() {
|
||||
continue;
|
||||
}
|
||||
let attr_values = if key == "production_countries" {
|
||||
production_country_values(&value)?
|
||||
} else {
|
||||
py_any_to_string_list(&value)?
|
||||
.into_iter()
|
||||
.map(|item| item.to_uppercase())
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
if !attr_values.is_empty() {
|
||||
values.insert(key, attr_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
available: true,
|
||||
values,
|
||||
})
|
||||
}
|
||||
|
||||
/// 判断 TMDB 字段是否包含任一目标值。
|
||||
fn matches(&self, attr: &str, value: &str) -> bool {
|
||||
let Some(info_values) = self.values.get(attr) else {
|
||||
return false;
|
||||
};
|
||||
let values = value
|
||||
.split(',')
|
||||
.filter(|item| !item.is_empty())
|
||||
.map(|item| item.to_uppercase())
|
||||
.collect::<Vec<_>>();
|
||||
values
|
||||
.iter()
|
||||
.any(|value| info_values.iter().any(|info_value| info_value == value))
|
||||
}
|
||||
}
|
||||
|
||||
/// 执行完整种子过滤并返回匹配优先级。
|
||||
fn match_torrent(
|
||||
py: Python<'_>,
|
||||
torrent: &TorrentSnapshot,
|
||||
groups: &[FilterGroup],
|
||||
matcher: &RuleMatcher,
|
||||
media: &MediaSnapshot,
|
||||
metainfo_options: Option<&Bound<'_, PyDict>>,
|
||||
parsed_rule_cache: &mut HashMap<String, RuleExpr>,
|
||||
episode_count_cache: &mut HashMap<String, i64>,
|
||||
) -> PyResult<Option<i64>> {
|
||||
let mut last_priority = None;
|
||||
for group in groups {
|
||||
let mut priority = 100i64;
|
||||
let mut matched_priority = None;
|
||||
for level in &group.levels {
|
||||
let expr = parse_cached_expr(level, parsed_rule_cache)?;
|
||||
if match_group(
|
||||
py,
|
||||
torrent,
|
||||
&expr,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)? {
|
||||
matched_priority = Some(priority);
|
||||
break;
|
||||
}
|
||||
priority -= 1;
|
||||
}
|
||||
match matched_priority {
|
||||
Some(priority) => last_priority = Some(priority),
|
||||
None => return Ok(None),
|
||||
}
|
||||
}
|
||||
Ok(last_priority)
|
||||
}
|
||||
|
||||
/// 延迟解析并缓存优先级层级表达式,保持命中高优先级后不解析低层级的语义。
|
||||
fn parse_cached_expr<'a>(
|
||||
level: &str,
|
||||
parsed_rule_cache: &'a mut HashMap<String, RuleExpr>,
|
||||
) -> PyResult<&'a RuleExpr> {
|
||||
if !parsed_rule_cache.contains_key(level) {
|
||||
let tokens = tokenize_rule(level)?;
|
||||
let mut parser = RuleParserState::new(tokens);
|
||||
let expr = parser.parse_expression()?;
|
||||
if parser.has_remaining() {
|
||||
return Err(PyValueError::new_err("规则表达式包含无法解析的剩余内容"));
|
||||
}
|
||||
parsed_rule_cache.insert(level.to_string(), expr);
|
||||
}
|
||||
Ok(parsed_rule_cache.get(level).expect("cached rule exists"))
|
||||
}
|
||||
|
||||
/// 递归求值规则布尔表达式。
|
||||
fn match_group(
|
||||
py: Python<'_>,
|
||||
torrent: &TorrentSnapshot,
|
||||
expr: &RuleExpr,
|
||||
matcher: &RuleMatcher,
|
||||
media: &MediaSnapshot,
|
||||
metainfo_options: Option<&Bound<'_, PyDict>>,
|
||||
episode_count_cache: &mut HashMap<String, i64>,
|
||||
) -> PyResult<bool> {
|
||||
match expr {
|
||||
RuleExpr::Name(name) => match_rule(
|
||||
py,
|
||||
torrent,
|
||||
name,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
),
|
||||
RuleExpr::Not(inner) => Ok(!match_group(
|
||||
py,
|
||||
torrent,
|
||||
inner,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)?),
|
||||
RuleExpr::And(left, right) => {
|
||||
if !match_group(
|
||||
py,
|
||||
torrent,
|
||||
left,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)? {
|
||||
return Ok(false);
|
||||
}
|
||||
match_group(
|
||||
py,
|
||||
torrent,
|
||||
right,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)
|
||||
}
|
||||
RuleExpr::Or(left, right) => {
|
||||
if match_group(
|
||||
py,
|
||||
torrent,
|
||||
left,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)? {
|
||||
return Ok(true);
|
||||
}
|
||||
match_group(
|
||||
py,
|
||||
torrent,
|
||||
right,
|
||||
matcher,
|
||||
media,
|
||||
metainfo_options,
|
||||
episode_count_cache,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 执行单条规则匹配。
|
||||
fn match_rule(
|
||||
py: Python<'_>,
|
||||
torrent: &TorrentSnapshot,
|
||||
rule_name: &str,
|
||||
matcher: &RuleMatcher,
|
||||
media: &MediaSnapshot,
|
||||
metainfo_options: Option<&Bound<'_, PyDict>>,
|
||||
episode_count_cache: &mut HashMap<String, i64>,
|
||||
) -> PyResult<bool> {
|
||||
let Some(rule) = matcher.get(py, rule_name) else {
|
||||
return Ok(false);
|
||||
};
|
||||
if match_tmdb_rule(&rule, media)? {
|
||||
return Ok(true);
|
||||
}
|
||||
let content = rule_match_content(&rule, torrent)?;
|
||||
let includes = get_string_list(&rule, "include")?;
|
||||
if !includes.is_empty() {
|
||||
let mut included = false;
|
||||
for pattern in includes {
|
||||
if regex_search(&pattern, &content)? {
|
||||
included = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !included {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
let excludes = get_string_list(&rule, "exclude")?;
|
||||
for pattern in excludes {
|
||||
if regex_search(&pattern, &content)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if let Some(size_range) = get_optional_nonempty_string(&rule, "size_range")? {
|
||||
if !match_size(torrent, &size_range, metainfo_options, episode_count_cache)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if let Some(seeders) = get_optional_i64(&rule, "seeders")? {
|
||||
if torrent.seeders < seeders {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if let Some(download_factor) = get_optional_f64(&rule, "downloadvolumefactor")? {
|
||||
if torrent.downloadvolumefactor != Some(download_factor) {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if let Some(publish_time) = get_optional_nonempty_string(&rule, "publish_time")? {
|
||||
if !match_publish_time(torrent.pub_minutes, &publish_time)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// 判断规则中的 TMDB 条件是否匹配媒体信息。
|
||||
fn match_tmdb_rule(rule: &Bound<'_, PyDict>, media: &MediaSnapshot) -> PyResult<bool> {
|
||||
let Some(tmdb_obj) = rule.get_item("tmdb")? else {
|
||||
return Ok(false);
|
||||
};
|
||||
if tmdb_obj.is_none() {
|
||||
return Ok(false);
|
||||
}
|
||||
if !media.available {
|
||||
return Ok(false);
|
||||
}
|
||||
let tmdb = tmdb_obj.downcast::<PyDict>()?;
|
||||
for (key, value) in tmdb.iter() {
|
||||
if value.is_none() {
|
||||
continue;
|
||||
}
|
||||
let value = value.str()?.to_str()?.to_string();
|
||||
if value.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if !media.matches(&key.extract::<String>()?, &value) {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// 计算规则实际用于正则匹配的内容。
|
||||
fn rule_match_content(rule: &Bound<'_, PyDict>, torrent: &TorrentSnapshot) -> PyResult<String> {
|
||||
let matches = get_string_list(rule, "match")?;
|
||||
if matches.is_empty() {
|
||||
return Ok(torrent.default_content());
|
||||
}
|
||||
let mut content = Vec::new();
|
||||
for field in matches {
|
||||
if let Some(values) = torrent.field_values(&field) {
|
||||
content.extend(values.iter().filter(|item| !item.is_empty()).cloned());
|
||||
}
|
||||
}
|
||||
if content.is_empty() {
|
||||
Ok(torrent.default_content())
|
||||
} else {
|
||||
Ok(content.join(" "))
|
||||
}
|
||||
}
|
||||
|
||||
/// 匹配大小范围,剧集按总集数折算单集大小。
|
||||
fn match_size(
|
||||
torrent: &TorrentSnapshot,
|
||||
size_range: &str,
|
||||
metainfo_options: Option<&Bound<'_, PyDict>>,
|
||||
episode_count_cache: &mut HashMap<String, i64>,
|
||||
) -> PyResult<bool> {
|
||||
let cache_key = format!("{}\n{}", torrent.title, torrent.description);
|
||||
let episode_count = match episode_count_cache.get(&cache_key) {
|
||||
Some(value) => *value,
|
||||
None => {
|
||||
let value = parse_total_episode_for_filter(
|
||||
torrent.title.as_str(),
|
||||
Some(torrent.description.as_str()),
|
||||
metainfo_options,
|
||||
)?;
|
||||
episode_count_cache.insert(cache_key, value);
|
||||
value
|
||||
}
|
||||
}
|
||||
.max(1) as f64;
|
||||
let torrent_size = torrent.size / episode_count;
|
||||
match parse_size_range(size_range)? {
|
||||
SizeRange::Between(min, max) => Ok(min <= torrent_size && torrent_size <= max),
|
||||
SizeRange::Gte(min) => Ok(torrent_size >= min),
|
||||
SizeRange::Lte(max) => Ok(torrent_size <= max),
|
||||
SizeRange::Unknown => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
enum SizeRange {
|
||||
Between(f64, f64),
|
||||
Gte(f64),
|
||||
Lte(f64),
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// 解析大小规则,单位与 Python 旧实现保持为 MB。
|
||||
fn parse_size_range(size_range: &str) -> PyResult<SizeRange> {
|
||||
let size_range = size_range.trim();
|
||||
if let Some((left, right)) = size_range.split_once('-') {
|
||||
return Ok(SizeRange::Between(
|
||||
parse_f64(left.trim(), "大小范围")? * SIZE_UNIT,
|
||||
parse_f64(right.trim(), "大小范围")? * SIZE_UNIT,
|
||||
));
|
||||
}
|
||||
if let Some(value) = size_range.strip_prefix('>') {
|
||||
return Ok(SizeRange::Gte(
|
||||
parse_f64(value.trim(), "大小范围")? * SIZE_UNIT,
|
||||
));
|
||||
}
|
||||
if let Some(value) = size_range.strip_prefix('<') {
|
||||
return Ok(SizeRange::Lte(
|
||||
parse_f64(value.trim(), "大小范围")? * SIZE_UNIT,
|
||||
));
|
||||
}
|
||||
Ok(SizeRange::Unknown)
|
||||
}
|
||||
|
||||
/// 匹配发布时间分钟数范围。
|
||||
fn match_publish_time(pub_minutes: f64, publish_time: &str) -> PyResult<bool> {
|
||||
let values = publish_time
|
||||
.split('-')
|
||||
.map(|item| parse_f64(item, "发布时间规则"))
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
if values.len() == 1 {
|
||||
Ok(pub_minutes >= values[0])
|
||||
} else if values.len() >= 2 {
|
||||
Ok(values[0] <= pub_minutes && pub_minutes <= values[1])
|
||||
} else {
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
/// 执行忽略大小写的正则搜索,按规则文本缓存编译结果。
|
||||
fn regex_search(pattern: &str, content: &str) -> PyResult<bool> {
|
||||
let cache_key = format!("(?i){pattern}");
|
||||
if let Ok(guard) = REGEX_CACHE.lock() {
|
||||
if let Some(regex) = guard.get(&cache_key) {
|
||||
return regex
|
||||
.is_match(content)
|
||||
.map_err(|err| PyValueError::new_err(err.to_string()));
|
||||
}
|
||||
}
|
||||
let regex =
|
||||
FancyRegex::new(&cache_key).map_err(|err| PyValueError::new_err(err.to_string()))?;
|
||||
let result = regex
|
||||
.is_match(content)
|
||||
.map_err(|err| PyValueError::new_err(err.to_string()))?;
|
||||
if let Ok(mut guard) = REGEX_CACHE.lock() {
|
||||
guard.insert(cache_key, regex);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// 抽取媒体字段值并统一转为大写字符串列表。
|
||||
fn media_attr_values(media: &Bound<'_, PyAny>, attr: &str) -> PyResult<Vec<String>> {
|
||||
let Ok(value) = media.getattr(attr) else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
if value.is_none() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
if attr == "production_countries" {
|
||||
return production_country_values(&value);
|
||||
}
|
||||
let mut result = py_any_to_string_list(&value)?
|
||||
.into_iter()
|
||||
.map(|item| item.to_uppercase())
|
||||
.collect::<Vec<_>>();
|
||||
if result.is_empty() {
|
||||
let text = value.str()?.to_str()?.to_uppercase();
|
||||
if !text.is_empty() {
|
||||
result.push(text);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// 从 TMDB production_countries 字段提取 iso_3166_1。
|
||||
fn production_country_values(value: &Bound<'_, PyAny>) -> PyResult<Vec<String>> {
|
||||
let Ok(list) = value.downcast::<PyList>() else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
let mut result = Vec::new();
|
||||
for item in list.iter() {
|
||||
if let Ok(dict) = item.downcast::<PyDict>() {
|
||||
if let Some(code) = get_optional_nonempty_string(dict, "iso_3166_1")? {
|
||||
result.push(code.to_uppercase());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// 按规则 match 字段读取 TorrentInfo 属性,避免热路径遍历整个 __dict__。
|
||||
fn selected_object_fields(
|
||||
torrent: &Bound<'_, PyAny>,
|
||||
match_fields: &HashSet<String>,
|
||||
title: &str,
|
||||
description: &str,
|
||||
labels: &[String],
|
||||
) -> PyResult<HashMap<String, Vec<String>>> {
|
||||
let mut result = HashMap::new();
|
||||
for field in match_fields {
|
||||
match field.as_str() {
|
||||
"title" => {
|
||||
if !title.is_empty() {
|
||||
result.insert(field.clone(), vec![title.to_string()]);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
"description" => {
|
||||
if !description.is_empty() {
|
||||
result.insert(field.clone(), vec![description.to_string()]);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
"labels" => {
|
||||
if !labels.is_empty() {
|
||||
result.insert(field.clone(), labels.to_vec());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
let Ok(value) = torrent.getattr(field) else {
|
||||
continue;
|
||||
};
|
||||
if value.is_none() || !value.is_truthy()? {
|
||||
continue;
|
||||
}
|
||||
let values = if let Ok(list) = value.downcast::<PyList>() {
|
||||
let mut items = Vec::new();
|
||||
for item in list.iter() {
|
||||
if !item.is_none() && item.is_truthy()? {
|
||||
items.push(item.str()?.to_str()?.to_string());
|
||||
}
|
||||
}
|
||||
items
|
||||
} else {
|
||||
vec![value.str()?.to_str()?.to_string()]
|
||||
};
|
||||
if !values.is_empty() {
|
||||
result.insert(field.clone(), values);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// 用 Rust 复刻 TorrentInfo.pub_minutes,避免过滤热路径回调 Python 方法。
|
||||
fn pub_minutes_from_py(torrent: &Bound<'_, PyAny>) -> PyResult<f64> {
|
||||
let Some(pubdate) = object_optional_string(torrent, "pubdate")? else {
|
||||
return Ok(0.0);
|
||||
};
|
||||
let Ok(pubdate) = NaiveDateTime::parse_from_str(&pubdate, "%Y-%m-%d %H:%M:%S") else {
|
||||
return Ok(0.0);
|
||||
};
|
||||
let now = Local::now().naive_local();
|
||||
Ok((now - pubdate).num_seconds().div_euclid(60) as f64)
|
||||
}
|
||||
|
||||
/// 解析浮点数字符串,保持 Python float 转换失败时抛异常的语义。
|
||||
fn parse_f64(value: &str, context: &str) -> PyResult<f64> {
|
||||
value
|
||||
.trim()
|
||||
.parse::<f64>()
|
||||
.map_err(|err| PyValueError::new_err(format!("{context}解析失败: {err}")))
|
||||
}
|
||||
|
||||
@@ -25,8 +25,9 @@ static HAS_SELECTOR_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#":has\(\s*(?:"([^"]+)"|'([^']+)'|([^)]*))\s*\)"#).unwrap());
|
||||
static TABLE_DIRECT_TR_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
|
||||
static EN_ELAPSED_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap());
|
||||
static EN_ELAPSED_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap()
|
||||
});
|
||||
const OUTPUT_FIELDS: &[(&str, &str)] = &[
|
||||
("title", "title"),
|
||||
("description", "description"),
|
||||
@@ -44,7 +45,11 @@ const OUTPUT_FIELDS: &[(&str, &str)] = &[
|
||||
|
||||
struct FieldSpec<'py> {
|
||||
name: String,
|
||||
config: Bound<'py, PyDict>,
|
||||
text_template: Option<String>,
|
||||
default_value: Option<String>,
|
||||
filters: Option<Bound<'py, PyAny>>,
|
||||
query: Option<QuerySpec>,
|
||||
case_selectors: Vec<(SelectorPlan, f64)>,
|
||||
}
|
||||
|
||||
enum RowParseResult {
|
||||
@@ -52,6 +57,23 @@ enum RowParseResult {
|
||||
Item(PyObject),
|
||||
}
|
||||
|
||||
struct QuerySpec {
|
||||
selector: SelectorPlan,
|
||||
attribute: Option<String>,
|
||||
remove_selectors: Vec<Selector>,
|
||||
contents: Option<i64>,
|
||||
index: Option<i64>,
|
||||
}
|
||||
|
||||
enum SelectorPlan {
|
||||
Direct(Selector),
|
||||
Has {
|
||||
base: Selector,
|
||||
inner: Selector,
|
||||
suffix: Option<Selector>,
|
||||
},
|
||||
}
|
||||
|
||||
/// 批量解析普通配置 indexer 页面,优先在 Rust 内覆盖站点配置语义。
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))]
|
||||
@@ -76,8 +98,12 @@ pub(crate) fn parse_indexer_torrents_fast(
|
||||
};
|
||||
let result = PyList::empty(py);
|
||||
let field_specs = build_field_specs(fields)?;
|
||||
let field_map = field_specs
|
||||
.iter()
|
||||
.map(|field| (field.name.as_str(), field))
|
||||
.collect::<HashMap<&str, &FieldSpec<'_>>>();
|
||||
for row in rows.into_iter().take(result_num) {
|
||||
match parse_indexer_row(py, row, domain, &field_specs, category)? {
|
||||
match parse_indexer_row(py, row, domain, &field_map, category)? {
|
||||
RowParseResult::Empty => {}
|
||||
RowParseResult::Item(item) => result.append(item)?,
|
||||
}
|
||||
@@ -95,47 +121,97 @@ fn build_field_specs<'py>(fields: &Bound<'py, PyDict>) -> PyResult<Vec<FieldSpec
|
||||
let Ok(config) = value.downcast_into::<PyDict>() else {
|
||||
continue;
|
||||
};
|
||||
let filters = config.get_item("filters")?.filter(|value| !value.is_none());
|
||||
specs.push(FieldSpec {
|
||||
name: key.extract::<String>()?,
|
||||
config,
|
||||
text_template: get_optional_string(&config, "text")?,
|
||||
default_value: get_default_value(&config)?,
|
||||
filters,
|
||||
query: build_query_spec(&config)?,
|
||||
case_selectors: build_case_selectors(&config)?,
|
||||
});
|
||||
}
|
||||
Ok(specs)
|
||||
}
|
||||
|
||||
/// 预编译字段选择器和静态取值参数,避免每行重复读取 Python 配置。
|
||||
fn build_query_spec(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<QuerySpec>> {
|
||||
let Some(selector_text) = get_selector_text(selector_config)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(selector) = parse_selector_plan(&selector_text) else {
|
||||
return Ok(None);
|
||||
};
|
||||
Ok(Some(QuerySpec {
|
||||
selector,
|
||||
attribute: get_optional_string(selector_config, "attribute")?,
|
||||
remove_selectors: parse_remove_selectors(selector_config)?,
|
||||
contents: get_optional_i64(selector_config, "contents")?,
|
||||
index: get_optional_i64(selector_config, "index")?,
|
||||
}))
|
||||
}
|
||||
|
||||
/// 预编译优惠字段的 case selector,匹配失败时仍按原语义回落到 1.0。
|
||||
fn build_case_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<(SelectorPlan, f64)>> {
|
||||
let Some(case_obj) = selector_config.get_item("case")? else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
let Ok(case_dict) = case_obj.downcast::<PyDict>() else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
let mut selectors = Vec::new();
|
||||
for (case_selector_obj, value) in case_dict.iter() {
|
||||
let case_selector = case_selector_obj.extract::<String>()?;
|
||||
if let Some(selector) = parse_selector_plan(&case_selector) {
|
||||
selectors.push((selector, value.extract::<f64>().unwrap_or(1.0)));
|
||||
}
|
||||
}
|
||||
Ok(selectors)
|
||||
}
|
||||
|
||||
/// 解析单行种子信息,覆盖普通配置站点的主字段抽取流程。
|
||||
fn parse_indexer_row(
|
||||
py: Python<'_>,
|
||||
row: ElementRef<'_>,
|
||||
domain: &str,
|
||||
field_specs: &[FieldSpec<'_>],
|
||||
field_map: &HashMap<&str, &FieldSpec<'_>>,
|
||||
category: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<RowParseResult> {
|
||||
let output = PyDict::new(py);
|
||||
let field_map = field_specs
|
||||
.iter()
|
||||
.map(|field| (field.name.as_str(), field))
|
||||
.collect::<HashMap<&str, &FieldSpec<'_>>>();
|
||||
let mut cache = BTreeMap::new();
|
||||
let mut resolving = HashSet::new();
|
||||
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, "details", &mut cache, &mut resolving)? {
|
||||
if let Some(value) = eval_field_by_name(row, field_map, "details", &mut cache, &mut resolving)?
|
||||
{
|
||||
if !value.is_empty() {
|
||||
output.set_item("page_url", normalize_site_link(domain, &value, true))?;
|
||||
}
|
||||
}
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, "download", &mut cache, &mut resolving)? {
|
||||
if let Some(value) = eval_field_by_name(row, field_map, "download", &mut cache, &mut resolving)?
|
||||
{
|
||||
if !value.is_empty() {
|
||||
output.set_item("enclosure", normalize_site_link(domain, &value, false))?;
|
||||
}
|
||||
}
|
||||
if let Some(value) = eval_factor_field(row, &field_map, "downloadvolumefactor", &mut cache, &mut resolving)? {
|
||||
if let Some(value) = eval_factor_field(
|
||||
row,
|
||||
field_map,
|
||||
"downloadvolumefactor",
|
||||
&mut cache,
|
||||
&mut resolving,
|
||||
)? {
|
||||
output.set_item("downloadvolumefactor", value)?;
|
||||
}
|
||||
if let Some(value) = eval_factor_field(row, &field_map, "uploadvolumefactor", &mut cache, &mut resolving)? {
|
||||
if let Some(value) = eval_factor_field(
|
||||
row,
|
||||
field_map,
|
||||
"uploadvolumefactor",
|
||||
&mut cache,
|
||||
&mut resolving,
|
||||
)? {
|
||||
output.set_item("uploadvolumefactor", value)?;
|
||||
}
|
||||
if let Some(value) = eval_pubdate_field(row, &field_map, &mut cache, &mut resolving)? {
|
||||
if let Some(value) = eval_pubdate_field(row, field_map, &mut cache, &mut resolving)? {
|
||||
if !value.is_empty() {
|
||||
output.set_item("pubdate", value)?;
|
||||
}
|
||||
@@ -144,32 +220,44 @@ fn parse_indexer_row(
|
||||
for (source_key, target_key) in OUTPUT_FIELDS {
|
||||
match *source_key {
|
||||
"labels" => {
|
||||
parse_labels_field(py, row, &field_map, &output)?;
|
||||
parse_labels_field(py, row, field_map, &output)?;
|
||||
}
|
||||
"hr" => {
|
||||
if let Some(value) = eval_hr_field(row, &field_map)? {
|
||||
if let Some(value) = eval_hr_field(row, field_map)? {
|
||||
output.set_item(*target_key, value)?;
|
||||
}
|
||||
}
|
||||
"category" => {
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
|
||||
if let Some(value) =
|
||||
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
|
||||
{
|
||||
output.set_item(*target_key, map_category_value(&value, category)?)?;
|
||||
}
|
||||
}
|
||||
"size" => {
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
|
||||
output.set_item(*target_key, parse_filesize_text(value.replace('\n', "").trim()))?;
|
||||
if let Some(value) =
|
||||
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
|
||||
{
|
||||
output.set_item(
|
||||
*target_key,
|
||||
parse_filesize_text(value.replace('\n', "").trim()),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
"leechers" | "seeders" | "grabs" => {
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
|
||||
if let Some(value) =
|
||||
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
|
||||
{
|
||||
output.set_item(*target_key, parse_peer_count(&value))?;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? {
|
||||
if let Some(value) =
|
||||
eval_field_by_name(row, field_map, source_key, &mut cache, &mut resolving)?
|
||||
{
|
||||
if !value.is_empty() {
|
||||
output.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
|
||||
output
|
||||
.set_item(*target_key, value.replace('\n', " ").trim().to_string())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,32 +304,27 @@ fn eval_field(
|
||||
cache: &mut BTreeMap<String, String>,
|
||||
resolving: &mut HashSet<String>,
|
||||
) -> PyResult<Option<String>> {
|
||||
let mut value = if let Some(template) = get_optional_string(&spec.config, "text")? {
|
||||
Some(render_field_template(row, field_map, &template, cache, resolving)?)
|
||||
let mut value = if let Some(template) = spec.text_template.as_deref() {
|
||||
Some(render_field_template(
|
||||
row, field_map, &template, cache, resolving,
|
||||
)?)
|
||||
} else {
|
||||
safe_query(row, &spec.config)?
|
||||
safe_query(row, spec.query.as_ref())
|
||||
};
|
||||
|
||||
if let Some(current) = value.as_deref() {
|
||||
if contains_jinja_syntax(current) {
|
||||
value = Some(render_embedded_value(
|
||||
row,
|
||||
field_map,
|
||||
&spec.name,
|
||||
current,
|
||||
cache,
|
||||
resolving,
|
||||
row, field_map, &spec.name, current, cache, resolving,
|
||||
)?);
|
||||
}
|
||||
}
|
||||
if let Some(filters) = spec.config.get_item("filters")? {
|
||||
if !filters.is_none() {
|
||||
value = apply_text_filters(value.unwrap_or_default(), &filters)?;
|
||||
}
|
||||
if let Some(filters) = spec.filters.as_ref() {
|
||||
value = apply_text_filters(value.unwrap_or_default(), filters)?;
|
||||
}
|
||||
if value.as_deref().map(str::is_empty).unwrap_or(true) {
|
||||
if let Some(default_value) = get_default_value(&spec.config)? {
|
||||
value = Some(default_value);
|
||||
if let Some(default_value) = spec.default_value.as_ref() {
|
||||
value = Some(default_value.clone());
|
||||
}
|
||||
}
|
||||
Ok(value)
|
||||
@@ -318,12 +401,10 @@ fn eval_factor_field(
|
||||
let Some(spec) = field_map.get(key).copied() else {
|
||||
return Ok(None);
|
||||
};
|
||||
if let Some(case_obj) = spec.config.get_item("case")? {
|
||||
let case_dict = case_obj.downcast::<PyDict>()?;
|
||||
for (case_selector_obj, value) in case_dict.iter() {
|
||||
let case_selector = case_selector_obj.extract::<String>()?;
|
||||
if selector_exists(row, &case_selector)? {
|
||||
return Ok(Some(value.extract::<f64>().unwrap_or(1.0)));
|
||||
if !spec.case_selectors.is_empty() {
|
||||
for (selector, value) in &spec.case_selectors {
|
||||
if selector_exists_with_plan(row, selector) {
|
||||
return Ok(Some(*value));
|
||||
}
|
||||
}
|
||||
return Ok(Some(1.0));
|
||||
@@ -350,15 +431,16 @@ fn parse_labels_field(
|
||||
let Some(spec) = field_map.get("labels").copied() else {
|
||||
return Ok(());
|
||||
};
|
||||
if !spec.config.contains("selector")? {
|
||||
let Some(query) = spec.query.as_ref() else {
|
||||
output.set_item("labels", PyList::empty(py))?;
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let labels = PyList::empty(py);
|
||||
if let Some(values) = query_all_values(row, &spec.config)? {
|
||||
for value in values.into_iter().filter(|item| !item.is_empty()) {
|
||||
labels.append(value)?;
|
||||
}
|
||||
for value in query_all_values(row, query)
|
||||
.into_iter()
|
||||
.filter(|item| !item.is_empty())
|
||||
{
|
||||
labels.append(value)?;
|
||||
}
|
||||
output.set_item("labels", labels)?;
|
||||
Ok(())
|
||||
@@ -372,10 +454,10 @@ fn eval_hr_field(
|
||||
let Some(spec) = field_map.get("hr").copied() else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(selector_text) = get_selector_text(&spec.config)? else {
|
||||
let Some(query) = spec.query.as_ref() else {
|
||||
return Ok(Some(false));
|
||||
};
|
||||
Ok(Some(selector_exists(row, &selector_text)?))
|
||||
Ok(Some(selector_exists_with_plan(row, &query.selector)))
|
||||
}
|
||||
|
||||
/// 将站点分类 ID 映射为 MoviePilot 的媒体类型中文值。
|
||||
@@ -418,7 +500,9 @@ fn eval_pubdate_field(
|
||||
if is_standard_datetime(&normalized) || !field_map.contains_key("date_added") {
|
||||
return Ok(Some(normalized));
|
||||
}
|
||||
if let Some(date_added) = eval_field_by_name(row, field_map, "date_added", cache, resolving)? {
|
||||
if let Some(date_added) =
|
||||
eval_field_by_name(row, field_map, "date_added", cache, resolving)?
|
||||
{
|
||||
let fallback = normalize_pubdate_text(&date_added);
|
||||
if is_standard_datetime(&fallback) {
|
||||
return Ok(Some(fallback));
|
||||
@@ -426,8 +510,10 @@ fn eval_pubdate_field(
|
||||
}
|
||||
return Ok(Some(normalized));
|
||||
}
|
||||
Ok(eval_field_by_name(row, field_map, "date_added", cache, resolving)?
|
||||
.map(|value| normalize_pubdate_text(&value)))
|
||||
Ok(
|
||||
eval_field_by_name(row, field_map, "date_added", cache, resolving)?
|
||||
.map(|value| normalize_pubdate_text(&value)),
|
||||
)
|
||||
}
|
||||
|
||||
/// 规范化发布时间文本为 MoviePilot 期望的字符串格式。
|
||||
@@ -466,9 +552,15 @@ fn select_site_elements<'a>(
|
||||
root: ElementRef<'a>,
|
||||
selector_text: &str,
|
||||
) -> Option<Vec<ElementRef<'a>>> {
|
||||
let plan = parse_selector_plan(selector_text)?;
|
||||
Some(select_site_elements_with_plan(root, &plan))
|
||||
}
|
||||
|
||||
/// 将站点选择器预编译为可复用计划,覆盖 PyQuery 的 :has("selector") 写法。
|
||||
fn parse_selector_plan(selector_text: &str) -> Option<SelectorPlan> {
|
||||
let Some(captures) = HAS_SELECTOR_RE.captures(selector_text) else {
|
||||
let selector = parse_css_selector(selector_text)?;
|
||||
return Some(root.select(&selector).collect());
|
||||
return Some(SelectorPlan::Direct(selector));
|
||||
};
|
||||
let matched = captures.get(0)?;
|
||||
let prefix = selector_text[..matched.start()].trim();
|
||||
@@ -481,19 +573,45 @@ fn select_site_elements<'a>(
|
||||
.trim();
|
||||
let base_selector = parse_css_selector(prefix)?;
|
||||
let has_selector = parse_css_selector(inner)?;
|
||||
let bases = root
|
||||
.select(&base_selector)
|
||||
.filter(|element| element.select(&has_selector).next().is_some());
|
||||
if suffix.is_empty() {
|
||||
return Some(bases.collect());
|
||||
let suffix = if suffix.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let suffix_selector_text = suffix.trim_start_matches('>').trim();
|
||||
Some(parse_css_selector(suffix_selector_text)?)
|
||||
};
|
||||
Some(SelectorPlan::Has {
|
||||
base: base_selector,
|
||||
inner: has_selector,
|
||||
suffix,
|
||||
})
|
||||
}
|
||||
|
||||
/// 执行预编译 selector 计划,避免每个字段每行重复解析 CSS。
|
||||
fn select_site_elements_with_plan<'a>(
|
||||
root: ElementRef<'a>,
|
||||
plan: &SelectorPlan,
|
||||
) -> Vec<ElementRef<'a>> {
|
||||
match plan {
|
||||
SelectorPlan::Direct(selector) => root.select(selector).collect(),
|
||||
SelectorPlan::Has {
|
||||
base,
|
||||
inner,
|
||||
suffix,
|
||||
} => {
|
||||
let bases = root
|
||||
.select(base)
|
||||
.filter(|element| element.select(inner).next().is_some());
|
||||
if let Some(suffix) = suffix {
|
||||
let mut values = Vec::new();
|
||||
for base in bases {
|
||||
values.extend(base.select(suffix));
|
||||
}
|
||||
values
|
||||
} else {
|
||||
bases.collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
let suffix_selector_text = suffix.trim_start_matches('>').trim();
|
||||
let suffix_selector = parse_css_selector(suffix_selector_text)?;
|
||||
let mut values = Vec::new();
|
||||
for base in bases {
|
||||
values.extend(base.select(&suffix_selector));
|
||||
}
|
||||
Some(values)
|
||||
}
|
||||
|
||||
/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
|
||||
@@ -520,35 +638,24 @@ fn expand_table_direct_tr_selector(selector_text: &str) -> String {
|
||||
}
|
||||
|
||||
/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
|
||||
fn safe_query(row: ElementRef<'_>, selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
|
||||
let Some(values) = query_all_values(row, selector_config)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
Ok(select_indexed_value(values, selector_config))
|
||||
fn safe_query(row: ElementRef<'_>, query: Option<&QuerySpec>) -> Option<String> {
|
||||
let query = query?;
|
||||
let values = query_all_values(row, query);
|
||||
select_indexed_value(values, query)
|
||||
}
|
||||
|
||||
/// 查询 selector 的全部文本或属性值。
|
||||
fn query_all_values(
|
||||
row: ElementRef<'_>,
|
||||
selector_config: &Bound<'_, PyDict>,
|
||||
) -> PyResult<Option<Vec<String>>> {
|
||||
let Some(selector_text) = get_selector_text(selector_config)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(elements) = select_site_elements(row, &selector_text) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let attribute = get_optional_string(selector_config, "attribute")?;
|
||||
let remove_selectors = parse_remove_selectors(selector_config)?;
|
||||
fn query_all_values(row: ElementRef<'_>, query: &QuerySpec) -> Vec<String> {
|
||||
let elements = select_site_elements_with_plan(row, &query.selector);
|
||||
let mut values = Vec::new();
|
||||
for element in elements {
|
||||
if let Some(attribute) = attribute.as_deref() {
|
||||
if let Some(attribute) = query.attribute.as_deref() {
|
||||
values.push(element.value().attr(attribute).unwrap_or("").to_string());
|
||||
} else {
|
||||
values.push(normalize_element_text(element, &remove_selectors));
|
||||
values.push(normalize_element_text(element, &query.remove_selectors));
|
||||
}
|
||||
}
|
||||
Ok(Some(values))
|
||||
values
|
||||
}
|
||||
|
||||
/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。
|
||||
@@ -586,20 +693,17 @@ fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<Str
|
||||
}
|
||||
|
||||
/// 对查询结果应用 contents/index 规则。
|
||||
fn select_indexed_value(
|
||||
values: Vec<String>,
|
||||
selector_config: &Bound<'_, PyDict>,
|
||||
) -> Option<String> {
|
||||
fn select_indexed_value(values: Vec<String>, query: &QuerySpec) -> Option<String> {
|
||||
if values.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
|
||||
if let Some(contents) = query.contents {
|
||||
if let Some(first) = values.first() {
|
||||
let lines: Vec<&str> = first.split('\n').collect();
|
||||
return pick_indexed_item(&lines, contents).map(|item| item.to_string());
|
||||
}
|
||||
}
|
||||
if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
|
||||
if let Some(index) = query.index {
|
||||
return pick_indexed_item(&values, index).cloned();
|
||||
}
|
||||
values.first().cloned()
|
||||
@@ -638,7 +742,8 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
|
||||
continue;
|
||||
}
|
||||
let pattern = args_list.get_item(0)?.extract::<String>()?;
|
||||
let group_index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
|
||||
let group_index =
|
||||
extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0);
|
||||
let Ok(regex) = Regex::new(&pattern) else {
|
||||
continue;
|
||||
};
|
||||
@@ -676,7 +781,9 @@ fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResu
|
||||
continue;
|
||||
}
|
||||
let from = args_list.get_item(0)?.extract::<String>()?;
|
||||
let to = args_list.get_item(args_list.len() - 1)?.extract::<String>()?;
|
||||
let to = args_list
|
||||
.get_item(args_list.len() - 1)?
|
||||
.extract::<String>()?;
|
||||
current = current.replace(&from, &to);
|
||||
}
|
||||
Some("dateparse") => {
|
||||
@@ -804,7 +911,11 @@ fn parse_english_elapsed_date(value: &str) -> Option<String> {
|
||||
"year" => Duration::days(amount * 365),
|
||||
_ => return None,
|
||||
};
|
||||
Some((Local::now() - duration).format("%Y-%m-%d %H:%M:%S").to_string())
|
||||
Some(
|
||||
(Local::now() - duration)
|
||||
.format("%Y-%m-%d %H:%M:%S")
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。
|
||||
@@ -869,7 +980,10 @@ fn should_skip_text_node(
|
||||
if element == root {
|
||||
return false;
|
||||
}
|
||||
if remove_selectors.iter().any(|selector| selector.matches(&element)) {
|
||||
if remove_selectors
|
||||
.iter()
|
||||
.any(|selector| selector.matches(&element))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
parent = element.parent().and_then(ElementRef::wrap);
|
||||
@@ -877,14 +991,12 @@ fn should_skip_text_node(
|
||||
false
|
||||
}
|
||||
|
||||
/// 判断 row 内是否存在指定 selector。
|
||||
fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
|
||||
if selector_text == "*" {
|
||||
return Ok(true);
|
||||
}
|
||||
Ok(select_site_elements(row, selector_text)
|
||||
.map(|elements| !elements.is_empty())
|
||||
.unwrap_or(false))
|
||||
/// 判断 row 内是否存在预编译 selector 计划匹配的元素。
|
||||
fn selector_exists_with_plan(row: ElementRef<'_>, selector: &SelectorPlan) -> bool {
|
||||
select_site_elements_with_plan(row, selector)
|
||||
.into_iter()
|
||||
.next()
|
||||
.is_some()
|
||||
}
|
||||
|
||||
/// 拼接详情和下载链接。
|
||||
|
||||
@@ -17,6 +17,7 @@ fn is_available() -> bool {
|
||||
fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_function(wrap_pyfunction!(is_available, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(filter::filter_torrents_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_path_fast, m)?)?;
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use crate::utils::{cached_regex, get_config_string_list};
|
||||
use anitomy_pure::elements::Category;
|
||||
use anitomy_pure::Parser;
|
||||
use golia_pinyin::{is_valid_syllable, segment};
|
||||
use once_cell::sync::Lazy;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyDict, PyList};
|
||||
use pyo3::types::PyDict;
|
||||
use regex::{Captures, Regex, RegexBuilder};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
static ANIME_BRACKET_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r"【[+0-9XVPI-]+】\s*【")
|
||||
@@ -311,12 +312,14 @@ static KEYWORD_META_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
static EPISODE_VERSION_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"v\d+$").unwrap());
|
||||
static TOKEN_SPLIT_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"\s+|\(|\)|\[|]|-|【|】|/|~|;|&|\||#|_|「|」|~").unwrap());
|
||||
static STREAMING_PLATFORM_LOOKUP: Lazy<HashMap<String, String>> =
|
||||
Lazy::new(streaming_platform_lookup);
|
||||
static RELEASE_GROUP_RE_CACHE: Lazy<Mutex<HashMap<String, Regex>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
static CUSTOMIZATION_RE_CACHE: Lazy<Mutex<HashMap<String, Regex>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
static STREAMING_PLATFORM_CACHE: Lazy<Mutex<HashMap<usize, Arc<HashMap<String, String>>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
static PARSE_OPTIONS_CACHE: Lazy<Mutex<HashMap<usize, Arc<ParseOptions>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
const MEDIA_TYPE_MOVIE: &str = "电影";
|
||||
const MEDIA_TYPE_TV: &str = "电视剧";
|
||||
@@ -370,11 +373,13 @@ struct ExplicitMetaInfo {
|
||||
total_episode: Option<i64>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ParseOptions {
|
||||
custom_words: Vec<String>,
|
||||
media_exts: HashSet<String>,
|
||||
release_groups: String,
|
||||
customization_patterns: Vec<String>,
|
||||
streaming_platforms: Arc<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
struct TokenCursor {
|
||||
@@ -404,11 +409,22 @@ pub(crate) fn parse_metainfo_fast(
|
||||
subtitle: Option<&str>,
|
||||
options: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<PyObject> {
|
||||
let options = ParseOptions::from_py(options)?;
|
||||
let meta = build_meta_info(title, subtitle, &options, true)?;
|
||||
let options = ParseOptions::from_py_cached(options)?;
|
||||
let meta = build_meta_info(title, subtitle, options.as_ref(), true)?;
|
||||
meta_to_py(py, &meta)
|
||||
}
|
||||
|
||||
/// 给过滤器 size_range 规则复用完整 Rust MetaInfo 解析链路。
|
||||
pub(crate) fn parse_total_episode_for_filter(
|
||||
title: &str,
|
||||
subtitle: Option<&str>,
|
||||
options: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<i64> {
|
||||
let options = ParseOptions::from_py_cached(options)?;
|
||||
let meta = build_meta_info(title, subtitle, options.as_ref(), true)?;
|
||||
Ok(meta.total_episode)
|
||||
}
|
||||
|
||||
/// 从路径入口解析 MetaInfoPath,并在 Rust 内完成父目录合并。
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (path, options=None))]
|
||||
@@ -417,8 +433,8 @@ pub(crate) fn parse_metainfo_path_fast(
|
||||
path: &str,
|
||||
options: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<PyObject> {
|
||||
let options = ParseOptions::from_py(options)?;
|
||||
let meta = build_meta_path(path, &options)?;
|
||||
let options = ParseOptions::from_py_cached(options)?;
|
||||
let meta = build_meta_path(path, options.as_ref())?;
|
||||
meta_to_py(py, &meta)
|
||||
}
|
||||
|
||||
@@ -443,19 +459,26 @@ pub(crate) fn find_metainfo_fast(py: Python<'_>, title: &str) -> PyResult<PyObje
|
||||
}
|
||||
|
||||
impl ParseOptions {
|
||||
/// 从 Python 字典读取 Rust 解析所需配置,避免 Rust 层访问数据库或 settings。
|
||||
fn from_py(options: Option<&Bound<'_, PyDict>>) -> PyResult<Self> {
|
||||
/// 按 Python 配置字典对象身份缓存解析配置,避免每个标题重复跨语言解包。
|
||||
fn from_py_cached(options: Option<&Bound<'_, PyDict>>) -> PyResult<Arc<Self>> {
|
||||
let Some(options) = options else {
|
||||
return Ok(Self {
|
||||
return Ok(Arc::new(Self {
|
||||
custom_words: Vec::new(),
|
||||
media_exts: HashSet::new(),
|
||||
release_groups: default_release_groups(),
|
||||
release_groups: String::new(),
|
||||
customization_patterns: Vec::new(),
|
||||
});
|
||||
streaming_platforms: Arc::new(HashMap::new()),
|
||||
}));
|
||||
};
|
||||
Ok(Self {
|
||||
custom_words: get_string_list(options, "custom_words")?,
|
||||
media_exts: get_string_list(options, "media_exts")?
|
||||
let cache_key = options.as_ptr() as usize;
|
||||
if let Ok(guard) = PARSE_OPTIONS_CACHE.lock() {
|
||||
if let Some(cached) = guard.get(&cache_key) {
|
||||
return Ok(cached.clone());
|
||||
}
|
||||
}
|
||||
let parsed = Arc::new(Self {
|
||||
custom_words: get_config_string_list(options, "custom_words")?,
|
||||
media_exts: get_config_string_list(options, "media_exts")?
|
||||
.into_iter()
|
||||
.map(|item| item.to_lowercase())
|
||||
.collect(),
|
||||
@@ -465,42 +488,45 @@ impl ParseOptions {
|
||||
.map(|value| value.extract::<String>())
|
||||
.transpose()?
|
||||
.filter(|value| !value.is_empty())
|
||||
.unwrap_or_else(default_release_groups),
|
||||
customization_patterns: get_string_list(options, "customization")?,
|
||||
})
|
||||
.unwrap_or_default(),
|
||||
customization_patterns: get_config_string_list(options, "customization")?,
|
||||
streaming_platforms: get_streaming_platforms(options)?,
|
||||
});
|
||||
if let Ok(mut guard) = PARSE_OPTIONS_CACHE.lock() {
|
||||
guard.insert(cache_key, parsed.clone());
|
||||
}
|
||||
Ok(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
/// 读取 Python 字典中的字符串列表。
|
||||
fn get_string_list(options: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
|
||||
let Some(value) = options.get_item(key)? else {
|
||||
return Ok(Vec::new());
|
||||
/// 从 Python 传入的流媒体平台表构建查询映射,避免 Rust 里重复维护默认列表。
|
||||
fn get_streaming_platforms(options: &Bound<'_, PyDict>) -> PyResult<Arc<HashMap<String, String>>> {
|
||||
let mut result = HashMap::new();
|
||||
let Some(value) = options.get_item("streaming_platforms")? else {
|
||||
return Ok(Arc::new(result));
|
||||
};
|
||||
if value.is_none() {
|
||||
return Ok(Vec::new());
|
||||
return Ok(Arc::new(result));
|
||||
}
|
||||
if let Ok(list) = value.downcast::<PyList>() {
|
||||
let mut result = Vec::new();
|
||||
for item in list.iter() {
|
||||
let text = item.extract::<String>()?;
|
||||
if !text.is_empty() {
|
||||
result.push(text);
|
||||
}
|
||||
let dict = value.downcast::<PyDict>()?;
|
||||
let cache_key = dict.as_ptr() as usize;
|
||||
if let Ok(guard) = STREAMING_PLATFORM_CACHE.lock() {
|
||||
if let Some(cached) = guard.get(&cache_key) {
|
||||
return Ok(cached.clone());
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
if let Ok(text) = value.extract::<String>() {
|
||||
return Ok(text
|
||||
.replace('\n', ";")
|
||||
.replace('|', ";")
|
||||
.split(';')
|
||||
.filter_map(|item| {
|
||||
let item = item.trim();
|
||||
(!item.is_empty()).then(|| item.to_string())
|
||||
})
|
||||
.collect());
|
||||
for (key, value) in dict.iter() {
|
||||
let key = key.str()?.to_str()?.to_uppercase();
|
||||
let value = value.str()?.to_str()?.to_string();
|
||||
if !key.is_empty() && !value.is_empty() {
|
||||
result.insert(key, value);
|
||||
}
|
||||
}
|
||||
Ok(Vec::new())
|
||||
let result = Arc::new(result);
|
||||
if let Ok(mut guard) = STREAMING_PLATFORM_CACHE.lock() {
|
||||
guard.insert(cache_key, result.clone());
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// 构建标题入口的完整元信息。
|
||||
@@ -541,6 +567,7 @@ fn build_meta_info(
|
||||
media_exts: options.media_exts.clone(),
|
||||
release_groups: options.release_groups.clone(),
|
||||
customization_patterns: options.customization_patterns.clone(),
|
||||
streaming_platforms: options.streaming_platforms.clone(),
|
||||
},
|
||||
false,
|
||||
)?;
|
||||
@@ -1024,7 +1051,13 @@ fn parse_video(
|
||||
init_resource_type(&mut meta, &mut state, ¤t);
|
||||
}
|
||||
if state.continue_flag {
|
||||
init_web_source(&mut meta, &mut state, ¤t, &mut tokens);
|
||||
init_web_source(
|
||||
&mut meta,
|
||||
&mut state,
|
||||
¤t,
|
||||
&mut tokens,
|
||||
&options.streaming_platforms,
|
||||
);
|
||||
}
|
||||
if state.continue_flag {
|
||||
init_video_encode(&mut meta, &mut state, ¤t);
|
||||
@@ -1844,13 +1877,12 @@ fn init_web_source(
|
||||
state: &mut VideoState,
|
||||
token: &str,
|
||||
tokens: &mut TokenCursor,
|
||||
streaming_platforms: &HashMap<String, String>,
|
||||
) {
|
||||
if meta_name(meta).is_none() {
|
||||
return;
|
||||
}
|
||||
let mut platform_name = STREAMING_PLATFORM_LOOKUP
|
||||
.get(&token.to_uppercase())
|
||||
.cloned();
|
||||
let mut platform_name = streaming_platforms.get(&token.to_uppercase()).cloned();
|
||||
let mut query_range = 1usize;
|
||||
let prev_token = state
|
||||
.index
|
||||
@@ -1869,7 +1901,7 @@ fn init_web_source(
|
||||
} else {
|
||||
format!("{adjacent}{separator}{token}")
|
||||
};
|
||||
if let Some(name) = STREAMING_PLATFORM_LOOKUP.get(&combined.to_uppercase()) {
|
||||
if let Some(name) = streaming_platforms.get(&combined.to_uppercase()) {
|
||||
platform_name = Some(name.clone());
|
||||
query_range = 2;
|
||||
if is_next {
|
||||
@@ -3061,152 +3093,6 @@ fn match_customization(title: &str, patterns: &[String]) -> Option<String> {
|
||||
(!unique.is_empty()).then(|| unique.into_values().collect::<Vec<_>>().join("@"))
|
||||
}
|
||||
|
||||
/// 按正则文本缓存动态配置正则,避免每个标题重复编译长规则。
|
||||
fn cached_regex(cache: &Lazy<Mutex<HashMap<String, Regex>>>, pattern: &str) -> Option<Regex> {
|
||||
if let Ok(guard) = cache.lock() {
|
||||
if let Some(regex) = guard.get(pattern) {
|
||||
return Some(regex.clone());
|
||||
}
|
||||
}
|
||||
let regex = Regex::new(pattern).ok()?;
|
||||
if let Ok(mut guard) = cache.lock() {
|
||||
guard.insert(pattern.to_string(), regex.clone());
|
||||
}
|
||||
Some(regex)
|
||||
}
|
||||
|
||||
/// 默认发布组正则,来自 ReleaseGroupsMatcher.RELEASE_GROUPS。
|
||||
fn default_release_groups() -> String {
|
||||
vec![
|
||||
"FF(?:(?:A|WE)B|CD|E(?:DU|B)|TV)",
|
||||
"Audies",
|
||||
"AD(?:Audio|E(?:book|)|Music|Web)",
|
||||
"BeiTai",
|
||||
"Bts(?:CHOOL|HD|PAD|TV)",
|
||||
"Zone",
|
||||
"CHD(?:Bits|PAD|(?:|HK)TV|WEB|)",
|
||||
"StBOX",
|
||||
"OneHD",
|
||||
"Lee",
|
||||
"xiaopie",
|
||||
"(?:(?:iNT|(?:HALFC|Mini(?:S|H|FH)D))-|)TLF",
|
||||
"(?:DG|GBWE)B",
|
||||
"Hares(?:(?:M|T)V|Web|)",
|
||||
"HDA(?:pad|rea|TV)",
|
||||
"EPiC",
|
||||
"HDC(?:hina|TV|)",
|
||||
"k9611",
|
||||
"tudou",
|
||||
"iHD",
|
||||
"D(?:ream|BTV)",
|
||||
"(?:HD|QHstudI)o",
|
||||
"beAst(?:TV|)",
|
||||
"HDH(?:ome|Pad|TV|WEB|)",
|
||||
"HDPT(?:Web|)",
|
||||
"HDS(?:ky|TV|Pad|WEB|)",
|
||||
"AQLJ",
|
||||
"HDZ(?:one|)",
|
||||
"HHWEB",
|
||||
"HTPT",
|
||||
"FRDS",
|
||||
"Yumi",
|
||||
"cXcY",
|
||||
"L(?:eague(?:(?:C|H)D|(?:M|T)V|NF|WEB)|HD)",
|
||||
"i18n",
|
||||
"CiNT",
|
||||
"MTeam(?:TV|)",
|
||||
"MPAD",
|
||||
"MWeb",
|
||||
"Our(?:Bits|TV)",
|
||||
"FLTTH",
|
||||
"Ao",
|
||||
"PbK",
|
||||
"MGs",
|
||||
"iLove(?:HD|TV)",
|
||||
"Panda",
|
||||
"AilMWeb",
|
||||
"PiGo(?:NF|(?:H|WE)B)",
|
||||
"PTer(?:DIY|Game|(?:M|T)V|WEB|)",
|
||||
"PTH(?:Audio|eBook|music|ome|tv|WEB|)",
|
||||
"PTsbao",
|
||||
"OPS",
|
||||
"F(?:Fans(?:AIeNcE|BD|D(?:VD|IY)|TV|WEB)|HDMv)",
|
||||
"SGXT",
|
||||
"PuTao",
|
||||
"CMCT(?:V|)",
|
||||
"Shark(?:WEB|DIY|TV|MV|)",
|
||||
"TJUPT",
|
||||
"TTG",
|
||||
"WiKi",
|
||||
"NGB",
|
||||
"DoA",
|
||||
"(?:ARi|ExRE)N",
|
||||
"B(?:MDru|eyondHD|TN)",
|
||||
"C(?:fandora|trlhd|MRG)",
|
||||
"DON",
|
||||
"EVO",
|
||||
"FLUX",
|
||||
"HONE(?:yG|)",
|
||||
"N(?:oGroup|T(?:b|G))",
|
||||
"PandaMoon",
|
||||
"SMURF",
|
||||
"T(?:EPES|aengoo|rollHD )",
|
||||
"ANi",
|
||||
"HYSUB",
|
||||
"KTXP",
|
||||
"LoliHouse",
|
||||
"MCE",
|
||||
"Nekomoe kissaten",
|
||||
"SweetSub",
|
||||
"MingY",
|
||||
"(?:Lilith|NC)-Raws",
|
||||
"织梦字幕组",
|
||||
"枫叶字幕组",
|
||||
"猎户手抄部",
|
||||
"喵萌奶茶屋",
|
||||
"漫猫字幕社",
|
||||
"霜庭云花Sub",
|
||||
"北宇治字幕组",
|
||||
"氢气烤肉架",
|
||||
"云歌字幕组",
|
||||
"萌樱字幕组",
|
||||
"极影字幕社",
|
||||
"悠哈璃羽字幕社",
|
||||
"❀拨雪寻春❀",
|
||||
"沸羊羊(?:制作|字幕组)",
|
||||
"(?:桜|樱)都字幕组",
|
||||
"FROG(?:E|Web|)",
|
||||
"UB(?:its|WEB|TV)",
|
||||
]
|
||||
.join("|")
|
||||
}
|
||||
|
||||
/// 构建常用流媒体平台查询表。
|
||||
fn streaming_platform_lookup() -> HashMap<String, String> {
|
||||
let pairs = [
|
||||
("AMZN", "Amazon"),
|
||||
("NF", "Netflix"),
|
||||
("ATVP", "Apple TV+"),
|
||||
("DSNP", "Disney+"),
|
||||
("PMTP", "Paramount+"),
|
||||
("HMAX", "Max"),
|
||||
("HULU", "Hulu Networks"),
|
||||
("CR", "Crunchyroll"),
|
||||
("Baha", "Baha"),
|
||||
("BG", "B-Global"),
|
||||
("HBO", "HBO"),
|
||||
("CRAV", "Crave"),
|
||||
("PCOK", "Peacock"),
|
||||
("HMAX", "Max"),
|
||||
];
|
||||
let mut map = HashMap::new();
|
||||
for (short, full) in pairs {
|
||||
map.insert(short.to_uppercase(), full.to_string());
|
||||
map.insert(full.to_uppercase(), full.to_string());
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// 将 Rust 元信息转换为 Python dict。
|
||||
fn meta_to_py(py: Python<'_>, meta: &MetaResult) -> PyResult<PyObject> {
|
||||
let dict = PyDict::new(py);
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
|
||||
use chrono::{
|
||||
DateTime, Datelike, Local, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone, Timelike, Utc,
|
||||
};
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyAny, PyDict, PyList};
|
||||
use quick_xml::events::{BytesRef, BytesStart, Event};
|
||||
use quick_xml::name::QName;
|
||||
use quick_xml::Reader;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RssItem {
|
||||
@@ -35,8 +37,20 @@ pub(crate) fn parse_rss_items_fast(
|
||||
) -> PyResult<Option<PyObject>> {
|
||||
let parsed = parse_rss_items(xml_text, max_items)?;
|
||||
let result = PyList::empty(py);
|
||||
let datetime_mod = py.import("datetime")?;
|
||||
let datetime_cls = datetime_mod.getattr("datetime")?;
|
||||
let timezone_cls = datetime_mod.getattr("timezone")?;
|
||||
let timedelta_cls = datetime_mod.getattr("timedelta")?;
|
||||
let mut timezone_cache = HashMap::new();
|
||||
for item in parsed {
|
||||
result.append(item_to_py(py, &item)?)?;
|
||||
result.append(item_to_py(
|
||||
py,
|
||||
&item,
|
||||
&datetime_cls,
|
||||
&timezone_cls,
|
||||
&timedelta_cls,
|
||||
&mut timezone_cache,
|
||||
)?)?;
|
||||
}
|
||||
Ok(Some(result.into()))
|
||||
}
|
||||
@@ -45,7 +59,7 @@ pub(crate) fn parse_rss_items_fast(
|
||||
fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
let mut reader = Reader::from_str(xml_text);
|
||||
|
||||
let mut results = Vec::new();
|
||||
let mut results = Vec::with_capacity(max_items.min(1024));
|
||||
let mut current_item: Option<RssItem> = None;
|
||||
let mut item_depth = 0usize;
|
||||
let mut current_field: Option<(TextField, usize)> = None;
|
||||
@@ -53,8 +67,9 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if current_item.is_none() && is_item_node(&local) {
|
||||
let name = event.name();
|
||||
let local = local_name(name.as_ref());
|
||||
if current_item.is_none() && is_item_node(local) {
|
||||
current_item = Some(RssItem::default());
|
||||
item_depth = 1;
|
||||
current_field = None;
|
||||
@@ -63,25 +78,26 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
item_depth += 1;
|
||||
handle_start_field(&event, &local, item, item_depth, &mut current_field)?;
|
||||
handle_start_field(&event, local, item, item_depth, &mut current_field)?;
|
||||
}
|
||||
}
|
||||
Ok(Event::Empty(event)) => {
|
||||
let local = local_name(event.name());
|
||||
let name = event.name();
|
||||
let local = local_name(name.as_ref());
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
handle_empty_field(&event, &local, item)?;
|
||||
handle_empty_field(&event, local, item)?;
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(event)) => {
|
||||
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
|
||||
let text = event.decode().map_err(to_py_value_error)?.to_string();
|
||||
append_text_field(item, field, &text);
|
||||
let text = event.decode().map_err(to_py_value_error)?;
|
||||
append_text_field(item, field, text.as_ref());
|
||||
}
|
||||
}
|
||||
Ok(Event::CData(event)) => {
|
||||
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
|
||||
let text = event.decode().map_err(to_py_value_error)?.to_string();
|
||||
append_text_field(item, field, &text);
|
||||
let text = event.decode().map_err(to_py_value_error)?;
|
||||
append_text_field(item, field, text.as_ref());
|
||||
}
|
||||
}
|
||||
Ok(Event::GeneralRef(event)) => {
|
||||
@@ -91,8 +107,9 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
}
|
||||
}
|
||||
Ok(Event::End(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if current_item.is_some() && item_depth == 1 && is_item_node(&local) {
|
||||
let name = event.name();
|
||||
let local = local_name(name.as_ref());
|
||||
if current_item.is_some() && item_depth == 1 && is_item_node(local) {
|
||||
if let Some(item) = current_item.take() {
|
||||
if let Some(item) = finalize_item(item) {
|
||||
results.push(item);
|
||||
@@ -130,17 +147,17 @@ fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
/// 处理开始标签,记录当前需要采集文本的字段和链接属性。
|
||||
fn handle_start_field(
|
||||
event: &BytesStart<'_>,
|
||||
local: &str,
|
||||
local: &[u8],
|
||||
item: &mut RssItem,
|
||||
depth: usize,
|
||||
current_field: &mut Option<(TextField, usize)>,
|
||||
) -> PyResult<()> {
|
||||
if local == "enclosure" {
|
||||
if local.eq_ignore_ascii_case(b"enclosure") {
|
||||
fill_enclosure(event, item)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if local == "link" {
|
||||
if local.eq_ignore_ascii_case(b"link") {
|
||||
fill_link_from_href(event, item)?;
|
||||
}
|
||||
|
||||
@@ -153,25 +170,39 @@ fn handle_start_field(
|
||||
}
|
||||
|
||||
/// 处理空标签,覆盖 Atom 的 link href 和 RSS 的 enclosure。
|
||||
fn handle_empty_field(event: &BytesStart<'_>, local: &str, item: &mut RssItem) -> PyResult<()> {
|
||||
match local {
|
||||
"enclosure" => fill_enclosure(event, item)?,
|
||||
"link" => fill_link_from_href(event, item)?,
|
||||
_ => {}
|
||||
fn handle_empty_field(event: &BytesStart<'_>, local: &[u8], item: &mut RssItem) -> PyResult<()> {
|
||||
if local.eq_ignore_ascii_case(b"enclosure") {
|
||||
fill_enclosure(event, item)?;
|
||||
} else if local.eq_ignore_ascii_case(b"link") {
|
||||
fill_link_from_href(event, item)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 根据标签名和已采集状态选择当前文本字段。
|
||||
fn pick_text_field(local: &str, item: &RssItem) -> Option<TextField> {
|
||||
match local {
|
||||
"title" if item.title.is_empty() => Some(TextField::Title),
|
||||
"description" | "summary" if item.description.is_empty() => Some(TextField::Description),
|
||||
"link" if item.link.is_empty() => Some(TextField::Link),
|
||||
"pubDate" | "published" | "updated" if item.pubdate.is_empty() => Some(TextField::Pubdate),
|
||||
"creator" if item.nickname.is_empty() => Some(TextField::Nickname),
|
||||
_ => None,
|
||||
fn pick_text_field(local: &[u8], item: &RssItem) -> Option<TextField> {
|
||||
if local.eq_ignore_ascii_case(b"title") && item.title.is_empty() {
|
||||
return Some(TextField::Title);
|
||||
}
|
||||
if (local.eq_ignore_ascii_case(b"description") || local.eq_ignore_ascii_case(b"summary"))
|
||||
&& item.description.is_empty()
|
||||
{
|
||||
return Some(TextField::Description);
|
||||
}
|
||||
if local.eq_ignore_ascii_case(b"link") && item.link.is_empty() {
|
||||
return Some(TextField::Link);
|
||||
}
|
||||
if (local.eq_ignore_ascii_case(b"pubDate")
|
||||
|| local.eq_ignore_ascii_case(b"published")
|
||||
|| local.eq_ignore_ascii_case(b"updated"))
|
||||
&& item.pubdate.is_empty()
|
||||
{
|
||||
return Some(TextField::Pubdate);
|
||||
}
|
||||
if local.eq_ignore_ascii_case(b"creator") && item.nickname.is_empty() {
|
||||
return Some(TextField::Nickname);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// 追加文本字段内容,兼容 CDATA 和带内联标签的描述。
|
||||
@@ -266,7 +297,14 @@ fn finalize_item(mut item: RssItem) -> Option<RssItem> {
|
||||
}
|
||||
|
||||
/// 将 Rust 条目转换为 Python dict,字段名保持与 RssHelper.parse 原返回一致。
|
||||
fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
|
||||
fn item_to_py(
|
||||
py: Python<'_>,
|
||||
item: &RssItem,
|
||||
datetime_cls: &Bound<'_, PyAny>,
|
||||
timezone_cls: &Bound<'_, PyAny>,
|
||||
timedelta_cls: &Bound<'_, PyAny>,
|
||||
timezone_cache: &mut HashMap<i32, PyObject>,
|
||||
) -> PyResult<PyObject> {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("title", &item.title)?;
|
||||
dict.set_item("enclosure", &item.enclosure)?;
|
||||
@@ -274,7 +312,17 @@ fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
|
||||
dict.set_item("description", &item.description)?;
|
||||
dict.set_item("link", &item.link)?;
|
||||
if let Some(timestamp) = parse_pubdate_timestamp(&item.pubdate) {
|
||||
dict.set_item("pubdate", py_datetime_from_timestamp(py, timestamp)?)?;
|
||||
dict.set_item(
|
||||
"pubdate",
|
||||
py_datetime_from_timestamp(
|
||||
py,
|
||||
timestamp,
|
||||
datetime_cls,
|
||||
timezone_cls,
|
||||
timedelta_cls,
|
||||
timezone_cache,
|
||||
)?,
|
||||
)?;
|
||||
} else {
|
||||
dict.set_item("pubdate", "")?;
|
||||
}
|
||||
@@ -285,13 +333,41 @@ fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
|
||||
}
|
||||
|
||||
/// 将 Unix 时间戳转换为本地时区 Python datetime,匹配原 astimezone(tz=None) 语义。
|
||||
fn py_datetime_from_timestamp<'py>(py: Python<'py>, timestamp: i64) -> PyResult<Bound<'py, PyAny>> {
|
||||
let datetime_mod = py.import("datetime")?;
|
||||
let datetime_cls = datetime_mod.getattr("datetime")?;
|
||||
let timezone_cls = datetime_mod.getattr("timezone")?;
|
||||
let utc = timezone_cls.getattr("utc")?;
|
||||
let utc_dt = datetime_cls.call_method1("fromtimestamp", (timestamp, utc))?;
|
||||
utc_dt.call_method0("astimezone")
|
||||
fn py_datetime_from_timestamp<'py>(
|
||||
py: Python<'py>,
|
||||
timestamp: i64,
|
||||
datetime_cls: &Bound<'py, PyAny>,
|
||||
timezone_cls: &Bound<'py, PyAny>,
|
||||
timedelta_cls: &Bound<'py, PyAny>,
|
||||
timezone_cache: &mut HashMap<i32, PyObject>,
|
||||
) -> PyResult<Bound<'py, PyAny>> {
|
||||
let Some(local_dt) = Local
|
||||
.timestamp_opt(timestamp, 0)
|
||||
.single()
|
||||
.or_else(|| Local.timestamp_opt(timestamp, 0).earliest())
|
||||
else {
|
||||
return datetime_cls.call_method1("fromtimestamp", (timestamp,));
|
||||
};
|
||||
let offset_seconds = local_dt.offset().fix().local_minus_utc();
|
||||
let tzinfo = match timezone_cache.get(&offset_seconds) {
|
||||
Some(cached) => cached.clone_ref(py),
|
||||
None => {
|
||||
let delta = timedelta_cls.call1((0, offset_seconds))?;
|
||||
let timezone = timezone_cls.call1((delta,))?.unbind();
|
||||
timezone_cache.insert(offset_seconds, timezone.clone_ref(py));
|
||||
timezone
|
||||
}
|
||||
};
|
||||
datetime_cls.call1((
|
||||
local_dt.year(),
|
||||
local_dt.month(),
|
||||
local_dt.day(),
|
||||
local_dt.hour(),
|
||||
local_dt.minute(),
|
||||
local_dt.second(),
|
||||
0,
|
||||
tzinfo.bind(py),
|
||||
))
|
||||
}
|
||||
|
||||
/// 解析 RSS/Atom 常见日期格式并返回时间戳。
|
||||
@@ -365,17 +441,13 @@ fn local_timestamp(naive: NaiveDateTime) -> Option<i64> {
|
||||
}
|
||||
|
||||
/// 判断当前标签是否为 RSS item 或 Atom entry。
|
||||
fn is_item_node(local: &str) -> bool {
|
||||
matches!(local, "item" | "entry")
|
||||
fn is_item_node(local: &[u8]) -> bool {
|
||||
local.eq_ignore_ascii_case(b"item") || local.eq_ignore_ascii_case(b"entry")
|
||||
}
|
||||
|
||||
/// 提取 XML 名称的本地部分,用于兼容 dc:creator 这类命名空间字段。
|
||||
fn local_name(name: QName<'_>) -> String {
|
||||
let raw = std::str::from_utf8(name.as_ref()).unwrap_or_default();
|
||||
raw.rsplit_once(':')
|
||||
.map(|(_, local)| local)
|
||||
.unwrap_or(raw)
|
||||
.to_string()
|
||||
fn local_name(raw: &[u8]) -> &[u8] {
|
||||
raw.rsplit(|byte| *byte == b':').next().unwrap_or(raw)
|
||||
}
|
||||
|
||||
/// 将 quick-xml 错误转换为 Python ValueError 交给 Python 包装层判断是否兜底。
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyAny, PyDict};
|
||||
use pyo3::types::{PyAny, PyDict, PyList};
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// 从 Python 字典读取可选字符串。
|
||||
pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<String>> {
|
||||
@@ -12,6 +15,22 @@ pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResu
|
||||
Ok(Some(value.str()?.to_str()?.to_string()))
|
||||
}
|
||||
|
||||
/// 从 Python 字典读取非空字符串。
|
||||
pub(crate) fn get_optional_nonempty_string(
|
||||
dict: &Bound<'_, PyDict>,
|
||||
key: &str,
|
||||
) -> PyResult<Option<String>> {
|
||||
let Some(value) = get_optional_string(dict, key)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
let value = value.trim().to_string();
|
||||
if value.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(value))
|
||||
}
|
||||
}
|
||||
|
||||
/// 从 Python 字典读取可选整数。
|
||||
pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<i64>> {
|
||||
let Some(value) = dict.get_item(key)? else {
|
||||
@@ -30,6 +49,54 @@ pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<
|
||||
Ok(text.parse::<i64>().ok())
|
||||
}
|
||||
|
||||
/// 从 Python 字典读取可选浮点数。
|
||||
pub(crate) fn get_optional_f64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Option<f64>> {
|
||||
let Some(value) = dict.get_item(key)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
py_any_to_f64(&value)
|
||||
}
|
||||
|
||||
/// 从 Python 字典读取字符串列表,兼容单值字符串。
|
||||
pub(crate) fn get_string_list(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
|
||||
let Some(value) = dict.get_item(key)? else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
py_any_to_string_list(&value)
|
||||
}
|
||||
|
||||
/// 从 Python 字典读取配置字符串列表,兼容列表和以换行、竖线、分号分隔的字符串。
|
||||
pub(crate) fn get_config_string_list(dict: &Bound<'_, PyDict>, key: &str) -> PyResult<Vec<String>> {
|
||||
let Some(value) = dict.get_item(key)? else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
if value.is_none() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
if let Ok(list) = value.downcast::<PyList>() {
|
||||
let mut result = Vec::new();
|
||||
for item in list.iter() {
|
||||
let text = item.extract::<String>()?;
|
||||
if !text.is_empty() {
|
||||
result.push(text);
|
||||
}
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
if let Ok(text) = value.extract::<String>() {
|
||||
return Ok(text
|
||||
.replace('\n', ";")
|
||||
.replace('|', ";")
|
||||
.split(';')
|
||||
.filter_map(|item| {
|
||||
let item = item.trim();
|
||||
(!item.is_empty()).then(|| item.to_string())
|
||||
})
|
||||
.collect());
|
||||
}
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
/// 将 Python 对象转换为 i64,用于兼容配置里字符串或数字形式的下标。
|
||||
pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
|
||||
if value.is_none() {
|
||||
@@ -44,3 +111,105 @@ pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
|
||||
}
|
||||
Ok(text.parse::<i64>().ok())
|
||||
}
|
||||
|
||||
/// 将 Python 值转换为可选 i64。
|
||||
pub(crate) fn py_any_to_i64(value: &Bound<'_, PyAny>) -> PyResult<Option<i64>> {
|
||||
if value.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
if let Ok(parsed) = value.extract::<i64>() {
|
||||
return Ok(Some(parsed));
|
||||
}
|
||||
let text = value.str()?.to_str()?.trim().to_string();
|
||||
if text.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(text.parse::<i64>().ok())
|
||||
}
|
||||
|
||||
/// 将 Python 值转换为可选 f64。
|
||||
pub(crate) fn py_any_to_f64(value: &Bound<'_, PyAny>) -> PyResult<Option<f64>> {
|
||||
if value.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
if let Ok(parsed) = value.extract::<f64>() {
|
||||
return Ok(Some(parsed));
|
||||
}
|
||||
let text = value.str()?.to_str()?.trim().to_string();
|
||||
if text.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(text.parse::<f64>().ok())
|
||||
}
|
||||
|
||||
/// 将 Python 值转换为字符串列表。
|
||||
pub(crate) fn py_any_to_string_list(value: &Bound<'_, PyAny>) -> PyResult<Vec<String>> {
|
||||
if value.is_none() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
if let Ok(list) = value.downcast::<PyList>() {
|
||||
let mut result = Vec::new();
|
||||
for item in list.iter() {
|
||||
let text = item.str()?.to_str()?.to_string();
|
||||
if !text.is_empty() {
|
||||
result.push(text);
|
||||
}
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
let text = value.str()?.to_str()?.to_string();
|
||||
if text.is_empty() {
|
||||
Ok(Vec::new())
|
||||
} else {
|
||||
Ok(vec![text])
|
||||
}
|
||||
}
|
||||
|
||||
/// 从对象读取可选字符串属性。
|
||||
pub(crate) fn object_optional_string(
|
||||
obj: &Bound<'_, PyAny>,
|
||||
attr: &str,
|
||||
) -> PyResult<Option<String>> {
|
||||
let value = obj.getattr(attr)?;
|
||||
if value.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
let text = value.str()?.to_str()?.to_string();
|
||||
if text.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(text))
|
||||
}
|
||||
}
|
||||
|
||||
/// 从对象读取可选字符串列表属性。
|
||||
pub(crate) fn object_string_list(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Vec<String>> {
|
||||
let value = obj.getattr(attr)?;
|
||||
py_any_to_string_list(&value)
|
||||
}
|
||||
|
||||
/// 从对象读取可选整数属性。
|
||||
pub(crate) fn object_optional_i64(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Option<i64>> {
|
||||
let value = obj.getattr(attr)?;
|
||||
py_any_to_i64(&value)
|
||||
}
|
||||
|
||||
/// 从对象读取可选浮点属性。
|
||||
pub(crate) fn object_optional_f64(obj: &Bound<'_, PyAny>, attr: &str) -> PyResult<Option<f64>> {
|
||||
let value = obj.getattr(attr)?;
|
||||
py_any_to_f64(&value)
|
||||
}
|
||||
|
||||
/// 按正则文本缓存动态正则,避免热路径重复编译。
|
||||
pub(crate) fn cached_regex(cache: &Mutex<HashMap<String, Regex>>, pattern: &str) -> Option<Regex> {
|
||||
if let Ok(guard) = cache.lock() {
|
||||
if let Some(regex) = guard.get(pattern) {
|
||||
return Some(regex.clone());
|
||||
}
|
||||
}
|
||||
let regex = Regex::new(pattern).ok()?;
|
||||
if let Ok(mut guard) = cache.lock() {
|
||||
guard.insert(pattern.to_string(), regex.clone());
|
||||
}
|
||||
Some(regex)
|
||||
}
|
||||
|
||||
182
scripts/benchmark_filter_torrents_rust.py
Normal file
182
scripts/benchmark_filter_torrents_rust.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import argparse
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from app.core.context import MediaInfo, TorrentInfo
|
||||
from app.modules.filter import FilterModule
|
||||
from app.modules.filter.RuleParser import RuleParser
|
||||
|
||||
|
||||
class StaticRuleHelper:
|
||||
"""
|
||||
为 benchmark 提供固定规则组,避免系统配置和数据库访问影响过滤链路测量。
|
||||
"""
|
||||
|
||||
def __init__(self, groups):
|
||||
self._groups = groups
|
||||
|
||||
def get_rule_group_by_media(self, media=None, group_names=None): # noqa: ARG002
|
||||
"""
|
||||
按规则名返回固定规则组。
|
||||
"""
|
||||
if not group_names:
|
||||
return self._groups
|
||||
return [group for group in self._groups if group.name in group_names]
|
||||
|
||||
|
||||
def build_module() -> FilterModule:
|
||||
"""
|
||||
构造覆盖正则、标签、站点字段、促销、做种数、发布时间和 size_range 的过滤模块。
|
||||
"""
|
||||
module = FilterModule()
|
||||
module.rulehelper = StaticRuleHelper(
|
||||
[
|
||||
SimpleNamespace(name="quality", rule_string="(HDR | DV) & !BLU > WEBDL & FREE"),
|
||||
SimpleNamespace(name="site", rule_string="LABEL & SITE & SIZE & SEED & PUB & TMDB"),
|
||||
]
|
||||
)
|
||||
module.rule_set = {
|
||||
"HDR": {"include": r"[\s.]+HDR[\s.]+|HDR10|HDR10\+"},
|
||||
"DV": {"include": r"DOVI|Dolby[\s.]+Vision"},
|
||||
"BLU": {
|
||||
"include": [r"\bBlu-?Ray\b"],
|
||||
"exclude": [r"(?<!WEB-|HDTV)RIP"],
|
||||
},
|
||||
"WEBDL": {"include": r"WEB-?DL|WEB-?RIP"},
|
||||
"FREE": {"downloadvolumefactor": 0},
|
||||
"LABEL": {"include": "官方", "match": ["labels"]},
|
||||
"SITE": {"include": "Alpha|Beta", "match": ["site_name"]},
|
||||
"SIZE": {"size_range": "100-500"},
|
||||
"SEED": {"seeders": "5"},
|
||||
"PUB": {"publish_time": "0-2880"},
|
||||
"TMDB": {"tmdb": {"original_language": "zh,cn"}},
|
||||
}
|
||||
return module
|
||||
|
||||
|
||||
def build_torrents(count: int) -> list[TorrentInfo]:
|
||||
"""
|
||||
构造稳定的种子列表,让匹配和不匹配路径都进入完整过滤逻辑。
|
||||
"""
|
||||
pubdate = (datetime.now() - timedelta(minutes=90)).strftime("%Y-%m-%d %H:%M:%S")
|
||||
torrents = []
|
||||
for index in range(count):
|
||||
matched = index % 3 != 0
|
||||
title = f"Example Show S01E{index % 12 + 1:02d}-E{index % 12 + 2:02d} 1080p WEB-DL HDR10"
|
||||
if not matched:
|
||||
title = f"Example Show S01E{index % 12 + 1:02d} 1080p BluRay HDR"
|
||||
torrents.append(
|
||||
TorrentInfo(
|
||||
site_name="Alpha" if index % 2 else "Beta",
|
||||
title=title,
|
||||
description="简繁中字",
|
||||
labels=["官方"] if matched else ["转载"],
|
||||
size=(700 if matched else 2400) * 1024 * 1024,
|
||||
seeders=20 if matched else 1,
|
||||
downloadvolumefactor=0 if matched else 1,
|
||||
pubdate=pubdate,
|
||||
)
|
||||
)
|
||||
return torrents
|
||||
|
||||
|
||||
def build_mediainfo() -> MediaInfo:
|
||||
"""
|
||||
构造命中 TMDB 规则的媒体信息。
|
||||
"""
|
||||
mediainfo = MediaInfo()
|
||||
mediainfo.original_language = "zh"
|
||||
return mediainfo
|
||||
|
||||
|
||||
def reset_priority(torrents: list[TorrentInfo]) -> None:
|
||||
"""
|
||||
清理上一轮过滤写入的优先级,避免重复测量间互相影响。
|
||||
"""
|
||||
for torrent in torrents:
|
||||
torrent.pri_order = 0
|
||||
|
||||
|
||||
def python_filter(module: FilterModule, rule_groups: list[str], torrent_list: list[TorrentInfo], mediainfo: MediaInfo):
|
||||
"""
|
||||
调用旧 Python 私有实现,作为同一入口语义下的基准对照。
|
||||
"""
|
||||
parser = RuleParser()
|
||||
parsed_rule_cache = {}
|
||||
groups = module.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
|
||||
for group in groups:
|
||||
torrent_list = module._FilterModule__filter_torrents(
|
||||
rule_string=group.rule_string,
|
||||
rule_name=group.name,
|
||||
torrent_list=torrent_list,
|
||||
mediainfo=mediainfo,
|
||||
parser=parser,
|
||||
parsed_rule_cache=parsed_rule_cache,
|
||||
)
|
||||
return torrent_list
|
||||
|
||||
|
||||
def rust_filter(module: FilterModule, rule_groups: list[str], torrent_list: list[TorrentInfo], mediainfo: MediaInfo):
|
||||
"""
|
||||
调用当前生产入口,测量 Rust 完整过滤链路。
|
||||
"""
|
||||
return module.filter_torrents(rule_groups=rule_groups, torrent_list=torrent_list, mediainfo=mediainfo)
|
||||
|
||||
|
||||
def measure(func, module: FilterModule, torrents: list[TorrentInfo], mediainfo: MediaInfo, loops: int, repeats: int):
|
||||
"""
|
||||
多轮测量过滤耗时并返回中位数。
|
||||
"""
|
||||
samples = []
|
||||
filtered_count = 0
|
||||
rule_groups = ["quality", "site"]
|
||||
for _ in range(repeats):
|
||||
start = time.perf_counter()
|
||||
for _ in range(loops):
|
||||
reset_priority(torrents)
|
||||
filtered_count = len(func(module, rule_groups, torrents, mediainfo))
|
||||
samples.append((time.perf_counter() - start) * 1000 / loops)
|
||||
return statistics.median(samples), filtered_count
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
解析命令行参数。
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Benchmark FilterModule.filter_torrents Rust entry")
|
||||
parser.add_argument("--items", type=int, default=2000, help="Torrent count per loop")
|
||||
parser.add_argument("--loops", type=int, default=20, help="Loops per repeat")
|
||||
parser.add_argument("--repeats", type=int, default=5, help="Repeat count")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""
|
||||
运行 Filter Rust 与旧 Python 过滤链路基准测试。
|
||||
"""
|
||||
args = parse_args()
|
||||
module = build_module()
|
||||
torrents = build_torrents(args.items)
|
||||
mediainfo = build_mediainfo()
|
||||
|
||||
rust_ms, rust_count = measure(rust_filter, module, torrents, mediainfo, args.loops, args.repeats)
|
||||
python_ms, python_count = measure(python_filter, module, torrents, mediainfo, args.loops, args.repeats)
|
||||
speedup = python_ms / rust_ms if rust_ms else 0
|
||||
|
||||
print(f"items_per_loop={len(torrents)} loops={args.loops} repeats={args.repeats}")
|
||||
print(f"rust_items={rust_count} python_items={python_count}")
|
||||
print(f"rust_chain_ms_per_loop={rust_ms:.3f}")
|
||||
print(f"python_chain_ms_per_loop={python_ms:.3f}")
|
||||
print(f"speedup={speedup:.2f}x")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -180,6 +180,7 @@ def _metainfo_options(custom_words=None):
|
||||
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
|
||||
"release_groups": release_groups,
|
||||
"customization": customization,
|
||||
"streaming_platforms": metainfo_module._rust_parse_options()["streaming_platforms"],
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import unittest
|
||||
from datetime import datetime, timedelta
|
||||
from types import SimpleNamespace
|
||||
|
||||
from app.core.context import TorrentInfo
|
||||
from app.core.context import MediaInfo, TorrentInfo
|
||||
from app.helper.torrent import TorrentHelper
|
||||
from app.modules.filter import FilterModule
|
||||
|
||||
@@ -64,6 +65,76 @@ class TorrentFilterTest(unittest.TestCase):
|
||||
self.assertEqual([torrent], filtered)
|
||||
self.assertEqual(100, torrent.pri_order)
|
||||
|
||||
def test_filter_torrents_keeps_sequential_rule_group_semantics(self):
|
||||
module = FilterModule()
|
||||
module.rulehelper = _RuleHelper(
|
||||
[
|
||||
SimpleNamespace(name="first", rule_string="HDR"),
|
||||
SimpleNamespace(name="second", rule_string="FREE"),
|
||||
]
|
||||
)
|
||||
module.rule_set = {
|
||||
"HDR": {"include": "HDR"},
|
||||
"FREE": {"downloadvolumefactor": 0},
|
||||
}
|
||||
keep = TorrentInfo(title="Movie HDR WEB-DL", description="", downloadvolumefactor=0)
|
||||
drop = TorrentInfo(title="Movie HDR WEB-DL", description="", downloadvolumefactor=1)
|
||||
|
||||
filtered = module.filter_torrents(rule_groups=["first", "second"], torrent_list=[keep, drop])
|
||||
|
||||
self.assertEqual([keep], filtered)
|
||||
self.assertEqual(100, keep.pri_order)
|
||||
|
||||
def test_filter_torrents_supports_full_rule_fields_in_rust_entry(self):
|
||||
module = _build_filter_module(
|
||||
rule_string="TMDB & LABEL & SIZE & SEED & PUB & SITE",
|
||||
rule_set={
|
||||
"TMDB": {"tmdb": {"original_language": "zh,cn"}},
|
||||
"LABEL": {"include": "官方", "match": ["labels"]},
|
||||
"SIZE": {"size_range": "100-400"},
|
||||
"SEED": {"seeders": "5"},
|
||||
"PUB": {"publish_time": "0-120"},
|
||||
"SITE": {"include": "Alpha", "match": ["site_name"]},
|
||||
},
|
||||
)
|
||||
torrent = TorrentInfo(
|
||||
site_name="Alpha",
|
||||
title="Show S01E01-E02 1080p",
|
||||
description="",
|
||||
labels=["官方"],
|
||||
size=600 * 1024 * 1024,
|
||||
seeders=8,
|
||||
pubdate=(datetime.now() - timedelta(minutes=30)).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
mediainfo = MediaInfo()
|
||||
mediainfo.original_language = "zh"
|
||||
|
||||
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent], mediainfo=mediainfo)
|
||||
|
||||
self.assertEqual([torrent], filtered)
|
||||
self.assertEqual(100, torrent.pri_order)
|
||||
|
||||
def test_filter_torrents_uses_rust_entry_without_python_match_fallback(self):
|
||||
module = _build_filter_module(
|
||||
rule_string="KEEP",
|
||||
rule_set={"KEEP": {"include": "Movie"}},
|
||||
)
|
||||
|
||||
def fail_python_fallback(*_args, **_kwargs):
|
||||
"""
|
||||
如果入口仍调用旧 Python 私有匹配逻辑,测试应立即失败。
|
||||
"""
|
||||
raise AssertionError("Python fallback should not be called")
|
||||
|
||||
module._FilterModule__filter_torrents = fail_python_fallback
|
||||
|
||||
filtered = module.filter_torrents(
|
||||
rule_groups=["test"],
|
||||
torrent_list=[TorrentInfo(title="Movie", description="")],
|
||||
)
|
||||
|
||||
self.assertEqual(1, len(filtered))
|
||||
|
||||
def test_filter_torrent_keeps_extra_filter_semantics(self):
|
||||
torrent = TorrentInfo(
|
||||
title="Movie 1080p HDR",
|
||||
|
||||
Reference in New Issue
Block a user