diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index c3059c30..7a0e04e9 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -12,6 +12,7 @@ from pyquery import PyQuery from app.core.config import settings from app.log import logger from app.schemas.types import MediaType +from app.utils import rust_accel from app.utils.http import RequestUtils, AsyncRequestUtils from app.utils.string import StringUtils from app.utils.url import UrlUtils @@ -737,6 +738,17 @@ class SiteSpider: self.is_error = True return [] + rust_torrents = rust_accel.parse_indexer_torrents( + html_text=html_text, + domain=self.domain, + list_config=self.list, + fields=self.fields, + category=self.category, + result_num=self.result_num + ) + if rust_torrents is not None: + return rust_torrents + # 清空旧结果 self.torrents_info_array = [] html_doc = None diff --git a/app/utils/rust_accel.py b/app/utils/rust_accel.py index 4297ca2f..fb73a0ad 100644 --- a/app/utils/rust_accel.py +++ b/app/utils/rust_accel.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import List, Optional from app.log import logger @@ -39,6 +39,34 @@ def parse_filter_rule(expression: str) -> Optional[list]: return None +def parse_indexer_torrents( + html_text: str, + domain: str, + list_config: dict, + fields: dict, + category: Optional[dict] = None, + result_num: int = 100 +) -> Optional[List[dict]]: + """ + 使用 Rust 批量解析普通配置站点种子列表,不可用时返回 None。 + """ + if not _moviepilot_rust: + return None + try: + return _moviepilot_rust.parse_indexer_torrents_fast( + html_text, + domain, + list_config, + fields, + category, + result_num + ) + except BaseException as err: + _raise_non_rust_panic(err) + logger.debug(f"Rust 站点列表解析失败,使用 Python 解析兜底:{err}") + return None + + def _raise_non_rust_panic(err: BaseException) -> None: """ 只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。 diff --git a/rust/moviepilot_rust/Cargo.lock b/rust/moviepilot_rust/Cargo.lock index 8d32992b..f2eaad29 100644 --- a/rust/moviepilot_rust/Cargo.lock +++ b/rust/moviepilot_rust/Cargo.lock @@ -2,24 +2,370 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cc" +version = "1.2.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cssparser" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e901edd733a1472f944a45116df3f846f54d37e67e68640ac8bb69689aca2aa" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "html5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" +dependencies = [ + "log", + "markup5ever", + "match_token", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indoc" version = "2.0.7" @@ -29,12 +375,91 @@ dependencies = [ "rustversion", ] +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + +[[package]] +name = "match_token" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memo-map" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" + [[package]] name = "memoffset" version = "0.9.1" @@ -44,11 +469,42 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minijinja" +version = "2.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2929e494b2280e1e18959bb2e121da03347ae896896fdfaceaab43c88a02803f" +dependencies = [ + "memo-map", + "serde", +] + [[package]] name = "moviepilot-rust" version = "0.1.0" dependencies = [ + "chrono", + "minijinja", + "once_cell", "pyo3", + "regex", + "scraper", + "url", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", ] [[package]] @@ -57,12 +513,114 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + [[package]] name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.106" @@ -144,12 +702,213 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scraper" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f3a24d916e78954af99281a455168d4a9515d65eca99a18da1b813689c4ad9" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors", + "tendril", +] + +[[package]] +name = "selectors" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5685b6ae43bfcf7d2e7dfcfb5d8e8f61b46442c902531e41a32a9a8bf0ee0fb6" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "syn" version = "2.0.117" @@ -161,20 +920,281 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "target-lexicon" version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "wasm-bindgen" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/rust/moviepilot_rust/Cargo.toml b/rust/moviepilot_rust/Cargo.toml index cbb57cc8..713ab289 100644 --- a/rust/moviepilot_rust/Cargo.toml +++ b/rust/moviepilot_rust/Cargo.toml @@ -8,4 +8,10 @@ name = "moviepilot_rust" crate-type = ["cdylib"] [dependencies] +minijinja = "2.20" +chrono = "0.4" +once_cell = "1.20" pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] } +regex = "1.11" +scraper = "0.24" +url = "2.5" diff --git a/rust/moviepilot_rust/src/indexer.rs b/rust/moviepilot_rust/src/indexer.rs new file mode 100644 index 00000000..c84e4775 --- /dev/null +++ b/rust/moviepilot_rust/src/indexer.rs @@ -0,0 +1,972 @@ +use crate::utils::{extract_i64, get_optional_i64, get_optional_string}; +use chrono::{DateTime, Duration, Local, NaiveDate, NaiveDateTime, NaiveTime}; +use minijinja::{context, Environment, UndefinedBehavior}; +use once_cell::sync::Lazy; +use pyo3::prelude::*; +use pyo3::types::{PyAny, PyDict, PyList}; +use regex::{Regex, RegexBuilder}; +use scraper::{ElementRef, Html, Selector}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use url::form_urlencoded; +use url::Url; + +static FILESIZE_UNIT_RE: Lazy = Lazy::new(|| { + RegexBuilder::new(r"[KMGTPI]*B?") + .case_insensitive(true) + .build() + .unwrap() +}); +static NUMERIC_FACTOR_RE: Lazy = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap()); +static FIELD_REF_RE: Lazy = + Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap()); +static HAS_QUOTED_SELECTOR_RE: Lazy = + Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap()); +static HAS_SELECTOR_RE: Lazy = + Lazy::new(|| Regex::new(r#":has\(\s*(?:"([^"]+)"|'([^']+)'|([^)]*))\s*\)"#).unwrap()); +static TABLE_DIRECT_TR_RE: Lazy = + Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap()); +static EN_ELAPSED_RE: Lazy = + Lazy::new(|| Regex::new(r"(?i)(\d+)\s*(second|minute|hour|day|week|month|year)s?\s*ago").unwrap()); +const OUTPUT_FIELDS: &[(&str, &str)] = &[ + ("title", "title"), + ("description", "description"), + ("imdbid", "imdbid"), + ("size", "size"), + ("leechers", "peers"), + ("seeders", "seeders"), + ("grabs", "grabs"), + ("date_elapsed", "date_elapsed"), + ("freedate", "freedate"), + ("labels", "labels"), + ("hr", "hit_and_run"), + ("category", "category"), +]; + +struct FieldSpec<'py> { + name: String, + config: Bound<'py, PyDict>, +} + +enum RowParseResult { + Empty, + Item(PyObject), +} + +/// 批量解析普通配置 indexer 页面,优先在 Rust 内覆盖站点配置语义。 +#[pyfunction] +#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))] +pub(crate) fn parse_indexer_torrents_fast( + py: Python<'_>, + html_text: &str, + domain: &str, + list_config: &Bound<'_, PyDict>, + fields: &Bound<'_, PyDict>, + category: Option<&Bound<'_, PyDict>>, + result_num: usize, +) -> PyResult> { + let Some(list_selector_text) = get_optional_string(list_config, "selector")? else { + return Ok(None); + }; + if list_selector_text.is_empty() { + return Ok(None); + } + let document = Html::parse_document(html_text); + let Some(rows) = select_site_elements(document.root_element(), &list_selector_text) else { + return Ok(None); + }; + let result = PyList::empty(py); + let field_specs = build_field_specs(fields)?; + for row in rows.into_iter().take(result_num) { + match parse_indexer_row(py, row, domain, &field_specs, category)? { + RowParseResult::Empty => {} + RowParseResult::Item(item) => result.append(item)?, + } + } + Ok(Some(result.into())) +} + +/// 预处理字段配置,保留 Python 字典引用以避免重复转换整份配置。 +fn build_field_specs<'py>(fields: &Bound<'py, PyDict>) -> PyResult>> { + let mut specs = Vec::new(); + for (key, value) in fields.iter() { + if value.is_none() { + continue; + } + let Ok(config) = value.downcast_into::() else { + continue; + }; + specs.push(FieldSpec { + name: key.extract::()?, + config, + }); + } + Ok(specs) +} + +/// 解析单行种子信息,覆盖普通配置站点的主字段抽取流程。 +fn parse_indexer_row( + py: Python<'_>, + row: ElementRef<'_>, + domain: &str, + field_specs: &[FieldSpec<'_>], + category: Option<&Bound<'_, PyDict>>, +) -> PyResult { + let output = PyDict::new(py); + let field_map = field_specs + .iter() + .map(|field| (field.name.as_str(), field)) + .collect::>>(); + let mut cache = BTreeMap::new(); + let mut resolving = HashSet::new(); + + if let Some(value) = eval_field_by_name(row, &field_map, "details", &mut cache, &mut resolving)? { + if !value.is_empty() { + output.set_item("page_url", normalize_site_link(domain, &value, true))?; + } + } + if let Some(value) = eval_field_by_name(row, &field_map, "download", &mut cache, &mut resolving)? { + if !value.is_empty() { + output.set_item("enclosure", normalize_site_link(domain, &value, false))?; + } + } + if let Some(value) = eval_factor_field(row, &field_map, "downloadvolumefactor", &mut cache, &mut resolving)? { + output.set_item("downloadvolumefactor", value)?; + } + if let Some(value) = eval_factor_field(row, &field_map, "uploadvolumefactor", &mut cache, &mut resolving)? { + output.set_item("uploadvolumefactor", value)?; + } + if let Some(value) = eval_pubdate_field(row, &field_map, &mut cache, &mut resolving)? { + if !value.is_empty() { + output.set_item("pubdate", value)?; + } + } + + for (source_key, target_key) in OUTPUT_FIELDS { + match *source_key { + "labels" => { + parse_labels_field(py, row, &field_map, &output)?; + } + "hr" => { + if let Some(value) = eval_hr_field(row, &field_map)? { + output.set_item(*target_key, value)?; + } + } + "category" => { + if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? { + output.set_item(*target_key, map_category_value(&value, category)?)?; + } + } + "size" => { + if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? { + output.set_item(*target_key, parse_filesize_text(value.replace('\n', "").trim()))?; + } + } + "leechers" | "seeders" | "grabs" => { + if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? { + output.set_item(*target_key, parse_peer_count(&value))?; + } + } + _ => { + if let Some(value) = eval_field_by_name(row, &field_map, source_key, &mut cache, &mut resolving)? { + if !value.is_empty() { + output.set_item(*target_key, value.replace('\n', " ").trim().to_string())?; + } + } + } + } + } + + if output.is_empty() { + return Ok(RowParseResult::Empty); + } + Ok(RowParseResult::Item(output.into())) +} + +/// 按字段名求值并缓存结果,支持 Jinja 模板里的任意 fields 引用。 +fn eval_field_by_name( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + name: &str, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult> { + if let Some(value) = cache.get(name) { + return Ok(Some(value.clone())); + } + if resolving.contains(name) { + return Ok(Some(String::new())); + } + let Some(spec) = field_map.get(name).copied() else { + return Ok(None); + }; + resolving.insert(name.to_string()); + let value = eval_field(row, field_map, spec, cache, resolving)?; + resolving.remove(name); + if let Some(value) = value.clone() { + cache.insert(name.to_string(), value); + } + Ok(value) +} + +/// 执行单个字段配置,统一处理 selector/text/default/filter 的组合语义。 +fn eval_field( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + spec: &FieldSpec<'_>, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult> { + let mut value = if let Some(template) = get_optional_string(&spec.config, "text")? { + Some(render_field_template(row, field_map, &template, cache, resolving)?) + } else { + safe_query(row, &spec.config)? + }; + + if let Some(current) = value.as_deref() { + if contains_jinja_syntax(current) { + value = Some(render_embedded_value( + row, + field_map, + &spec.name, + current, + cache, + resolving, + )?); + } + } + if let Some(filters) = spec.config.get_item("filters")? { + if !filters.is_none() { + value = apply_text_filters(value.unwrap_or_default(), &filters)?; + } + } + if value.as_deref().map(str::is_empty).unwrap_or(true) { + if let Some(default_value) = get_default_value(&spec.config)? { + value = Some(default_value); + } + } + Ok(value) +} + +/// 渲染字段 text 模板,只抽取模板实际引用的依赖字段。 +fn render_field_template( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + template: &str, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult { + let mut values = BTreeMap::new(); + for key in extract_template_field_names(template) { + let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default(); + values.insert(key, value); + } + Ok(render_jinja_template(template, &values).unwrap_or_default()) +} + +/// 渲染字段值中残留的 Jinja 模板,兼容少数站点把模板写进 title 属性的情况。 +fn render_embedded_value( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + current_name: &str, + template: &str, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult { + let mut values = BTreeMap::new(); + for key in extract_template_field_names(template) { + if key == current_name { + values.insert(key, String::new()); + continue; + } + let value = eval_field_by_name(row, field_map, &key, cache, resolving)?.unwrap_or_default(); + values.insert(key, value); + } + Ok(render_jinja_template(template, &values).unwrap_or_default()) +} + +/// 提取 Jinja 模板中出现过的 fields 字段名。 +fn extract_template_field_names(template: &str) -> Vec { + let mut keys = Vec::new(); + for captures in FIELD_REF_RE.captures_iter(template) { + let Some(key) = captures.get(1).or_else(|| captures.get(2)) else { + continue; + }; + let key = key.as_str(); + if !keys.iter().any(|item: &String| item == key) { + keys.push(key.to_string()); + } + } + keys +} + +/// 读取字段默认值,兼容历史配置里的 defualt_value 拼写。 +fn get_default_value(selector_config: &Bound<'_, PyDict>) -> PyResult> { + if let Some(value) = get_optional_string(selector_config, "default_value")? { + return Ok(Some(value)); + } + get_optional_string(selector_config, "defualt_value") +} + +/// 解析上传/下载优惠系数字段,保留配置里 0.5/0.3 这类浮点倍率。 +fn eval_factor_field( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + key: &str, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult> { + let Some(spec) = field_map.get(key).copied() else { + return Ok(None); + }; + if let Some(case_obj) = spec.config.get_item("case")? { + let case_dict = case_obj.downcast::()?; + for (case_selector_obj, value) in case_dict.iter() { + let case_selector = case_selector_obj.extract::()?; + if selector_exists(row, &case_selector)? { + return Ok(Some(value.extract::().unwrap_or(1.0))); + } + } + return Ok(Some(1.0)); + } + if let Some(value) = eval_field_by_name(row, field_map, key, cache, resolving)? { + if let Some(number) = NUMERIC_FACTOR_RE + .captures(&value) + .and_then(|caps| caps.get(1)) + .and_then(|item| item.as_str().parse::().ok()) + { + return Ok(Some(number)); + } + } + Ok(Some(1.0)) +} + +/// 解析标签列表字段,保持 Python 侧 labels 输出为字符串数组。 +fn parse_labels_field( + py: Python<'_>, + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + output: &Bound<'_, PyDict>, +) -> PyResult<()> { + let Some(spec) = field_map.get("labels").copied() else { + return Ok(()); + }; + if !spec.config.contains("selector")? { + output.set_item("labels", PyList::empty(py))?; + return Ok(()); + } + let labels = PyList::empty(py); + if let Some(values) = query_all_values(row, &spec.config)? { + for value in values.into_iter().filter(|item| !item.is_empty()) { + labels.append(value)?; + } + } + output.set_item("labels", labels)?; + Ok(()) +} + +/// 解析 HR 标记字段,配置存在时输出布尔值。 +fn eval_hr_field( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, +) -> PyResult> { + let Some(spec) = field_map.get("hr").copied() else { + return Ok(None); + }; + let Some(selector_text) = get_selector_text(&spec.config)? else { + return Ok(Some(false)); + }; + Ok(Some(selector_exists(row, &selector_text)?)) +} + +/// 将站点分类 ID 映射为 MoviePilot 的媒体类型中文值。 +fn map_category_value(value: &str, category: Option<&Bound<'_, PyDict>>) -> PyResult<&'static str> { + let Some(category) = category else { + return Ok("未知"); + }; + let tv_cats = category_ids_for_field(category, "tv")?; + let movie_cats = category_ids_for_field(category, "movie")?; + if tv_cats.iter().any(|item| item == value) && !movie_cats.iter().any(|item| item == value) { + return Ok("电视剧"); + } + if movie_cats.iter().any(|item| item == value) { + return Ok("电影"); + } + Ok("未知") +} + +/// 解析整数类统计字段,兼容 "12/34" 和千分位逗号。 +fn parse_peer_count(value: &str) -> i64 { + value + .split('/') + .next() + .unwrap_or("") + .replace(',', "") + .trim() + .parse::() + .unwrap_or(0) +} + +/// 解析发布时间字段,并在 date 模板产出相对时间时使用 date_added 保持可排序时间。 +fn eval_pubdate_field( + row: ElementRef<'_>, + field_map: &HashMap<&str, &FieldSpec<'_>>, + cache: &mut BTreeMap, + resolving: &mut HashSet, +) -> PyResult> { + if let Some(value) = eval_field_by_name(row, field_map, "date", cache, resolving)? { + let normalized = normalize_pubdate_text(&value); + if is_standard_datetime(&normalized) || !field_map.contains_key("date_added") { + return Ok(Some(normalized)); + } + if let Some(date_added) = eval_field_by_name(row, field_map, "date_added", cache, resolving)? { + let fallback = normalize_pubdate_text(&date_added); + if is_standard_datetime(&fallback) { + return Ok(Some(fallback)); + } + } + return Ok(Some(normalized)); + } + Ok(eval_field_by_name(row, field_map, "date_added", cache, resolving)? + .map(|value| normalize_pubdate_text(&value))) +} + +/// 规范化发布时间文本为 MoviePilot 期望的字符串格式。 +fn normalize_pubdate_text(value: &str) -> String { + if let Some(parsed) = format_date_value(value, "%Y-%m-%d %H:%M:%S") { + return parsed; + } + value.replace('\n', " ").trim().to_string() +} + +/// 判断文本是否已经是 MoviePilot 标准日期时间格式。 +fn is_standard_datetime(value: &str) -> bool { + NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S").is_ok() +} + +/// 解析标准 CSS selector,并保留 table > tr 的 HTML5 tbody 兼容扩展。 +fn parse_css_selector(selector_text: &str) -> Option { + if selector_text == "*" { + return Selector::parse("*").ok(); + } + let normalized = normalize_pyquery_selector(selector_text); + let expanded = expand_table_direct_tr_selector(&normalized); + if let Ok(selector) = Selector::parse(&expanded) { + return Some(selector); + } + if expanded != normalized { + if let Ok(selector) = Selector::parse(&normalized) { + return Some(selector); + } + } + Selector::parse(selector_text).ok() +} + +/// 查询站点选择器,额外支持 PyQuery 的 :has("selector") 写法。 +fn select_site_elements<'a>( + root: ElementRef<'a>, + selector_text: &str, +) -> Option>> { + let Some(captures) = HAS_SELECTOR_RE.captures(selector_text) else { + let selector = parse_css_selector(selector_text)?; + return Some(root.select(&selector).collect()); + }; + let matched = captures.get(0)?; + let prefix = selector_text[..matched.start()].trim(); + let suffix = selector_text[matched.end()..].trim(); + let inner = captures + .get(1) + .or_else(|| captures.get(2)) + .or_else(|| captures.get(3))? + .as_str() + .trim(); + let base_selector = parse_css_selector(prefix)?; + let has_selector = parse_css_selector(inner)?; + let bases = root + .select(&base_selector) + .filter(|element| element.select(&has_selector).next().is_some()); + if suffix.is_empty() { + return Some(bases.collect()); + } + let suffix_selector_text = suffix.trim_start_matches('>').trim(); + let suffix_selector = parse_css_selector(suffix_selector_text)?; + let mut values = Vec::new(); + for base in bases { + values.extend(base.select(&suffix_selector)); + } + Some(values) +} + +/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。 +fn normalize_pyquery_selector(selector_text: &str) -> String { + HAS_QUOTED_SELECTOR_RE + .replace_all(selector_text, |captures: ®ex::Captures<'_>| { + let inner = captures + .get(1) + .or_else(|| captures.get(2)) + .map(|item| item.as_str()) + .unwrap_or_default(); + format!(":has({inner})") + }) + .into_owned() +} + +/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。 +fn expand_table_direct_tr_selector(selector_text: &str) -> String { + let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2"); + if expanded == selector_text { + return selector_text.to_string(); + } + format!("{selector_text}, {expanded}") +} + +/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。 +fn safe_query(row: ElementRef<'_>, selector_config: &Bound<'_, PyDict>) -> PyResult> { + let Some(values) = query_all_values(row, selector_config)? else { + return Ok(None); + }; + Ok(select_indexed_value(values, selector_config)) +} + +/// 查询 selector 的全部文本或属性值。 +fn query_all_values( + row: ElementRef<'_>, + selector_config: &Bound<'_, PyDict>, +) -> PyResult>> { + let Some(selector_text) = get_selector_text(selector_config)? else { + return Ok(None); + }; + let Some(elements) = select_site_elements(row, &selector_text) else { + return Ok(None); + }; + let attribute = get_optional_string(selector_config, "attribute")?; + let remove_selectors = parse_remove_selectors(selector_config)?; + let mut values = Vec::new(); + for element in elements { + if let Some(attribute) = attribute.as_deref() { + values.push(element.value().attr(attribute).unwrap_or("").to_string()); + } else { + values.push(normalize_element_text(element, &remove_selectors)); + } + } + Ok(Some(values)) +} + +/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。 +fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult> { + let Some(remove_text) = get_optional_string(selector_config, "remove")? else { + return Ok(Vec::new()); + }; + let mut selectors = Vec::new(); + for item in remove_text.split(',') { + let item = item.trim(); + if item.is_empty() { + continue; + } + let Some(selector) = parse_css_selector(item) else { + return Ok(Vec::new()); + }; + selectors.push(selector); + } + Ok(selectors) +} + +/// 读取 selector 或 selectors 配置。 +fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult> { + if let Some(selector) = get_optional_string(selector_config, "selector")? { + if !selector.is_empty() { + return Ok(Some(selector)); + } + } + if let Some(selector) = get_optional_string(selector_config, "selectors")? { + if !selector.is_empty() { + return Ok(Some(selector)); + } + } + Ok(None) +} + +/// 对查询结果应用 contents/index 规则。 +fn select_indexed_value( + values: Vec, + selector_config: &Bound<'_, PyDict>, +) -> Option { + if values.is_empty() { + return None; + } + if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") { + if let Some(first) = values.first() { + let lines: Vec<&str> = first.split('\n').collect(); + return pick_indexed_item(&lines, contents).map(|item| item.to_string()); + } + } + if let Ok(Some(index)) = get_optional_i64(selector_config, "index") { + return pick_indexed_item(&values, index).cloned(); + } + values.first().cloned() +} + +/// 按 Python 列表语义读取正负索引。 +fn pick_indexed_item(items: &[T], index: i64) -> Option<&T> { + let len = items.len() as i64; + let resolved = if index < 0 { len + index } else { index }; + if resolved < 0 { + return None; + } + items.get(resolved as usize) +} + +/// 执行 indexer 文本过滤器,覆盖 Build 配置中出现的全部过滤器。 +fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResult> { + let Ok(filter_list) = filters.downcast::() else { + return Ok(Some(current)); + }; + for item in filter_list.iter() { + let filter = item.downcast::()?; + let method_name = get_optional_string(filter, "name")?; + if current.is_empty() { + break; + } + match method_name.as_deref() { + Some("re_search") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + let Ok(args_list) = args.downcast::() else { + continue; + }; + if args_list.len() < 2 { + continue; + } + let pattern = args_list.get_item(0)?.extract::()?; + let group_index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0); + let Ok(regex) = Regex::new(&pattern) else { + continue; + }; + if let Some(captures) = regex.captures(¤t) { + if let Some(value) = captures.get(group_index as usize) { + current = value.as_str().to_string(); + } + } + } + Some("split") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + let Ok(args_list) = args.downcast::() else { + continue; + }; + if args_list.len() < 2 { + continue; + } + let delimiter = args_list.get_item(0)?.extract::()?; + let index = extract_i64(&args_list.get_item(args_list.len() - 1)?)?.unwrap_or(0); + let parts: Vec<&str> = current.split(&delimiter).collect(); + if let Some(value) = pick_indexed_item(&parts, index) { + current = value.to_string(); + } + } + Some("replace") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + let Ok(args_list) = args.downcast::() else { + continue; + }; + if args_list.len() < 2 { + continue; + } + let from = args_list.get_item(0)?.extract::()?; + let to = args_list.get_item(args_list.len() - 1)?.extract::()?; + current = current.replace(&from, &to); + } + Some("dateparse") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + let format = args.str()?.to_str()?.to_string(); + if let Some(value) = format_date_value(¤t, &format) { + current = value; + } + } + Some("date_en_elapsed_parse") => { + if let Some(value) = parse_english_elapsed_date(¤t) { + current = value; + } + } + Some("strip") => { + current = current.trim().to_string(); + } + Some("lstrip") => { + let Some(args) = filter.get_item("args")? else { + current = current.trim_start().to_string(); + continue; + }; + current = lstrip_text(¤t, &args)?; + } + Some("appendleft") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + current = format!("{}{}", args.str()?.to_str()?, current); + } + Some("querystring") => { + let Some(args) = filter.get_item("args")? else { + continue; + }; + current = query_param_value(¤t, args.str()?.to_str()?).unwrap_or_default(); + } + _ => {} + } + } + Ok(Some(current.trim().to_string())) +} + +/// 按 Python str.lstrip(chars) 语义处理左侧字符集。 +fn lstrip_text(current: &str, args: &Bound<'_, PyAny>) -> PyResult { + let chars = if let Ok(args_list) = args.downcast::() { + if args_list.is_empty() { + String::new() + } else { + args_list.get_item(0)?.str()?.to_str()?.to_string() + } + } else { + args.str()?.to_str()?.to_string() + }; + if chars.is_empty() { + return Ok(current.trim_start().to_string()); + } + Ok(current + .trim_start_matches(|ch| chars.contains(ch)) + .to_string()) +} + +/// 将日期文本按站点格式解析为统一时间字符串。 +fn format_date_value(value: &str, format: &str) -> Option { + let value = value.replace('\n', " ").trim().to_string(); + if value.is_empty() { + return None; + } + if value.eq_ignore_ascii_case("now") { + return Some(Local::now().format("%Y-%m-%d %H:%M:%S").to_string()); + } + if let Ok(datetime) = NaiveDateTime::parse_from_str(&value, format) { + return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string()); + } + if let Ok(date) = NaiveDate::parse_from_str(&value, format) { + return Some( + date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?) + .format("%Y-%m-%d %H:%M:%S") + .to_string(), + ); + } + parse_common_date_value(&value) +} + +/// 尝试解析站点常见日期格式,补足 dateparse 失败后的兼容路径。 +fn parse_common_date_value(value: &str) -> Option { + if let Ok(datetime) = DateTime::parse_from_rfc3339(value) { + return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string()); + } + for format in [ + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d%H:%M:%S", + "%Y-%m-%d %H:%M", + "%Y-%m-%d", + "%b %d %Y, %H:%M", + "%H:%M:%S%d/%m/%Y", + ] { + if let Ok(datetime) = NaiveDateTime::parse_from_str(value, format) { + return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string()); + } + if let Ok(date) = NaiveDate::parse_from_str(value, format) { + let datetime = date.and_time(NaiveTime::from_hms_opt(0, 0, 0)?); + return Some(datetime.format("%Y-%m-%d %H:%M:%S").to_string()); + } + } + None +} + +/// 解析 IPT 等英文站点的相对发布时间。 +fn parse_english_elapsed_date(value: &str) -> Option { + if let Some(parsed) = parse_common_date_value(value) { + return Some(parsed); + } + let captures = EN_ELAPSED_RE.captures(value)?; + let amount = captures.get(1)?.as_str().parse::().ok()?; + let unit = captures.get(2)?.as_str().to_ascii_lowercase(); + let duration = match unit.as_str() { + "second" => Duration::seconds(amount), + "minute" => Duration::minutes(amount), + "hour" => Duration::hours(amount), + "day" => Duration::days(amount), + "week" => Duration::weeks(amount), + "month" => Duration::days(amount * 30), + "year" => Duration::days(amount * 365), + _ => return None, + }; + Some((Local::now() - duration).format("%Y-%m-%d %H:%M:%S").to_string()) +} + +/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。 +fn parse_filesize_text(text: &str) -> i64 { + let raw = text.trim().to_string(); + if raw.is_empty() { + return 0; + } + if raw.chars().all(|ch| ch.is_ascii_digit()) { + return raw.parse::().unwrap_or(0); + } + let normalized = raw.replace([',', ' '], "").to_uppercase(); + let size_text = FILESIZE_UNIT_RE.replace_all(&normalized, "").to_string(); + let Ok(mut size) = size_text.parse::() else { + return 0; + }; + if normalized.contains("PB") || normalized.contains("PIB") { + size *= 1024_f64.powi(5); + } else if normalized.contains("TB") || normalized.contains("TIB") { + size *= 1024_f64.powi(4); + } else if normalized.contains("GB") || normalized.contains("GIB") { + size *= 1024_f64.powi(3); + } else if normalized.contains("MB") || normalized.contains("MIB") { + size *= 1024_f64.powi(2); + } else if normalized.contains("KB") || normalized.contains("KIB") { + size *= 1024_f64; + } + size.round() as i64 +} + +/// 规范化元素文本,尽量接近 PyQuery.text() 输出。 +fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String { + let mut rendered = String::new(); + for node in element.descendants() { + let Some(text_node) = node.value().as_text() else { + continue; + }; + if should_skip_text_node( + node.parent().and_then(ElementRef::wrap), + element, + remove_selectors, + ) { + continue; + } + rendered.push_str(text_node); + } + normalize_whitespace(&rendered) +} + +/// 折叠 PyQuery.text() 中的连续空白,保留元素相邻文本节点的直接拼接效果。 +fn normalize_whitespace(value: &str) -> String { + value.split_whitespace().collect::>().join(" ") +} + +/// 判断文本节点是否位于需要 remove 的元素子树中。 +fn should_skip_text_node( + mut parent: Option>, + root: ElementRef<'_>, + remove_selectors: &[Selector], +) -> bool { + while let Some(element) = parent { + if element == root { + return false; + } + if remove_selectors.iter().any(|selector| selector.matches(&element)) { + return true; + } + parent = element.parent().and_then(ElementRef::wrap); + } + false +} + +/// 判断 row 内是否存在指定 selector。 +fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult { + if selector_text == "*" { + return Ok(true); + } + Ok(select_site_elements(row, selector_text) + .map(|elements| !elements.is_empty()) + .unwrap_or(false)) +} + +/// 拼接详情和下载链接。 +fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> String { + if link.starts_with("http") || link.starts_with("magnet") { + return link.to_string(); + } + if protocol_relative && link.starts_with("//") { + let scheme = domain.split(':').next().unwrap_or("http"); + return format!("{scheme}:{link}"); + } + if !protocol_relative { + if let Ok(base) = Url::parse(&standardize_base_url(domain)) { + if let Some(host) = base.host_str() { + if link.contains(host) { + if link.starts_with('/') { + return format!("{}:{link}", base.scheme()); + } + return format!("{}://{link}", base.scheme()); + } + } + } + } + if let Some(stripped) = link.strip_prefix('/') { + format!("{domain}{stripped}") + } else { + format!("{domain}{link}") + } +} + +/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)。 +fn render_jinja_template(template: &str, fields: &BTreeMap) -> Option { + let mut env = Environment::new(); + env.set_undefined_behavior(UndefinedBehavior::Chainable); + env.render_str(template, context! { fields => fields }).ok() +} + +/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选。 +fn contains_jinja_syntax(value: &str) -> bool { + value.contains("{{") || value.contains("{%") || value.contains("{#") +} + +/// 读取分类配置中的 ID 列表。 +fn category_ids_for_field(category: &Bound<'_, PyDict>, key: &str) -> PyResult> { + let Some(list_obj) = category.get_item(key)? else { + return Ok(Vec::new()); + }; + let Ok(list) = list_obj.downcast::() else { + return Ok(Vec::new()); + }; + let mut values = Vec::new(); + for item in list.iter() { + let dict = item.downcast::()?; + if let Some(id) = get_optional_string(dict, "id")? { + values.push(id); + } + } + Ok(values) +} + +/// 读取 URL 查询参数中的第一个值。 +fn query_param_value(text: &str, key: &str) -> Option { + let query = if let Ok(url) = Url::parse(text) { + url.query().unwrap_or("").to_string() + } else { + text.split_once('?') + .map(|(_, query)| query.split('#').next().unwrap_or("").to_string()) + .unwrap_or_default() + }; + form_urlencoded::parse(query.as_bytes()) + .find(|(param_key, _)| param_key == key) + .map(|(_, value)| value.to_string()) +} + +/// 标准化基础 URL,与 Python UrlUtils.standardize_base_url 保持一致。 +fn standardize_base_url(host: &str) -> String { + let mut value = host.to_string(); + if !value.ends_with('/') { + value.push('/'); + } + if !value.starts_with("http://") && !value.starts_with("https://") { + value = format!("http://{value}"); + } + value +} diff --git a/rust/moviepilot_rust/src/lib.rs b/rust/moviepilot_rust/src/lib.rs index 683f67ab..df536fdc 100644 --- a/rust/moviepilot_rust/src/lib.rs +++ b/rust/moviepilot_rust/src/lib.rs @@ -1,4 +1,6 @@ mod filter; +mod indexer; +mod utils; use pyo3::prelude::*; @@ -13,5 +15,6 @@ fn is_available() -> bool { fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(is_available, m)?)?; m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?; + m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?; Ok(()) } diff --git a/rust/moviepilot_rust/src/utils.rs b/rust/moviepilot_rust/src/utils.rs new file mode 100644 index 00000000..c08bffdf --- /dev/null +++ b/rust/moviepilot_rust/src/utils.rs @@ -0,0 +1,46 @@ +use pyo3::prelude::*; +use pyo3::types::{PyAny, PyDict}; + +/// 从 Python 字典读取可选字符串。 +pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { + let Some(value) = dict.get_item(key)? else { + return Ok(None); + }; + if value.is_none() { + return Ok(None); + } + Ok(Some(value.str()?.to_str()?.to_string())) +} + +/// 从 Python 字典读取可选整数。 +pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { + let Some(value) = dict.get_item(key)? else { + return Ok(None); + }; + if value.is_none() { + return Ok(None); + } + if let Ok(parsed) = value.extract::() { + return Ok(Some(parsed)); + } + let text = value.str()?.to_str()?.trim().to_string(); + if text.is_empty() { + return Ok(None); + } + Ok(text.parse::().ok()) +} + +/// 将 Python 对象转换为 i64,用于兼容配置里字符串或数字形式的下标。 +pub(crate) fn extract_i64(value: &Bound<'_, PyAny>) -> PyResult> { + if value.is_none() { + return Ok(None); + } + if let Ok(parsed) = value.extract::() { + return Ok(Some(parsed)); + } + let text = value.str()?.to_str()?.trim().to_string(); + if text.is_empty() { + return Ok(None); + } + Ok(text.parse::().ok()) +} diff --git a/scripts/benchmark_indexer_rust.py b/scripts/benchmark_indexer_rust.py new file mode 100644 index 00000000..5db36e3a --- /dev/null +++ b/scripts/benchmark_indexer_rust.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Benchmark SiteSpider indexer parsing with real MoviePilot-Build site configs. +""" + +import argparse +import copy +import contextlib +import io +import statistics +import sys +import time +from pathlib import Path +from typing import Callable, Dict, List + +import yaml + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from app.modules.indexer import spider as spider_module +from app.modules.indexer.spider import SiteSpider + + +def _load_site_config(sites_dir: Path, site_id: str) -> dict: + """ + 从 MoviePilot-Build sites 目录读取指定站点配置。 + """ + candidates = list(sites_dir.rglob(f"{site_id}.yml")) + if not candidates: + raise FileNotFoundError(f"找不到站点配置:{site_id}") + with candidates[0].open("r", encoding="utf-8") as file: + return yaml.safe_load(file) + + +def _nexus_row( + torrent_id: int, + title: str, + category: str = "402", + date_text: str = "2025-05-01 12:13:14", + free_class: str = "pro_free", +) -> str: + """ + 生成 NexusPhp 类站点配置可解析的单行 HTML。 + """ + return f""" + + cat + + + + + +
+ {title}.fallback + download + + + Desc {torrent_id} remove + 标签{torrent_id} +
+ + + {date_text} + 1.5 GB + {torrent_id + 10} + {torrent_id % 7} + {torrent_id % 11} + + """ + + +def _build_nexus_html(rows: int) -> str: + """ + 生成多行 NexusPhp 表格 HTML,用于驱动真实 SiteSpider 解析链路。 + """ + body = "\n".join( + _nexus_row(1000 + index, f"Benchmark.Title.{index}") + for index in range(rows) + ) + return f"{body}
" + + +def _build_ipt_html(rows: int) -> str: + """ + 生成 IPT 配置使用的表格 HTML,覆盖 lstrip 和英文相对时间过滤器。 + """ + body = "\n".join( + f""" + + Benchmark.IPT.{index} + downloadFree +
Uploaded | {index % 5 + 1} hours ago
+ + {1 + index % 9}.2 GB + {index % 13} + {index + 20} + {index % 7} + + """ + for index in range(rows) + ) + return f"{body}
" + + +def _prepare_cases(sites_dir: Path, rows: int) -> List[Dict[str, object]]: + """ + 构造使用真实站点配置的 benchmark 用例。 + """ + cases = [] + for site_id in ["agsvpt", "pttime", "chdbits"]: + config = _load_site_config(sites_dir, site_id) + cases.append({ + "site": site_id, + "indexer": config, + "html": _build_nexus_html(rows), + }) + cases.append({ + "site": "iptorrents", + "indexer": _load_site_config(sites_dir, "iptorrents"), + "html": _build_ipt_html(rows), + }) + return cases + + +def _parse_with_rust(indexer: dict, html: str) -> List[dict]: + """ + 使用 SiteSpider 默认入口解析,Rust 扩展可用时会命中 Rust fast path。 + """ + return SiteSpider(copy.deepcopy(indexer)).parse(html) + + +def _parse_with_python(indexer: dict, html: str) -> List[dict]: + """ + 临时禁用 Rust fast path,复用同一个 SiteSpider.parse 入口测 PyQuery 路径。 + """ + original = spider_module.rust_accel.parse_indexer_torrents + spider_module.rust_accel.parse_indexer_torrents = lambda *args, **kwargs: None + try: + return SiteSpider(copy.deepcopy(indexer)).parse(html) + finally: + spider_module.rust_accel.parse_indexer_torrents = original + + +def _time_parser(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int) -> float: + """ + 多次执行解析函数并返回总耗时。 + """ + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer): + started = time.perf_counter() + for _ in range(loops): + parser(indexer, html) + return time.perf_counter() - started + + +def _median_time(parser: Callable[[dict, str], List[dict]], indexer: dict, html: str, loops: int, repeats: int) -> float: + """ + 重复测量并取中位数,降低单次抖动对结论的影响。 + """ + samples = [_time_parser(parser, indexer, html, loops) for _ in range(repeats)] + return statistics.median(samples) + + +def run_benchmark(sites_dir: Path, rows: int, loops: int, repeats: int) -> None: + """ + 执行真实 SiteSpider.parse 链路 benchmark 并打印 Rust/PyQuery 对比。 + """ + cases = _prepare_cases(sites_dir, rows) + print(f"sites_dir={sites_dir}") + print(f"rows={rows} loops={loops} repeats={repeats}") + print("site, rows, rust_ms, python_ms, speedup") + for case in cases: + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer): + rust_result = _parse_with_rust(case["indexer"], case["html"]) + python_result = _parse_with_python(case["indexer"], case["html"]) + if not rust_result or not python_result: + raise RuntimeError(f"{case['site']} benchmark fixture did not parse any rows") + + rust_time = _median_time(_parse_with_rust, case["indexer"], case["html"], loops, repeats) + python_time = _median_time(_parse_with_python, case["indexer"], case["html"], loops, repeats) + speedup = python_time / rust_time if rust_time else 0 + print( + f"{case['site']}, {len(rust_result)}, " + f"{rust_time * 1000:.2f}, {python_time * 1000:.2f}, {speedup:.2f}x" + ) + + +def parse_args() -> argparse.Namespace: + """ + 解析命令行参数。 + """ + parser = argparse.ArgumentParser(description="Benchmark MoviePilot Rust indexer parser") + parser.add_argument( + "--sites-dir", + type=Path, + default=Path("/Users/jxxghp/PycharmProjects/MoviePilot-Build/sites"), + help="MoviePilot-Build sites directory", + ) + parser.add_argument("--rows", type=int, default=80, help="Rows per fixture page") + parser.add_argument("--loops", type=int, default=50, help="Loops per timing sample") + parser.add_argument("--repeats", type=int, default=5, help="Timing samples per parser") + return parser.parse_args() + + +def main() -> None: + """ + 入口函数。 + """ + args = parse_args() + run_benchmark(args.sites_dir, args.rows, args.loops, args.repeats) + + +if __name__ == "__main__": + main() diff --git a/tests/test_rust_accel.py b/tests/test_rust_accel.py index dc45f0ec..cc9b2b14 100644 --- a/tests/test_rust_accel.py +++ b/tests/test_rust_accel.py @@ -1,5 +1,7 @@ import pytest +from app.modules.indexer.spider import SiteSpider +from app.schemas.types import MediaType from app.utils import rust_accel @@ -25,3 +27,219 @@ def test_rust_filter_rule_parser_handles_parentheses_and_or(): result = rust_accel.parse_filter_rule("CNSUB & (4K | 1080P) & !BLU") assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]] + + +def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links(): + """ + Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。 + """ + html = """ + + + + + + + + + + + +
TV + + + + +
+ Default.Title + DL + IMDb + Main description removelink + FREE + +
+
1 hour ago1.5 GB1,2345/79
+ """ + indexer = { + "id": "unit", + "name": "Unit", + "domain": "https://example.com/", + "search": {"paths": [{"path": "torrents.php"}]}, + "category": { + "movie": [{"id": "401"}], + "tv": [{"id": "402"}], + }, + "torrents": { + "list": {"selector": 'table.torrents > tr:has("table.torrentname")'}, + "fields": { + "title_default": {"selector": 'a[href*="details.php?id="]'}, + "title_optional": { + "selector": 'a[title][href*="details.php?id="]', + "attribute": "title", + }, + "title": { + "text": "{% if fields['title_optional'] %}{{ fields['title_optional'] }}{% else %}" + "{{ fields['title_default'] }}{% endif %}" + }, + "details": {"selector": 'a[href*="details.php?id="]', "attribute": "href"}, + "download": {"selector": 'a[href*="download.php?id="]', "attribute": "href"}, + "imdbid": { + "selector": 'a[href*="imdb.com/title/tt"]', + "attribute": "href", + "filters": [{"name": "re_search", "args": ["tt\\d+", 0]}], + }, + "date_elapsed": {"selector": "td:nth-child(4) > span"}, + "date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_added'] if fields['date_added'] else fields['date_elapsed'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + "size": {"selector": "td:nth-child(5)"}, + "seeders": {"selector": "td:nth-child(6)"}, + "leechers": {"selector": "td:nth-child(7)"}, + "grabs": {"selector": "td:nth-child(8)"}, + "downloadvolumefactor": {"case": {"img.free": 0, "*": 1}}, + "uploadvolumefactor": {"case": {"*": 1}}, + "description": { + "selector": "font.subtitle", + "remove": "span,a", + }, + "labels": {"selector": "span.label"}, + "hr": {"selector": "img.hitandrun"}, + "category": { + "selector": 'a[href*="?cat="]', + "attribute": "href", + "filters": [{"name": "querystring", "args": "cat"}], + }, + }, + }, + } + + result = SiteSpider(indexer, mtype=MediaType.TV).parse(html) + + assert result == [{ + "page_url": "https://example.com/details.php?id=100", + "enclosure": "https://example.com/download.php?id=100", + "downloadvolumefactor": 1.0, + "uploadvolumefactor": 1.0, + "pubdate": "2025-05-01 12:13:14", + "title": "Optional.Title", + "description": "Main description", + "imdbid": "tt1234567", + "size": 1610612736, + "peers": 5, + "seeders": 1234, + "grabs": 9, + "date_elapsed": "1 hour ago", + "labels": ["FREE"], + "hit_and_run": True, + "category": "电视剧", + }] + + +def test_rust_indexer_parser_handles_default_values_and_template_arithmetic(): + """ + Rust indexer 解析应支持 defualt_value、Jinja int filter 和模板算术表达式。 + """ + html = """ + + + + +
Default.Title
+ """ + fields = { + "title_default": {"selector": 'a[href*="details.php?id="]'}, + "missing_days": {"defualt_value": "2", "selector": "span.missing"}, + "title": {"text": "{{ fields['title_default'] }} {{ (fields['missing_days']|int)*86400 }}"}, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://example.com/", + list_config={"selector": "table.torrents > tr"}, + fields=fields, + category=None, + result_num=100, + ) + + assert result == [{"title": "Default.Title 172800"}] + + +def test_rust_indexer_parser_handles_lstrip_and_english_elapsed_date(): + """ + Rust indexer 解析应覆盖 IPT 配置用到的 lstrip 和 date_en_elapsed_parse 过滤器。 + """ + html = """ + + + + + +
Titledownload
Uploaded | 2 hours ago
+ """ + fields = { + "title": {"selector": 'a[href*="/t/"]'}, + "download": { + "selector": 'a[href*="/download.php/"]', + "attribute": "href", + "filters": [{"name": "lstrip", "args": ["/"]}], + }, + "date": { + "selector": "td:nth-child(2) > div", + "filters": [ + {"name": "split", "args": ["|", 1]}, + {"name": "date_en_elapsed_parse"}, + ], + }, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://iptorrents.com/", + list_config={"selector": 'table[id="torrents"] tr'}, + fields=fields, + category=None, + result_num=100, + ) + + assert len(result) == 1 + assert result[0]["title"] == "Title" + assert result[0]["enclosure"] == "https://iptorrents.com/download.php/123" + assert result[0]["pubdate"] + + +def test_rust_indexer_parser_prefers_date_added_when_date_template_returns_elapsed_text(): + """ + Rust indexer 解析 date 模板产出相对时间时,应使用 date_added 里的标准时间。 + """ + html = """ + + + + +
1 hour ago
+ """ + fields = { + "date_elapsed": {"selector": "span"}, + "date_added": {"selector": "span", "attribute": "title"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://example.com/", + list_config={"selector": "table.torrents > tr"}, + fields=fields, + category=None, + result_num=100, + ) + + assert result[0]["pubdate"] == "2025-06-02 03:04:05"