From 24dc53b62da403791dcc03fca13067a488aea4db Mon Sep 17 00:00:00 2001 From: jxxghp Date: Fri, 12 Jun 2026 10:11:52 +0800 Subject: [PATCH] fix: handle NexusPHP occurrence pubdate parsing --- app/modules/indexer/spider/__init__.py | 96 ++++++++++++++++- tests/test_indexer_spider_search_url.py | 109 +++++++++++++++++++ tests/test_rust_accel.py | 132 ++++++++++++++++++++++++ 3 files changed, 333 insertions(+), 4 deletions(-) diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 3aa16384..06eafae4 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -104,7 +104,7 @@ class SiteSpider: 预编译字段模板,避免按每条种子重复构造 Jinja Template。 """ templates = {} - for name in ("title", "description"): + for name in ("title", "description", "date"): selector = (self.fields or {}).get(name, {}) template_text = selector.get("text") if isinstance(selector, dict) else None if not template_text: @@ -502,19 +502,107 @@ class SiteSpider: def __get_pubdate(self, torrent: Any): # torrent pubdate yyyy-mm-dd hh:mm:ss - if 'date_added' not in self.fields: + if 'date_added' not in self.fields and 'date' not in self.fields: return selector = self.fields.get('date_added', {}) pubdate_str = self._safe_query(torrent, selector) + if not pubdate_str: + selector = self.fields.get('date', {}) + pubdate_str = self.__get_date(torrent, selector) if pubdate_str: pubdate_str = pubdate_str.replace('\n', ' ').strip() self.torrents_info['pubdate'] = self.__filter_text(pubdate_str, selector.get('filters')) if self.torrents_info.get('pubdate'): try: - if not isinstance(self.torrents_info['pubdate'], datetime.datetime): + if isinstance(self.torrents_info['pubdate'], datetime.datetime): + self.torrents_info['pubdate'] = self.torrents_info['pubdate'].strftime('%Y-%m-%d %H:%M:%S') + else: datetime.datetime.strptime(str(self.torrents_info['pubdate']), '%Y-%m-%d %H:%M:%S') except (ValueError, TypeError): self.torrents_info['pubdate'] = StringUtils.unify_datetime_str(str(self.torrents_info['pubdate'])) + if self.__is_invalid_pubdate_text(self.torrents_info.get('pubdate')): + self.torrents_info.pop('pubdate', None) + + def __get_date(self, torrent: Any, selector: dict) -> Optional[str]: + """ + 从 date 模板解析发布时间。 + """ + if not selector: + return None + if "selector" in selector: + return self._safe_query(torrent, selector) + template_text = selector.get("text") + if not template_text: + return None + + render_dict = {} + for field_name in ("date_elapsed", "date_added"): + field_selector = self.fields.get(field_name, {}) + field_value = self._safe_query(torrent, field_selector) + if not field_value: + field_value = self.__get_date_from_cell(torrent, field_selector) + render_dict[field_name] = field_value + if not any(render_dict.values()): + return None + + template = self._field_templates.get("date") or Template(template_text) + pubdate_str = template.render(fields=render_dict) + if pubdate_str == "now" or self.__is_relative_pubdate_text(pubdate_str): + return None + return pubdate_str + + def __get_date_from_cell(self, torrent: Any, selector: dict) -> Optional[str]: + """ + 兼容 NexusPHP 发生时间模式下不再渲染 span 的时间单元格。 + """ + cell_selector = self.__date_cell_selector(selector.get("selector")) + if not cell_selector: + return None + return self._safe_query(torrent, {"selector": cell_selector}) + + @staticmethod + def __date_cell_selector(selector: Optional[str]) -> Optional[str]: + """ + 从时间字段选择器推导父级 td 选择器。 + """ + if not selector: + return None + selector = selector.strip() + if not selector or "> span" not in selector: + return None + return selector.split("> span", 1)[0].strip() + + @staticmethod + def __is_relative_pubdate_text(pubdate: Optional[str]) -> bool: + """ + 判断是否为相对时间,避免写入不可排序的发布时间。 + """ + if not pubdate: + return False + text = str(pubdate).strip().lower() + if re.search(r"\d{4}[-/年]\d{1,2}", text): + return False + if "ago" in text: + return True + return bool(re.search(r"\d+\s*(秒|分钟|分|小时|天|周|月|年)", text)) + + @classmethod + def __is_invalid_pubdate_text(cls, pubdate: Optional[str]) -> bool: + """ + 判断是否为不可用发布时间,避免列错位文本污染 pubdate。 + """ + if not pubdate: + return True + text = str(pubdate).strip() + if text.lower() == "now" or text == "0": + return True + if cls.__is_relative_pubdate_text(text): + return True + try: + datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S') + return False + except (ValueError, TypeError): + return True def __get_date_elapsed(self, torrent: Any): # torrent date elapsed text @@ -822,7 +910,7 @@ class SiteSpider: text = param_value[0] if param_value else '' except Exception as err: logger.debug(f'过滤器 {method_name} 处理失败:{str(err)} - {traceback.format_exc()}') - return text.strip() + return text.strip() if isinstance(text, str) else text @staticmethod def __remove(item: Any, selector: Optional[dict]): diff --git a/tests/test_indexer_spider_search_url.py b/tests/test_indexer_spider_search_url.py index 1907b59b..d5837707 100644 --- a/tests/test_indexer_spider_search_url.py +++ b/tests/test_indexer_spider_search_url.py @@ -213,6 +213,115 @@ def test_python_spider_remove_does_not_pollute_other_fields(): }] +def test_python_spider_parses_nexus_php_occurrence_time_cell(): + """ + Python 兜底解析应兼容 NexusPHP 发生时间模式下没有 span 的时间单元格。 + """ + indexer = _build_indexer( + torrents={ + "list": {"selector": 'table.torrents > tr:has("table.torrentname")'}, + "fields": { + "title": {"selector": 'a[href*="details.php?id="]'}, + "date_elapsed": {"selector": "td:nth-child(4) > span", "optional": True}, + "date_added": { + "selector": "td:nth-child(4) > span", + "attribute": "title", + "optional": True, + }, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + }, + }, + ) + html = """ + + + + + + + +
Movie.Title
2025-05-01
12:13:14
+ """ + + with patch("app.modules.indexer.spider.rust_accel.parse_indexer_torrents", return_value=None): + result = SiteSpider(indexer).parse(html) + + assert result[0]["pubdate"] == "2025-05-01 12:13:14" + + +def test_python_spider_does_not_use_relative_date_as_pubdate(): + """ + Python 兜底解析不能把相对时间写入 pubdate。 + """ + indexer = _build_indexer( + torrents={ + "list": {"selector": "table.torrents > tr"}, + "fields": { + "title": {"selector": "a.title"}, + "date_elapsed": {"selector": "span.elapsed"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + }, + }, + ) + html = """ + + +
Movie.Title1小时
+ """ + + with patch("app.modules.indexer.spider.rust_accel.parse_indexer_torrents", return_value=None): + result = SiteSpider(indexer).parse(html) + + assert "pubdate" not in result[0] or result[0]["pubdate"] is None + + +def test_python_spider_does_not_use_invalid_date_as_pubdate(): + """ + Python 兜底解析不能把列错位的无效日期写入 pubdate。 + """ + indexer = _build_indexer( + torrents={ + "list": {"selector": "table.torrents > tr"}, + "fields": { + "title": {"selector": "a.title"}, + "date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"}, + "date_elapsed": {"selector": "td:nth-child(4) > span"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + }, + }, + ) + html = """ + + + + + + + +
Movie.Title0
+ """ + + with patch("app.modules.indexer.spider.rust_accel.parse_indexer_torrents", return_value=None): + result = SiteSpider(indexer).parse(html) + + assert "pubdate" not in result[0] or result[0]["pubdate"] is None + + def test_nexus_php_subtitle_table_parse_extracts_common_fields(): """ NexusPHP 字幕表格应解析出下载链接、语言、标题、时间、大小、点击、上传者等字段。 diff --git a/tests/test_rust_accel.py b/tests/test_rust_accel.py index 654b53ef..d1e4c804 100644 --- a/tests/test_rust_accel.py +++ b/tests/test_rust_accel.py @@ -1,6 +1,7 @@ import os import time from datetime import datetime, timezone +from importlib.metadata import PackageNotFoundError, version from types import SimpleNamespace import pytest @@ -24,6 +25,22 @@ pytestmark = pytest.mark.skipif( ) +def _require_rust_package_version(min_version: str) -> None: + """ + 跳过依赖尚未发布到 PyPI 的 Rust 扩展新行为用例。 + """ + try: + installed_version = version("moviepilot-rust") + except PackageNotFoundError: + pytest.skip("moviepilot-rust 包版本不可用") + installed_parts = tuple( + int(part) for part in installed_version.split("+", 1)[0].split(".")[:3] + ) + required_parts = tuple(int(part) for part in min_version.split(".")[:3]) + if installed_parts < required_parts: + pytest.skip(f"需要 moviepilot-rust>={min_version},当前为 {installed_version}") + + def test_rust_filter_rule_parser_matches_boolean_semantics(): """ Rust 过滤规则解析应保持 pyparsing 的布尔表达式结构。 @@ -640,3 +657,118 @@ def test_rust_indexer_parser_prefers_date_added_when_date_template_returns_elaps ) assert result[0]["pubdate"] == "2025-06-02 03:04:05" + + +def test_rust_indexer_parser_reads_nexus_php_occurrence_time_cell(): + """ + Rust indexer 解析应兼容 NexusPHP 发生时间模式下没有 span 的时间单元格。 + """ + _require_rust_package_version("0.1.9") + html = """ + + + + + + + +
Movie.Title
2025-05-01
12:13:14
+ """ + fields = { + "title": {"selector": 'a[href*="details.php?id="]'}, + "date_elapsed": {"selector": "td:nth-child(4) > span", "optional": True}, + "date_added": { + "selector": "td:nth-child(4) > span", + "attribute": "title", + "optional": True, + }, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://example.com/", + list_config={"selector": 'table.torrents > tr:has("table.torrentname")'}, + fields=fields, + category=None, + result_num=100, + ) + + assert result[0]["pubdate"] == "2025-05-01 12:13:14" + + +def test_rust_indexer_parser_does_not_use_relative_date_as_pubdate(): + """ + Rust indexer 解析不能把相对时间写入 pubdate。 + """ + _require_rust_package_version("0.1.9") + html = """ + + +
Movie.Title1小时
+ """ + fields = { + "title": {"selector": "a"}, + "date_elapsed": {"selector": "span.elapsed"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://example.com/", + list_config={"selector": "table.torrents > tr"}, + fields=fields, + category=None, + result_num=100, + ) + + assert "pubdate" not in result[0] + + +def test_rust_indexer_parser_does_not_use_invalid_date_as_pubdate(): + """ + Rust indexer 解析不能把列错位的无效日期写入 pubdate。 + """ + _require_rust_package_version("0.1.9") + html = """ + + + + + + + +
Movie.Title0
+ """ + fields = { + "title": {"selector": "a"}, + "date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"}, + "date_elapsed": {"selector": "td:nth-child(4) > span"}, + "date": { + "text": "{% if fields['date_elapsed'] or fields['date_added'] %}" + "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}" + "{% else %}now{% endif %}", + "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}], + }, + } + + result = rust_accel.parse_indexer_torrents( + html_text=html, + domain="https://example.com/", + list_config={"selector": "table.torrents > tr"}, + fields=fields, + category=None, + result_num=100, + ) + + assert "pubdate" not in result[0]