From ccbcce057379978397867d1b5ae988a5633512dc Mon Sep 17 00:00:00 2001 From: jxxghp Date: Tue, 26 May 2026 12:14:36 +0800 Subject: [PATCH] chore: update 3 workspace files --- app/modules/indexer/__init__.py | 24 ++++++++-------- app/modules/indexer/spider/__init__.py | 28 ++++++++++++++++--- tests/test_indexer_spider_search_url.py | 37 +++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/app/modules/indexer/__init__.py b/app/modules/indexer/__init__.py index c181f70c..dcead0fb 100644 --- a/app/modules/indexer/__init__.py +++ b/app/modules/indexer/__init__.py @@ -21,6 +21,17 @@ from app.schemas.types import MediaType, ModuleType, OtherModulesType from app.utils.string import StringUtils +SPIDER_PARSER_CLASSES = { + "TNodeSpider": TNodeSpider, + "TorrentLeech": TorrentLeech, + "mTorrent": MTorrentSpider, + "Yema": YemaSpider, + "Haidan": HaiDanSpider, + "HDDolby": HddolbySpider, + "RousiPro": RousiSpider, +} + + class IndexerModule(_ModuleBase): """ 索引模块 @@ -156,17 +167,8 @@ class IndexerModule(_ModuleBase): """ site = site or {} parser = site.get("parser") - parser_classes = { - "TNodeSpider": TNodeSpider, - "TorrentLeech": TorrentLeech, - "mTorrent": MTorrentSpider, - "Yema": YemaSpider, - "Haidan": HaiDanSpider, - "HDDolby": HddolbySpider, - "RousiPro": RousiSpider, - } - if parser in parser_classes: - return parser_classes[parser].get_search_page_size(keyword=keyword) + if parser in SPIDER_PARSER_CLASSES: + return SPIDER_PARSER_CLASSES[parser].get_search_page_size(keyword=keyword) try: page_size = int(site.get("result_num") or SiteSpider.default_result_num()) except (TypeError, ValueError): diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 7a0e04e9..6de3053e 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -69,6 +69,7 @@ class SiteSpider: if not keyword and self.browse: self.list = self.browse.get('list') or self.list self.fields = self.browse.get('fields') or self.fields + self._field_templates = self.__build_field_templates() self.domain = indexer.get('domain') self.result_num = int(indexer.get('result_num') or self.default_result_num()) self._timeout = int(indexer.get('timeout') or 15) @@ -85,6 +86,19 @@ class SiteSpider: self.torrents_info = {} self.torrents_info_array = [] + def __build_field_templates(self) -> dict: + """ + 预编译字段模板,避免按每条种子重复构造 Jinja Template。 + """ + templates = {} + for name in ("title", "description"): + selector = (self.fields or {}).get(name, {}) + template_text = selector.get("text") if isinstance(selector, dict) else None + if not template_text: + continue + templates[name] = Template(template_text) + return templates + @classmethod def default_result_num(cls) -> int: """ @@ -304,7 +318,8 @@ class SiteSpider: title_optional_selector = self.fields.get('title_optional', {}) title_optional = self._safe_query(torrent, title_optional_selector) render_dict.update({'title_optional': title_optional}) - self.torrents_info['title'] = Template(selector.get('text')).render(fields=render_dict) + template = self._field_templates.get("title") or Template(selector.get("text")) + self.torrents_info['title'] = template.render(fields=render_dict) self.torrents_info['title'] = self.__filter_text(self.torrents_info.get('title'), selector.get('filters')) @@ -337,7 +352,8 @@ class SiteSpider: description_normal_selector = self.fields.get("description_normal", {}) description_normal = self._safe_query(torrent, description_normal_selector) render_dict.update({"description_normal": description_normal}) - self.torrents_info['description'] = Template(selector.get('text')).render(fields=render_dict) + template = self._field_templates.get("description") or Template(selector.get("text")) + self.torrents_info['description'] = template.render(fields=render_dict) self.torrents_info['description'] = self.__filter_text(self.torrents_info.get('description'), selector.get('filters')) @@ -594,13 +610,17 @@ class SiteSpider: if not selector_config or not selector_config.get('selector'): return None - query_obj = torrent(selector_config.get('selector', '')).clone() + should_clone = bool(selector_config.get("remove")) + query_obj = torrent(selector_config.get('selector', '')) + if should_clone: + query_obj = query_obj.clone() try: self.__remove(query_obj, selector_config) items = self.__attribute_or_text(query_obj, selector_config) return self.__index(items, selector_config) finally: - query_obj.clear() + if should_clone: + query_obj.clear() del query_obj def get_info(self, torrent: Any) -> dict: diff --git a/tests/test_indexer_spider_search_url.py b/tests/test_indexer_spider_search_url.py index c1edcadb..a67daebe 100644 --- a/tests/test_indexer_spider_search_url.py +++ b/tests/test_indexer_spider_search_url.py @@ -1,4 +1,5 @@ from urllib.parse import parse_qs, urlparse +from unittest.mock import patch from app.modules.indexer.spider import SiteSpider from app.modules.indexer.spider.haidan import HaiDanSpider @@ -174,3 +175,39 @@ def test_haidan_empty_keyword_uses_blank_search_value(): assert params["search"] == "" assert params["search_area"] == "0" + + +def test_python_spider_remove_does_not_pollute_other_fields(): + """ + Python fallback 解析带 remove 的字段时不能影响同一行后续字段选择。 + """ + indexer = _build_indexer( + torrents={ + "list": {"selector": "table.torrents > tr"}, + "fields": { + "title": {"selector": "a.title"}, + "description": { + "selector": "td.desc", + "remove": "span.noise", + }, + "imdbid": {"selector": "span.noise"}, + }, + }, + ) + html = """ + + + + + +
Movie.TitleMain description tt1234567
+ """ + + with patch("app.modules.indexer.spider.rust_accel.parse_indexer_torrents", return_value=None): + result = SiteSpider(indexer).parse(html) + + assert result == [{ + "title": "Movie.Title", + "description": "Main description", + "imdbid": "tt1234567", + }]