perf: optimize torrent filtering

2026-06-01 13:40:54 +08:00 · 2026-05-15 16:55:42 +08:00
parent 2831eecbeb
commit 51229204c9
3 changed files with 243 additions and 46 deletions
--- a/app/helper/torrent.py
+++ b/app/helper/torrent.py
@@ -1,5 +1,6 @@
 import datetime
 import re
+from functools import lru_cache
 from pathlib import Path
 from typing import Tuple, Optional, List, Union, Dict, Any
 from urllib.parse import unquote
@@ -19,6 +20,40 @@ from app.utils.http import RequestUtils
 from app.utils.string import StringUtils


+_SIZE_UNIT = 1024 * 1024
+
+
+@lru_cache(maxsize=512)
+def _compile_filter_pattern(pattern: str) -> re.Pattern:
+    """
+    编译订阅/工作流附加过滤正则。
+    用户输入沿用原本的正则语义，缓存只减少同一规则反复匹配大量种子时的编译成本。
+    """
+    return re.compile(r"%s" % pattern, re.I)
+
+
+def _filter_pattern_search(pattern: Union[str, int, float], content: str) -> bool:
+    """
+    按原有字符串插值语义执行过滤正则匹配。
+    """
+    return bool(_compile_filter_pattern(str(pattern)).search(content))
+
+
+@lru_cache(maxsize=256)
+def _parse_filter_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
+    """
+    解析附加过滤的大小范围，单位为 MB。
+    """
+    if size_range.find("-") != -1:
+        size_min, size_max = size_range.split("-")
+        return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
+    if size_range.startswith(">"):
+        return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
+    if size_range.startswith("<"):
+        return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
+    return "unknown", 0, None
+
+
 class TorrentHelper:
    """
    种子帮助类
@@ -460,52 +495,48 @@ class TorrentHelper:
        # 包含
        include = filter_params.get("include")
        if include:
-            if not re.search(r"%s" % include, content, re.I):
+            if not _filter_pattern_search(include, content):
                logger.info(f"{content} 不匹配包含规则 {include}")
                return False
        # 排除
        exclude = filter_params.get("exclude")
        if exclude:
-            if re.search(r"%s" % exclude, content, re.I):
+            if _filter_pattern_search(exclude, content):
                logger.info(f"{content} 匹配排除规则 {exclude}")
                return False
        # 质量
        quality = filter_params.get("quality")
        if quality:
-            if not re.search(r"%s" % quality, torrent_info.title, re.I):
+            if not _filter_pattern_search(quality, torrent_info.title):
                logger.info(f"{torrent_info.title} 不匹配质量规则 {quality}")
                return False
        # 分辨率
        resolution = filter_params.get("resolution")
        if resolution:
-            if not re.search(r"%s" % resolution, torrent_info.title, re.I):
+            if not _filter_pattern_search(resolution, torrent_info.title):
                logger.info(f"{torrent_info.title} 不匹配分辨率规则 {resolution}")
                return False
        # 特效
        effect = filter_params.get("effect")
        if effect:
-            if not re.search(r"%s" % effect, torrent_info.title, re.I):
+            if not _filter_pattern_search(effect, torrent_info.title):
                logger.info(f"{torrent_info.title} 不匹配特效规则 {effect}")
                return False

        # 大小
        size_range = filter_params.get("size")
        if size_range:
-            if size_range.find("-") != -1:
+            size_rule, size_min, size_max = _parse_filter_size_range(size_range)
+            if size_rule == "between":
                # 区间
-                size_min, size_max = size_range.split("-")
-                size_min = float(size_min.strip()) * 1024 * 1024
-                size_max = float(size_max.strip()) * 1024 * 1024
                if torrent_info.size < size_min or torrent_info.size > size_max:
                    return False
-            elif size_range.startswith(">"):
+            elif size_rule == "gte":
                # 大于
-                size_min = float(size_range[1:].strip()) * 1024 * 1024
                if torrent_info.size < size_min:
                    return False
-            elif size_range.startswith("<"):
+            elif size_rule == "lte":
                # 小于
-                size_max = float(size_range[1:].strip()) * 1024 * 1024
                if torrent_info.size > size_max:
                    return False

@@ -521,6 +552,7 @@ class TorrentHelper:
        """
        # 匹配季
        seasons = season_episodes.keys()
+        seasons_set = set(seasons)
        # 种子季
        torrent_seasons = meta.season_list
        if not torrent_seasons:
@@ -528,7 +560,7 @@ class TorrentHelper:
            torrent_seasons = [1]
        # 种子集
        torrent_episodes = meta.episode_list
-        if not set(torrent_seasons).issubset(set(seasons)):
+        if not set(torrent_seasons).issubset(seasons_set):
            # 种子季不在过滤季中
            logger.debug(
                f"种子 {torrent.site_name} - {torrent.title} 包含季 {torrent_seasons} 不是需要的季 {list(seasons)}")
@@ -539,7 +571,7 @@ class TorrentHelper:
        if len(torrent_seasons) == 1:
            need_episodes = season_episodes.get(torrent_seasons[0])
            if need_episodes \
-                    and not set(torrent_episodes).intersection(set(need_episodes)):
+                    and not set(torrent_episodes).intersection(need_episodes):
                # 单季集没有交集的不要
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} "
                             f"集 {torrent_episodes} 没有需要的集：{need_episodes}")
--- a/app/modules/filter/init.py
+++ b/app/modules/filter/init.py
@@ -1,11 +1,12 @@
+import re
 from copy import deepcopy
+from functools import lru_cache
 from typing import List, Tuple, Union, Dict, Optional

 from app.core.context import TorrentInfo, MediaInfo
+from app.core.metainfo import MetaInfo
 from app.helper.rule import RuleHelper
 from app.log import logger
-import re
-from app.core.metainfo import MetaInfo
 from app.modules import _ModuleBase
 from app.modules.filter.RuleParser import RuleParser
 from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
@@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
 from app.utils.string import StringUtils


+_SIZE_UNIT = 1024 * 1024
+
+
+@lru_cache(maxsize=1024)
+def _compile_ignorecase(pattern: str) -> re.Pattern:
+    """
+    编译过滤规则正则。
+    过滤规则在搜索/订阅中会被大量种子重复匹配，缓存编译结果能减少热路径开销；
+    这里仍保留原有的 IGNORECASE 语义，非法正则也会像原来一样在匹配时抛出异常。
+    """
+    return re.compile(r"%s" % pattern, re.IGNORECASE)
+
+
+def _regex_search(pattern: Union[str, int, float], content: str) -> bool:
+    """
+    按原有字符串插值语义执行正则匹配，同时复用已编译表达式。
+    """
+    return bool(_compile_ignorecase(str(pattern)).search(content))
+
+
+@lru_cache(maxsize=256)
+def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
+    """
+    解析大小范围，单位为 MB。
+    返回值中的操作符只供本模块内部使用，避免每个种子重复拆分同一个规则。
+    """
+    size_range = size_range.strip()
+    if size_range.find("-") != -1:
+        size_min, size_max = size_range.split("-")
+        return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
+    if size_range.startswith(">"):
+        return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
+    if size_range.startswith("<"):
+        return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
+    return "unknown", 0, None
+
+
+@lru_cache(maxsize=256)
+def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
+    """
+    解析发布时间规则，避免同一规则对大量种子反复转换 float。
+    """
+    return tuple(float(t) for t in publish_time.split("-"))
+
+
 class FilterModule(_ModuleBase):
    CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}

@@ -86,6 +132,9 @@ class FilterModule(_ModuleBase):
        if not rule_groups:
            return torrent_list
        parser = RuleParser()
+        # 同一轮过滤里，相同的优先级层级会被多个种子反复使用；按需解析并缓存，
+        # 既减少 pyparsing 开销，也保留原来“命中高优先级后不解析低层级”的容错行为。
+        parsed_rule_cache: Dict[str, Union[list, str]] = {}
        # 查询规则表详情
        groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
        if groups:
@@ -97,21 +146,27 @@ class FilterModule(_ModuleBase):
                    torrent_list=torrent_list,
                    mediainfo=mediainfo,
                    parser=parser,
+                    parsed_rule_cache=parsed_rule_cache,
                )
        return torrent_list

    def __filter_torrents(self, rule_string: str, rule_name: str,
                          torrent_list: List[TorrentInfo],
                          mediainfo: MediaInfo,
-                          parser: RuleParser) -> List[TorrentInfo]:
+                          parser: RuleParser,
+                          parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]:
        """
        过滤种子
        """
+        if not torrent_list:
+            return []
+        # 只拆分一次规则层级；具体层级仍延迟到真正需要匹配时解析。
+        rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')]
        # 返回种子列表
        ret_torrents = []
        for torrent in torrent_list:
            # 能命中优先级的才返回
-            if not self.__get_order(torrent, rule_string, mediainfo, parser):
+            if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache):
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} "
                             f"不匹配 {rule_name} 过滤规则")
                continue
@@ -119,13 +174,12 @@ class FilterModule(_ModuleBase):

        return ret_torrents

-    def __get_order(self, torrent: TorrentInfo, rule_str: str,
-                    mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]:
+    def __get_order(self, torrent: TorrentInfo, rule_groups: List[str],
+                    mediainfo: MediaInfo, parser: RuleParser,
+                    parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]:
        """
        获取种子匹配的规则优先级，值越大越优先，未匹配时返回None
        """
-        # 多级规则
-        rule_groups = rule_str.split('>')
        # 优先级
        res_order = 100
        # 是否匹配
@@ -133,8 +187,8 @@ class FilterModule(_ModuleBase):

        for rule_group in rule_groups:
            # 解析规则组
-            parsed_group = parser.parse(rule_group.strip())
-            if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo):
+            parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache)
+            if self.__match_group(torrent, parsed_group, mediainfo):
                # 出现匹配时中断
                matched = True
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}")
@@ -145,6 +199,17 @@ class FilterModule(_ModuleBase):

        return None if not matched else torrent

+    @staticmethod
+    def __parse_rule_group(rule_group: str, parser: RuleParser,
+                           parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]:
+        """
+        解析单个优先级层级。
+        缓存粒度放在层级表达式上，兼容多个规则组复用相同表达式的情况。
+        """
+        if rule_group not in parsed_rule_cache:
+            parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0]
+        return parsed_rule_cache[rule_group]
+
    def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str],
                      mediainfo: MediaInfo) -> Optional[bool]:
        """
@@ -173,12 +238,13 @@ class FilterModule(_ModuleBase):
        """
        判断种子是否匹配规则项
        """
-        if not self.rule_set.get(rule_name):
+        rule = self.rule_set.get(rule_name)
+        if not rule:
            # 规则不存在
            logger.debug(f"规则 {rule_name} 不存在")
            return False
        # TMDB规则
-        tmdb = self.rule_set[rule_name].get("tmdb")
+        tmdb = rule.get("tmdb")
        # 符合TMDB规则的直接返回True，即不过滤
        if tmdb and self.__match_tmdb(tmdb, mediainfo):
            logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则，匹配成功")
@@ -187,7 +253,7 @@ class FilterModule(_ModuleBase):
        content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}"
        # 只匹配指定关键字
        match_content = []
-        matchs = self.rule_set[rule_name].get("match") or []
+        matchs = rule.get("match") or []
        if matchs:
            for match in matchs:
                if not hasattr(torrent, match):
@@ -202,27 +268,27 @@ class FilterModule(_ModuleBase):
        if match_content:
            content = " ".join(match_content)
        # 包含规则项
-        includes = self.rule_set[rule_name].get("include") or []
+        includes = rule.get("include") or []
        if not isinstance(includes, list):
            includes = [includes]
        # 排除规则项
-        excludes = self.rule_set[rule_name].get("exclude") or []
+        excludes = rule.get("exclude") or []
        if not isinstance(excludes, list):
            excludes = [excludes]
        # 大小范围规则项
-        size_range = self.rule_set[rule_name].get("size_range")
+        size_range = rule.get("size_range")
        # 做种人数规则项
-        seeders = self.rule_set[rule_name].get("seeders")
+        seeders = rule.get("seeders")
        # FREE规则
-        downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor")
+        downloadvolumefactor = rule.get("downloadvolumefactor")
        # 发布时间规则
-        pubdate: str = self.rule_set[rule_name].get("publish_time")
-        if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes):
+        pubdate: str = rule.get("publish_time")
+        if includes and not any(_regex_search(include, content) for include in includes):
            # 未发现任何包含项
            logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}")
            return False
        for exclude in excludes:
-            if re.search(r"%s" % exclude, content, re.IGNORECASE):
+            if _regex_search(exclude, content):
                # 发现排除项
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}")
                return False
@@ -247,7 +313,7 @@ class FilterModule(_ModuleBase):
            # 种子发布时间
            pub_minutes = torrent.pub_minutes()
            # 发布时间规则
-            pub_times = [float(t) for t in pubdate.split("-")]
+            pub_times = _parse_publish_time(pubdate)
            if len(pub_times) == 1:
                # 发布时间小于规则
                if pub_minutes < pub_times[0]:
@@ -319,22 +385,17 @@ class FilterModule(_ModuleBase):
        # 每集大小
        torrent_size = torrent.size / episode_count
        # 大小范围
-        size_range = size_range.strip()
-        if size_range.find("-") != -1:
+        size_rule, size_min, size_max = _parse_size_range(size_range)
+        if size_rule == "between":
            # 区间
-            size_min, size_max = size_range.split("-")
-            size_min = float(size_min.strip()) * 1024 * 1024
-            size_max = float(size_max.strip()) * 1024 * 1024
            if size_min <= torrent_size <= size_max:
                return True
-        elif size_range.startswith(">"):
+        elif size_rule == "gte":
            # 大于
-            size_min = float(size_range[1:].strip()) * 1024 * 1024
            if torrent_size >= size_min:
                return True
-        elif size_range.startswith("<"):
+        elif size_rule == "lte":
            # 小于
-            size_max = float(size_range[1:].strip()) * 1024 * 1024
            if torrent_size <= size_max:
                return True
        return False
--- a/tests/test_torrent_filter.py
+++ b/tests/test_torrent_filter.py
@@ -0,0 +1,104 @@
+import unittest
+from types import SimpleNamespace
+
+from app.core.context import TorrentInfo
+from app.helper.torrent import TorrentHelper
+from app.modules.filter import FilterModule
+
+
+class _RuleHelper:
+    """
+    过滤模块测试用的轻量规则仓库，避免依赖真实系统配置。
+    """
+
+    def __init__(self, groups):
+        self._groups = groups
+
+    def get_rule_group_by_media(self, media=None, group_names=None):  # noqa: ARG002
+        if not group_names:
+            return self._groups
+        return [group for group in self._groups if group.name in group_names]
+
+
+def _build_filter_module(rule_string: str, rule_set: dict) -> FilterModule:
+    module = FilterModule()
+    module.rulehelper = _RuleHelper(
+        [SimpleNamespace(name="test", rule_string=rule_string)]
+    )
+    module.rule_set = rule_set
+    return module
+
+
+class TorrentFilterTest(unittest.TestCase):
+
+    def test_filter_torrents_keeps_priority_and_boolean_rule_semantics(self):
+        module = _build_filter_module(
+            rule_string="HDR & !BLU > DV",
+            rule_set={
+                "HDR": {"include": "HDR"},
+                "DV": {"include": "DOVI"},
+                "BLU": {"include": "BluRay"},
+            },
+        )
+        torrents = [
+            TorrentInfo(title="Movie HDR WEB-DL", description=""),
+            TorrentInfo(title="Movie DOVI", description=""),
+            TorrentInfo(title="Movie HDR BluRay", description=""),
+        ]
+
+        filtered = module.filter_torrents(rule_groups=["test"], torrent_list=torrents)
+
+        self.assertEqual(torrents[:2], filtered)
+        self.assertEqual(100, filtered[0].pri_order)
+        self.assertEqual(99, filtered[1].pri_order)
+
+    def test_filter_torrents_keeps_lazy_priority_level_parsing(self):
+        module = _build_filter_module(
+            rule_string="KEEP > (",
+            rule_set={"KEEP": {"include": "Movie"}},
+        )
+        torrent = TorrentInfo(title="Movie", description="")
+
+        filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent])
+
+        self.assertEqual([torrent], filtered)
+        self.assertEqual(100, torrent.pri_order)
+
+    def test_filter_torrent_keeps_extra_filter_semantics(self):
+        torrent = TorrentInfo(
+            title="Movie 1080p HDR",
+            description="中字",
+            labels=["free"],
+            size=3 * 1024 * 1024 * 1024,
+            uploadvolumefactor=1,
+            downloadvolumefactor=0,
+        )
+
+        self.assertTrue(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={
+                    "include": "中字|free",
+                    "exclude": "BluRay",
+                    "resolution": "1080p",
+                    "effect": "HDR",
+                    "size": "1000-4000",
+                },
+            )
+        )
+        self.assertFalse(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={"exclude": "HDR"},
+            )
+        )
+        self.assertFalse(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={"size": "<1000"},
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()