perf: optimize torrent filtering

2026-06-05 07:29:56 +08:00 · 2026-05-15 16:55:42 +08:00
parent 2831eecbeb
commit 51229204c9
3 changed files with 243 additions and 46 deletions
--- a/app/modules/filter/init.py
+++ b/app/modules/filter/init.py
@@ -1,11 +1,12 @@
+import re
 from copy import deepcopy
+from functools import lru_cache
 from typing import List, Tuple, Union, Dict, Optional

 from app.core.context import TorrentInfo, MediaInfo
+from app.core.metainfo import MetaInfo
 from app.helper.rule import RuleHelper
 from app.log import logger
-import re
-from app.core.metainfo import MetaInfo
 from app.modules import _ModuleBase
 from app.modules.filter.RuleParser import RuleParser
 from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
@@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
 from app.utils.string import StringUtils


+_SIZE_UNIT = 1024 * 1024
+
+
+@lru_cache(maxsize=1024)
+def _compile_ignorecase(pattern: str) -> re.Pattern:
+    """
+    编译过滤规则正则。
+    过滤规则在搜索/订阅中会被大量种子重复匹配，缓存编译结果能减少热路径开销；
+    这里仍保留原有的 IGNORECASE 语义，非法正则也会像原来一样在匹配时抛出异常。
+    """
+    return re.compile(r"%s" % pattern, re.IGNORECASE)
+
+
+def _regex_search(pattern: Union[str, int, float], content: str) -> bool:
+    """
+    按原有字符串插值语义执行正则匹配，同时复用已编译表达式。
+    """
+    return bool(_compile_ignorecase(str(pattern)).search(content))
+
+
+@lru_cache(maxsize=256)
+def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
+    """
+    解析大小范围，单位为 MB。
+    返回值中的操作符只供本模块内部使用，避免每个种子重复拆分同一个规则。
+    """
+    size_range = size_range.strip()
+    if size_range.find("-") != -1:
+        size_min, size_max = size_range.split("-")
+        return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
+    if size_range.startswith(">"):
+        return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
+    if size_range.startswith("<"):
+        return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
+    return "unknown", 0, None
+
+
+@lru_cache(maxsize=256)
+def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
+    """
+    解析发布时间规则，避免同一规则对大量种子反复转换 float。
+    """
+    return tuple(float(t) for t in publish_time.split("-"))
+
+
 class FilterModule(_ModuleBase):
    CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}

@@ -86,6 +132,9 @@ class FilterModule(_ModuleBase):
        if not rule_groups:
            return torrent_list
        parser = RuleParser()
+        # 同一轮过滤里，相同的优先级层级会被多个种子反复使用；按需解析并缓存，
+        # 既减少 pyparsing 开销，也保留原来“命中高优先级后不解析低层级”的容错行为。
+        parsed_rule_cache: Dict[str, Union[list, str]] = {}
        # 查询规则表详情
        groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
        if groups:
@@ -97,21 +146,27 @@ class FilterModule(_ModuleBase):
                    torrent_list=torrent_list,
                    mediainfo=mediainfo,
                    parser=parser,
+                    parsed_rule_cache=parsed_rule_cache,
                )
        return torrent_list

    def __filter_torrents(self, rule_string: str, rule_name: str,
                          torrent_list: List[TorrentInfo],
                          mediainfo: MediaInfo,
-                          parser: RuleParser) -> List[TorrentInfo]:
+                          parser: RuleParser,
+                          parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]:
        """
        过滤种子
        """
+        if not torrent_list:
+            return []
+        # 只拆分一次规则层级；具体层级仍延迟到真正需要匹配时解析。
+        rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')]
        # 返回种子列表
        ret_torrents = []
        for torrent in torrent_list:
            # 能命中优先级的才返回
-            if not self.__get_order(torrent, rule_string, mediainfo, parser):
+            if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache):
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} "
                             f"不匹配 {rule_name} 过滤规则")
                continue
@@ -119,13 +174,12 @@ class FilterModule(_ModuleBase):

        return ret_torrents

-    def __get_order(self, torrent: TorrentInfo, rule_str: str,
-                    mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]:
+    def __get_order(self, torrent: TorrentInfo, rule_groups: List[str],
+                    mediainfo: MediaInfo, parser: RuleParser,
+                    parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]:
        """
        获取种子匹配的规则优先级，值越大越优先，未匹配时返回None
        """
-        # 多级规则
-        rule_groups = rule_str.split('>')
        # 优先级
        res_order = 100
        # 是否匹配
@@ -133,8 +187,8 @@ class FilterModule(_ModuleBase):

        for rule_group in rule_groups:
            # 解析规则组
-            parsed_group = parser.parse(rule_group.strip())
-            if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo):
+            parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache)
+            if self.__match_group(torrent, parsed_group, mediainfo):
                # 出现匹配时中断
                matched = True
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}")
@@ -145,6 +199,17 @@ class FilterModule(_ModuleBase):

        return None if not matched else torrent

+    @staticmethod
+    def __parse_rule_group(rule_group: str, parser: RuleParser,
+                           parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]:
+        """
+        解析单个优先级层级。
+        缓存粒度放在层级表达式上，兼容多个规则组复用相同表达式的情况。
+        """
+        if rule_group not in parsed_rule_cache:
+            parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0]
+        return parsed_rule_cache[rule_group]
+
    def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str],
                      mediainfo: MediaInfo) -> Optional[bool]:
        """
@@ -173,12 +238,13 @@ class FilterModule(_ModuleBase):
        """
        判断种子是否匹配规则项
        """
-        if not self.rule_set.get(rule_name):
+        rule = self.rule_set.get(rule_name)
+        if not rule:
            # 规则不存在
            logger.debug(f"规则 {rule_name} 不存在")
            return False
        # TMDB规则
-        tmdb = self.rule_set[rule_name].get("tmdb")
+        tmdb = rule.get("tmdb")
        # 符合TMDB规则的直接返回True，即不过滤
        if tmdb and self.__match_tmdb(tmdb, mediainfo):
            logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则，匹配成功")
@@ -187,7 +253,7 @@ class FilterModule(_ModuleBase):
        content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}"
        # 只匹配指定关键字
        match_content = []
-        matchs = self.rule_set[rule_name].get("match") or []
+        matchs = rule.get("match") or []
        if matchs:
            for match in matchs:
                if not hasattr(torrent, match):
@@ -202,27 +268,27 @@ class FilterModule(_ModuleBase):
        if match_content:
            content = " ".join(match_content)
        # 包含规则项
-        includes = self.rule_set[rule_name].get("include") or []
+        includes = rule.get("include") or []
        if not isinstance(includes, list):
            includes = [includes]
        # 排除规则项
-        excludes = self.rule_set[rule_name].get("exclude") or []
+        excludes = rule.get("exclude") or []
        if not isinstance(excludes, list):
            excludes = [excludes]
        # 大小范围规则项
-        size_range = self.rule_set[rule_name].get("size_range")
+        size_range = rule.get("size_range")
        # 做种人数规则项
-        seeders = self.rule_set[rule_name].get("seeders")
+        seeders = rule.get("seeders")
        # FREE规则
-        downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor")
+        downloadvolumefactor = rule.get("downloadvolumefactor")
        # 发布时间规则
-        pubdate: str = self.rule_set[rule_name].get("publish_time")
-        if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes):
+        pubdate: str = rule.get("publish_time")
+        if includes and not any(_regex_search(include, content) for include in includes):
            # 未发现任何包含项
            logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}")
            return False
        for exclude in excludes:
-            if re.search(r"%s" % exclude, content, re.IGNORECASE):
+            if _regex_search(exclude, content):
                # 发现排除项
                logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}")
                return False
@@ -247,7 +313,7 @@ class FilterModule(_ModuleBase):
            # 种子发布时间
            pub_minutes = torrent.pub_minutes()
            # 发布时间规则
-            pub_times = [float(t) for t in pubdate.split("-")]
+            pub_times = _parse_publish_time(pubdate)
            if len(pub_times) == 1:
                # 发布时间小于规则
                if pub_minutes < pub_times[0]:
@@ -319,22 +385,17 @@ class FilterModule(_ModuleBase):
        # 每集大小
        torrent_size = torrent.size / episode_count
        # 大小范围
-        size_range = size_range.strip()
-        if size_range.find("-") != -1:
+        size_rule, size_min, size_max = _parse_size_range(size_range)
+        if size_rule == "between":
            # 区间
-            size_min, size_max = size_range.split("-")
-            size_min = float(size_min.strip()) * 1024 * 1024
-            size_max = float(size_max.strip()) * 1024 * 1024
            if size_min <= torrent_size <= size_max:
                return True
-        elif size_range.startswith(">"):
+        elif size_rule == "gte":
            # 大于
-            size_min = float(size_range[1:].strip()) * 1024 * 1024
            if torrent_size >= size_min:
                return True
-        elif size_range.startswith("<"):
+        elif size_rule == "lte":
            # 小于
-            size_max = float(size_range[1:].strip()) * 1024 * 1024
            if torrent_size <= size_max:
                return True
        return False