perf: optimize torrent filtering

This commit is contained in:
jxxghp
2026-05-15 16:55:42 +08:00
parent 2831eecbeb
commit 51229204c9
3 changed files with 243 additions and 46 deletions

View File

@@ -1,5 +1,6 @@
import datetime
import re
from functools import lru_cache
from pathlib import Path
from typing import Tuple, Optional, List, Union, Dict, Any
from urllib.parse import unquote
@@ -19,6 +20,40 @@ from app.utils.http import RequestUtils
from app.utils.string import StringUtils
_SIZE_UNIT = 1024 * 1024
@lru_cache(maxsize=512)
def _compile_filter_pattern(pattern: str) -> re.Pattern:
"""
编译订阅/工作流附加过滤正则。
用户输入沿用原本的正则语义,缓存只减少同一规则反复匹配大量种子时的编译成本。
"""
return re.compile(r"%s" % pattern, re.I)
def _filter_pattern_search(pattern: Union[str, int, float], content: str) -> bool:
"""
按原有字符串插值语义执行过滤正则匹配。
"""
return bool(_compile_filter_pattern(str(pattern)).search(content))
@lru_cache(maxsize=256)
def _parse_filter_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
"""
解析附加过滤的大小范围,单位为 MB。
"""
if size_range.find("-") != -1:
size_min, size_max = size_range.split("-")
return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
if size_range.startswith(">"):
return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
if size_range.startswith("<"):
return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
return "unknown", 0, None
class TorrentHelper:
"""
种子帮助类
@@ -460,52 +495,48 @@ class TorrentHelper:
# 包含
include = filter_params.get("include")
if include:
if not re.search(r"%s" % include, content, re.I):
if not _filter_pattern_search(include, content):
logger.info(f"{content} 不匹配包含规则 {include}")
return False
# 排除
exclude = filter_params.get("exclude")
if exclude:
if re.search(r"%s" % exclude, content, re.I):
if _filter_pattern_search(exclude, content):
logger.info(f"{content} 匹配排除规则 {exclude}")
return False
# 质量
quality = filter_params.get("quality")
if quality:
if not re.search(r"%s" % quality, torrent_info.title, re.I):
if not _filter_pattern_search(quality, torrent_info.title):
logger.info(f"{torrent_info.title} 不匹配质量规则 {quality}")
return False
# 分辨率
resolution = filter_params.get("resolution")
if resolution:
if not re.search(r"%s" % resolution, torrent_info.title, re.I):
if not _filter_pattern_search(resolution, torrent_info.title):
logger.info(f"{torrent_info.title} 不匹配分辨率规则 {resolution}")
return False
# 特效
effect = filter_params.get("effect")
if effect:
if not re.search(r"%s" % effect, torrent_info.title, re.I):
if not _filter_pattern_search(effect, torrent_info.title):
logger.info(f"{torrent_info.title} 不匹配特效规则 {effect}")
return False
# 大小
size_range = filter_params.get("size")
if size_range:
if size_range.find("-") != -1:
size_rule, size_min, size_max = _parse_filter_size_range(size_range)
if size_rule == "between":
# 区间
size_min, size_max = size_range.split("-")
size_min = float(size_min.strip()) * 1024 * 1024
size_max = float(size_max.strip()) * 1024 * 1024
if torrent_info.size < size_min or torrent_info.size > size_max:
return False
elif size_range.startswith(">"):
elif size_rule == "gte":
# 大于
size_min = float(size_range[1:].strip()) * 1024 * 1024
if torrent_info.size < size_min:
return False
elif size_range.startswith("<"):
elif size_rule == "lte":
# 小于
size_max = float(size_range[1:].strip()) * 1024 * 1024
if torrent_info.size > size_max:
return False
@@ -521,6 +552,7 @@ class TorrentHelper:
"""
# 匹配季
seasons = season_episodes.keys()
seasons_set = set(seasons)
# 种子季
torrent_seasons = meta.season_list
if not torrent_seasons:
@@ -528,7 +560,7 @@ class TorrentHelper:
torrent_seasons = [1]
# 种子集
torrent_episodes = meta.episode_list
if not set(torrent_seasons).issubset(set(seasons)):
if not set(torrent_seasons).issubset(seasons_set):
# 种子季不在过滤季中
logger.debug(
f"种子 {torrent.site_name} - {torrent.title} 包含季 {torrent_seasons} 不是需要的季 {list(seasons)}")
@@ -539,7 +571,7 @@ class TorrentHelper:
if len(torrent_seasons) == 1:
need_episodes = season_episodes.get(torrent_seasons[0])
if need_episodes \
and not set(torrent_episodes).intersection(set(need_episodes)):
and not set(torrent_episodes).intersection(need_episodes):
# 单季集没有交集的不要
logger.debug(f"种子 {torrent.site_name} - {torrent.title} "
f"{torrent_episodes} 没有需要的集:{need_episodes}")

View File

@@ -1,11 +1,12 @@
import re
from copy import deepcopy
from functools import lru_cache
from typing import List, Tuple, Union, Dict, Optional
from app.core.context import TorrentInfo, MediaInfo
from app.core.metainfo import MetaInfo
from app.helper.rule import RuleHelper
from app.log import logger
import re
from app.core.metainfo import MetaInfo
from app.modules import _ModuleBase
from app.modules.filter.RuleParser import RuleParser
from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
@@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
from app.utils.string import StringUtils
_SIZE_UNIT = 1024 * 1024
@lru_cache(maxsize=1024)
def _compile_ignorecase(pattern: str) -> re.Pattern:
"""
编译过滤规则正则。
过滤规则在搜索/订阅中会被大量种子重复匹配,缓存编译结果能减少热路径开销;
这里仍保留原有的 IGNORECASE 语义,非法正则也会像原来一样在匹配时抛出异常。
"""
return re.compile(r"%s" % pattern, re.IGNORECASE)
def _regex_search(pattern: Union[str, int, float], content: str) -> bool:
"""
按原有字符串插值语义执行正则匹配,同时复用已编译表达式。
"""
return bool(_compile_ignorecase(str(pattern)).search(content))
@lru_cache(maxsize=256)
def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
"""
解析大小范围,单位为 MB。
返回值中的操作符只供本模块内部使用,避免每个种子重复拆分同一个规则。
"""
size_range = size_range.strip()
if size_range.find("-") != -1:
size_min, size_max = size_range.split("-")
return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
if size_range.startswith(">"):
return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
if size_range.startswith("<"):
return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
return "unknown", 0, None
@lru_cache(maxsize=256)
def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
"""
解析发布时间规则,避免同一规则对大量种子反复转换 float。
"""
return tuple(float(t) for t in publish_time.split("-"))
class FilterModule(_ModuleBase):
CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}
@@ -86,6 +132,9 @@ class FilterModule(_ModuleBase):
if not rule_groups:
return torrent_list
parser = RuleParser()
# 同一轮过滤里,相同的优先级层级会被多个种子反复使用;按需解析并缓存,
# 既减少 pyparsing 开销,也保留原来“命中高优先级后不解析低层级”的容错行为。
parsed_rule_cache: Dict[str, Union[list, str]] = {}
# 查询规则表详情
groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
if groups:
@@ -97,21 +146,27 @@ class FilterModule(_ModuleBase):
torrent_list=torrent_list,
mediainfo=mediainfo,
parser=parser,
parsed_rule_cache=parsed_rule_cache,
)
return torrent_list
def __filter_torrents(self, rule_string: str, rule_name: str,
torrent_list: List[TorrentInfo],
mediainfo: MediaInfo,
parser: RuleParser) -> List[TorrentInfo]:
parser: RuleParser,
parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]:
"""
过滤种子
"""
if not torrent_list:
return []
# 只拆分一次规则层级;具体层级仍延迟到真正需要匹配时解析。
rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')]
# 返回种子列表
ret_torrents = []
for torrent in torrent_list:
# 能命中优先级的才返回
if not self.__get_order(torrent, rule_string, mediainfo, parser):
if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache):
logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} "
f"不匹配 {rule_name} 过滤规则")
continue
@@ -119,13 +174,12 @@ class FilterModule(_ModuleBase):
return ret_torrents
def __get_order(self, torrent: TorrentInfo, rule_str: str,
mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]:
def __get_order(self, torrent: TorrentInfo, rule_groups: List[str],
mediainfo: MediaInfo, parser: RuleParser,
parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]:
"""
获取种子匹配的规则优先级值越大越优先未匹配时返回None
"""
# 多级规则
rule_groups = rule_str.split('>')
# 优先级
res_order = 100
# 是否匹配
@@ -133,8 +187,8 @@ class FilterModule(_ModuleBase):
for rule_group in rule_groups:
# 解析规则组
parsed_group = parser.parse(rule_group.strip())
if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo):
parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache)
if self.__match_group(torrent, parsed_group, mediainfo):
# 出现匹配时中断
matched = True
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}")
@@ -145,6 +199,17 @@ class FilterModule(_ModuleBase):
return None if not matched else torrent
@staticmethod
def __parse_rule_group(rule_group: str, parser: RuleParser,
parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]:
"""
解析单个优先级层级。
缓存粒度放在层级表达式上,兼容多个规则组复用相同表达式的情况。
"""
if rule_group not in parsed_rule_cache:
parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0]
return parsed_rule_cache[rule_group]
def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str],
mediainfo: MediaInfo) -> Optional[bool]:
"""
@@ -173,12 +238,13 @@ class FilterModule(_ModuleBase):
"""
判断种子是否匹配规则项
"""
if not self.rule_set.get(rule_name):
rule = self.rule_set.get(rule_name)
if not rule:
# 规则不存在
logger.debug(f"规则 {rule_name} 不存在")
return False
# TMDB规则
tmdb = self.rule_set[rule_name].get("tmdb")
tmdb = rule.get("tmdb")
# 符合TMDB规则的直接返回True即不过滤
if tmdb and self.__match_tmdb(tmdb, mediainfo):
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则匹配成功")
@@ -187,7 +253,7 @@ class FilterModule(_ModuleBase):
content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}"
# 只匹配指定关键字
match_content = []
matchs = self.rule_set[rule_name].get("match") or []
matchs = rule.get("match") or []
if matchs:
for match in matchs:
if not hasattr(torrent, match):
@@ -202,27 +268,27 @@ class FilterModule(_ModuleBase):
if match_content:
content = " ".join(match_content)
# 包含规则项
includes = self.rule_set[rule_name].get("include") or []
includes = rule.get("include") or []
if not isinstance(includes, list):
includes = [includes]
# 排除规则项
excludes = self.rule_set[rule_name].get("exclude") or []
excludes = rule.get("exclude") or []
if not isinstance(excludes, list):
excludes = [excludes]
# 大小范围规则项
size_range = self.rule_set[rule_name].get("size_range")
size_range = rule.get("size_range")
# 做种人数规则项
seeders = self.rule_set[rule_name].get("seeders")
seeders = rule.get("seeders")
# FREE规则
downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor")
downloadvolumefactor = rule.get("downloadvolumefactor")
# 发布时间规则
pubdate: str = self.rule_set[rule_name].get("publish_time")
if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes):
pubdate: str = rule.get("publish_time")
if includes and not any(_regex_search(include, content) for include in includes):
# 未发现任何包含项
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}")
return False
for exclude in excludes:
if re.search(r"%s" % exclude, content, re.IGNORECASE):
if _regex_search(exclude, content):
# 发现排除项
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}")
return False
@@ -247,7 +313,7 @@ class FilterModule(_ModuleBase):
# 种子发布时间
pub_minutes = torrent.pub_minutes()
# 发布时间规则
pub_times = [float(t) for t in pubdate.split("-")]
pub_times = _parse_publish_time(pubdate)
if len(pub_times) == 1:
# 发布时间小于规则
if pub_minutes < pub_times[0]:
@@ -319,22 +385,17 @@ class FilterModule(_ModuleBase):
# 每集大小
torrent_size = torrent.size / episode_count
# 大小范围
size_range = size_range.strip()
if size_range.find("-") != -1:
size_rule, size_min, size_max = _parse_size_range(size_range)
if size_rule == "between":
# 区间
size_min, size_max = size_range.split("-")
size_min = float(size_min.strip()) * 1024 * 1024
size_max = float(size_max.strip()) * 1024 * 1024
if size_min <= torrent_size <= size_max:
return True
elif size_range.startswith(">"):
elif size_rule == "gte":
# 大于
size_min = float(size_range[1:].strip()) * 1024 * 1024
if torrent_size >= size_min:
return True
elif size_range.startswith("<"):
elif size_rule == "lte":
# 小于
size_max = float(size_range[1:].strip()) * 1024 * 1024
if torrent_size <= size_max:
return True
return False

View File

@@ -0,0 +1,104 @@
import unittest
from types import SimpleNamespace
from app.core.context import TorrentInfo
from app.helper.torrent import TorrentHelper
from app.modules.filter import FilterModule
class _RuleHelper:
"""
过滤模块测试用的轻量规则仓库,避免依赖真实系统配置。
"""
def __init__(self, groups):
self._groups = groups
def get_rule_group_by_media(self, media=None, group_names=None): # noqa: ARG002
if not group_names:
return self._groups
return [group for group in self._groups if group.name in group_names]
def _build_filter_module(rule_string: str, rule_set: dict) -> FilterModule:
module = FilterModule()
module.rulehelper = _RuleHelper(
[SimpleNamespace(name="test", rule_string=rule_string)]
)
module.rule_set = rule_set
return module
class TorrentFilterTest(unittest.TestCase):
def test_filter_torrents_keeps_priority_and_boolean_rule_semantics(self):
module = _build_filter_module(
rule_string="HDR & !BLU > DV",
rule_set={
"HDR": {"include": "HDR"},
"DV": {"include": "DOVI"},
"BLU": {"include": "BluRay"},
},
)
torrents = [
TorrentInfo(title="Movie HDR WEB-DL", description=""),
TorrentInfo(title="Movie DOVI", description=""),
TorrentInfo(title="Movie HDR BluRay", description=""),
]
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=torrents)
self.assertEqual(torrents[:2], filtered)
self.assertEqual(100, filtered[0].pri_order)
self.assertEqual(99, filtered[1].pri_order)
def test_filter_torrents_keeps_lazy_priority_level_parsing(self):
module = _build_filter_module(
rule_string="KEEP > (",
rule_set={"KEEP": {"include": "Movie"}},
)
torrent = TorrentInfo(title="Movie", description="")
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent])
self.assertEqual([torrent], filtered)
self.assertEqual(100, torrent.pri_order)
def test_filter_torrent_keeps_extra_filter_semantics(self):
torrent = TorrentInfo(
title="Movie 1080p HDR",
description="中字",
labels=["free"],
size=3 * 1024 * 1024 * 1024,
uploadvolumefactor=1,
downloadvolumefactor=0,
)
self.assertTrue(
TorrentHelper.filter_torrent(
torrent_info=torrent,
filter_params={
"include": "中字|free",
"exclude": "BluRay",
"resolution": "1080p",
"effect": "HDR",
"size": "1000-4000",
},
)
)
self.assertFalse(
TorrentHelper.filter_torrent(
torrent_info=torrent,
filter_params={"exclude": "HDR"},
)
)
self.assertFalse(
TorrentHelper.filter_torrent(
torrent_info=torrent,
filter_params={"size": "<1000"},
)
)
if __name__ == "__main__":
unittest.main()