mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-01 13:40:54 +08:00
perf: optimize torrent filtering
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import datetime
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional, List, Union, Dict, Any
|
||||
from urllib.parse import unquote
|
||||
@@ -19,6 +20,40 @@ from app.utils.http import RequestUtils
|
||||
from app.utils.string import StringUtils
|
||||
|
||||
|
||||
_SIZE_UNIT = 1024 * 1024
|
||||
|
||||
|
||||
@lru_cache(maxsize=512)
|
||||
def _compile_filter_pattern(pattern: str) -> re.Pattern:
|
||||
"""
|
||||
编译订阅/工作流附加过滤正则。
|
||||
用户输入沿用原本的正则语义,缓存只减少同一规则反复匹配大量种子时的编译成本。
|
||||
"""
|
||||
return re.compile(r"%s" % pattern, re.I)
|
||||
|
||||
|
||||
def _filter_pattern_search(pattern: Union[str, int, float], content: str) -> bool:
|
||||
"""
|
||||
按原有字符串插值语义执行过滤正则匹配。
|
||||
"""
|
||||
return bool(_compile_filter_pattern(str(pattern)).search(content))
|
||||
|
||||
|
||||
@lru_cache(maxsize=256)
|
||||
def _parse_filter_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
|
||||
"""
|
||||
解析附加过滤的大小范围,单位为 MB。
|
||||
"""
|
||||
if size_range.find("-") != -1:
|
||||
size_min, size_max = size_range.split("-")
|
||||
return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
|
||||
if size_range.startswith(">"):
|
||||
return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
|
||||
if size_range.startswith("<"):
|
||||
return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
|
||||
return "unknown", 0, None
|
||||
|
||||
|
||||
class TorrentHelper:
|
||||
"""
|
||||
种子帮助类
|
||||
@@ -460,52 +495,48 @@ class TorrentHelper:
|
||||
# 包含
|
||||
include = filter_params.get("include")
|
||||
if include:
|
||||
if not re.search(r"%s" % include, content, re.I):
|
||||
if not _filter_pattern_search(include, content):
|
||||
logger.info(f"{content} 不匹配包含规则 {include}")
|
||||
return False
|
||||
# 排除
|
||||
exclude = filter_params.get("exclude")
|
||||
if exclude:
|
||||
if re.search(r"%s" % exclude, content, re.I):
|
||||
if _filter_pattern_search(exclude, content):
|
||||
logger.info(f"{content} 匹配排除规则 {exclude}")
|
||||
return False
|
||||
# 质量
|
||||
quality = filter_params.get("quality")
|
||||
if quality:
|
||||
if not re.search(r"%s" % quality, torrent_info.title, re.I):
|
||||
if not _filter_pattern_search(quality, torrent_info.title):
|
||||
logger.info(f"{torrent_info.title} 不匹配质量规则 {quality}")
|
||||
return False
|
||||
# 分辨率
|
||||
resolution = filter_params.get("resolution")
|
||||
if resolution:
|
||||
if not re.search(r"%s" % resolution, torrent_info.title, re.I):
|
||||
if not _filter_pattern_search(resolution, torrent_info.title):
|
||||
logger.info(f"{torrent_info.title} 不匹配分辨率规则 {resolution}")
|
||||
return False
|
||||
# 特效
|
||||
effect = filter_params.get("effect")
|
||||
if effect:
|
||||
if not re.search(r"%s" % effect, torrent_info.title, re.I):
|
||||
if not _filter_pattern_search(effect, torrent_info.title):
|
||||
logger.info(f"{torrent_info.title} 不匹配特效规则 {effect}")
|
||||
return False
|
||||
|
||||
# 大小
|
||||
size_range = filter_params.get("size")
|
||||
if size_range:
|
||||
if size_range.find("-") != -1:
|
||||
size_rule, size_min, size_max = _parse_filter_size_range(size_range)
|
||||
if size_rule == "between":
|
||||
# 区间
|
||||
size_min, size_max = size_range.split("-")
|
||||
size_min = float(size_min.strip()) * 1024 * 1024
|
||||
size_max = float(size_max.strip()) * 1024 * 1024
|
||||
if torrent_info.size < size_min or torrent_info.size > size_max:
|
||||
return False
|
||||
elif size_range.startswith(">"):
|
||||
elif size_rule == "gte":
|
||||
# 大于
|
||||
size_min = float(size_range[1:].strip()) * 1024 * 1024
|
||||
if torrent_info.size < size_min:
|
||||
return False
|
||||
elif size_range.startswith("<"):
|
||||
elif size_rule == "lte":
|
||||
# 小于
|
||||
size_max = float(size_range[1:].strip()) * 1024 * 1024
|
||||
if torrent_info.size > size_max:
|
||||
return False
|
||||
|
||||
@@ -521,6 +552,7 @@ class TorrentHelper:
|
||||
"""
|
||||
# 匹配季
|
||||
seasons = season_episodes.keys()
|
||||
seasons_set = set(seasons)
|
||||
# 种子季
|
||||
torrent_seasons = meta.season_list
|
||||
if not torrent_seasons:
|
||||
@@ -528,7 +560,7 @@ class TorrentHelper:
|
||||
torrent_seasons = [1]
|
||||
# 种子集
|
||||
torrent_episodes = meta.episode_list
|
||||
if not set(torrent_seasons).issubset(set(seasons)):
|
||||
if not set(torrent_seasons).issubset(seasons_set):
|
||||
# 种子季不在过滤季中
|
||||
logger.debug(
|
||||
f"种子 {torrent.site_name} - {torrent.title} 包含季 {torrent_seasons} 不是需要的季 {list(seasons)}")
|
||||
@@ -539,7 +571,7 @@ class TorrentHelper:
|
||||
if len(torrent_seasons) == 1:
|
||||
need_episodes = season_episodes.get(torrent_seasons[0])
|
||||
if need_episodes \
|
||||
and not set(torrent_episodes).intersection(set(need_episodes)):
|
||||
and not set(torrent_episodes).intersection(need_episodes):
|
||||
# 单季集没有交集的不要
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} "
|
||||
f"集 {torrent_episodes} 没有需要的集:{need_episodes}")
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from functools import lru_cache
|
||||
from typing import List, Tuple, Union, Dict, Optional
|
||||
|
||||
from app.core.context import TorrentInfo, MediaInfo
|
||||
from app.core.metainfo import MetaInfo
|
||||
from app.helper.rule import RuleHelper
|
||||
from app.log import logger
|
||||
import re
|
||||
from app.core.metainfo import MetaInfo
|
||||
from app.modules import _ModuleBase
|
||||
from app.modules.filter.RuleParser import RuleParser
|
||||
from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
|
||||
@@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
|
||||
from app.utils.string import StringUtils
|
||||
|
||||
|
||||
_SIZE_UNIT = 1024 * 1024
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def _compile_ignorecase(pattern: str) -> re.Pattern:
|
||||
"""
|
||||
编译过滤规则正则。
|
||||
过滤规则在搜索/订阅中会被大量种子重复匹配,缓存编译结果能减少热路径开销;
|
||||
这里仍保留原有的 IGNORECASE 语义,非法正则也会像原来一样在匹配时抛出异常。
|
||||
"""
|
||||
return re.compile(r"%s" % pattern, re.IGNORECASE)
|
||||
|
||||
|
||||
def _regex_search(pattern: Union[str, int, float], content: str) -> bool:
|
||||
"""
|
||||
按原有字符串插值语义执行正则匹配,同时复用已编译表达式。
|
||||
"""
|
||||
return bool(_compile_ignorecase(str(pattern)).search(content))
|
||||
|
||||
|
||||
@lru_cache(maxsize=256)
|
||||
def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
|
||||
"""
|
||||
解析大小范围,单位为 MB。
|
||||
返回值中的操作符只供本模块内部使用,避免每个种子重复拆分同一个规则。
|
||||
"""
|
||||
size_range = size_range.strip()
|
||||
if size_range.find("-") != -1:
|
||||
size_min, size_max = size_range.split("-")
|
||||
return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
|
||||
if size_range.startswith(">"):
|
||||
return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
|
||||
if size_range.startswith("<"):
|
||||
return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
|
||||
return "unknown", 0, None
|
||||
|
||||
|
||||
@lru_cache(maxsize=256)
|
||||
def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
|
||||
"""
|
||||
解析发布时间规则,避免同一规则对大量种子反复转换 float。
|
||||
"""
|
||||
return tuple(float(t) for t in publish_time.split("-"))
|
||||
|
||||
|
||||
class FilterModule(_ModuleBase):
|
||||
CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}
|
||||
|
||||
@@ -86,6 +132,9 @@ class FilterModule(_ModuleBase):
|
||||
if not rule_groups:
|
||||
return torrent_list
|
||||
parser = RuleParser()
|
||||
# 同一轮过滤里,相同的优先级层级会被多个种子反复使用;按需解析并缓存,
|
||||
# 既减少 pyparsing 开销,也保留原来“命中高优先级后不解析低层级”的容错行为。
|
||||
parsed_rule_cache: Dict[str, Union[list, str]] = {}
|
||||
# 查询规则表详情
|
||||
groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
|
||||
if groups:
|
||||
@@ -97,21 +146,27 @@ class FilterModule(_ModuleBase):
|
||||
torrent_list=torrent_list,
|
||||
mediainfo=mediainfo,
|
||||
parser=parser,
|
||||
parsed_rule_cache=parsed_rule_cache,
|
||||
)
|
||||
return torrent_list
|
||||
|
||||
def __filter_torrents(self, rule_string: str, rule_name: str,
|
||||
torrent_list: List[TorrentInfo],
|
||||
mediainfo: MediaInfo,
|
||||
parser: RuleParser) -> List[TorrentInfo]:
|
||||
parser: RuleParser,
|
||||
parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]:
|
||||
"""
|
||||
过滤种子
|
||||
"""
|
||||
if not torrent_list:
|
||||
return []
|
||||
# 只拆分一次规则层级;具体层级仍延迟到真正需要匹配时解析。
|
||||
rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')]
|
||||
# 返回种子列表
|
||||
ret_torrents = []
|
||||
for torrent in torrent_list:
|
||||
# 能命中优先级的才返回
|
||||
if not self.__get_order(torrent, rule_string, mediainfo, parser):
|
||||
if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache):
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} "
|
||||
f"不匹配 {rule_name} 过滤规则")
|
||||
continue
|
||||
@@ -119,13 +174,12 @@ class FilterModule(_ModuleBase):
|
||||
|
||||
return ret_torrents
|
||||
|
||||
def __get_order(self, torrent: TorrentInfo, rule_str: str,
|
||||
mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]:
|
||||
def __get_order(self, torrent: TorrentInfo, rule_groups: List[str],
|
||||
mediainfo: MediaInfo, parser: RuleParser,
|
||||
parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]:
|
||||
"""
|
||||
获取种子匹配的规则优先级,值越大越优先,未匹配时返回None
|
||||
"""
|
||||
# 多级规则
|
||||
rule_groups = rule_str.split('>')
|
||||
# 优先级
|
||||
res_order = 100
|
||||
# 是否匹配
|
||||
@@ -133,8 +187,8 @@ class FilterModule(_ModuleBase):
|
||||
|
||||
for rule_group in rule_groups:
|
||||
# 解析规则组
|
||||
parsed_group = parser.parse(rule_group.strip())
|
||||
if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo):
|
||||
parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache)
|
||||
if self.__match_group(torrent, parsed_group, mediainfo):
|
||||
# 出现匹配时中断
|
||||
matched = True
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}")
|
||||
@@ -145,6 +199,17 @@ class FilterModule(_ModuleBase):
|
||||
|
||||
return None if not matched else torrent
|
||||
|
||||
@staticmethod
|
||||
def __parse_rule_group(rule_group: str, parser: RuleParser,
|
||||
parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]:
|
||||
"""
|
||||
解析单个优先级层级。
|
||||
缓存粒度放在层级表达式上,兼容多个规则组复用相同表达式的情况。
|
||||
"""
|
||||
if rule_group not in parsed_rule_cache:
|
||||
parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0]
|
||||
return parsed_rule_cache[rule_group]
|
||||
|
||||
def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str],
|
||||
mediainfo: MediaInfo) -> Optional[bool]:
|
||||
"""
|
||||
@@ -173,12 +238,13 @@ class FilterModule(_ModuleBase):
|
||||
"""
|
||||
判断种子是否匹配规则项
|
||||
"""
|
||||
if not self.rule_set.get(rule_name):
|
||||
rule = self.rule_set.get(rule_name)
|
||||
if not rule:
|
||||
# 规则不存在
|
||||
logger.debug(f"规则 {rule_name} 不存在")
|
||||
return False
|
||||
# TMDB规则
|
||||
tmdb = self.rule_set[rule_name].get("tmdb")
|
||||
tmdb = rule.get("tmdb")
|
||||
# 符合TMDB规则的直接返回True,即不过滤
|
||||
if tmdb and self.__match_tmdb(tmdb, mediainfo):
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则,匹配成功")
|
||||
@@ -187,7 +253,7 @@ class FilterModule(_ModuleBase):
|
||||
content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}"
|
||||
# 只匹配指定关键字
|
||||
match_content = []
|
||||
matchs = self.rule_set[rule_name].get("match") or []
|
||||
matchs = rule.get("match") or []
|
||||
if matchs:
|
||||
for match in matchs:
|
||||
if not hasattr(torrent, match):
|
||||
@@ -202,27 +268,27 @@ class FilterModule(_ModuleBase):
|
||||
if match_content:
|
||||
content = " ".join(match_content)
|
||||
# 包含规则项
|
||||
includes = self.rule_set[rule_name].get("include") or []
|
||||
includes = rule.get("include") or []
|
||||
if not isinstance(includes, list):
|
||||
includes = [includes]
|
||||
# 排除规则项
|
||||
excludes = self.rule_set[rule_name].get("exclude") or []
|
||||
excludes = rule.get("exclude") or []
|
||||
if not isinstance(excludes, list):
|
||||
excludes = [excludes]
|
||||
# 大小范围规则项
|
||||
size_range = self.rule_set[rule_name].get("size_range")
|
||||
size_range = rule.get("size_range")
|
||||
# 做种人数规则项
|
||||
seeders = self.rule_set[rule_name].get("seeders")
|
||||
seeders = rule.get("seeders")
|
||||
# FREE规则
|
||||
downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor")
|
||||
downloadvolumefactor = rule.get("downloadvolumefactor")
|
||||
# 发布时间规则
|
||||
pubdate: str = self.rule_set[rule_name].get("publish_time")
|
||||
if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes):
|
||||
pubdate: str = rule.get("publish_time")
|
||||
if includes and not any(_regex_search(include, content) for include in includes):
|
||||
# 未发现任何包含项
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}")
|
||||
return False
|
||||
for exclude in excludes:
|
||||
if re.search(r"%s" % exclude, content, re.IGNORECASE):
|
||||
if _regex_search(exclude, content):
|
||||
# 发现排除项
|
||||
logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}")
|
||||
return False
|
||||
@@ -247,7 +313,7 @@ class FilterModule(_ModuleBase):
|
||||
# 种子发布时间
|
||||
pub_minutes = torrent.pub_minutes()
|
||||
# 发布时间规则
|
||||
pub_times = [float(t) for t in pubdate.split("-")]
|
||||
pub_times = _parse_publish_time(pubdate)
|
||||
if len(pub_times) == 1:
|
||||
# 发布时间小于规则
|
||||
if pub_minutes < pub_times[0]:
|
||||
@@ -319,22 +385,17 @@ class FilterModule(_ModuleBase):
|
||||
# 每集大小
|
||||
torrent_size = torrent.size / episode_count
|
||||
# 大小范围
|
||||
size_range = size_range.strip()
|
||||
if size_range.find("-") != -1:
|
||||
size_rule, size_min, size_max = _parse_size_range(size_range)
|
||||
if size_rule == "between":
|
||||
# 区间
|
||||
size_min, size_max = size_range.split("-")
|
||||
size_min = float(size_min.strip()) * 1024 * 1024
|
||||
size_max = float(size_max.strip()) * 1024 * 1024
|
||||
if size_min <= torrent_size <= size_max:
|
||||
return True
|
||||
elif size_range.startswith(">"):
|
||||
elif size_rule == "gte":
|
||||
# 大于
|
||||
size_min = float(size_range[1:].strip()) * 1024 * 1024
|
||||
if torrent_size >= size_min:
|
||||
return True
|
||||
elif size_range.startswith("<"):
|
||||
elif size_rule == "lte":
|
||||
# 小于
|
||||
size_max = float(size_range[1:].strip()) * 1024 * 1024
|
||||
if torrent_size <= size_max:
|
||||
return True
|
||||
return False
|
||||
|
||||
104
tests/test_torrent_filter.py
Normal file
104
tests/test_torrent_filter.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from app.core.context import TorrentInfo
|
||||
from app.helper.torrent import TorrentHelper
|
||||
from app.modules.filter import FilterModule
|
||||
|
||||
|
||||
class _RuleHelper:
|
||||
"""
|
||||
过滤模块测试用的轻量规则仓库,避免依赖真实系统配置。
|
||||
"""
|
||||
|
||||
def __init__(self, groups):
|
||||
self._groups = groups
|
||||
|
||||
def get_rule_group_by_media(self, media=None, group_names=None): # noqa: ARG002
|
||||
if not group_names:
|
||||
return self._groups
|
||||
return [group for group in self._groups if group.name in group_names]
|
||||
|
||||
|
||||
def _build_filter_module(rule_string: str, rule_set: dict) -> FilterModule:
|
||||
module = FilterModule()
|
||||
module.rulehelper = _RuleHelper(
|
||||
[SimpleNamespace(name="test", rule_string=rule_string)]
|
||||
)
|
||||
module.rule_set = rule_set
|
||||
return module
|
||||
|
||||
|
||||
class TorrentFilterTest(unittest.TestCase):
|
||||
|
||||
def test_filter_torrents_keeps_priority_and_boolean_rule_semantics(self):
|
||||
module = _build_filter_module(
|
||||
rule_string="HDR & !BLU > DV",
|
||||
rule_set={
|
||||
"HDR": {"include": "HDR"},
|
||||
"DV": {"include": "DOVI"},
|
||||
"BLU": {"include": "BluRay"},
|
||||
},
|
||||
)
|
||||
torrents = [
|
||||
TorrentInfo(title="Movie HDR WEB-DL", description=""),
|
||||
TorrentInfo(title="Movie DOVI", description=""),
|
||||
TorrentInfo(title="Movie HDR BluRay", description=""),
|
||||
]
|
||||
|
||||
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=torrents)
|
||||
|
||||
self.assertEqual(torrents[:2], filtered)
|
||||
self.assertEqual(100, filtered[0].pri_order)
|
||||
self.assertEqual(99, filtered[1].pri_order)
|
||||
|
||||
def test_filter_torrents_keeps_lazy_priority_level_parsing(self):
|
||||
module = _build_filter_module(
|
||||
rule_string="KEEP > (",
|
||||
rule_set={"KEEP": {"include": "Movie"}},
|
||||
)
|
||||
torrent = TorrentInfo(title="Movie", description="")
|
||||
|
||||
filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent])
|
||||
|
||||
self.assertEqual([torrent], filtered)
|
||||
self.assertEqual(100, torrent.pri_order)
|
||||
|
||||
def test_filter_torrent_keeps_extra_filter_semantics(self):
|
||||
torrent = TorrentInfo(
|
||||
title="Movie 1080p HDR",
|
||||
description="中字",
|
||||
labels=["free"],
|
||||
size=3 * 1024 * 1024 * 1024,
|
||||
uploadvolumefactor=1,
|
||||
downloadvolumefactor=0,
|
||||
)
|
||||
|
||||
self.assertTrue(
|
||||
TorrentHelper.filter_torrent(
|
||||
torrent_info=torrent,
|
||||
filter_params={
|
||||
"include": "中字|free",
|
||||
"exclude": "BluRay",
|
||||
"resolution": "1080p",
|
||||
"effect": "HDR",
|
||||
"size": "1000-4000",
|
||||
},
|
||||
)
|
||||
)
|
||||
self.assertFalse(
|
||||
TorrentHelper.filter_torrent(
|
||||
torrent_info=torrent,
|
||||
filter_params={"exclude": "HDR"},
|
||||
)
|
||||
)
|
||||
self.assertFalse(
|
||||
TorrentHelper.filter_torrent(
|
||||
torrent_info=torrent,
|
||||
filter_params={"size": "<1000"},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user