perf: optimize media recognition internals

2026-06-07 08:41:16 +08:00 · 2026-05-15 13:37:36 +08:00
parent b2a18f9ae4
commit 2831eecbeb
9 changed files with 396 additions and 254 deletions
--- a/app/core/meta/customization.py
+++ b/app/core/meta/customization.py
@@ -14,6 +14,7 @@ class CustomizationMatcher(metaclass=Singleton):
        self.systemconfig = SystemConfigOper()
        self.customization = None
        self.custom_separator = None
+        self._customization_re_cache = {}

    @staticmethod
    def _normalize_customization(customization):
@@ -42,10 +43,14 @@ class CustomizationMatcher(metaclass=Singleton):
            return ""
        self.customization = "|".join([f"({item})" for item in customization])

-        customization_re = re.compile(r"%s" % self.customization)
+        customization_re = self._customization_re_cache.get(self.customization)
+        if not customization_re:
+            # 配置每次读取、编译结果按规则缓存，兼顾实时生效和高频识别性能。
+            customization_re = re.compile(r"%s" % self.customization)
+            self._customization_re_cache[self.customization] = customization_re
        # 处理重复多次的情况，保留先后顺序（按添加自定义占位符的顺序）
        unique_customization = {}
-        for item in re.findall(customization_re, title):
+        for item in customization_re.findall(title):
            if not isinstance(item, tuple):
                item = (item,)
            for i in range(len(item)):
--- a/app/core/meta/metavideo.py
+++ b/app/core/meta/metavideo.py
@@ -105,6 +105,7 @@ class MetaVideo(MetaBase):
        tokens = Tokens(title)
        # 实例化StreamingPlatforms对象
        streaming_platforms = StreamingPlatforms()
+        media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT
        # 解析名称、年份、季、集、资源类型、分辨率等
        token = tokens.get_next()
        while token:
@@ -113,7 +114,7 @@ class MetaVideo(MetaBase):
            self.__init_part(token, tokens)
            # 标题
            if self._continue_flag:
-                self.__init_name(token)
+                self.__init_name(token, media_exts)
            # 年份
            if self._continue_flag:
                self.__init_year(token)
@@ -226,7 +227,7 @@ class MetaVideo(MetaBase):
                name = None
        return name

-    def __init_name(self, token: Optional[str]):
+    def __init_name(self, token: Optional[str], media_exts: list):
        """
        识别名称
        """
@@ -313,7 +314,6 @@ class MetaVideo(MetaBase):
                return
            else:
                # 后缀名不要
-                media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT
                if ".%s".lower() % token in media_exts:
                    return
                # 英文或者英文+数字，拼装起来
--- a/app/core/meta/releasegroup.py
+++ b/app/core/meta/releasegroup.py
@@ -86,6 +86,18 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
            for release_group in site_groups:
                release_groups.append(release_group)
        self.__release_groups = '|'.join(release_groups)
+        self.systemconfig = SystemConfigOper()
+        self.__groups_re_cache = {}
+
+    def __get_groups_re(self, groups: str):
+        """
+        发布组规则通常很长，按规则文本缓存编译结果，避免每个标题都重复编译。
+        """
+        groups_re = self.__groups_re_cache.get(groups)
+        if not groups_re:
+            groups_re = re.compile(r"(?<=[-@\[￡【&])(?:(?:%s))(?=$|[@.\s\]\[】&])" % groups, re.I)
+            self.__groups_re_cache[groups] = groups_re
+        return groups_re

    def match(self, title: str = None, groups: str = None):
        """
@@ -97,7 +109,7 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
            return ""
        if not groups:
            # 自定义组
-            custom_release_groups = SystemConfigOper().get(SystemConfigKey.CustomReleaseGroups)
+            custom_release_groups = self.systemconfig.get(SystemConfigKey.CustomReleaseGroups)
            if isinstance(custom_release_groups, list):
                custom_release_groups = list(filter(None, custom_release_groups))
            if custom_release_groups:
@@ -106,9 +118,9 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
            else:
                groups = self.__release_groups
        title = f"{title} "
-        groups_re = re.compile(r"(?<=[-@\[￡【&])(?:(?:%s))(?=$|[@.\s\]\[】&])" % groups, re.I)
+        groups_re = self.__get_groups_re(groups)
        unique_groups = []
-        for item in re.findall(groups_re, title):
+        for item in groups_re.findall(title):
            item_str = item[0] if isinstance(item, tuple) else item
            if item_str not in unique_groups:
                unique_groups.append(item_str)
--- a/app/core/meta/words.py
+++ b/app/core/meta/words.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import cn2an
 import regex as re
@@ -9,6 +9,10 @@ from app.schemas.types import SystemConfigKey
 from app.utils.singleton import Singleton


+_COMBINED_WORD_RE = re.compile(r'^\s*(.*?)\s*=>\s*(.*?)\s*&&\s*(.*?)\s*<>\s*(.*?)\s*>>\s*(.*?)\s*$')
+_LEADING_ZERO_RE = re.compile(r"^0+")
+
+
 class WordsMatcher(metaclass=Singleton):

    def __init__(self):
@@ -28,37 +32,23 @@ class WordsMatcher(metaclass=Singleton):
            if not word or word.startswith("#"):
                continue
            try:
-                if word.count(" => ") and word.count(" && ") and word.count(" >> ") and word.count(" <> "):
-                    # 替换词
-                    thc = str(re.findall(r'(.*?)\s*=>', word)[0]).strip()
-                    # 被替换词
-                    bthc = str(re.findall(r'=>\s*(.*?)\s*&&', word)[0]).strip()
-                    # 集偏移前字段
-                    pyq = str(re.findall(r'&&\s*(.*?)\s*<>', word)[0]).strip()
-                    # 集偏移后字段
-                    pyh = str(re.findall(r'<>(.*?)\s*>>', word)[0]).strip()
-                    # 集偏移
-                    offsets = str(re.findall(r'>>\s*(.*?)$', word)[0]).strip()
+                word_info = self.__parse_word(word)
+                if not word_info:
+                    continue
+                word_type, params = word_info
+                if word_type == "replace_and_offset":
+                    thc, bthc, pyq, pyh, offsets = params
                    # 替换词
                    title, message, state = self.__replace_regex(title, thc, bthc)
                    if state:
                        # 替换词成功再进行集偏移
                        title, message, state = self.__episode_offset(title, pyq, pyh, offsets)
-                elif word.count(" => "):
-                    # 替换词
-                    strings = word.split(" => ")
-                    title, message, state = self.__replace_regex(title, strings[0], strings[1])
-                elif word.count(" >> ") and word.count(" <> "):
-                    # 集偏移
-                    strings = word.split(" <> ")
-                    offsets = strings[1].split(" >> ")
-                    strings[1] = offsets[0]
-                    title, message, state = self.__episode_offset(title, strings[0], strings[1], offsets[1])
-                else:
-                    # 屏蔽词
-                    if not word.strip():
-                        continue
-                    title, message, state = self.__replace_regex(title, word, "")
+                elif word_type == "replace":
+                    title, message, state = self.__replace_regex(title, params[0], params[1])
+                elif word_type == "offset":
+                    title, message, state = self.__episode_offset(title, params[0], params[1], params[2])
+                else:  # block
+                    title, message, state = self.__replace_regex(title, params[0], "")

                if state:
                    appley_words.append(word)
@@ -68,16 +58,37 @@ class WordsMatcher(metaclass=Singleton):

        return title, appley_words

+    @staticmethod
+    def __parse_word(word: str) -> Optional[Tuple[str, Tuple[str, ...]]]:
+        """
+        解析识别词格式。复杂识别词保留原来的字段含义，只把多次正则提取合并为一次。
+        """
+        if word.count(" => ") and word.count(" && ") and word.count(" >> ") and word.count(" <> "):
+            word_match = _COMBINED_WORD_RE.match(word)
+            if not word_match:
+                raise ValueError("复杂识别词格式不正确")
+            return "replace_and_offset", tuple(item.strip() for item in word_match.groups())
+        if word.count(" => "):
+            strings = word.split(" => ")
+            return "replace", (strings[0], strings[1])
+        if word.count(" >> ") and word.count(" <> "):
+            strings = word.split(" <> ")
+            offsets = strings[1].split(" >> ")
+            strings[1] = offsets[0]
+            return "offset", (strings[0], strings[1], offsets[1])
+        if not word.strip():
+            return None
+        return "block", (word,)
+
    @staticmethod
    def __replace_regex(title: str, replaced: str, replace: str) -> Tuple[str, str, bool]:
        """
        正则替换
        """
        try:
-            if not re.findall(r'%s' % replaced, title):
-                return title, "", False
-            else:
-                return re.sub(r'%s' % replaced, r'%s' % replace, title), "", True
+            replaced_re = re.compile(r'%s' % replaced)
+            title, count = replaced_re.subn(r'%s' % replace, title)
+            return title, "", count > 0
        except Exception as err:
            logger.warn(f"自定义识别词正则替换失败：{str(err)} - 标题：{title}，被替换词：{replaced}，替换词：{replace}")
            return title, str(err), False
@@ -112,9 +123,9 @@ class WordsMatcher(metaclass=Singleton):
                if not episode_num_str.isdigit():
                    episode_num_offset_str = cn2an.an2cn(episode_num_offset_int, "low")
                else:
-                    count_0 = re.findall(r"^0+", episode_num_str)
+                    count_0 = _LEADING_ZERO_RE.search(episode_num_str)
                    if count_0:
-                        episode_num_offset_str = f"{count_0[0]}{episode_num_offset_int}"
+                        episode_num_offset_str = f"{count_0.group(0)}{episode_num_offset_int}"
                    else:
                        episode_num_offset_str = str(episode_num_offset_int)
                episode_nums_offset_str.append(episode_num_offset_str)
--- a/app/core/metainfo.py
+++ b/app/core/metainfo.py
@@ -14,6 +14,60 @@ from app.log import logger
 from app.schemas.types import MediaType


+_ANIME_BRACKET_RE = re.compile(r'【[+0-9XVPI-]+】\s*【', re.IGNORECASE)
+_ANIME_DASH_EPISODE_RE = re.compile(r'\s+-\s+[\dv]{1,4}\s+', re.IGNORECASE)
+_VIDEO_SEASON_EPISODE_RE = re.compile(
+    r"S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|"
+    r"EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}",
+    re.IGNORECASE,
+)
+_ANIME_SQUARE_BRACKET_RE = re.compile(r'\[[+0-9XVPI-]+]\s*\[', re.IGNORECASE)
+
+_BRACED_METAINFO_RE = re.compile(r'(?<={\[)[\W\w]+(?=]})')
+_BRACED_TMDBID_RE = re.compile(r'(?<=tmdbid=)\d+')
+_BRACED_DOUBANID_RE = re.compile(r'(?<=doubanid=)\d+')
+_BRACED_TYPE_RE = re.compile(r'(?<=type=)\w+')
+_BRACED_BEGIN_SEASON_RE = re.compile(r'(?<=s=)\d+')
+_BRACED_END_SEASON_RE = re.compile(r'(?<=s=\d+-)\d+')
+_BRACED_BEGIN_EPISODE_RE = re.compile(r'(?<=e=)\d+')
+_BRACED_END_EPISODE_RE = re.compile(r'(?<=e=\d+-)\d+')
+_EMBY_TMDB_RE_LIST = (
+    re.compile(r'\[tmdbid[=\-](\d+)\]'),
+    re.compile(r'\[tmdb[=\-](\d+)\]'),
+    re.compile(r'\{tmdbid[=\-](\d+)\}'),
+    re.compile(r'\{tmdb[=\-](\d+)\}'),
+)
+
+
+def _empty_metainfo() -> dict:
+    """
+    返回媒体标签的默认结构，避免不同识别请求之间共享可变状态。
+    """
+    return {
+        'tmdbid': None,
+        'doubanid': None,
+        'type': None,
+        'begin_season': None,
+        'end_season': None,
+        'total_season': None,
+        'begin_episode': None,
+        'end_episode': None,
+        'total_episode': None,
+    }
+
+
+def _apply_range_total(metainfo: dict, begin_key: str, end_key: str, total_key: str) -> None:
+    """
+    计算季/集范围总数；保留原有倒序输入自动交换的兼容行为。
+    """
+    if metainfo.get(begin_key) and metainfo.get(end_key):
+        if metainfo[begin_key] > metainfo[end_key]:
+            metainfo[begin_key], metainfo[end_key] = metainfo[end_key], metainfo[begin_key]
+        metainfo[total_key] = metainfo[end_key] - metainfo[begin_key] + 1
+    elif metainfo.get(begin_key) and not metainfo.get(end_key):
+        metainfo[total_key] = 1
+
+
 def _build_meta_info(
        title: str,
        subtitle: Optional[str] = None,
@@ -30,10 +84,11 @@ def _build_meta_info(
    title, metainfo = find_metainfo(title)
    # 判断是否处理文件
    media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT
-    if title and Path(title).suffix.lower() in media_exts:
+    title_path = Path(title) if title else None
+    if title_path and title_path.suffix.lower() in media_exts:
        isfile = True
        # 去掉后缀
-        title = Path(title).stem
+        title = title_path.stem
    else:
        isfile = False
    # 识别
@@ -115,15 +170,13 @@ def is_anime(name: str) -> bool:
    """
    if not name:
        return False
-    if re.search(r'【[+0-9XVPI-]+】\s*【', name, re.IGNORECASE):
+    if _ANIME_BRACKET_RE.search(name):
        return True
-    if re.search(r'\s+-\s+[\dv]{1,4}\s+', name, re.IGNORECASE):
+    if _ANIME_DASH_EPISODE_RE.search(name):
        return True
-    if re.search(r"S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}",
-                 name,
-                 re.IGNORECASE):
+    if _VIDEO_SEASON_EPISODE_RE.search(name):
        return False
-    if re.search(r'\[[+0-9XVPI-]+]\s*\[', name, re.IGNORECASE):
+    if _ANIME_SQUARE_BRACKET_RE.search(name):
        return True
    return False

@@ -132,95 +185,62 @@ def find_metainfo(title: str) -> Tuple[str, dict]:
    """
    从标题中提取媒体信息
    """
-    metainfo = {
-        'tmdbid': None,
-        'doubanid': None,
-        'type': None,
-        'begin_season': None,
-        'end_season': None,
-        'total_season': None,
-        'begin_episode': None,
-        'end_episode': None,
-        'total_episode': None,
-    }
+    metainfo = _empty_metainfo()
    if not title:
        return title, metainfo
    # 从标题中提取媒体信息 格式为{[tmdbid=xxx;type=xxx;s=xxx;e=xxx]}
-    results = re.findall(r'(?<={\[)[\W\w]+(?=]})', title)
+    results = _BRACED_METAINFO_RE.findall(title)
    if results:
        for result in results:
            # 查找tmdbid信息
-            tmdbid = re.findall(r'(?<=tmdbid=)\d+', result)
-            if tmdbid and tmdbid[0].isdigit():
-                metainfo['tmdbid'] = tmdbid[0]
+            tmdbid = _BRACED_TMDBID_RE.search(result)
+            if tmdbid and tmdbid.group(0).isdigit():
+                metainfo['tmdbid'] = tmdbid.group(0)
            # 查找豆瓣id信息
-            doubanid = re.findall(r'(?<=doubanid=)\d+', result)
-            if doubanid and doubanid[0].isdigit():
-                metainfo['doubanid'] = doubanid[0]
+            doubanid = _BRACED_DOUBANID_RE.search(result)
+            if doubanid and doubanid.group(0).isdigit():
+                metainfo['doubanid'] = doubanid.group(0)
            # 查找媒体类型
-            mtype = re.findall(r'(?<=type=)\w+', result)
+            mtype = _BRACED_TYPE_RE.search(result)
            if mtype:
-                if mtype[0] == "movies":
+                media_type = mtype.group(0)
+                if media_type == "movies":
                    metainfo['type'] = MediaType.MOVIE
-                elif mtype[0] == "tv":
+                elif media_type == "tv":
                    metainfo['type'] = MediaType.TV
            # 查找季信息
-            begin_season = re.findall(r'(?<=s=)\d+', result)
-            if begin_season and begin_season[0].isdigit():
-                metainfo['begin_season'] = int(begin_season[0])
-            end_season = re.findall(r'(?<=s=\d+-)\d+', result)
-            if end_season and end_season[0].isdigit():
-                metainfo['end_season'] = int(end_season[0])
+            begin_season = _BRACED_BEGIN_SEASON_RE.search(result)
+            if begin_season and begin_season.group(0).isdigit():
+                metainfo['begin_season'] = int(begin_season.group(0))
+            end_season = _BRACED_END_SEASON_RE.search(result)
+            if end_season and end_season.group(0).isdigit():
+                metainfo['end_season'] = int(end_season.group(0))
            # 查找集信息
-            begin_episode = re.findall(r'(?<=e=)\d+', result)
-            if begin_episode and begin_episode[0].isdigit():
-                metainfo['begin_episode'] = int(begin_episode[0])
-            end_episode = re.findall(r'(?<=e=\d+-)\d+', result)
-            if end_episode and end_episode[0].isdigit():
-                metainfo['end_episode'] = int(end_episode[0])
+            begin_episode = _BRACED_BEGIN_EPISODE_RE.search(result)
+            if begin_episode and begin_episode.group(0).isdigit():
+                metainfo['begin_episode'] = int(begin_episode.group(0))
+            end_episode = _BRACED_END_EPISODE_RE.search(result)
+            if end_episode and end_episode.group(0).isdigit():
+                metainfo['end_episode'] = int(end_episode.group(0))
            # 去除title中该部分
            if tmdbid or mtype or begin_season or end_season or begin_episode or end_episode:
                title = title.replace(f"{{[{result}]}}", '')

-    # 支持Emby格式的ID标签
-    # 1. [tmdbid=xxxx] 或 [tmdbid-xxxx] 格式
-    tmdb_match = re.search(r'\[tmdbid[=\-](\d+)\]', title)
+    # 支持Emby格式的ID标签；第一个 [tmdbid] 历史上始终优先处理，用于覆盖前面 {[...]} 中的旧标签。
+    tmdb_match = _EMBY_TMDB_RE_LIST[0].search(title)
    if tmdb_match:
        metainfo['tmdbid'] = tmdb_match.group(1)
-        title = re.sub(r'\[tmdbid[=\-](\d+)\]', '', title).strip()
-
-    # 2. [tmdb=xxxx] 或 [tmdb-xxxx] 格式
-    if not metainfo['tmdbid']:
-        tmdb_match = re.search(r'\[tmdb[=\-](\d+)\]', title)
-        if tmdb_match:
-            metainfo['tmdbid'] = tmdb_match.group(1)
-            title = re.sub(r'\[tmdb[=\-](\d+)\]', '', title).strip()
-
-    # 3. {tmdbid=xxxx} 或 {tmdbid-xxxx} 格式
-    if not metainfo['tmdbid']:
-        tmdb_match = re.search(r'\{tmdbid[=\-](\d+)\}', title)
-        if tmdb_match:
-            metainfo['tmdbid'] = tmdb_match.group(1)
-            title = re.sub(r'\{tmdbid[=\-](\d+)\}', '', title).strip()
-
-    # 4. {tmdb=xxxx} 或 {tmdb-xxxx} 格式
-    if not metainfo['tmdbid']:
-        tmdb_match = re.search(r'\{tmdb[=\-](\d+)\}', title)
-        if tmdb_match:
-            metainfo['tmdbid'] = tmdb_match.group(1)
-            title = re.sub(r'\{tmdb[=\-](\d+)\}', '', title).strip()
+        title = _EMBY_TMDB_RE_LIST[0].sub('', title).strip()
+    elif not metainfo['tmdbid']:
+        # 保持原有优先级：[tmdbid] > [tmdb] > {tmdbid} > {tmdb}
+        for tmdb_re in _EMBY_TMDB_RE_LIST[1:]:
+            tmdb_match = tmdb_re.search(title)
+            if tmdb_match:
+                metainfo['tmdbid'] = tmdb_match.group(1)
+                title = tmdb_re.sub('', title).strip()
+                break

    # 计算季集总数
-    if metainfo.get('begin_season') and metainfo.get('end_season'):
-        if metainfo['begin_season'] > metainfo['end_season']:
-            metainfo['begin_season'], metainfo['end_season'] = metainfo['end_season'], metainfo['begin_season']
-        metainfo['total_season'] = metainfo['end_season'] - metainfo['begin_season'] + 1
-    elif metainfo.get('begin_season') and not metainfo.get('end_season'):
-        metainfo['total_season'] = 1
-    if metainfo.get('begin_episode') and metainfo.get('end_episode'):
-        if metainfo['begin_episode'] > metainfo['end_episode']:
-            metainfo['begin_episode'], metainfo['end_episode'] = metainfo['end_episode'], metainfo['begin_episode']
-        metainfo['total_episode'] = metainfo['end_episode'] - metainfo['begin_episode'] + 1
-    elif metainfo.get('begin_episode') and not metainfo.get('end_episode'):
-        metainfo['total_episode'] = 1
+    _apply_range_total(metainfo, 'begin_season', 'end_season', 'total_season')
+    _apply_range_total(metainfo, 'begin_episode', 'end_episode', 'total_episode')
    return title, metainfo