feat(agent): add tools for querying and updating custom identifiers

2026-05-06 20:42:43 +08:00 · 2026-04-07 09:00:15 +08:00
parent 02cb5dfc31
commit ac9c9598f4
5 changed files with 394 additions and 3 deletions
--- a/app/agent/init.py
+++ b/app/agent/init.py
@@ -251,7 +251,7 @@ class MoviePilotAgent:
                                        if start_idx > 0:
                                            on_token(buffer[:start_idx])
                                        in_think_tag = True
-                                        buffer = buffer[start_idx + 7 :]
+                                        buffer = buffer[start_idx + 7:]
                                    else:
                                        # 检查是否以 <think> 的前缀结尾
                                        partial_match = False
@@ -269,7 +269,7 @@ class MoviePilotAgent:
                                    end_idx = buffer.find("</think>")
                                    if end_idx != -1:
                                        in_think_tag = False
-                                        buffer = buffer[end_idx + 8 :]
+                                        buffer = buffer[end_idx + 8:]
                                    else:
                                        # 检查是否以 </think> 的前缀结尾
                                        partial_match = False
@@ -619,7 +619,7 @@ class AgentManager:
                await self._session_workers[session_id]
            except asyncio.CancelledError:
                pass
-            self._session_workers.pop(session_id, None)
+            self._session_workers.pop(session_id, None)  # noqa
            stopped = True

        # 清空队列中待处理的消息
--- a/app/agent/tools/factory.py
+++ b/app/agent/tools/factory.py
@@ -52,6 +52,8 @@ from app.agent.tools.impl.query_installed_plugins import QueryInstalledPluginsTo
 from app.agent.tools.impl.query_plugin_capabilities import QueryPluginCapabilitiesTool
 from app.agent.tools.impl.run_slash_command import RunSlashCommandTool
 from app.agent.tools.impl.list_slash_commands import ListSlashCommandsTool
+from app.agent.tools.impl.query_custom_identifiers import QueryCustomIdentifiersTool
+from app.agent.tools.impl.update_custom_identifiers import UpdateCustomIdentifiersTool
 from app.core.plugin import PluginManager
 from app.log import logger
 from .base import MoviePilotTool
@@ -128,6 +130,8 @@ class MoviePilotToolFactory:
            QueryPluginCapabilitiesTool,
            RunSlashCommandTool,
            ListSlashCommandsTool,
+            QueryCustomIdentifiersTool,
+            UpdateCustomIdentifiersTool,
        ]
        # 创建内置工具
        for ToolClass in tool_definitions:
--- a/app/agent/tools/impl/query_custom_identifiers.py
+++ b/app/agent/tools/impl/query_custom_identifiers.py
@@ -0,0 +1,66 @@
+"""查询自定义识别词工具"""
+
+import json
+from typing import Optional, Type
+
+from pydantic import BaseModel, Field
+
+from app.agent.tools.base import MoviePilotTool
+from app.db.systemconfig_oper import SystemConfigOper
+from app.log import logger
+from app.schemas.types import SystemConfigKey
+
+
+class QueryCustomIdentifiersInput(BaseModel):
+    """查询自定义识别词工具的输入参数模型"""
+
+    explanation: str = Field(
+        ...,
+        description="Clear explanation of why this tool is being used in the current context",
+    )
+
+
+class QueryCustomIdentifiersTool(MoviePilotTool):
+    name: str = "query_custom_identifiers"
+    description: str = (
+        "Query all currently configured custom identifiers (自定义识别词). "
+        "Returns the list of identifier rules used for preprocessing torrent/file names before media recognition. "
+        "Use this tool to check existing rules before adding new ones to avoid duplicates."
+    )
+    args_schema: Type[BaseModel] = QueryCustomIdentifiersInput
+
+    def get_tool_message(self, **kwargs) -> Optional[str]:
+        """生成友好的提示消息"""
+        return "正在查询自定义识别词"
+
+    async def run(self, **kwargs) -> str:
+        logger.info(f"执行工具: {self.name}")
+        try:
+            system_config_oper = SystemConfigOper()
+            identifiers = system_config_oper.get(SystemConfigKey.CustomIdentifiers)
+            if identifiers:
+                return json.dumps(
+                    {
+                        "success": True,
+                        "count": len(identifiers),
+                        "identifiers": identifiers,
+                    },
+                    ensure_ascii=False,
+                    indent=2,
+                )
+            return json.dumps(
+                {
+                    "success": True,
+                    "count": 0,
+                    "identifiers": [],
+                    "message": "当前没有配置自定义识别词",
+                },
+                ensure_ascii=False,
+                indent=2,
+            )
+        except Exception as e:
+            logger.error(f"查询自定义识别词失败: {e}")
+            return json.dumps(
+                {"success": False, "message": f"查询自定义识别词时发生错误: {str(e)}"},
+                ensure_ascii=False,
+            )
--- a/app/agent/tools/impl/update_custom_identifiers.py
+++ b/app/agent/tools/impl/update_custom_identifiers.py
@@ -0,0 +1,95 @@
+"""更新自定义识别词工具"""
+
+import json
+from typing import List, Optional, Type
+
+from pydantic import BaseModel, Field
+
+from app.agent.tools.base import MoviePilotTool
+from app.db.systemconfig_oper import SystemConfigOper
+from app.log import logger
+from app.schemas.types import SystemConfigKey
+
+
+class UpdateCustomIdentifiersInput(BaseModel):
+    """更新自定义识别词工具的输入参数模型"""
+
+    explanation: str = Field(
+        ...,
+        description="Clear explanation of why this tool is being used in the current context",
+    )
+    identifiers: List[str] = Field(
+        ...,
+        description=(
+            "The complete list of custom identifier rules to save. "
+            "This REPLACES the entire existing list. "
+            "Always query existing identifiers first, merge new rules, then pass the full list."
+        ),
+    )
+
+
+class UpdateCustomIdentifiersTool(MoviePilotTool):
+    name: str = "update_custom_identifiers"
+    description: str = (
+        "Update the full list of custom identifiers (自定义识别词) used for preprocessing torrent/file names. "
+        "This tool REPLACES all existing identifier rules with the provided list. "
+        "IMPORTANT: Always use 'query_custom_identifiers' first to get existing rules, "
+        "then merge new rules into the list before calling this tool to avoid accidentally deleting existing rules. "
+        "Supported rule formats (spaces around operators are required): "
+        "1) Block word: just the word/regex to remove; "
+        "2) Replacement: '被替换词 => 替换词'; "
+        "3) Episode offset: '前定位词 <> 后定位词 >> EP±N'; "
+        "4) Combined: '被替换词 => 替换词 && 前定位词 <> 后定位词 >> EP±N'; "
+        "Lines starting with '#' are comments. "
+        "The replacement target supports: {[tmdbid=xxx;type=movie/tv;s=xxx;e=xxx]} for direct TMDB ID matching."
+    )
+    args_schema: Type[BaseModel] = UpdateCustomIdentifiersInput
+
+    def get_tool_message(self, **kwargs) -> Optional[str]:
+        """生成友好的提示消息"""
+        identifiers = kwargs.get("identifiers", [])
+        return f"正在更新自定义识别词（共 {len(identifiers)} 条规则）"
+
+    async def run(self, identifiers: List[str] = None, **kwargs) -> str:
+        logger.info(
+            f"执行工具: {self.name}, 规则数量: {len(identifiers) if identifiers else 0}"
+        )
+        try:
+            if identifiers is None:
+                return json.dumps(
+                    {"success": False, "message": "必须提供 identifiers 参数"},
+                    ensure_ascii=False,
+                )
+
+            # 过滤空字符串
+            identifiers = [i for i in identifiers if i is not None]
+
+            system_config_oper = SystemConfigOper()
+
+            # 保存
+            value = identifiers if identifiers else None
+            success = await system_config_oper.async_set(
+                SystemConfigKey.CustomIdentifiers, value
+            )
+            if success:
+                return json.dumps(
+                    {
+                        "success": True,
+                        "message": f"自定义识别词已更新，共 {len(identifiers)} 条规则",
+                        "count": len(identifiers),
+                        "identifiers": identifiers,
+                    },
+                    ensure_ascii=False,
+                    indent=2,
+                )
+            else:
+                return json.dumps(
+                    {"success": False, "message": "保存自定义识别词失败"},
+                    ensure_ascii=False,
+                )
+        except Exception as e:
+            logger.error(f"更新自定义识别词失败: {e}")
+            return json.dumps(
+                {"success": False, "message": f"更新自定义识别词时发生错误: {str(e)}"},
+                ensure_ascii=False,
+            )
--- a/skills/generate-identifiers/SKILL.md
+++ b/skills/generate-identifiers/SKILL.md
@@ -0,0 +1,226 @@
+---
+name: generate-identifiers
+description: >-
+  Use this skill when a user provides a torrent name or file name and wants to fix recognition issues,
+  or asks to add/manage custom identifiers (自定义识别词).
+  This skill generates identifier rules based on the WordsMatcher preprocessing logic,
+  checks for duplicates against existing rules, and saves them via MCP tools.
+  Applicable scenarios include:
+  1) A torrent or file name is incorrectly recognized (wrong title, season, episode, etc.);
+  2) The user wants to block unwanted keywords from torrent names;
+  3) The user needs episode offset rules for series with non-standard numbering;
+  4) The user wants to force recognition of a specific media by TMDB/Douban ID.
+allowed-tools: query_custom_identifiers update_custom_identifiers recognize_media
+---
+
+# Generate Custom Identifiers (生成自定义识别词)
+
+This skill helps generate custom identifier rules for MoviePilot's media recognition system. Custom identifiers preprocess torrent/file names before the recognition engine runs, correcting naming issues that cause misidentification.
+
+## Prerequisites
+
+You need the following tools:
+- `query_custom_identifiers` - Query all existing custom identifier rules
+- `update_custom_identifiers` - Save the updated identifier list (replaces the full list)
+- `recognize_media` - Test recognition of a torrent title or file path (optional, for verification)
+
+## Supported Rule Formats
+
+There are **four formats**. Operators must have spaces on both sides.
+
+### 1. Block Word (屏蔽词)
+
+Removes matched text from the title. Supports regex.
+
+```
+REPACK
+```
+
+### 2. Replacement (被替换词 => 替换词)
+
+Regex substitution. The left side is a regex pattern, the right side is the replacement (supports backreferences).
+
+```
+被替换词 => 替换词
+```
+
+**Special replacement for direct ID specification:**
+```
+被替换词 => {[tmdbid=xxx;type=movie/tv;s=xxx;e=xxx]}
+被替换词 => {[doubanid=xxx;type=movie/tv;s=xxx;e=xxx]}
+```
+Where `s` (season) and `e` (episode) are optional.
+
+### 3. Episode Offset (集偏移)
+
+Shifts episode numbers found between the front and back delimiter words. `EP` is the placeholder for the original episode number.
+
+```
+前定位词 <> 后定位词 >> EP-12
+```
+
+### 4. Combined Replacement + Episode Offset
+
+First performs replacement; episode offset only runs if replacement succeeded.
+
+```
+被替换词 => 替换词 && 前定位词 <> 后定位词 >> EP-12
+```
+
+### Comments
+
+Lines starting with `#` are comments and will be skipped during processing.
+
+## Important Rules for Writing Identifiers
+
+1. **Regex support**: All patterns support regular expressions. Special characters (`. * + ? ^ $ { } [ ] ( ) | \`) must be escaped with `\` when matching literally.
+2. **Spaces matter**: The operators ` => `, ` <> `, ` >> `, ` && ` must have spaces on both sides.
+3. **One rule per string**: Each element in the identifiers list is one rule.
+4. **EP placeholder**: In episode offset expressions, `EP` represents the original episode number. Common patterns:
+   - `EP-12` means subtract 12
+   - `EP+5` means add 5
+   - `EP*2` means multiply by 2
+5. **Chinese number support**: Episode offset handles Chinese numbers (一二三四五六七八九十).
+6. **Empty replacement**: Using nothing after `=>` is equivalent to a block word.
+
+## Workflow
+
+### Step 1: Analyze the Problem
+
+Parse the torrent/file name provided by the user. Identify:
+- What is being incorrectly recognized (title, season, episode, year, quality, etc.)
+- What the correct recognition result should be
+- Which identifier format(s) will solve the problem
+
+### Step 2: Generate the Identifier Rule(s)
+
+Write the rule using the appropriate format. Ensure:
+- Regex special characters are properly escaped
+- Add a comment line (starting with `#`) above the rule to describe what it does
+- Test the regex mentally against the provided name to verify correctness
+
+### Step 3: Query Existing Identifiers
+
+Use the `query_custom_identifiers` tool to get all current rules:
+
+```
+query_custom_identifiers(explanation="Checking existing identifiers before adding new rules to avoid duplicates")
+```
+
+### Step 4: Check for Duplicates
+
+Compare each new rule against the existing identifiers:
+- **Exact duplicate**: The rule string is identical to an existing rule — skip it
+- **Functional duplicate**: A different rule that produces the same effect on the same input (e.g., same regex pattern with trivial whitespace differences) — warn the user
+- **Conflict**: An existing rule modifies the same text in a different way — warn the user and ask which to keep
+
+### Step 5: Save the Updated Identifiers
+
+Merge new non-duplicate rules into the existing list, then use `update_custom_identifiers` to save the **complete** list:
+
+```
+update_custom_identifiers(
+    explanation="Adding new identifier rules for [description]",
+    identifiers=["existing rule 1", "existing rule 2", "# new comment", "new rule"]
+)
+```
+
+**CRITICAL**: Always include ALL existing rules in the list. This tool replaces the entire list.
+
+### Step 6: Verify (Optional)
+
+If the user wants to verify the rule works, use `recognize_media` to test:
+
+```
+recognize_media(explanation="Testing recognition after adding identifier", title="the torrent title to test")
+```
+
+### Step 7: Report
+
+Tell the user:
+- What rule(s) were added
+- What effect they will have on the title
+- Whether any duplicates or conflicts were found
+
+## Common Scenarios and Examples
+
+### Wrong Season/Episode Parsing
+
+**User**: "种子名 `[SubGroup] My Show - 13 [1080P]`，这是第二季第1集，但被识别成第13集"
+
+**Solution**: Episode offset to subtract 12:
+```
+# My Show 第二季集数偏移（13->1）
+\[SubGroup\] <> \[1080P\] >> EP-12
+```
+
+### Unwanted Text Causing Wrong Identification
+
+**User**: "种子名 `My.Show.2024.REPACK.1080p.mkv`，REPACK导致识别异常"
+
+**Solution**: Block word:
+```
+# 屏蔽REPACK标记
+REPACK
+```
+
+### Non-Standard Naming
+
+**User**: "文件名 `[OldName] EP01.mkv`，应该识别为 NewName"
+
+**Solution**: Replacement:
+```
+# OldName替换为NewName
+OldName => NewName
+```
+
+### Force TMDB ID Recognition
+
+**User**: "种子名 `Some.Weird.Name.S01E01.1080p.mkv`，识别不到，TMDB ID是12345，是电视剧"
+
+**Solution**: Direct ID specification:
+```
+# 强制识别Some.Weird.Name为TMDB ID 12345
+Some\.Weird\.Name => {[tmdbid=12345;type=tv;s=1]}
+```
+
+### Combined Fix
+
+**User**: "种子名 `[Baha][OldTitle][13][1080P]`，标题应该是NewTitle，而且13应该是第二季第1集"
+
+**Solution**: Combined replacement + episode offset:
+```
+# OldTitle替换为NewTitle并偏移集数
+OldTitle => NewTitle && \[Baha\] <> \[1080P\] >> EP-12
+```
+
+### Multiple Episode Numbers in One Title
+
+**User**: "种子名 `[Group] Title - 13-14 [1080P]`，应该是第1-2集"
+
+**Solution**: Episode offset (handles multiple numbers between delimiters):
+```
+# Title 集数偏移
+\[Group\] <> \[1080P\] >> EP-12
+```
+
+## WordsMatcher Processing Logic Reference
+
+The `WordsMatcher.prepare()` method (in `app/core/meta/words.py`) processes each rule in order:
+
+1. Skip empty lines and lines starting with `#`
+2. Detect format by checking operator presence:
+   - Contains ` => ` AND ` && ` AND ` >> ` AND ` <> ` → Combined format (4)
+   - Contains ` => ` → Replacement format (2)
+   - Contains ` >> ` AND ` <> ` → Episode offset format (3)
+   - Otherwise → Block word format (1)
+3. For combined format, replacement runs first; episode offset only runs if replacement succeeded
+4. Returns the modified title and a list of rules that were actually applied
+5. Priority: per-subscribe `custom_words` parameter takes precedence over global `CustomIdentifiers`
+
+## Safety Notes
+
+- Always query existing rules first before updating
+- Never remove existing rules unless the user explicitly asks
+- Add comment lines before new rules for maintainability
+- When uncertain about the correct approach, present multiple options and let the user choose