feat: add captcha recognition agent tool

This commit is contained in:
jxxghp
2026-06-14 16:24:04 +08:00
parent 93713ba662
commit 4e3eddec10
6 changed files with 388 additions and 13 deletions

View File

@@ -37,6 +37,7 @@ from app.agent.tools.impl.query_media_detail import QueryMediaDetailTool
from app.agent.tools.impl.search_torrents import SearchTorrentsTool
from app.agent.tools.impl.get_search_results import GetSearchResultsTool
from app.agent.tools.impl.search_web import SearchWebTool
from app.agent.tools.impl.recognize_captcha import RecognizeCaptchaTool
from app.agent.tools.impl.send_message import SendMessageTool
from app.agent.tools.impl.ask_user_choice import AskUserChoiceTool
from app.agent.tools.impl.send_local_file import SendLocalFileTool
@@ -165,6 +166,7 @@ class MoviePilotToolFactory:
SearchTorrentsTool,
GetSearchResultsTool,
SearchWebTool,
RecognizeCaptchaTool,
AddDownloadTool,
QuerySubscribesTool,
QuerySubscribeSharesTool,

View File

@@ -0,0 +1,167 @@
"""识别图形验证码工具。"""
import json
from typing import Optional, Type
from pydantic import BaseModel, Field
from app.agent.tools.base import MoviePilotTool
from app.agent.tools.tags import ToolTag
from app.helper.browser import BrowserSessionHelper
from app.helper.ocr import OcrHelper
from app.log import logger
class RecognizeCaptchaInput(BaseModel):
"""识别图形验证码工具的输入参数模型。"""
explanation: Optional[str] = Field(
None,
description="Clear explanation of why this captcha image needs to be recognized",
)
image_url: str = Field(
...,
description=(
"Captcha image URL obtained from the browser page, usually an img.src value. "
"Supports http/https URLs and data:image/...;base64,... URLs."
),
)
cookie: Optional[str] = Field(
None,
description=(
"Optional Cookie header used to download the captcha image when the image URL "
"requires the same authenticated browser session."
),
)
user_agent: Optional[str] = Field(
None,
description="Optional User-Agent used when downloading the captcha image.",
)
allow_private_network: bool = Field(
False,
description="Allow captcha image URLs on localhost, loopback, private, or link-local addresses.",
)
class RecognizeCaptchaTool(MoviePilotTool):
"""
图形验证码识别工具,供 Agent 在浏览器自动化登录时读取验证码文本。
"""
name: str = "recognize_captcha"
tags: list[str] = [
ToolTag.Read,
ToolTag.Web,
]
description: str = (
"Recognize a graphic captcha image and return the captcha text. "
"Use this after browser automation extracts a captcha img.src from the page. "
"Pass cookie and user_agent when the image URL requires the current browser session. "
"Supports http/https image URLs and data:image/...;base64,... URLs. "
"For safety, localhost and private network URLs are blocked by default unless "
"allow_private_network is true."
)
args_schema: Type[BaseModel] = RecognizeCaptchaInput
def get_tool_message(self, **kwargs) -> Optional[str]:
"""根据验证码图片参数生成友好的提示消息。"""
image_url = str(kwargs.get("image_url") or "")
if image_url.lower().startswith("data:image/"):
return "识别图形验证码: data image"
return f"识别图形验证码: {image_url}"
@staticmethod
def _recognize_captcha_sync(
image_url: str,
cookie: Optional[str] = None,
user_agent: Optional[str] = None,
allow_private_network: bool = False,
) -> str:
"""
在线程池中下载并识别验证码图片。
:param image_url: 验证码图片地址
:param cookie: 下载图片时使用的 Cookie
:param user_agent: 下载图片时使用的 User-Agent
:param allow_private_network: 是否允许访问本机或私网地址
:return: 验证码文本,失败时返回空字符串
"""
clean_url = (image_url or "").strip()
if not clean_url:
return ""
if not clean_url.lower().startswith("data:image/"):
BrowserSessionHelper.validate_url(
clean_url,
allow_private_network=allow_private_network,
)
return OcrHelper().get_captcha_text(
image_url=clean_url,
cookie=cookie,
ua=user_agent,
)
async def run(
self,
image_url: str,
cookie: Optional[str] = None,
user_agent: Optional[str] = None,
allow_private_network: bool = False,
**kwargs,
) -> str:
"""
识别指定图片地址中的图形验证码文本。
:param image_url: 验证码图片地址
:param cookie: 下载图片时使用的 Cookie
:param user_agent: 下载图片时使用的 User-Agent
:param allow_private_network: 是否允许访问本机或私网地址
:return: JSON 格式的识别结果
"""
logger.info(f"执行工具: {self.name}, 参数: image_url={image_url}")
try:
captcha_text = await self.run_blocking(
"web",
self._recognize_captcha_sync,
image_url,
cookie,
user_agent,
allow_private_network,
)
if captcha_text:
return json.dumps(
{
"success": True,
"captcha_text": captcha_text,
"message": "验证码识别成功",
},
ensure_ascii=False,
)
return json.dumps(
{
"success": False,
"captcha_text": "",
"message": "验证码识别失败或未返回内容",
},
ensure_ascii=False,
)
except ValueError as err:
logger.warning(f"验证码图片地址校验失败: {str(err)}")
return json.dumps(
{
"success": False,
"captcha_text": "",
"message": str(err),
},
ensure_ascii=False,
)
except Exception as err:
logger.error(f"识别图形验证码失败: {str(err)}", exc_info=True)
return json.dumps(
{
"success": False,
"captcha_text": "",
"message": f"识别图形验证码时发生错误: {str(err)}",
},
ensure_ascii=False,
)

View File

@@ -6,31 +6,63 @@ from app.utils.http import RequestUtils
class OcrHelper:
"""
OCR 辅助类,负责获取验证码图片并调用 OCR 服务识别文本。
"""
_ocr_b64_url = f"{settings.OCR_HOST}/captcha/base64"
def get_captcha_text(self, image_url: Optional[str] = None, image_b64: Optional[str] = None,
cookie: Optional[str] = None, ua: Optional[str] = None):
def get_captcha_text(
self,
image_url: Optional[str] = None,
image_b64: Optional[str] = None,
cookie: Optional[str] = None,
ua: Optional[str] = None,
) -> str:
"""
根据图片地址,获取验证码图片,并识别内容
:param image_url: 图片地址
:param image_b64: 图片base64跳过图片地址下载
:param cookie: 下载图片使用的cookie
:param ua: 下载图片使用的ua
:return: 验证码识别结果,失败时返回空字符串
"""
image_b64 = self._normalize_image_base64(image_b64)
if image_url:
ret = RequestUtils(ua=ua,
cookies=cookie).get_res(image_url)
if ret is not None:
image_bin = ret.content
if not image_bin:
return ""
image_b64 = base64.b64encode(image_bin).decode()
data_url_b64 = self._extract_data_url_base64(image_url)
if data_url_b64:
image_b64 = data_url_b64
else:
ret = RequestUtils(ua=ua,
cookies=cookie).get_res(image_url)
if ret is not None:
image_bin = ret.content
if not image_bin:
return ""
image_b64 = base64.b64encode(image_bin).decode()
if not image_b64:
return ""
ret = RequestUtils(content_type="application/json").post_res(
url=self._ocr_b64_url,
json={"base64_img": image_b64})
if ret:
return ret.json().get("result")
return ret.json().get("result") or ""
return ""
@staticmethod
def _normalize_image_base64(image_b64: Optional[str]) -> str:
"""规范化外部传入的图片 base64 内容。"""
if not image_b64:
return ""
return OcrHelper._extract_data_url_base64(image_b64) or image_b64.strip()
@staticmethod
def _extract_data_url_base64(image_url: Optional[str]) -> str:
"""从 data:image/...;base64,... 地址中提取纯 base64 内容。"""
image_url = (image_url or "").strip()
if not image_url.lower().startswith("data:image/"):
return ""
metadata, separator, data = image_url.partition(",")
if not separator or ";base64" not in metadata.lower():
return ""
return data.strip()

View File

@@ -222,6 +222,20 @@ MoviePilot 也提供普通 REST API 给前端和自动化客户端使用。所
`browse_webpage` 使用持久浏览器会话,默认以当前 Agent 会话作为 `session_key``goto``snapshot``click``click_ref``fill``fill_ref``select``select_ref``wait` 等动作会返回页面快照,快照中的 `interactive_elements[].ref` 可用于后续 `*_ref` 操作。支持 `list_tabs``open_tab``focus_tab``close_tab` 管理标签页,支持 `close_session` 释放会话。出于安全考虑,默认拒绝访问 localhost、环回地址、私网地址和链路本地地址确需访问可信内网或本机页面时可显式传入 `allow_private_network: true`
**`recognize_captcha` 图形验证码识别示例**:
```json
{
"tool_name": "recognize_captcha",
"arguments": {
"image_url": "https://example.com/captcha.png",
"cookie": "sid=...",
"user_agent": "Mozilla/5.0 ..."
}
}
```
`recognize_captcha` 用于浏览器自动化登录时识别普通图形验证码。智能体可先通过 `browse_webpage``evaluate` 动作从页面元素中提取 `img.src`,再把图片地址传给该工具;支持 `http/https` 图片地址和 `data:image/...;base64,...`。当验证码图片依赖当前浏览器会话时,可传入 Cookie 与 User-Agent。出于安全考虑默认拒绝访问 localhost、环回地址、私网地址和链路本地地址确需访问可信内网或本机验证码图片时可显式传入 `allow_private_network: true`
### 3. 获取工具详情
**GET** `/api/v1/mcp/tools/{tool_name}`

View File

@@ -8,7 +8,7 @@ description: >-
interaction, such as checking a site page, confirming a JavaScript-rendered
result, testing login state, capturing visible errors, or updating and
validating tracker site cookies.
allowed-tools: browse_webpage search_web query_sites update_site_cookie test_site update_site
allowed-tools: browse_webpage recognize_captcha search_web query_sites update_site_cookie test_site update_site
---
# Browser Use
@@ -41,6 +41,9 @@ dedicated tool can complete the task more directly and safely.
`get_content`, `screenshot`, `click`, `click_ref`, `fill`, `fill_ref`,
`select`, `select_ref`, `evaluate`, `wait`, `list_tabs`, `open_tab`,
`focus_tab`, `close_tab`, `close_session`.
- `recognize_captcha` - Recognize graphic captcha text from an image URL or
`data:image/...;base64,...` value extracted from the page. Pass Cookie and
User-Agent when the image requires the current browser session.
- `search_web` - Find current pages or official references before opening a
target URL. It supports DDGS-backed `search_engine` (`auto`, `duckduckgo`,
`google`, `brave`, etc.) and `site_url` for limiting results to a specified
@@ -174,6 +177,28 @@ update_site_cookie site_identifier=<id> username="..." password="..." two_step_c
Ask for missing username, password, or two-step code only when required for the
operation. Do not expose secrets in the final answer.
### Login Page With A Graphic Captcha
When a user explicitly asks to complete a login flow that contains a normal
graphic captcha:
1. Open the login page and inspect the form with `snapshot`.
2. Extract the captcha image URL with `evaluate`, for example:
```text
browse_webpage action="evaluate" script="() => document.querySelector('img[src*=\"captcha\"], img[alt*=\"验证码\"], img[title*=\"验证码\"]')?.src || ''"
```
3. If the captcha image needs session cookies, extract `document.cookie` and the
current `navigator.userAgent` with `evaluate`.
4. Call `recognize_captcha image_url="<img.src>"` and pass `cookie` /
`user_agent` when needed.
5. Fill the returned `captcha_text`, submit the form, and verify the login
result.
If recognition fails, refresh the captcha once and retry. Stop after a second
failure and tell the user manual input is needed.
### Inspect A Tracker Page
When the user asks what is visible on a site page:
@@ -187,8 +212,9 @@ When the user asks what is visible on a site page:
- Ask before submitting forms that create, delete, purchase, publish, or change
account/security settings.
- Never solve captchas, bypass access controls, or scrape private content beyond
the user's explicit task.
- Solve graphic captchas only for a user-requested login flow. Do not use this
to bypass access controls, defeat anti-bot challenges, or scrape private
content beyond the user's explicit task.
- Do not print passwords, tokens, cookies, two-step secrets, or full session
headers in the response.
- Localhost, loopback, private, and link-local URLs are blocked by default. Set

View File

@@ -0,0 +1,134 @@
import asyncio
import base64
import json
from unittest.mock import patch
from app.agent.tools.factory import MoviePilotToolFactory
from app.agent.tools.impl.recognize_captcha import RecognizeCaptchaTool
from app.agent.tools.manager import MoviePilotToolsManager
from app.helper.ocr import OcrHelper
class _FakeResponse:
"""测试用响应对象,模拟 requests.Response 的最小行为。"""
def __init__(self, content: bytes = b"", payload: dict = None):
"""初始化响应内容与 JSON 载荷。"""
self.content = content
self.payload = payload or {}
def json(self) -> dict:
"""返回测试预设 JSON 内容。"""
return self.payload
def __bool__(self) -> bool:
"""模拟 requests.Response 在成功状态下为真。"""
return True
def test_factory_registers_recognize_captcha_tool():
"""工具工厂应注册图形验证码识别工具。"""
with patch(
"app.agent.tools.factory.PluginManager.get_plugin_agent_tools",
return_value=[],
):
tools = MoviePilotToolFactory.create_tools(
session_id="captcha-session",
user_id="10001",
)
tool_names = {tool.name for tool in tools}
assert "recognize_captcha" in tool_names
def test_mcp_tool_manager_exposes_recognize_captcha_schema():
"""MCP 工具管理器应暴露验证码识别工具参数。"""
tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001")
with patch(
"app.agent.tools.manager.MoviePilotToolFactory.create_tools",
return_value=[tool],
):
manager = MoviePilotToolsManager(is_admin=True)
tool_definitions = manager.list_tools()
schema = tool_definitions[0].input_schema
assert [item.name for item in tool_definitions] == ["recognize_captcha"]
assert "image_url" in schema["required"]
assert "cookie" in schema["properties"]
assert "user_agent" in schema["properties"]
assert "allow_private_network" in schema["properties"]
def test_ocr_helper_extracts_data_url_base64_without_downloading_image():
"""data:image 地址应直接提取 base64 内容并提交给 OCR 服务。"""
image_b64 = base64.b64encode(b"captcha-image").decode()
image_url = f"data:image/png;base64,{image_b64}"
with patch("app.helper.ocr.RequestUtils") as request_utils:
request_utils.return_value.post_res.return_value = _FakeResponse(
payload={"result": "a8k2"}
)
result = OcrHelper().get_captcha_text(image_url=image_url)
assert result == "a8k2"
request_utils.return_value.get_res.assert_not_called()
request_utils.return_value.post_res.assert_called_once()
assert request_utils.return_value.post_res.call_args.kwargs["json"] == {
"base64_img": image_b64
}
def test_recognize_captcha_tool_returns_captcha_text_from_ocr_helper():
"""验证码工具应返回结构化识别结果,便于 Agent 继续填写表单。"""
tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001")
async def _run_tool():
"""执行一次带 mock OCR 的工具调用。"""
with patch(
"app.agent.tools.impl.recognize_captcha.OcrHelper.get_captcha_text",
return_value="x7p9",
) as recognize_mock:
result = await tool.run(
image_url="https://example.com/captcha.png",
cookie="sid=abc",
user_agent="MoviePilotTest/1.0",
)
return result, recognize_mock
result, recognize_mock = asyncio.run(_run_tool())
payload = json.loads(result)
assert payload == {
"success": True,
"captcha_text": "x7p9",
"message": "验证码识别成功",
}
recognize_mock.assert_called_once_with(
image_url="https://example.com/captcha.png",
cookie="sid=abc",
ua="MoviePilotTest/1.0",
)
def test_recognize_captcha_tool_blocks_private_network_by_default():
"""验证码工具默认应拒绝本机和私网图片地址。"""
tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001")
with patch(
"app.agent.tools.impl.recognize_captcha.OcrHelper.get_captcha_text",
return_value="x7p9",
) as recognize_mock:
result = asyncio.run(
tool.run(image_url="http://127.0.0.1/captcha.png")
)
payload = json.loads(result)
assert payload["success"] is False
assert payload["captcha_text"] == ""
assert "默认不允许访问本机或私网地址" in payload["message"]
recognize_mock.assert_not_called()