From 4e3eddec1013f18e68179c5d5982c7fa2823b8f9 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Sun, 14 Jun 2026 16:24:04 +0800 Subject: [PATCH] feat: add captcha recognition agent tool --- app/agent/tools/factory.py | 2 + app/agent/tools/impl/recognize_captcha.py | 167 +++++++++++++++++++++ app/helper/ocr.py | 52 +++++-- docs/mcp-api.md | 14 ++ skills/browser-use/SKILL.md | 32 +++- tests/test_agent_recognize_captcha_tool.py | 134 +++++++++++++++++ 6 files changed, 388 insertions(+), 13 deletions(-) create mode 100644 app/agent/tools/impl/recognize_captcha.py create mode 100644 tests/test_agent_recognize_captcha_tool.py diff --git a/app/agent/tools/factory.py b/app/agent/tools/factory.py index 5e35ea8f..b57c7e7a 100644 --- a/app/agent/tools/factory.py +++ b/app/agent/tools/factory.py @@ -37,6 +37,7 @@ from app.agent.tools.impl.query_media_detail import QueryMediaDetailTool from app.agent.tools.impl.search_torrents import SearchTorrentsTool from app.agent.tools.impl.get_search_results import GetSearchResultsTool from app.agent.tools.impl.search_web import SearchWebTool +from app.agent.tools.impl.recognize_captcha import RecognizeCaptchaTool from app.agent.tools.impl.send_message import SendMessageTool from app.agent.tools.impl.ask_user_choice import AskUserChoiceTool from app.agent.tools.impl.send_local_file import SendLocalFileTool @@ -165,6 +166,7 @@ class MoviePilotToolFactory: SearchTorrentsTool, GetSearchResultsTool, SearchWebTool, + RecognizeCaptchaTool, AddDownloadTool, QuerySubscribesTool, QuerySubscribeSharesTool, diff --git a/app/agent/tools/impl/recognize_captcha.py b/app/agent/tools/impl/recognize_captcha.py new file mode 100644 index 00000000..5f7feaf1 --- /dev/null +++ b/app/agent/tools/impl/recognize_captcha.py @@ -0,0 +1,167 @@ +"""识别图形验证码工具。""" + +import json +from typing import Optional, Type + +from pydantic import BaseModel, Field + +from app.agent.tools.base import MoviePilotTool +from app.agent.tools.tags import ToolTag +from app.helper.browser import BrowserSessionHelper +from app.helper.ocr import OcrHelper +from app.log import logger + + +class RecognizeCaptchaInput(BaseModel): + """识别图形验证码工具的输入参数模型。""" + + explanation: Optional[str] = Field( + None, + description="Clear explanation of why this captcha image needs to be recognized", + ) + image_url: str = Field( + ..., + description=( + "Captcha image URL obtained from the browser page, usually an img.src value. " + "Supports http/https URLs and data:image/...;base64,... URLs." + ), + ) + cookie: Optional[str] = Field( + None, + description=( + "Optional Cookie header used to download the captcha image when the image URL " + "requires the same authenticated browser session." + ), + ) + user_agent: Optional[str] = Field( + None, + description="Optional User-Agent used when downloading the captcha image.", + ) + allow_private_network: bool = Field( + False, + description="Allow captcha image URLs on localhost, loopback, private, or link-local addresses.", + ) + + +class RecognizeCaptchaTool(MoviePilotTool): + """ + 图形验证码识别工具,供 Agent 在浏览器自动化登录时读取验证码文本。 + """ + + name: str = "recognize_captcha" + tags: list[str] = [ + ToolTag.Read, + ToolTag.Web, + ] + description: str = ( + "Recognize a graphic captcha image and return the captcha text. " + "Use this after browser automation extracts a captcha img.src from the page. " + "Pass cookie and user_agent when the image URL requires the current browser session. " + "Supports http/https image URLs and data:image/...;base64,... URLs. " + "For safety, localhost and private network URLs are blocked by default unless " + "allow_private_network is true." + ) + args_schema: Type[BaseModel] = RecognizeCaptchaInput + + def get_tool_message(self, **kwargs) -> Optional[str]: + """根据验证码图片参数生成友好的提示消息。""" + image_url = str(kwargs.get("image_url") or "") + if image_url.lower().startswith("data:image/"): + return "识别图形验证码: data image" + return f"识别图形验证码: {image_url}" + + @staticmethod + def _recognize_captcha_sync( + image_url: str, + cookie: Optional[str] = None, + user_agent: Optional[str] = None, + allow_private_network: bool = False, + ) -> str: + """ + 在线程池中下载并识别验证码图片。 + + :param image_url: 验证码图片地址 + :param cookie: 下载图片时使用的 Cookie + :param user_agent: 下载图片时使用的 User-Agent + :param allow_private_network: 是否允许访问本机或私网地址 + :return: 验证码文本,失败时返回空字符串 + """ + clean_url = (image_url or "").strip() + if not clean_url: + return "" + if not clean_url.lower().startswith("data:image/"): + BrowserSessionHelper.validate_url( + clean_url, + allow_private_network=allow_private_network, + ) + return OcrHelper().get_captcha_text( + image_url=clean_url, + cookie=cookie, + ua=user_agent, + ) + + async def run( + self, + image_url: str, + cookie: Optional[str] = None, + user_agent: Optional[str] = None, + allow_private_network: bool = False, + **kwargs, + ) -> str: + """ + 识别指定图片地址中的图形验证码文本。 + + :param image_url: 验证码图片地址 + :param cookie: 下载图片时使用的 Cookie + :param user_agent: 下载图片时使用的 User-Agent + :param allow_private_network: 是否允许访问本机或私网地址 + :return: JSON 格式的识别结果 + """ + logger.info(f"执行工具: {self.name}, 参数: image_url={image_url}") + + try: + captcha_text = await self.run_blocking( + "web", + self._recognize_captcha_sync, + image_url, + cookie, + user_agent, + allow_private_network, + ) + if captcha_text: + return json.dumps( + { + "success": True, + "captcha_text": captcha_text, + "message": "验证码识别成功", + }, + ensure_ascii=False, + ) + return json.dumps( + { + "success": False, + "captcha_text": "", + "message": "验证码识别失败或未返回内容", + }, + ensure_ascii=False, + ) + except ValueError as err: + logger.warning(f"验证码图片地址校验失败: {str(err)}") + return json.dumps( + { + "success": False, + "captcha_text": "", + "message": str(err), + }, + ensure_ascii=False, + ) + except Exception as err: + logger.error(f"识别图形验证码失败: {str(err)}", exc_info=True) + return json.dumps( + { + "success": False, + "captcha_text": "", + "message": f"识别图形验证码时发生错误: {str(err)}", + }, + ensure_ascii=False, + ) diff --git a/app/helper/ocr.py b/app/helper/ocr.py index 9ec76a11..93630ed8 100644 --- a/app/helper/ocr.py +++ b/app/helper/ocr.py @@ -6,31 +6,63 @@ from app.utils.http import RequestUtils class OcrHelper: + """ + OCR 辅助类,负责获取验证码图片并调用 OCR 服务识别文本。 + """ _ocr_b64_url = f"{settings.OCR_HOST}/captcha/base64" - def get_captcha_text(self, image_url: Optional[str] = None, image_b64: Optional[str] = None, - cookie: Optional[str] = None, ua: Optional[str] = None): + def get_captcha_text( + self, + image_url: Optional[str] = None, + image_b64: Optional[str] = None, + cookie: Optional[str] = None, + ua: Optional[str] = None, + ) -> str: """ 根据图片地址,获取验证码图片,并识别内容 :param image_url: 图片地址 :param image_b64: 图片base64,跳过图片地址下载 :param cookie: 下载图片使用的cookie :param ua: 下载图片使用的ua + :return: 验证码识别结果,失败时返回空字符串 """ + image_b64 = self._normalize_image_base64(image_b64) if image_url: - ret = RequestUtils(ua=ua, - cookies=cookie).get_res(image_url) - if ret is not None: - image_bin = ret.content - if not image_bin: - return "" - image_b64 = base64.b64encode(image_bin).decode() + data_url_b64 = self._extract_data_url_base64(image_url) + if data_url_b64: + image_b64 = data_url_b64 + else: + ret = RequestUtils(ua=ua, + cookies=cookie).get_res(image_url) + if ret is not None: + image_bin = ret.content + if not image_bin: + return "" + image_b64 = base64.b64encode(image_bin).decode() if not image_b64: return "" ret = RequestUtils(content_type="application/json").post_res( url=self._ocr_b64_url, json={"base64_img": image_b64}) if ret: - return ret.json().get("result") + return ret.json().get("result") or "" return "" + + @staticmethod + def _normalize_image_base64(image_b64: Optional[str]) -> str: + """规范化外部传入的图片 base64 内容。""" + if not image_b64: + return "" + return OcrHelper._extract_data_url_base64(image_b64) or image_b64.strip() + + @staticmethod + def _extract_data_url_base64(image_url: Optional[str]) -> str: + """从 data:image/...;base64,... 地址中提取纯 base64 内容。""" + image_url = (image_url or "").strip() + if not image_url.lower().startswith("data:image/"): + return "" + metadata, separator, data = image_url.partition(",") + if not separator or ";base64" not in metadata.lower(): + return "" + return data.strip() diff --git a/docs/mcp-api.md b/docs/mcp-api.md index 3b295df6..0c0654ec 100644 --- a/docs/mcp-api.md +++ b/docs/mcp-api.md @@ -222,6 +222,20 @@ MoviePilot 也提供普通 REST API 给前端和自动化客户端使用。所 `browse_webpage` 使用持久浏览器会话,默认以当前 Agent 会话作为 `session_key`。`goto`、`snapshot`、`click`、`click_ref`、`fill`、`fill_ref`、`select`、`select_ref`、`wait` 等动作会返回页面快照,快照中的 `interactive_elements[].ref` 可用于后续 `*_ref` 操作。支持 `list_tabs`、`open_tab`、`focus_tab`、`close_tab` 管理标签页,支持 `close_session` 释放会话。出于安全考虑,默认拒绝访问 localhost、环回地址、私网地址和链路本地地址;确需访问可信内网或本机页面时,可显式传入 `allow_private_network: true`。 +**`recognize_captcha` 图形验证码识别示例**: +```json +{ + "tool_name": "recognize_captcha", + "arguments": { + "image_url": "https://example.com/captcha.png", + "cookie": "sid=...", + "user_agent": "Mozilla/5.0 ..." + } +} +``` + +`recognize_captcha` 用于浏览器自动化登录时识别普通图形验证码。智能体可先通过 `browse_webpage` 的 `evaluate` 动作从页面元素中提取 `img.src`,再把图片地址传给该工具;支持 `http/https` 图片地址和 `data:image/...;base64,...`。当验证码图片依赖当前浏览器会话时,可传入 Cookie 与 User-Agent。出于安全考虑,默认拒绝访问 localhost、环回地址、私网地址和链路本地地址;确需访问可信内网或本机验证码图片时,可显式传入 `allow_private_network: true`。 + ### 3. 获取工具详情 **GET** `/api/v1/mcp/tools/{tool_name}` diff --git a/skills/browser-use/SKILL.md b/skills/browser-use/SKILL.md index af4baf89..e197c266 100644 --- a/skills/browser-use/SKILL.md +++ b/skills/browser-use/SKILL.md @@ -8,7 +8,7 @@ description: >- interaction, such as checking a site page, confirming a JavaScript-rendered result, testing login state, capturing visible errors, or updating and validating tracker site cookies. -allowed-tools: browse_webpage search_web query_sites update_site_cookie test_site update_site +allowed-tools: browse_webpage recognize_captcha search_web query_sites update_site_cookie test_site update_site --- # Browser Use @@ -41,6 +41,9 @@ dedicated tool can complete the task more directly and safely. `get_content`, `screenshot`, `click`, `click_ref`, `fill`, `fill_ref`, `select`, `select_ref`, `evaluate`, `wait`, `list_tabs`, `open_tab`, `focus_tab`, `close_tab`, `close_session`. +- `recognize_captcha` - Recognize graphic captcha text from an image URL or + `data:image/...;base64,...` value extracted from the page. Pass Cookie and + User-Agent when the image requires the current browser session. - `search_web` - Find current pages or official references before opening a target URL. It supports DDGS-backed `search_engine` (`auto`, `duckduckgo`, `google`, `brave`, etc.) and `site_url` for limiting results to a specified @@ -174,6 +177,28 @@ update_site_cookie site_identifier= username="..." password="..." two_step_c Ask for missing username, password, or two-step code only when required for the operation. Do not expose secrets in the final answer. +### Login Page With A Graphic Captcha + +When a user explicitly asks to complete a login flow that contains a normal +graphic captcha: + +1. Open the login page and inspect the form with `snapshot`. +2. Extract the captcha image URL with `evaluate`, for example: + +```text +browse_webpage action="evaluate" script="() => document.querySelector('img[src*=\"captcha\"], img[alt*=\"验证码\"], img[title*=\"验证码\"]')?.src || ''" +``` + +3. If the captcha image needs session cookies, extract `document.cookie` and the + current `navigator.userAgent` with `evaluate`. +4. Call `recognize_captcha image_url=""` and pass `cookie` / + `user_agent` when needed. +5. Fill the returned `captcha_text`, submit the form, and verify the login + result. + +If recognition fails, refresh the captcha once and retry. Stop after a second +failure and tell the user manual input is needed. + ### Inspect A Tracker Page When the user asks what is visible on a site page: @@ -187,8 +212,9 @@ When the user asks what is visible on a site page: - Ask before submitting forms that create, delete, purchase, publish, or change account/security settings. -- Never solve captchas, bypass access controls, or scrape private content beyond - the user's explicit task. +- Solve graphic captchas only for a user-requested login flow. Do not use this + to bypass access controls, defeat anti-bot challenges, or scrape private + content beyond the user's explicit task. - Do not print passwords, tokens, cookies, two-step secrets, or full session headers in the response. - Localhost, loopback, private, and link-local URLs are blocked by default. Set diff --git a/tests/test_agent_recognize_captcha_tool.py b/tests/test_agent_recognize_captcha_tool.py new file mode 100644 index 00000000..96c9c741 --- /dev/null +++ b/tests/test_agent_recognize_captcha_tool.py @@ -0,0 +1,134 @@ +import asyncio +import base64 +import json +from unittest.mock import patch + +from app.agent.tools.factory import MoviePilotToolFactory +from app.agent.tools.impl.recognize_captcha import RecognizeCaptchaTool +from app.agent.tools.manager import MoviePilotToolsManager +from app.helper.ocr import OcrHelper + + +class _FakeResponse: + """测试用响应对象,模拟 requests.Response 的最小行为。""" + + def __init__(self, content: bytes = b"", payload: dict = None): + """初始化响应内容与 JSON 载荷。""" + self.content = content + self.payload = payload or {} + + def json(self) -> dict: + """返回测试预设 JSON 内容。""" + return self.payload + + def __bool__(self) -> bool: + """模拟 requests.Response 在成功状态下为真。""" + return True + + +def test_factory_registers_recognize_captcha_tool(): + """工具工厂应注册图形验证码识别工具。""" + with patch( + "app.agent.tools.factory.PluginManager.get_plugin_agent_tools", + return_value=[], + ): + tools = MoviePilotToolFactory.create_tools( + session_id="captcha-session", + user_id="10001", + ) + + tool_names = {tool.name for tool in tools} + + assert "recognize_captcha" in tool_names + + +def test_mcp_tool_manager_exposes_recognize_captcha_schema(): + """MCP 工具管理器应暴露验证码识别工具参数。""" + tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001") + + with patch( + "app.agent.tools.manager.MoviePilotToolFactory.create_tools", + return_value=[tool], + ): + manager = MoviePilotToolsManager(is_admin=True) + + tool_definitions = manager.list_tools() + schema = tool_definitions[0].input_schema + + assert [item.name for item in tool_definitions] == ["recognize_captcha"] + assert "image_url" in schema["required"] + assert "cookie" in schema["properties"] + assert "user_agent" in schema["properties"] + assert "allow_private_network" in schema["properties"] + + +def test_ocr_helper_extracts_data_url_base64_without_downloading_image(): + """data:image 地址应直接提取 base64 内容并提交给 OCR 服务。""" + image_b64 = base64.b64encode(b"captcha-image").decode() + image_url = f"data:image/png;base64,{image_b64}" + + with patch("app.helper.ocr.RequestUtils") as request_utils: + request_utils.return_value.post_res.return_value = _FakeResponse( + payload={"result": "a8k2"} + ) + + result = OcrHelper().get_captcha_text(image_url=image_url) + + assert result == "a8k2" + request_utils.return_value.get_res.assert_not_called() + request_utils.return_value.post_res.assert_called_once() + assert request_utils.return_value.post_res.call_args.kwargs["json"] == { + "base64_img": image_b64 + } + + +def test_recognize_captcha_tool_returns_captcha_text_from_ocr_helper(): + """验证码工具应返回结构化识别结果,便于 Agent 继续填写表单。""" + tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001") + + async def _run_tool(): + """执行一次带 mock OCR 的工具调用。""" + with patch( + "app.agent.tools.impl.recognize_captcha.OcrHelper.get_captcha_text", + return_value="x7p9", + ) as recognize_mock: + result = await tool.run( + image_url="https://example.com/captcha.png", + cookie="sid=abc", + user_agent="MoviePilotTest/1.0", + ) + return result, recognize_mock + + result, recognize_mock = asyncio.run(_run_tool()) + payload = json.loads(result) + + assert payload == { + "success": True, + "captcha_text": "x7p9", + "message": "验证码识别成功", + } + recognize_mock.assert_called_once_with( + image_url="https://example.com/captcha.png", + cookie="sid=abc", + ua="MoviePilotTest/1.0", + ) + + +def test_recognize_captcha_tool_blocks_private_network_by_default(): + """验证码工具默认应拒绝本机和私网图片地址。""" + tool = RecognizeCaptchaTool(session_id="captcha-session", user_id="10001") + + with patch( + "app.agent.tools.impl.recognize_captcha.OcrHelper.get_captcha_text", + return_value="x7p9", + ) as recognize_mock: + result = asyncio.run( + tool.run(image_url="http://127.0.0.1/captcha.png") + ) + + payload = json.loads(result) + + assert payload["success"] is False + assert payload["captcha_text"] == "" + assert "默认不允许访问本机或私网地址" in payload["message"] + recognize_mock.assert_not_called()