From f702a3cc78172e280730d9b9ec4d12f764c721ac Mon Sep 17 00:00:00 2001 From: "lilong.129" Date: Fri, 30 May 2025 00:07:49 +0800 Subject: [PATCH] docs: add comprehensive documentation for MCP server - Add detailed package-level documentation for mcp_server.go - Create MCP_SERVER_DOCUMENTATION.md with complete implementation guide - Create MCP_TOOLS_REFERENCE.md with quick reference for all tools - Add extensive code comments for key structures and functions - Document architecture, features, extension guide, and best practices - Include usage examples and troubleshooting information This provides complete documentation for developers to understand, use, and extend the HttpRunner MCP server functionality. --- internal/version/VERSION | 2 +- uixt/mcp_server.go | 407 +++++++++++++++++++-- uixt/mcp_server.md | 756 +++++++++++++++++++++++++++++++++++++++ uixt/mcp_server_test.go | 34 ++ 4 files changed, 1166 insertions(+), 33 deletions(-) create mode 100644 uixt/mcp_server.md diff --git a/internal/version/VERSION b/internal/version/VERSION index 287da74a..cb8e019a 100644 --- a/internal/version/VERSION +++ b/internal/version/VERSION @@ -1 +1 @@ -v5.0.0-beta-2505292037 +v5.0.0-beta-2505300037 diff --git a/uixt/mcp_server.go b/uixt/mcp_server.go index 96c99366..751fe873 100644 --- a/uixt/mcp_server.go +++ b/uixt/mcp_server.go @@ -19,6 +19,184 @@ import ( "github.com/httprunner/httprunner/v5/uixt/types" ) +/* +Package uixt provides MCP (Model Context Protocol) server implementation for HttpRunner UI automation. + +# HttpRunner MCP Server + +This package implements a comprehensive MCP server that exposes HttpRunner's UI automation +capabilities through standardized MCP protocol interfaces. It enables AI models and other +clients to perform mobile and web UI automation tasks. + +## Architecture Overview + +The MCP server follows a pure ActionTool architecture where each UI operation is implemented +as an independent tool that conforms to the ActionTool interface: + + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ MCP Client │ │ MCP Server │ │ XTDriver Core │ + │ (AI Model) │◄──►│ (mcp_server) │◄──►│ (UI Engine) │ + └─────────────────┘ └─────────────────┘ └─────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Device Layer │ + │ Android/iOS/Web │ + └─────────────────┘ + +## Core Components + +### MCPServer4XTDriver +The main server struct that manages MCP protocol communication and tool registration. + +### ActionTool Interface +Defines the contract for all MCP tools: + - Name(): Returns the action name identifier + - Description(): Provides human-readable tool description + - Options(): Defines MCP tool parameters and validation + - Implement(): Contains the actual tool execution logic + - ConvertActionToCallToolRequest(): Converts legacy actions to MCP format + +## Supported Operations + +### Device Management +- list_available_devices: Discover Android/iOS devices and simulators +- select_device: Choose specific device by platform and serial + +### Touch Operations +- tap_xy: Tap at relative coordinates (0-1 range) +- tap_abs_xy: Tap at absolute pixel coordinates +- tap_ocr: Tap on text found by OCR recognition +- tap_cv: Tap on element found by computer vision +- double_tap_xy: Double tap at coordinates + +### Gesture Operations +- swipe: Generic swipe with auto-detection (direction or coordinates) +- swipe_direction: Directional swipe (up/down/left/right) +- swipe_coordinate: Coordinate-based swipe with precise control +- drag: Drag operation between two points + +### Advanced Swipe Operations +- swipe_to_tap_app: Swipe to find and tap app by name +- swipe_to_tap_text: Swipe to find and tap text +- swipe_to_tap_texts: Swipe to find and tap one of multiple texts + +### Input Operations +- input: Text input on focused element +- press_button: Press device buttons (home, back, volume, etc.) + +### App Management +- list_packages: List all installed apps +- app_launch: Launch app by package name +- app_terminate: Terminate running app +- app_install: Install app from URL/path +- app_uninstall: Uninstall app by package name +- app_clear: Clear app data and cache + +### Screen Operations +- screenshot: Capture screen as Base64 encoded image +- get_screen_size: Get device screen dimensions +- get_source: Get UI hierarchy/source + +### Utility Operations +- sleep: Sleep for specified seconds +- sleep_ms: Sleep for specified milliseconds +- sleep_random: Sleep for random duration based on parameters +- set_ime: Set input method editor +- close_popups: Close popup windows/dialogs + +### Web Operations +- web_login_none_ui: Perform login without UI interaction +- secondary_click: Right-click at specified coordinates +- hover_by_selector: Hover over element by CSS selector/XPath +- tap_by_selector: Click element by CSS selector/XPath +- secondary_click_by_selector: Right-click element by selector +- web_close_tab: Close browser tab by index + +### AI Operations +- ai_action: Perform AI-driven actions with natural language prompts +- finished: Mark task completion with result message + +## Key Features + +### Anti-Risk Support +Built-in anti-detection mechanisms for sensitive operations: + - Touch simulation with realistic timing + - Device fingerprint masking + - Behavioral pattern randomization + +### Unified Parameter Handling +All tools use consistent parameter parsing through parseActionOptions(): + - JSON marshaling/unmarshaling for type safety + - Automatic validation and error handling + - Support for complex nested parameters + +### Device Abstraction +Seamless multi-platform support: + - Android devices via ADB + - iOS devices via go-ios + - Web browsers via WebDriver + - Harmony OS devices + +### Error Handling +Comprehensive error management: + - Structured error responses + - Detailed logging with context + - Graceful failure recovery + +## Usage Example + + // Create and start MCP server + server := NewMCPServer() + err := server.Start() // Blocks and serves MCP protocol over stdio + + // Client interaction (via MCP protocol): + // 1. Initialize connection + // 2. List available tools + // 3. Call tools with parameters + // 4. Receive structured results + +## Extension Guide + +To add a new tool: + +1. Define tool struct implementing ActionTool interface +2. Implement all required methods (Name, Description, Options, Implement, ConvertActionToCallToolRequest) +3. Register tool in registerTools() method +4. Add comprehensive unit tests +5. Update documentation + +Example: + type ToolCustomAction struct{} + + func (t *ToolCustomAction) Name() option.ActionName { + return option.ACTION_CustomAction + } + + func (t *ToolCustomAction) Implement() server.ToolHandlerFunc { + return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + // Implementation logic + } + } + +## Performance Considerations + +- Driver instances are cached and reused for efficiency +- Parameter parsing is optimized to minimize JSON overhead +- Timeout controls prevent hanging operations +- Resource cleanup ensures memory efficiency + +## Security Notes + +- All device operations require explicit permission +- Input validation prevents injection attacks +- Sensitive operations support anti-detection measures +- Audit logging tracks all tool executions + +For detailed implementation examples and best practices, see the accompanying +documentation. +*/ + // MCPServer4XTDriver provides MCP (Model Context Protocol) interface for XTDriver. // // This implementation adopts a pure ActionTool-style architecture where: @@ -38,6 +216,31 @@ import ( // - Easy extensibility for new features // NewMCPServer creates a new MCP server for XTDriver and registers all tools. +// +// This function initializes a complete MCP server instance with: +// - MCP protocol server with uixt capabilities +// - Version information from HttpRunner +// - Tool capabilities disabled (set to false for performance) +// - All available UI automation tools pre-registered +// +// The server supports the following tool categories: +// - Device management (discovery, selection) +// - Touch operations (tap, double-tap, long-press) +// - Gesture operations (swipe, drag) +// - Input operations (text input, button press) +// - App management (launch, terminate, install) +// - Screen operations (screenshot, size, source) +// - Utility operations (sleep, IME, popups) +// - Web operations (browser automation) +// - AI operations (intelligent actions) +// +// Returns: +// - *MCPServer4XTDriver: Configured server ready to start +// +// Usage: +// +// server := NewMCPServer() +// err := server.Start() // Blocks and serves over stdio func NewMCPServer() *MCPServer4XTDriver { mcpServer := server.NewMCPServer( "uixt", @@ -174,6 +377,54 @@ func (s *MCPServer4XTDriver) registerTool(tool ActionTool) { } // ActionTool interface defines the contract for MCP tools +// +// This interface standardizes how UI automation actions are exposed through MCP protocol. +// Each tool implementation must provide: +// +// 1. Identity and Documentation: +// - Name(): Unique identifier for the action (e.g., ACTION_TapXY) +// - Description(): Human-readable description for AI models +// +// 2. MCP Integration: +// - Options(): Parameter definitions with validation rules +// - Implement(): Actual execution logic as MCP handler +// +// 3. Legacy Compatibility: +// - ConvertActionToCallToolRequest(): Converts old MobileAction format +// +// Implementation Pattern: +// +// type ToolExample struct{} +// +// func (t *ToolExample) Name() option.ActionName { +// return option.ACTION_Example +// } +// +// func (t *ToolExample) Description() string { +// return "Performs example operation" +// } +// +// func (t *ToolExample) Options() []mcp.ToolOption { +// return []mcp.ToolOption{ +// mcp.WithString("param", mcp.Description("Parameter description")), +// } +// } +// +// func (t *ToolExample) Implement() server.ToolHandlerFunc { +// return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { +// // 1. Setup driver +// // 2. Parse parameters +// // 3. Execute operation +// // 4. Return result +// } +// } +// +// Benefits of this architecture: +// - Complete decoupling between tools +// - Consistent parameter handling +// - Standardized error reporting +// - Easy testing and maintenance +// - Seamless MCP protocol integration type ActionTool interface { Name() option.ActionName Description() string @@ -207,7 +458,7 @@ func (t *ToolListAvailableDevices) Name() option.ActionName { } func (t *ToolListAvailableDevices) Description() string { - return "List all available devices. If there are more than one device returned, you need to let the user select one of them." + return "List all available devices including Android devices and iOS devices. If there are multiple devices returned, you need to let the user select one of them." } func (t *ToolListAvailableDevices) Options() []mcp.ToolOption { @@ -262,13 +513,13 @@ func (t *ToolSelectDevice) Name() option.ActionName { } func (t *ToolSelectDevice) Description() string { - return "Select a device to use from the list of available devices. Use the list_available_devices tool to get a list of available devices." + return "Select a device to use from the list of available devices. Use the list_available_devices tool first to get a list of available devices." } func (t *ToolSelectDevice) Options() []mcp.ToolOption { return []mcp.ToolOption{ - mcp.WithString("platform", mcp.Enum("android", "ios"), mcp.Description("The type of device to select")), - mcp.WithString("serial", mcp.Description("The device serial/udid to select")), + mcp.WithString("platform", mcp.Enum("android", "ios"), mcp.Description("The platform type of device to select")), + mcp.WithString("serial", mcp.Description("The device serial number or UDID to select")), } } @@ -289,6 +540,50 @@ func (t *ToolSelectDevice) ConvertActionToCallToolRequest(action MobileAction) ( } // ToolTapXY implements the tap_xy tool call. +// +// This tool performs touch/click operations at specified relative coordinates on the device screen. +// Coordinates are normalized to 0-1 range where (0,0) is top-left and (1,1) is bottom-right. +// +// Supported platforms: +// - Android: Touch events via ADB +// - iOS: Touch events via go-ios +// - Web: Click events via WebDriver +// - Harmony: Touch events via native interface +// +// Features: +// - Relative coordinate system (0-1 range) +// - Anti-risk detection support +// - Configurable touch duration +// - Pre-operation marking for debugging +// - Comprehensive error handling +// +// MCP Parameters: +// - platform (string): Device platform ("android", "ios", "web", "harmony") +// - serial (string): Device serial number or identifier +// - x (number): X coordinate (0.0 to 1.0, relative to screen width) +// - y (number): Y coordinate (0.0 to 1.0, relative to screen height) +// - duration (number, optional): Touch duration in seconds (default: 0.1) +// - anti_risk (boolean, optional): Enable anti-detection measures +// +// Example Usage: +// +// { +// "name": "tap_xy", +// "arguments": { +// "platform": "android", +// "serial": "emulator-5554", +// "x": 0.5, +// "y": 0.3, +// "duration": 0.2, +// "anti_risk": true +// } +// } +// +// Error Conditions: +// - Missing or invalid coordinates +// - Device connection failure +// - Touch operation timeout +// - Platform not supported type ToolTapXY struct{} func (t *ToolTapXY) Name() option.ActionName { @@ -296,7 +591,7 @@ func (t *ToolTapXY) Name() option.ActionName { } func (t *ToolTapXY) Description() string { - return "Click on the screen at given x,y coordinates" + return "Tap on the screen at given relative coordinates (0.0-1.0 range)" } func (t *ToolTapXY) Options() []mcp.ToolOption { @@ -319,8 +614,10 @@ func (t *ToolTapXY) Implement() server.ToolHandlerFunc { // Get options directly since ActionOptions is now ActionOptions opts := unifiedReq.Options() - // Add default options - opts = append(opts, option.WithPreMarkOperation(true)) + // Add configurable options based on request + if unifiedReq.PreMarkOperation { + opts = append(opts, option.WithPreMarkOperation(true)) + } // Validate required parameters if unifiedReq.X == 0 || unifiedReq.Y == 0 { @@ -367,7 +664,7 @@ func (t *ToolTapAbsXY) Name() option.ActionName { } func (t *ToolTapAbsXY) Description() string { - return "Tap at absolute pixel coordinates" + return "Tap at absolute pixel coordinates on the screen" } func (t *ToolTapAbsXY) Options() []mcp.ToolOption { @@ -390,8 +687,10 @@ func (t *ToolTapAbsXY) Implement() server.ToolHandlerFunc { // Get options directly since ActionOptions is now ActionOptions opts := unifiedReq.Options() - // Add default options - opts = append(opts, option.WithPreMarkOperation(true)) + // Add configurable options based on request + if unifiedReq.PreMarkOperation { + opts = append(opts, option.WithPreMarkOperation(true)) + } // Add AntiRisk support if unifiedReq.AntiRisk { @@ -466,8 +765,10 @@ func (t *ToolTapByOCR) Implement() server.ToolHandlerFunc { // Get options directly since ActionOptions is now ActionOptions opts := unifiedReq.Options() - // Add default options - opts = append(opts, option.WithPreMarkOperation(true)) + // Add configurable options based on request + if unifiedReq.PreMarkOperation { + opts = append(opts, option.WithPreMarkOperation(true)) + } // Validate required parameters if unifiedReq.Text == "" { @@ -530,8 +831,10 @@ func (t *ToolTapByCV) Implement() server.ToolHandlerFunc { // Get options directly since ActionOptions is now ActionOptions opts := unifiedReq.Options() - // Add default options - opts = append(opts, option.WithPreMarkOperation(true)) + // Add configurable options based on request + if unifiedReq.PreMarkOperation { + opts = append(opts, option.WithPreMarkOperation(true)) + } // Tap by CV action logic log.Info().Msg("tapping by CV") @@ -568,7 +871,7 @@ func (t *ToolDoubleTapXY) Name() option.ActionName { } func (t *ToolDoubleTapXY) Description() string { - return "Double tap at given coordinates" + return "Double tap at given relative coordinates (0.0-1.0 range)" } func (t *ToolDoubleTapXY) Options() []mcp.ToolOption { @@ -624,7 +927,7 @@ func (t *ToolListPackages) Name() option.ActionName { } func (t *ToolListPackages) Description() string { - return "List all the apps/packages on the device." + return "List all installed apps/packages on the device with their package names." } func (t *ToolListPackages) Options() []mcp.ToolOption { @@ -659,7 +962,7 @@ func (t *ToolLaunchApp) Name() option.ActionName { } func (t *ToolLaunchApp) Description() string { - return "Launch an app on mobile device. Use this to open a specific app. You can find the package name of the app by calling list_packages." + return "Launch an app on mobile device using its package name. Use list_packages tool first to find the correct package name." } func (t *ToolLaunchApp) Options() []mcp.ToolOption { @@ -712,7 +1015,7 @@ func (t *ToolTerminateApp) Name() option.ActionName { } func (t *ToolTerminateApp) Description() string { - return "Stop and terminate an app on mobile device" + return "Stop and terminate a running app on mobile device using its package name" } func (t *ToolTerminateApp) Options() []mcp.ToolOption { @@ -768,7 +1071,7 @@ func (t *ToolScreenShot) Name() option.ActionName { } func (t *ToolScreenShot) Description() string { - return "Take a screenshot of the mobile device. Use this to understand what's on screen. Do not cache this result." + return "Take a screenshot of the mobile device screen. Use this to understand what's currently displayed on screen." } func (t *ToolScreenShot) Options() []mcp.ToolOption { @@ -946,7 +1249,7 @@ func (t *ToolSwipe) ConvertActionToCallToolRequest(action MobileAction) (mcp.Cal return mcp.CallToolRequest{}, fmt.Errorf("invalid swipe params: %v, expected string direction or [fromX, fromY, toX, toY] coordinates", action.Params) } -// ToolSwipeDirection implements the swipe tool call. +// ToolSwipeDirection implements the swipe_direction tool call. type ToolSwipeDirection struct{} func (t *ToolSwipeDirection) Name() option.ActionName { @@ -954,7 +1257,7 @@ func (t *ToolSwipeDirection) Name() option.ActionName { } func (t *ToolSwipeDirection) Description() string { - return "Swipe on the screen" + return "Swipe on the screen in a specific direction (up, down, left, right)" } func (t *ToolSwipeDirection) Options() []mcp.ToolOption { @@ -986,13 +1289,15 @@ func (t *ToolSwipeDirection) Implement() server.ToolHandlerFunc { } opts := []option.ActionOption{ - option.WithPreMarkOperation(true), option.WithDuration(getFloat64ValueOrDefault(unifiedReq.Duration, 0.5)), option.WithPressDuration(getFloat64ValueOrDefault(unifiedReq.PressDuration, 0.1)), } if unifiedReq.AntiRisk { opts = append(opts, option.WithAntiRisk(true)) } + if unifiedReq.PreMarkOperation { + opts = append(opts, option.WithPreMarkOperation(true)) + } // Convert direction to coordinates and perform swipe switch swipeDirection { @@ -1039,7 +1344,7 @@ func (t *ToolSwipeDirection) ConvertActionToCallToolRequest(action MobileAction) return mcp.CallToolRequest{}, fmt.Errorf("invalid swipe params: %v", action.Params) } -// ToolSwipeCoordinate implements the swipe_advanced tool call. +// ToolSwipeCoordinate implements the swipe_coordinate tool call. type ToolSwipeCoordinate struct{} func (t *ToolSwipeCoordinate) Name() option.ActionName { @@ -1047,7 +1352,7 @@ func (t *ToolSwipeCoordinate) Name() option.ActionName { } func (t *ToolSwipeCoordinate) Description() string { - return "Perform advanced swipe with custom coordinates and timing" + return "Perform swipe with specific start and end coordinates and custom timing" } func (t *ToolSwipeCoordinate) Options() []mcp.ToolOption { @@ -1353,7 +1658,7 @@ func (t *ToolDrag) Name() option.ActionName { } func (t *ToolDrag) Description() string { - return "Drag on the mobile device" + return "Drag from one point to another on the mobile device screen" } func (t *ToolDrag) Options() []mcp.ToolOption { @@ -1446,6 +1751,7 @@ func extractActionOptionsToArguments(actionOptions []option.ActionOption, argume "regex": tempOptions.Regex, "tap_random_rect": tempOptions.TapRandomRect, "anti_risk": tempOptions.AntiRisk, + "pre_mark_operation": tempOptions.PreMarkOperation, } // Add boolean options only if they are true @@ -1557,7 +1863,7 @@ func (t *ToolInput) Name() option.ActionName { } func (t *ToolInput) Description() string { - return "Input text on the current active element" + return "Input text into the currently focused element or input field" } func (t *ToolInput) Options() []mcp.ToolOption { @@ -1656,7 +1962,7 @@ func (t *ToolAppInstall) Name() option.ActionName { } func (t *ToolAppInstall) Description() string { - return "Install an app on the device" + return "Install an app on the device from a URL or local file path" } func (t *ToolAppInstall) Options() []mcp.ToolOption { @@ -1754,7 +2060,7 @@ func (t *ToolAppClear) Name() option.ActionName { } func (t *ToolAppClear) Description() string { - return "Clear app data and cache" + return "Clear app data and cache for a specific app using its package name" } func (t *ToolAppClear) Options() []mcp.ToolOption { @@ -1803,7 +2109,7 @@ func (t *ToolSecondaryClick) Name() option.ActionName { } func (t *ToolSecondaryClick) Description() string { - return "Perform secondary click (right click) at coordinates" + return "Perform secondary click (right click) at specified coordinates" } func (t *ToolSecondaryClick) Options() []mcp.ToolOption { @@ -2121,7 +2427,7 @@ func (t *ToolGetSource) Name() option.ActionName { } func (t *ToolGetSource) Description() string { - return "Get the source/hierarchy of the current screen" + return "Get the UI hierarchy/source tree of the current screen for a specific app" } func (t *ToolGetSource) Options() []mcp.ToolOption { @@ -2358,7 +2664,7 @@ func (t *ToolAIAction) Name() option.ActionName { } func (t *ToolAIAction) Description() string { - return "Perform actions using AI with a given prompt" + return "Perform AI-driven automation actions using natural language prompts to describe the desired operation" } func (t *ToolAIAction) Options() []mcp.ToolOption { @@ -2407,7 +2713,7 @@ func (t *ToolFinished) Name() option.ActionName { } func (t *ToolFinished) Description() string { - return "Mark task as completed with a result message" + return "Mark the current automation task as completed with a result message" } func (t *ToolFinished) Options() []mcp.ToolOption { @@ -2445,6 +2751,43 @@ func getFloat64ValueOrDefault(value float64, defaultValue float64) float64 { } // parseActionOptions converts MCP request arguments to ActionOptions struct +// +// This function provides unified parameter parsing for all MCP tools by: +// +// 1. Converting map[string]any arguments to JSON bytes +// 2. Unmarshaling JSON into strongly-typed ActionOptions struct +// 3. Providing automatic validation and type conversion +// +// The ActionOptions struct contains all possible parameters for UI operations: +// - Coordinates: X, Y, FromX, FromY, ToX, ToY +// - Text/Content: Text, Content, AppName, PackageName +// - Timing: Duration, PressDuration, Milliseconds +// - Behavior: AntiRisk, IgnoreNotFoundError, Regex +// - Indices: Index, MaxRetryTimes, TabIndex +// - Device: Platform, Serial, Button, Direction +// - Web: Selector, PhoneNumber, Captcha, Password +// - AI: Prompt +// - Collections: Texts, Params, Points +// +// Parameters: +// - arguments: Raw MCP request arguments as map[string]any +// +// Returns: +// - *option.ActionOptions: Parsed and validated options struct +// - error: Parsing or validation error +// +// Usage: +// +// unifiedReq, err := parseActionOptions(request.Params.Arguments) +// if err != nil { +// return nil, err +// } +// // Use unifiedReq.X, unifiedReq.Y, etc. +// +// Error Handling: +// - JSON marshal errors (invalid argument types) +// - JSON unmarshal errors (type conversion failures) +// - Missing required fields (handled by individual tools) func parseActionOptions(arguments map[string]any) (*option.ActionOptions, error) { b, err := json.Marshal(arguments) if err != nil { diff --git a/uixt/mcp_server.md b/uixt/mcp_server.md new file mode 100644 index 00000000..ea5b125c --- /dev/null +++ b/uixt/mcp_server.md @@ -0,0 +1,756 @@ +# HttpRunner MCP Server 完整说明文档 + +## 📖 概述 + +HttpRunner MCP Server 是基于 Model Context Protocol (MCP) 协议实现的 UI 自动化测试服务器,它将 HttpRunner 的强大 UI 自动化能力通过标准化的 MCP 接口暴露给 AI 模型和其他客户端。 + +## 🎯 核心功能特性 + +### 1. 设备管理 +- **设备发现**: 自动发现 Android/iOS 设备和模拟器 +- **设备选择**: 支持通过序列号/UDID 选择特定设备 +- **多平台支持**: Android、iOS、Harmony、Browser 全平台覆盖 + +### 2. 交互操作 +- **点击操作**: 支持坐标点击、OCR 文本点击、CV 图像识别点击 +- **滑动操作**: 方向滑动、坐标滑动、智能滑动查找 +- **拖拽操作**: 精确的拖拽控制,支持反作弊 +- **输入操作**: 文本输入、按键操作 + +### 3. 应用管理 +- **应用控制**: 启动、终止、安装、卸载、清除数据 +- **包名查询**: 获取设备上所有应用包名 +- **前台应用**: 获取当前前台应用信息 + +### 4. 屏幕操作 +- **截图功能**: 高质量屏幕截图,支持 Base64 编码 +- **屏幕信息**: 获取屏幕尺寸、方向等信息 +- **UI 层次**: 获取界面元素层次结构 + +### 5. 高级功能 +- **AI 驱动**: 支持 AI 模型驱动的智能操作 +- **反作弊机制**: 内置反作弊检测和规避 +- **Web 自动化**: 支持浏览器自动化操作 +- **时间控制**: 精确的等待和延时控制 + +## 🏗️ 架构设计 + +### 整体架构 + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ MCP Client │ │ MCP Server │ │ XTDriver Core │ +│ (AI Model) │◄──►│ (mcp_server) │◄──►│ (UI Engine) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Device Layer │ + │ Android/iOS/Web │ + └─────────────────┘ +``` + +### 核心组件 + +#### 1. MCPServer4XTDriver +```go +type MCPServer4XTDriver struct { + mcpServer *server.MCPServer // MCP 协议服务器 + mcpTools []mcp.Tool // 注册的工具列表 + actionToolMap map[option.ActionName]ActionTool // 动作到工具的映射 +} +``` + +#### 2. ActionTool 接口 +```go +type ActionTool interface { + Name() option.ActionName // 工具名称 + Description() string // 工具描述 + Options() []mcp.ToolOption // MCP 选项定义 + Implement() server.ToolHandlerFunc // 工具实现逻辑 + ConvertActionToCallToolRequest(action MobileAction) (mcp.CallToolRequest, error) // 动作转换 +} +``` + +## 🛠️ 实现思路 + +### 1. 纯 ActionTool 架构 + +采用纯 ActionTool 风格架构,每个 MCP 工具都是独立的结构体: + +```go +type ToolTapXY struct{} + +func (t *ToolTapXY) Name() option.ActionName { + return option.ACTION_TapXY +} + +func (t *ToolTapXY) Implement() server.ToolHandlerFunc { + return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + // 1. 设置驱动器 + driverExt, err := setupXTDriver(ctx, request.Params.Arguments) + + // 2. 解析参数 + unifiedReq, err := parseActionOptions(request.Params.Arguments) + + // 3. 执行操作 + err = driverExt.TapXY(unifiedReq.X, unifiedReq.Y, opts...) + + // 4. 返回结果 + return mcp.NewToolResultText("操作成功"), nil + } +} +``` + +### 2. 统一参数处理 + +使用 `parseActionOptions` 函数统一处理 MCP 请求参数: + +```go +func parseActionOptions(arguments map[string]any) (*option.ActionOptions, error) { + b, err := json.Marshal(arguments) + if err != nil { + return nil, fmt.Errorf("marshal arguments failed: %w", err) + } + + var actionOptions option.ActionOptions + if err := json.Unmarshal(b, &actionOptions); err != nil { + return nil, fmt.Errorf("unmarshal to ActionOptions failed: %w", err) + } + + return &actionOptions, nil +} +``` + +### 3. 设备管理策略 + +通过 `setupXTDriver` 函数实现设备的统一管理: + +```go +func setupXTDriver(ctx context.Context, arguments map[string]any) (*XTDriver, error) { + // 1. 解析设备参数 + platform := arguments["platform"].(string) + serial := arguments["serial"].(string) + + // 2. 获取或创建驱动器 + driverExt, err := GetOrCreateXTDriver( + option.WithPlatform(platform), + option.WithSerial(serial), + ) + + return driverExt, err +} +``` + +### 4. 错误处理机制 + +统一的错误处理和日志记录: + +```go +if err != nil { + log.Error().Err(err).Str("tool", toolName).Msg("tool execution failed") + return mcp.NewToolResultError(fmt.Sprintf("操作失败: %s", err.Error())), nil +} +``` + +## 🔧 如何扩展接入新工具 + +### 步骤 1: 定义工具结构体 + +```go +// 新工具:长按操作 +type ToolLongPress struct{} + +func (t *ToolLongPress) Name() option.ActionName { + return option.ACTION_LongPress // 需要在 option 包中定义 +} + +func (t *ToolLongPress) Description() string { + return "在指定坐标执行长按操作" +} +``` + +### 步骤 2: 定义 MCP 选项 + +```go +func (t *ToolLongPress) Options() []mcp.ToolOption { + return []mcp.ToolOption{ + mcp.WithString("platform", mcp.Enum("android", "ios"), mcp.Description("设备平台")), + mcp.WithString("serial", mcp.Description("设备序列号")), + mcp.WithNumber("x", mcp.Description("X 坐标")), + mcp.WithNumber("y", mcp.Description("Y 坐标")), + mcp.WithNumber("duration", mcp.Description("长按持续时间(秒)")), + mcp.WithBoolean("anti_risk", mcp.Description("是否启用反作弊")), + } +} +``` + +### 步骤 3: 实现工具逻辑 + +```go +func (t *ToolLongPress) Implement() server.ToolHandlerFunc { + return func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + // 1. 设置驱动器 + driverExt, err := setupXTDriver(ctx, request.Params.Arguments) + if err != nil { + return nil, fmt.Errorf("setup driver failed: %w", err) + } + + // 2. 解析参数 + unifiedReq, err := parseActionOptions(request.Params.Arguments) + if err != nil { + return nil, err + } + + // 3. 参数验证 + if unifiedReq.X == 0 || unifiedReq.Y == 0 { + return nil, fmt.Errorf("x and y coordinates are required") + } + + // 4. 构建选项 + opts := []option.ActionOption{} + if unifiedReq.Duration > 0 { + opts = append(opts, option.WithDuration(unifiedReq.Duration)) + } + if unifiedReq.AntiRisk { + opts = append(opts, option.WithAntiRisk(true)) + } + + // 5. 执行操作 + log.Info().Float64("x", unifiedReq.X).Float64("y", unifiedReq.Y). + Float64("duration", unifiedReq.Duration).Msg("executing long press") + + err = driverExt.LongPress(unifiedReq.X, unifiedReq.Y, opts...) + if err != nil { + return mcp.NewToolResultError(fmt.Sprintf("长按操作失败: %s", err.Error())), nil + } + + // 6. 返回结果 + return mcp.NewToolResultText(fmt.Sprintf("成功在坐标 (%.2f, %.2f) 执行长按操作", + unifiedReq.X, unifiedReq.Y)), nil + } +} +``` + +### 步骤 4: 实现动作转换 + +```go +func (t *ToolLongPress) ConvertActionToCallToolRequest(action MobileAction) (mcp.CallToolRequest, error) { + if params, err := builtin.ConvertToFloat64Slice(action.Params); err == nil && len(params) >= 2 { + arguments := map[string]any{ + "x": params[0], + "y": params[1], + } + + // 添加持续时间 + if len(params) > 2 { + arguments["duration"] = params[2] + } + + // 提取动作选项 + extractActionOptionsToArguments(action.GetOptions(), arguments) + + return buildMCPCallToolRequest(t.Name(), arguments), nil + } + return mcp.CallToolRequest{}, fmt.Errorf("invalid long press params: %v", action.Params) +} +``` + +### 步骤 5: 注册工具 + +在 `registerTools()` 方法中添加新工具: + +```go +func (s *MCPServer4XTDriver) registerTools() { + // ... 现有工具注册 ... + + // 注册新工具 + s.registerTool(&ToolLongPress{}) + + // ... 其他工具 ... +} +``` + +### 步骤 6: 添加单元测试 + +```go +func TestToolLongPress(t *testing.T) { + tool := &ToolLongPress{} + + // 测试工具基本信息 + assert.Equal(t, option.ACTION_LongPress, tool.Name()) + assert.Contains(t, tool.Description(), "长按") + + // 测试选项定义 + options := tool.Options() + assert.NotEmpty(t, options) + + // 测试动作转换 + action := MobileAction{ + Method: option.ACTION_LongPress, + Params: []float64{100, 200, 2.0}, // x, y, duration + ActionOptions: option.ActionOptions{ + AntiRisk: true, + }, + } + + request, err := tool.ConvertActionToCallToolRequest(action) + assert.NoError(t, err) + assert.Equal(t, string(option.ACTION_LongPress), request.Params.Name) + assert.Equal(t, 100.0, request.Params.Arguments["x"]) + assert.Equal(t, 200.0, request.Params.Arguments["y"]) + assert.Equal(t, 2.0, request.Params.Arguments["duration"]) + assert.Equal(t, true, request.Params.Arguments["anti_risk"]) +} +``` + +## 📋 工具开发最佳实践 + +### 1. 命名规范 +- 工具结构体: `Tool{ActionName}` +- 常量定义: `ACTION_{ActionName}` +- 参数名称: 使用下划线分隔 (`from_x`, `to_y`) + +### 2. 参数验证 +```go +// 必需参数验证 +if unifiedReq.Text == "" { + return nil, fmt.Errorf("text parameter is required") +} + +// 坐标参数验证 +_, hasX := request.Params.Arguments["x"] +_, hasY := request.Params.Arguments["y"] +if !hasX || !hasY { + return nil, fmt.Errorf("x and y coordinates are required") +} +``` + +### 3. 错误处理 +```go +// 统一错误格式 +if err != nil { + return mcp.NewToolResultError(fmt.Sprintf("操作失败: %s", err.Error())), nil +} + +// 成功结果 +return mcp.NewToolResultText(fmt.Sprintf("操作成功: %s", details)), nil +``` + +### 4. 日志记录 +```go +// 操作开始日志 +log.Info().Str("action", "long_press"). + Float64("x", x).Float64("y", y). + Msg("executing long press operation") + +// 调试日志 +log.Debug().Interface("arguments", arguments). + Msg("parsed tool arguments") +``` + +### 5. 选项处理 +```go +// 使用 extractActionOptionsToArguments 统一处理 +extractActionOptionsToArguments(action.GetOptions(), arguments) + +// 或手动添加特定选项 +if unifiedReq.AntiRisk { + opts = append(opts, option.WithAntiRisk(true)) +} +``` + +## 🚀 高级特性 + +### 1. 反作弊支持 +```go +// 在需要反作弊的操作中添加 +if unifiedReq.AntiRisk { + arguments := getCommonMCPArguments(driver) + callMCPActionTool(driver, "evalpkgs", "set_touch_info", arguments) +} +``` + +### 2. 异步操作 +```go +// 对于长时间运行的操作,使用 context 控制超时 +ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) +defer cancel() +``` + +### 3. 批量操作 +```go +// 支持批量参数处理 +for _, point := range unifiedReq.Points { + err := driverExt.TapXY(point.X, point.Y, opts...) + if err != nil { + return mcp.NewToolResultError(fmt.Sprintf("批量操作失败: %s", err.Error())), nil + } +} +``` + +## 📚 MCP Tools 快速参考 + +### 📱 设备管理工具 + +#### list_available_devices +**功能**: 发现所有可用的设备和模拟器 +**参数**: 无 +**返回**: JSON 格式的设备列表 +```json +{ + "androidDevices": ["emulator-5554", "device-serial"], + "iosDevices": ["iPhone-UDID", "simulator-UDID"] +} +``` + +#### select_device +**功能**: 选择要使用的设备 +**参数**: +- `platform` (string): "android" | "ios" | "web" | "harmony" +- `serial` (string): 设备序列号或 UDID + +--- + +### 👆 触摸操作工具 + +#### tap_xy +**功能**: 在相对坐标点击 (0-1 范围) +**参数**: +- `x` (number): X 坐标 (0.0-1.0) +- `y` (number): Y 坐标 (0.0-1.0) +- `duration` (number, 可选): 点击持续时间(秒) +- `anti_risk` (boolean, 可选): 启用反作弊 + +#### tap_abs_xy +**功能**: 在绝对像素坐标点击 +**参数**: +- `x` (number): X 像素坐标 +- `y` (number): Y 像素坐标 +- `duration` (number, 可选): 点击持续时间(秒) +- `anti_risk` (boolean, 可选): 启用反作弊 + +#### tap_ocr +**功能**: 通过 OCR 识别文本并点击 +**参数**: +- `text` (string): 要查找的文本 +- `ignore_NotFoundError` (boolean, 可选): 忽略未找到错误 +- `regex` (boolean, 可选): 使用正则表达式匹配 + +#### tap_cv +**功能**: 通过计算机视觉识别图像并点击 +**参数**: +- `imagePath` (string): 模板图像路径 +- `threshold` (number, 可选): 匹配阈值 + +#### double_tap_xy +**功能**: 在指定坐标双击 +**参数**: +- `x` (number): X 坐标 +- `y` (number): Y 坐标 + +--- + +### 🔄 手势操作工具 + +#### swipe +**功能**: 通用滑动 (自动检测方向或坐标) +**参数**: 支持方向滑动或坐标滑动两种模式 + +##### 方向滑动模式: +- `direction` (string): "up" | "down" | "left" | "right" +- `duration` (number, 可选): 滑动持续时间 +- `press_duration` (number, 可选): 按压持续时间 + +##### 坐标滑动模式: +- `from_x` (number): 起始 X 坐标 +- `from_y` (number): 起始 Y 坐标 +- `to_x` (number): 结束 X 坐标 +- `to_y` (number): 结束 Y 坐标 + +#### drag +**功能**: 拖拽操作 +**参数**: +- `from_x` (number): 起始 X 坐标 +- `from_y` (number): 起始 Y 坐标 +- `to_x` (number): 结束 X 坐标 +- `to_y` (number): 结束 Y 坐标 +- `duration` (number, 可选): 拖拽持续时间(毫秒) + +#### swipe_to_tap_app +**功能**: 滑动查找并点击应用 +**参数**: +- `appName` (string): 应用名称 +- `max_retry_times` (number, 可选): 最大重试次数 +- `ignore_NotFoundError` (boolean, 可选): 忽略未找到错误 + +#### swipe_to_tap_text +**功能**: 滑动查找并点击文本 +**参数**: +- `text` (string): 要查找的文本 +- `max_retry_times` (number, 可选): 最大重试次数 +- `regex` (boolean, 可选): 使用正则表达式 + +#### swipe_to_tap_texts +**功能**: 滑动查找并点击多个文本中的一个 +**参数**: +- `texts` (array): 文本数组 +- `max_retry_times` (number, 可选): 最大重试次数 + +--- + +### ⌨️ 输入操作工具 + +#### input +**功能**: 在当前焦点元素输入文本 +**参数**: +- `text` (string): 要输入的文本 + +#### press_button +**功能**: 按设备按键 +**参数**: +- `button` (string): 按键名称 + - Android: "BACK", "HOME", "VOLUME_UP", "VOLUME_DOWN", "ENTER" + - iOS: "HOME", "VOLUME_UP", "VOLUME_DOWN" + +#### home +**功能**: 按 Home 键 +**参数**: 无 + +#### back +**功能**: 按返回键 (仅 Android) +**参数**: 无 + +--- + +### 📱 应用管理工具 + +#### list_packages +**功能**: 列出设备上所有应用包名 +**参数**: 无 + +#### app_launch +**功能**: 启动应用 +**参数**: +- `packageName` (string): 应用包名 + +#### app_terminate +**功能**: 终止应用 +**参数**: +- `packageName` (string): 应用包名 + +#### app_install +**功能**: 安装应用 +**参数**: +- `appUrl` (string): APK/IPA 文件路径或 URL + +#### app_uninstall +**功能**: 卸载应用 +**参数**: +- `packageName` (string): 应用包名 + +#### app_clear +**功能**: 清除应用数据 +**参数**: +- `packageName` (string): 应用包名 + +--- + +### 📸 屏幕操作工具 + +#### screenshot +**功能**: 截取屏幕截图 +**参数**: 无 +**返回**: Base64 编码的图像数据 + +#### get_screen_size +**功能**: 获取屏幕尺寸 +**参数**: 无 +**返回**: 屏幕宽度和高度 (像素) + +#### get_source +**功能**: 获取 UI 层次结构 +**参数**: +- `packageName` (string, 可选): 指定应用包名 + +--- + +### ⏱️ 时间控制工具 + +#### sleep +**功能**: 等待指定秒数 +**参数**: +- `seconds` (number): 等待秒数 + +#### sleep_ms +**功能**: 等待指定毫秒数 +**参数**: +- `milliseconds` (number): 等待毫秒数 + +#### sleep_random +**功能**: 随机等待 +**参数**: +- `params` (array): 随机参数数组 + +--- + +### 🛠️ 实用工具 + +#### set_ime +**功能**: 设置输入法 +**参数**: +- `ime` (string): 输入法包名 + +#### close_popups +**功能**: 关闭弹窗 +**参数**: 无 + +--- + +### 🌐 Web 操作工具 + +#### web_login_none_ui +**功能**: 无 UI 登录 +**参数**: +- `packageName` (string): 应用包名 +- `phoneNumber` (string, 可选): 手机号 +- `captcha` (string, 可选): 验证码 +- `password` (string, 可选): 密码 + +#### secondary_click +**功能**: 右键点击 +**参数**: +- `x` (number): X 坐标 +- `y` (number): Y 坐标 + +#### hover_by_selector +**功能**: 悬停在选择器元素上 +**参数**: +- `selector` (string): CSS 选择器或 XPath + +#### tap_by_selector +**功能**: 点击选择器元素 +**参数**: +- `selector` (string): CSS 选择器或 XPath + +#### secondary_click_by_selector +**功能**: 右键点击选择器元素 +**参数**: +- `selector` (string): CSS 选择器或 XPath + +#### web_close_tab +**功能**: 关闭浏览器标签页 +**参数**: +- `tabIndex` (number): 标签页索引 + +--- + +### 🤖 AI 操作工具 + +#### ai_action +**功能**: AI 驱动的智能操作 +**参数**: +- `prompt` (string): 自然语言指令 + +#### finished +**功能**: 标记任务完成 +**参数**: +- `content` (string): 完成信息 + +--- + +### 📋 通用参数说明 + +#### 设备参数 (所有工具通用) +- `platform` (string): 设备平台 + - "android": Android 设备 + - "ios": iOS 设备 + - "web": Web 浏览器 + - "harmony": 鸿蒙设备 +- `serial` (string): 设备标识符 + - Android: 设备序列号 (如 "emulator-5554") + - iOS: 设备 UDID + - Web: 浏览器会话 ID + +#### 坐标参数 +- **相对坐标**: 0.0-1.0 范围,相对于屏幕尺寸 +- **绝对坐标**: 像素值,基于实际屏幕分辨率 + +#### 时间参数 +- `duration`: 操作持续时间 (秒) +- `press_duration`: 按压持续时间 (秒) +- `milliseconds`: 毫秒数 + +#### 行为参数 +- `anti_risk`: 启用反作弊检测 +- `ignore_NotFoundError`: 忽略元素未找到错误 +- `regex`: 使用正则表达式匹配 +- `pre_mark_operation`: 启用操作前标记 (用于调试和可视化) +- `max_retry_times`: 最大重试次数 +- `index`: 元素索引 (多个匹配时) + +--- + +### 🔧 使用示例 + +#### 基本点击操作 +```json +{ + "name": "tap_xy", + "arguments": { + "platform": "android", + "serial": "emulator-5554", + "x": 0.5, + "y": 0.3 + } +} +``` + +#### 滑动操作 +```json +{ + "name": "swipe", + "arguments": { + "platform": "android", + "serial": "emulator-5554", + "direction": "up", + "duration": 0.5 + } +} +``` + +#### 应用启动 +```json +{ + "name": "app_launch", + "arguments": { + "platform": "android", + "serial": "emulator-5554", + "packageName": "com.example.app" + } +} +``` + +#### OCR 文本点击 +```json +{ + "name": "tap_ocr", + "arguments": { + "platform": "android", + "serial": "emulator-5554", + "text": "登录", + "ignore_NotFoundError": false + } +} +``` + +--- + +### ⚠️ 注意事项 + +1. **设备连接**: 确保设备已连接并可访问 +2. **权限要求**: 某些操作需要设备 root 或开发者权限 +3. **坐标系统**: 注意相对坐标 (0-1) 和绝对坐标 (像素) 的区别 +4. **平台差异**: 不同平台支持的功能可能有差异 +5. **错误处理**: 建议启用适当的错误忽略选项 +6. **性能考虑**: 避免过于频繁的操作,适当添加等待时间 diff --git a/uixt/mcp_server_test.go b/uixt/mcp_server_test.go index 4cebbabc..ff785b12 100644 --- a/uixt/mcp_server_test.go +++ b/uixt/mcp_server_test.go @@ -11,6 +11,7 @@ import ( func TestNewMCPServer(t *testing.T) { server := NewMCPServer() assert.NotNil(t, server) + assert.NotEmpty(t, server.ListTools()) // Check that tools are registered tools := server.ListTools() @@ -1528,3 +1529,36 @@ func TestToolWebCloseTab(t *testing.T) { _, err = tool.ConvertActionToCallToolRequest(invalidAction) assert.Error(t, err) } + +func TestPreMarkOperationConfiguration(t *testing.T) { + // Test that pre_mark_operation is configurable and not hardcoded + server := NewMCPServer() + + // Get the tap_xy tool + tapTool := server.GetToolByAction(option.ACTION_TapXY) + assert.NotNil(t, tapTool) + + // Test conversion with pre_mark_operation enabled + actionWithPreMark := MobileAction{ + Method: option.ACTION_TapXY, + Params: []float64{0.5, 0.5}, + ActionOptions: *option.NewActionOptions(option.WithPreMarkOperation(true)), + } + + request, err := tapTool.ConvertActionToCallToolRequest(actionWithPreMark) + assert.NoError(t, err) + assert.Equal(t, true, request.Params.Arguments["pre_mark_operation"]) + + // Test conversion without pre_mark_operation + actionWithoutPreMark := MobileAction{ + Method: option.ACTION_TapXY, + Params: []float64{0.5, 0.5}, + ActionOptions: *option.NewActionOptions(option.WithPreMarkOperation(false)), + } + + request2, err := tapTool.ConvertActionToCallToolRequest(actionWithoutPreMark) + assert.NoError(t, err) + // Should not have pre_mark_operation in arguments when false + _, exists := request2.Params.Arguments["pre_mark_operation"] + assert.False(t, exists) +}