feat: add AI Querier module with custom output schema support and refactor common model calling logic

- Add new AI Querier module for structured information extraction from screenshots - Support custom output schema for structured data response - Implement automatic type conversion and data validation - Add comprehensive test suite with various data structure examples - Refactor callModelWithLogging to utils.go as shared function for planner, asserter, and querier - Eliminate code duplication across AI modules (30+ lines of repeated code) - Improve maintainability with unified logging and timing logic - Add environment variable checks in test setup to handle missing API keys gracefully Key features: - Custom output schema support with JSON Schema generation - Automatic data type conversion with reflection - Fallback mechanisms for robust parsing - Comprehensive documentation and usage examples - Backward compatibility with existing functionality
2026-06-03 06:49:38 +08:00 · 2025-06-10 20:41:35 +08:00
parent fa9a53d2ae
commit 7c45acd061
10 changed files with 1495 additions and 22 deletions
--- a/uixt/ai/asserter.go
+++ b/uixt/ai/asserter.go
@@ -3,7 +3,6 @@ package ai
 import (
 	"context"
 	"fmt"
-	"time"

 	"github.com/cloudwego/eino-ext/components/model/openai"
 	openai2 "github.com/cloudwego/eino-ext/libs/acl/openai"
@@ -15,7 +14,6 @@ import (
 	"github.com/httprunner/httprunner/v5/uixt/option"
 	"github.com/httprunner/httprunner/v5/uixt/types"
 	"github.com/pkg/errors"
-	"github.com/rs/zerolog/log"
 )

 // IAsserter interface defines the contract for assertion operations
@@ -128,15 +126,11 @@ Here is the assertion. Please tell whether it is truthy according to the screens
 	a.history.Append(userMsg)

 	// Call model service, generate response
-	logRequest(a.history)
-	startTime := time.Now()
-	message, err := a.model.Generate(ctx, a.history)
-	log.Debug().Float64("elapsed(s)", time.Since(startTime).Seconds()).
-		Str("model", string(a.modelConfig.ModelType)).Msg("call model service for assertion")
+	message, err := callModelWithLogging(ctx, a.model, a.history,
+		a.modelConfig.ModelType, "assertion")
 	if err != nil {
 		return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
 	}
-	logResponse(message)

 	// Parse result
 	result, err := parseAssertionResult(message.Content)
--- a/uixt/ai/asserter_test.go
+++ b/uixt/ai/asserter_test.go
@@ -12,7 +12,7 @@ import (
 )

 func createAsserter(t *testing.T) *Asserter {
-	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250428)
+	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250328)
 	require.NoError(t, err)
 	asserter, err := NewAsserter(context.Background(), modelConfig)
 	require.NoError(t, err)
--- a/uixt/ai/planner.go
+++ b/uixt/ai/planner.go
@@ -2,7 +2,6 @@ package ai

 import (
 	"context"
-	"time"

 	"github.com/cloudwego/eino-ext/components/model/openai"
 	"github.com/cloudwego/eino/components/model"
@@ -116,15 +115,11 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (result *Plan
 	p.history.Append(opts.Message)

 	// call model service, generate response
-	logRequest(p.history)
-	startTime := time.Now()
-	message, err := p.model.Generate(ctx, p.history)
-	log.Debug().Float64("elapsed(s)", time.Since(startTime).Seconds()).
-		Str("model", string(p.modelConfig.ModelType)).Msg("call model service for planning")
+	message, err := callModelWithLogging(ctx, p.model, p.history,
+		p.modelConfig.ModelType, "planning")
 	if err != nil {
 		return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
 	}
-	logResponse(message)

 	defer func() {
 		// Extract usage information if available
@@ -174,7 +169,6 @@ func (p *Planner) Call(ctx context.Context, opts *PlanningOptions) (result *Plan
 	log.Info().
 		Interface("thought", result.Thought).
 		Interface("tool_calls", result.ToolCalls).
-		Float64("elapsed(s)", time.Since(startTime).Seconds()).
 		Msg("get VLM planning result")
 	return result, nil
 }
--- a/uixt/ai/planner_test.go
+++ b/uixt/ai/planner_test.go
@@ -29,7 +29,7 @@ func TestVLMPlanning(t *testing.T) {

 	userInstruction += "\n\n请基于以上游戏规则，给出下一步可点击的两个图标坐标"

-	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250428)
+	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250328)
 	require.NoError(t, err)

 	planner, err := NewPlanner(context.Background(), modelConfig)
@@ -72,7 +72,7 @@ func TestXHSPlanning(t *testing.T) {

 	userInstruction := "点击第二个帖子的作者头像"

-	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250428)
+	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250328)
 	require.NoError(t, err)

 	planner, err := NewPlanner(context.Background(), modelConfig)
@@ -115,7 +115,7 @@ func TestChatList(t *testing.T) {

 	userInstruction := "请结合图片的文字信息，请告诉我一共有多少个群聊，哪些群聊右下角有绿点"

-	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250428)
+	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250328)
 	require.NoError(t, err)

 	planner, err := NewPlanner(context.Background(), modelConfig)
@@ -147,7 +147,7 @@ func TestChatList(t *testing.T) {

 func TestHandleSwitch(t *testing.T) {
 	userInstruction := "检查发送框下方的联网搜索开关，蓝色为开启状态，灰色为关闭状态；若开关处于关闭状态，则点击进行开启"
-	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250428)
+	modelConfig, err := GetModelConfig(option.DOUBAO_1_5_UI_TARS_250328)
 	require.NoError(t, err)

 	planner, err := NewPlanner(context.Background(), modelConfig)
--- a/uixt/ai/querier.go
+++ b/uixt/ai/querier.go
@@ -0,0 +1,515 @@
+package ai
+
+import (
+	"context"
+	"fmt"
+	"reflect"
+
+	"github.com/cloudwego/eino-ext/components/model/openai"
+	openai2 "github.com/cloudwego/eino-ext/libs/acl/openai"
+	"github.com/cloudwego/eino/components/model"
+	"github.com/cloudwego/eino/schema"
+	"github.com/getkin/kin-openapi/openapi3gen"
+	"github.com/httprunner/httprunner/v5/code"
+	"github.com/httprunner/httprunner/v5/internal/json"
+	"github.com/httprunner/httprunner/v5/uixt/option"
+	"github.com/httprunner/httprunner/v5/uixt/types"
+	"github.com/pkg/errors"
+)
+
+// IQuerier interface defines the contract for query operations
+type IQuerier interface {
+	Query(ctx context.Context, opts *QueryOptions) (*QueryResult, error)
+}
+
+// QueryOptions represents the input options for query
+type QueryOptions struct {
+	Query        string      `json:"query"`                  // The query text to extract information
+	Screenshot   string      `json:"screenshot"`             // Base64 encoded screenshot
+	Size         types.Size  `json:"size"`                   // Screen dimensions
+	OutputSchema interface{} `json:"outputSchema,omitempty"` // Custom output schema for structured response
+}
+
+// QueryResult represents the response from an AI query
+type QueryResult struct {
+	Content string      `json:"content"`        // The extracted content/information
+	Thought string      `json:"thought"`        // The reasoning process
+	Data    interface{} `json:"data,omitempty"` // Structured data when OutputSchema is provided
+}
+
+// Querier handles query operations using different AI models
+type Querier struct {
+	modelConfig  *ModelConfig
+	model        model.ToolCallingChatModel
+	systemPrompt string
+	history      ConversationHistory
+}
+
+// NewQuerier creates a new Querier instance
+func NewQuerier(ctx context.Context, modelConfig *ModelConfig) (*Querier, error) {
+	querier := &Querier{
+		modelConfig:  modelConfig,
+		systemPrompt: defaultQueryPrompt,
+	}
+
+	if option.IS_UI_TARS(modelConfig.ModelType) {
+		querier.systemPrompt += "\n" + uiTarsQueryResponseFormat
+	} else {
+		// define default output format
+		type OutputFormat struct {
+			Content string `json:"content"`
+			Thought string `json:"thought"`
+			Error   string `json:"error,omitempty"`
+		}
+		outputFormatSchema, err := openapi3gen.NewSchemaRefForValue(&OutputFormat{}, nil)
+		if err != nil {
+			return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
+		}
+		// set structured response format
+		modelConfig.ChatModelConfig.ResponseFormat = &openai2.ChatCompletionResponseFormat{
+			Type: openai2.ChatCompletionResponseFormatTypeJSONSchema,
+			JSONSchema: &openai2.ChatCompletionResponseFormatJSONSchema{
+				Name:        "query_result",
+				Description: "data that describes query result",
+				Schema:      outputFormatSchema.Value,
+				Strict:      false,
+			},
+		}
+	}
+
+	var err error
+	querier.model, err = openai.NewChatModel(ctx, modelConfig.ChatModelConfig)
+	if err != nil {
+		return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
+	}
+
+	return querier, nil
+}
+
+// callModelWithLogging calls the model with automatic logging and timing
+
+// Query performs the information extraction from the screenshot
+func (q *Querier) Query(ctx context.Context, opts *QueryOptions) (*QueryResult, error) {
+	// Validate input parameters
+	if err := validateQueryInput(opts); err != nil {
+		return nil, errors.Wrap(err, "validate query parameters failed")
+	}
+
+	// Handle custom output schema if provided
+	if opts.OutputSchema != nil {
+		return q.queryWithCustomSchema(ctx, opts)
+	}
+
+	// Reset history for each new query
+	q.history = ConversationHistory{
+		{
+			Role:    schema.System,
+			Content: q.systemPrompt,
+		},
+	}
+
+	// Create user message with screenshot and query
+	userMsg := &schema.Message{
+		Role: schema.User,
+		MultiContent: []schema.ChatMessagePart{
+			{
+				Type: schema.ChatMessagePartTypeImageURL,
+				ImageURL: &schema.ChatMessageImageURL{
+					URL:    opts.Screenshot,
+					Detail: schema.ImageURLDetailAuto,
+				},
+			},
+			{
+				Type: schema.ChatMessagePartTypeText,
+				Text: fmt.Sprintf(`
+Here is the query. Please extract the requested information from the screenshot.
+=====================================
+%s
+=====================================
+  `, opts.Query),
+			},
+		},
+	}
+
+	// Append user message to history
+	q.history.Append(userMsg)
+
+	// Call model service with logging
+	message, err := callModelWithLogging(ctx, q.model, q.history,
+		q.modelConfig.ModelType, "query")
+	if err != nil {
+		return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
+	}
+
+	// Parse result
+	result, err := parseQueryResult(message.Content)
+	if err != nil {
+		return nil, errors.Wrap(code.LLMParseQueryResponseError, err.Error())
+	}
+
+	// Append assistant message to history
+	q.history.Append(&schema.Message{
+		Role:    schema.Assistant,
+		Content: message.Content,
+	})
+
+	return result, nil
+}
+
+// validateQueryInput validates the input parameters for query
+func validateQueryInput(opts *QueryOptions) error {
+	if opts.Query == "" {
+		return errors.Wrap(code.LLMPrepareRequestError, "query text is required")
+	}
+	if opts.Screenshot == "" {
+		return errors.Wrap(code.LLMPrepareRequestError, "screenshot is required")
+	}
+	return nil
+}
+
+// parseQueryResult parses the model response into QueryResult
+func parseQueryResult(content string) (*QueryResult, error) {
+	// Extract JSON content from response
+	jsonContent := extractJSONFromContent(content)
+	if jsonContent == "" {
+		// If no JSON found, treat the entire content as the result
+		// This handles cases where the model returns plain text instead of JSON
+		return &QueryResult{
+			Content: content,
+			Thought: "Direct response from model",
+		}, nil
+	}
+
+	// Parse JSON response
+	var result QueryResult
+	if err := json.Unmarshal([]byte(jsonContent), &result); err != nil {
+		// If JSON parsing fails, treat the content as plain text result
+		return &QueryResult{
+			Content: content,
+			Thought: "Failed to parse as JSON, returning raw content",
+		}, nil
+	}
+
+	return &result, nil
+}
+
+// queryWithCustomSchema performs query with custom output schema
+func (q *Querier) queryWithCustomSchema(ctx context.Context, opts *QueryOptions) (*QueryResult, error) {
+	// Create a new model config with custom schema
+	modelConfig := *q.modelConfig
+
+	if !option.IS_UI_TARS(modelConfig.ModelType) {
+		// Generate schema from the provided output schema
+		outputFormatSchema, err := openapi3gen.NewSchemaRefForValue(opts.OutputSchema, nil)
+		if err != nil {
+			return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
+		}
+
+		// Create custom response format with the provided schema
+		modelConfig.ChatModelConfig.ResponseFormat = &openai2.ChatCompletionResponseFormat{
+			Type: openai2.ChatCompletionResponseFormatTypeJSONSchema,
+			JSONSchema: &openai2.ChatCompletionResponseFormatJSONSchema{
+				Name:        "custom_query_result",
+				Description: "custom structured data response",
+				Schema:      outputFormatSchema.Value,
+				Strict:      false,
+			},
+		}
+	}
+
+	// Create a new model instance with custom schema
+	model, err := openai.NewChatModel(ctx, modelConfig.ChatModelConfig)
+	if err != nil {
+		return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
+	}
+
+	// Reset history for each new query
+	systemPrompt := q.systemPrompt
+	if option.IS_UI_TARS(modelConfig.ModelType) {
+		systemPrompt += "\n" + uiTarsQueryResponseFormat
+	} else {
+		// Add instruction for structured output
+		systemPrompt += "\n\nPlease respond with structured data according to the specified schema. Include both the structured data and your reasoning process."
+	}
+
+	history := ConversationHistory{
+		{
+			Role:    schema.System,
+			Content: systemPrompt,
+		},
+	}
+
+	// Create user message with screenshot and query
+	userMsg := &schema.Message{
+		Role: schema.User,
+		MultiContent: []schema.ChatMessagePart{
+			{
+				Type: schema.ChatMessagePartTypeImageURL,
+				ImageURL: &schema.ChatMessageImageURL{
+					URL:    opts.Screenshot,
+					Detail: schema.ImageURLDetailAuto,
+				},
+			},
+			{
+				Type: schema.ChatMessagePartTypeText,
+				Text: fmt.Sprintf(`
+Here is the query. Please extract the requested information from the screenshot and return it in the specified structured format.
+=====================================
+%s
+=====================================
+  `, opts.Query),
+			},
+		},
+	}
+
+	// Append user message to history
+	history.Append(userMsg)
+
+	// Call model service with logging
+	message, err := callModelWithLogging(ctx, model, history, modelConfig.ModelType, "custom schema query")
+	if err != nil {
+		return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
+	}
+
+	// Parse result with custom schema
+	result, err := parseCustomSchemaResult(message.Content, opts.OutputSchema)
+	if err != nil {
+		return nil, errors.Wrap(code.LLMParseQueryResponseError, err.Error())
+	}
+
+	// Append assistant message to history
+	q.history.Append(&schema.Message{
+		Role:    schema.Assistant,
+		Content: message.Content,
+	})
+
+	return result, nil
+}
+
+// setDefaultFieldValue sets a default value for a field in the structured data using reflection
+func setDefaultFieldValue(structValue reflect.Value, fieldName, defaultValue string) {
+	if field := structValue.FieldByName(fieldName); field.IsValid() && field.CanSet() && field.Kind() == reflect.String {
+		field.SetString(defaultValue)
+	}
+}
+
+// ensureDefaultValues ensures that Content and Thought fields have default values if empty
+func ensureDefaultValues(result *QueryResult, structuredData interface{}) {
+	const (
+		defaultContent = "Structured data extracted successfully"
+		defaultThought = "Parsed structured response according to custom schema"
+	)
+
+	// Set defaults for QueryResult
+	if result.Content == "" {
+		result.Content = defaultContent
+	}
+	if result.Thought == "" {
+		result.Thought = defaultThought
+	}
+
+	// Set defaults in structured data if it's a pointer to struct
+	if structuredData != nil {
+		if structValue := reflect.ValueOf(structuredData); structValue.Kind() == reflect.Ptr {
+			if elem := structValue.Elem(); elem.IsValid() && elem.Kind() == reflect.Struct {
+				if result.Content == defaultContent {
+					setDefaultFieldValue(elem, "Content", defaultContent)
+				}
+				if result.Thought == defaultThought {
+					setDefaultFieldValue(elem, "Thought", defaultThought)
+				}
+			}
+		}
+	}
+}
+
+// parseCustomSchemaResult parses the model response with custom schema
+func parseCustomSchemaResult(content string, outputSchema interface{}) (*QueryResult, error) {
+	// Extract JSON content from response
+	jsonContent := extractJSONFromContent(content)
+	if jsonContent == "" {
+		// If no JSON found, treat the entire content as the result
+		return &QueryResult{
+			Content: content,
+			Thought: "Direct response from model",
+		}, nil
+	}
+
+	// Create a new instance of the same type as outputSchema
+	schemaType := reflect.TypeOf(outputSchema)
+	if schemaType.Kind() == reflect.Ptr {
+		schemaType = schemaType.Elem()
+	}
+
+	// Create a new instance of the schema type
+	newInstance := reflect.New(schemaType).Interface()
+
+	// Try to unmarshal directly into the schema type
+	if err := json.Unmarshal([]byte(jsonContent), newInstance); err == nil {
+		// Successfully parsed into the expected schema type
+		result := &QueryResult{
+			Data: newInstance, // Store the typed pointer directly
+		}
+
+		// Try to extract content and thought if the schema has these fields
+		schemaValue := reflect.ValueOf(newInstance).Elem()
+		if contentField := schemaValue.FieldByName("Content"); contentField.IsValid() && contentField.Kind() == reflect.String {
+			result.Content = contentField.String()
+		}
+		if thoughtField := schemaValue.FieldByName("Thought"); thoughtField.IsValid() && thoughtField.Kind() == reflect.String {
+			result.Thought = thoughtField.String()
+		}
+
+		// If no standard fields found, try to extract from map representation
+		if result.Content == "" && result.Thought == "" {
+			var dataMap map[string]interface{}
+			if err := json.Unmarshal([]byte(jsonContent), &dataMap); err == nil {
+				if content, exists := dataMap["content"]; exists {
+					if contentStr, ok := content.(string); ok {
+						result.Content = contentStr
+					}
+				}
+				if thought, exists := dataMap["thought"]; exists {
+					if thoughtStr, ok := thought.(string); ok {
+						result.Thought = thoughtStr
+					}
+				}
+			}
+		}
+
+		// Ensure default values are set
+		ensureDefaultValues(result, newInstance)
+		return result, nil
+	}
+
+	// Fallback: try to parse as generic map and then convert
+	var structuredData interface{}
+	if err := json.Unmarshal([]byte(jsonContent), &structuredData); err == nil {
+		// Try to convert the generic data to the expected schema type
+		if convertedData, err := convertToSchemaType(structuredData, outputSchema); err == nil {
+			result := &QueryResult{
+				Data: convertedData, // Store the converted typed data
+			}
+
+			// Extract content and thought from the original map
+			if dataMap, ok := structuredData.(map[string]interface{}); ok {
+				if content, exists := dataMap["content"]; exists {
+					if contentStr, ok := content.(string); ok {
+						result.Content = contentStr
+					}
+				}
+				if thought, exists := dataMap["thought"]; exists {
+					if thoughtStr, ok := thought.(string); ok {
+						result.Thought = thoughtStr
+					}
+				}
+			}
+
+			// Ensure default values are set
+			ensureDefaultValues(result, convertedData)
+			return result, nil
+		}
+
+		// If conversion failed, fall back to storing the generic data
+		if dataMap, ok := structuredData.(map[string]interface{}); ok {
+			result := &QueryResult{
+				Data: structuredData,
+			}
+
+			// Extract content and thought if present
+			if content, exists := dataMap["content"]; exists {
+				if contentStr, ok := content.(string); ok {
+					result.Content = contentStr
+				}
+			}
+			if thought, exists := dataMap["thought"]; exists {
+				if thoughtStr, ok := thought.(string); ok {
+					result.Thought = thoughtStr
+				}
+			}
+
+			// Ensure default values are set
+			ensureDefaultValues(result, nil)
+			return result, nil
+		}
+	}
+
+	// Fallback to treating as plain text
+	return &QueryResult{
+		Content: content,
+		Thought: "Failed to parse as structured data, returning raw content",
+	}, nil
+}
+
+// convertToSchemaType converts generic data to the specified schema type
+func convertToSchemaType(data interface{}, outputSchema interface{}) (interface{}, error) {
+	// Get the type of the output schema
+	schemaType := reflect.TypeOf(outputSchema)
+	if schemaType.Kind() == reflect.Ptr {
+		schemaType = schemaType.Elem()
+	}
+
+	// Create a new instance of the schema type
+	newInstance := reflect.New(schemaType).Interface()
+
+	// Convert via JSON marshaling/unmarshaling
+	jsonData, err := json.Marshal(data)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to marshal data to JSON")
+	}
+
+	if err := json.Unmarshal(jsonData, newInstance); err != nil {
+		return nil, errors.Wrap(err, "failed to unmarshal data to target type")
+	}
+
+	return newInstance, nil
+}
+
+// ConvertQueryResultData converts QueryResult.Data to the specified type T
+// This is a helper function for type-safe conversion of the structured data
+//
+// Note: When using QueryOptions.OutputSchema, the Data field is automatically
+// converted to the correct type, so this function is typically not needed.
+// This function is mainly useful for:
+// 1. Converting data when OutputSchema was not used
+// 2. Converting to a different type than the original OutputSchema
+// 3. Handling legacy code or edge cases
+func ConvertQueryResultData[T any](result *QueryResult) (*T, error) {
+	if result.Data == nil {
+		return nil, errors.New("no structured data available")
+	}
+
+	// If Data is already of the correct type, return it directly
+	if typedData, ok := result.Data.(*T); ok {
+		return typedData, nil
+	}
+
+	// If Data is a pointer to the correct type, dereference and return
+	if reflect.TypeOf(result.Data).Kind() == reflect.Ptr {
+		if typedData, ok := result.Data.(*T); ok {
+			return typedData, nil
+		}
+		// Try to get the value that the pointer points to
+		dataValue := reflect.ValueOf(result.Data)
+		if dataValue.Kind() == reflect.Ptr && !dataValue.IsNil() {
+			elem := dataValue.Elem()
+			if elem.Type() == reflect.TypeOf((*T)(nil)).Elem() {
+				typedData := elem.Interface().(T)
+				return &typedData, nil
+			}
+		}
+	}
+
+	// Fallback: try to convert via JSON marshaling/unmarshaling
+	jsonData, err := json.Marshal(result.Data)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to marshal data to JSON")
+	}
+
+	var converted T
+	if err := json.Unmarshal(jsonData, &converted); err != nil {
+		return nil, errors.Wrap(err, "failed to unmarshal data to target type")
+	}
+
+	return &converted, nil
+}
--- a/uixt/ai/querier.md
+++ b/uixt/ai/querier.md
@@ -0,0 +1,299 @@
+# HttpRunner AI Querier - 自定义输出格式功能
+
+## 功能概述
+
+HttpRunner 的 AI Querier 模块支持自定义输出格式功能，允许用户指定特定的数据结构，让 AI 模型返回结构化的数据响应。适用于：
+
+- **UI 元素分析**：自动化测试中的界面元素提取
+- **游戏界面分析**：网格类游戏（连连看、消消乐、2048等）数据提取
+- **表单数据提取**：从表单截图中提取结构化信息
+- **图像内容分析**：任何需要从截图中提取结构化信息的场景
+
+## 核心数据结构
+
+```go
+// QueryOptions - 查询选项
+type QueryOptions struct {
+    Query        string      `json:"query"`                  // 查询文本
+    Screenshot   string      `json:"screenshot"`             // Base64编码的截图
+    Size         types.Size  `json:"size"`                   // 屏幕尺寸
+    OutputSchema interface{} `json:"outputSchema,omitempty"` // 自定义输出格式（可选）
+}
+
+// QueryResult - 查询结果
+type QueryResult struct {
+    Content string      `json:"content"`        // 人类可读的分析结果
+    Thought string      `json:"thought"`        // AI 推理过程
+    Data    interface{} `json:"data,omitempty"` // 结构化数据（使用OutputSchema时自动转换为指定类型）
+}
+```
+
+## 基本用法
+
+### 标准查询
+
+```go
+// 创建查询器
+modelConfig, err := ai.GetModelConfig(option.OPENAI_GPT_4O)
+querier, err := ai.NewQuerier(ctx, modelConfig)
+
+// 执行查询
+result, err := querier.Query(ctx, &ai.QueryOptions{
+    Query:      "请分析这张截图中的内容",
+    Screenshot: screenshot,
+    Size:       size,
+    // 不指定 OutputSchema
+})
+
+fmt.Printf("分析结果: %s\n", result.Content)
+fmt.Printf("推理过程: %s\n", result.Thought)
+// result.Data 为 nil
+```
+
+### 自定义格式查询
+
+```go
+// 定义输出结构
+type GameAnalysis struct {
+    Content string   `json:"content"` // 分析描述
+    Thought string   `json:"thought"` // 思考过程
+    Rows    int      `json:"rows"`    // 行数
+    Cols    int      `json:"cols"`    // 列数
+    Icons   []string `json:"icons"`   // 图标类型
+}
+
+// 执行查询
+result, err := querier.Query(ctx, &ai.QueryOptions{
+    Query:        "分析这个游戏界面的网格结构和图标类型",
+    Screenshot:   screenshot,
+    Size:         size,
+    OutputSchema: GameAnalysis{}, // 指定输出格式
+})
+
+// 直接类型断言获取结构化数据
+if gameData, ok := result.Data.(*GameAnalysis); ok {
+    fmt.Printf("行数: %d, 列数: %d\n", gameData.Rows, gameData.Cols)
+    fmt.Printf("图标类型: %v\n", gameData.Icons)
+}
+```
+
+## 应用场景示例
+
+### UI 元素分析
+
+```go
+type UIAnalysis struct {
+    Content  string      `json:"content"`
+    Thought  string      `json:"thought"`
+    Elements []UIElement `json:"elements"`
+}
+
+type UIElement struct {
+    Type     string      `json:"type"`        // button, text, input等
+    Text     string      `json:"text"`        // 文本内容
+    BoundBox BoundingBox `json:"boundBox"`    // 位置坐标
+    Clickable bool       `json:"clickable"`   // 是否可点击
+}
+
+type BoundingBox struct {
+    X, Y, Width, Height int `json:"x,y,width,height"`
+}
+```
+
+### 网格游戏分析
+
+```go
+type GridGame struct {
+    Content string     `json:"content"`
+    Thought string     `json:"thought"`
+    Grid    [][]Cell   `json:"grid"`       // 网格数据
+    Stats   Statistics `json:"statistics"` // 统计信息
+}
+
+type Cell struct {
+    Type  string `json:"type"`  // 单元格类型
+    Value string `json:"value"` // 单元格值
+    Row   int    `json:"row"`   // 行索引
+    Col   int    `json:"col"`   // 列索引
+}
+
+type Statistics struct {
+    TotalCells  int `json:"totalCells"`
+    UniqueTypes int `json:"uniqueTypes"`
+}
+```
+
+### 表单数据提取
+
+```go
+type FormAnalysis struct {
+    Content string      `json:"content"`
+    Thought string      `json:"thought"`
+    Fields  []FormField `json:"fields"`
+    Actions []Action    `json:"actions"`
+}
+
+type FormField struct {
+    Label    string      `json:"label"`    // 字段标签
+    Type     string      `json:"type"`     // 字段类型
+    Value    string      `json:"value"`    // 当前值
+    Required bool        `json:"required"` // 是否必填
+    BoundBox BoundingBox `json:"boundBox"` // 位置
+}
+```
+
+## 核心特性
+
+### 自动类型转换
+- 指定 `OutputSchema` 时，`QueryResult.Data` 自动转换为指定类型
+- 支持直接类型断言：`result.Data.(*YourType)`
+- 无需手动调用转换函数
+
+### 多级回退机制
+1. 优先解析为指定的结构化类型
+2. 失败时尝试通用JSON解析
+3. 最终回退到纯文本响应
+
+### 向后兼容
+- 不指定 `OutputSchema` 时行为不变
+- 现有代码无需修改
+
+## 最佳实践
+
+### 1. 结构体设计
+
+```go
+// 推荐：包含标准字段
+type YourSchema struct {
+    Content string `json:"content"` // 必须：人类可读描述
+    Thought string `json:"thought"` // 必须：AI推理过程
+    // 自定义字段...
+    Data    CustomData `json:"data"`
+}
+
+// 使用描述性的JSON标签
+type Element struct {
+    Type     string `json:"elementType"`   // 清晰的字段名
+    Position Point  `json:"gridPosition"`  // 描述性标签
+    Visible  bool   `json:"isVisible"`     // 布尔值清晰性
+}
+```
+
+### 2. 查询指令
+
+```go
+// 推荐：详细的查询指令
+opts := &ai.QueryOptions{
+    Query: `分析这张截图并提供结构化信息：
+1. 识别界面类型和主要元素
+2. 提取所有可交互元素的位置和属性
+3. 统计各类元素的数量`,
+    Screenshot:   screenshot,
+    Size:         size,
+    OutputSchema: YourSchema{},
+}
+```
+
+### 3. 错误处理
+
+```go
+result, err := querier.Query(ctx, opts)
+if err != nil {
+    return err
+}
+
+// 类型断言
+if data, ok := result.Data.(*YourSchema); ok {
+    // 使用结构化数据
+    processData(data)
+} else {
+    // 回退到文本结果
+    log.Printf("结构化解析失败，使用文本结果: %s", result.Content)
+}
+```
+
+## 完整示例
+
+```go
+package main
+
+import (
+    "context"
+    "fmt"
+    "log"
+
+    "github.com/httprunner/httprunner/v5/internal/builtin"
+    "github.com/httprunner/httprunner/v5/uixt/ai"
+    "github.com/httprunner/httprunner/v5/uixt/option"
+)
+
+type ScreenAnalysis struct {
+    Content    string   `json:"content"`
+    Thought    string   `json:"thought"`
+    Elements   []string `json:"elements"`
+    Categories []string `json:"categories"`
+    Count      int      `json:"count"`
+}
+
+func main() {
+    ctx := context.Background()
+
+    // 创建查询器
+    modelConfig, err := ai.GetModelConfig(option.OPENAI_GPT_4O)
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    querier, err := ai.NewQuerier(ctx, modelConfig)
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    // 加载截图
+    screenshot, size, err := builtin.LoadImage("screenshot.png")
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    // 执行结构化查询
+    result, err := querier.Query(ctx, &ai.QueryOptions{
+        Query:        "分析截图中的UI元素，提取元素类型和分类信息",
+        Screenshot:   screenshot,
+        Size:         size,
+        OutputSchema: ScreenAnalysis{},
+    })
+    if err != nil {
+        log.Fatal(err)
+    }
+
+    // 使用结构化数据
+    if analysis, ok := result.Data.(*ScreenAnalysis); ok {
+        fmt.Printf("发现 %d 个元素\n", analysis.Count)
+        fmt.Printf("元素类型: %v\n", analysis.Elements)
+        fmt.Printf("分类: %v\n", analysis.Categories)
+    } else {
+        fmt.Printf("文本结果: %s\n", result.Content)
+    }
+}
+```
+
+## 辅助函数
+
+对于特殊情况，提供了类型转换辅助函数：
+
+```go
+// 手动类型转换（通常不需要）
+converted, err := ai.ConvertQueryResultData[YourType](result)
+if err != nil {
+    return err
+}
+```
+
+**注意**：使用 `OutputSchema` 时，`Data` 字段已自动转换为正确类型，通常不需要手动调用此函数。
+
+## 技术限制
+
+- 需要支持结构化输出的AI模型（如 OpenAI GPT-4）
+- 复杂嵌套结构需要清晰的查询指令
+- AI模型可能不总是严格遵循指定格式
+- UI-TARS 模型使用不同的响应格式处理
--- a/uixt/ai/querier_prompts.go
+++ b/uixt/ai/querier_prompts.go
@@ -0,0 +1,20 @@
+package ai
+
+// Default query system prompt
+const defaultQueryPrompt = `You are an AI assistant specialized in analyzing images and extracting information. User will provide a screenshot and a query asking for specific information to be extracted from the image. Please analyze the image carefully and provide the requested information.`
+
+// UI-TARS query response format
+const uiTarsQueryResponseFormat = `
+## Output Json String Format
+` + "```" + `
+"{
+  "content": "<<is a string containing the extracted information or analysis result>>",
+  "thought": "<<is a string explaining your analysis process and reasoning. Use Chinese.>>"
+}"
+` + "```" + `
+
+## Rules **MUST** follow
+- Make sure to return **only** the JSON, with **no additional** text or explanations.
+- Use Chinese in ` + "`Thought`" + ` part.
+- You **MUST** strictly follow up the **Output Json String Format**.
+- Provide detailed and accurate information extraction based on the image content.`
--- a/uixt/ai/querier_test.go
+++ b/uixt/ai/querier_test.go
@@ -0,0 +1,617 @@
+package ai
+
+import (
+	"context"
+	"fmt"
+	"testing"
+
+	"github.com/httprunner/httprunner/v5/internal/builtin"
+	"github.com/httprunner/httprunner/v5/uixt/option"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// Data structures for testing custom output schemas
+
+// GameIcon represents a single icon in the game grid
+type GameIcon struct {
+	Name string `json:"name"` // Icon name (e.g., "beach_ball", "glove")
+	Row  int    `json:"row"`  // Row position (0-based)
+	Col  int    `json:"col"`  // Column position (0-based)
+}
+
+// GameGrid represents the complete game grid
+type GameGrid struct {
+	Grid  [][]GameIcon `json:"grid"`  // 2D array of game icons
+	Rows  int          `json:"rows"`  // Number of rows
+	Cols  int          `json:"cols"`  // Number of columns
+	Icons []string     `json:"icons"` // List of unique icon names
+}
+
+// LianliankanResponse represents the structured response for lianliankan game analysis
+type LianliankanResponse struct {
+	Content string   `json:"content"` // Description of the analysis
+	Thought string   `json:"thought"` // Reasoning process
+	Data    GameGrid `json:"data"`    // Structured game grid data
+}
+
+// SimpleGameInfo represents basic game information
+type SimpleGameInfo struct {
+	Content    string   `json:"content"`    // Description
+	Thought    string   `json:"thought"`    // Reasoning
+	Rows       int      `json:"rows"`       // Number of rows
+	Cols       int      `json:"cols"`       // Number of columns
+	IconTypes  []string `json:"iconTypes"`  // List of icon types
+	TotalIcons int      `json:"totalIcons"` // Total number of icons
+}
+
+// Additional data structures for comprehensive testing
+
+// GameAnalysisResult represents structured analysis of a game interface
+type GameAnalysisResult struct {
+	Content    string     `json:"content"`    // Human-readable description
+	Thought    string     `json:"thought"`    // AI reasoning process
+	GameType   string     `json:"gameType"`   // Type of game detected
+	Dimensions Dimensions `json:"dimensions"` // Grid dimensions
+	Elements   []Element  `json:"elements"`   // Game elements detected
+	Statistics Statistics `json:"statistics"` // Game statistics
+}
+
+type Dimensions struct {
+	Rows int `json:"rows"` // Number of rows
+	Cols int `json:"cols"` // Number of columns
+}
+
+type Element struct {
+	Type     string      `json:"type"`     // Element type/name
+	Position Position    `json:"position"` // Position in grid
+	BoundBox BoundingBox `json:"boundBox"` // Pixel coordinates
+}
+
+type Position struct {
+	Row int `json:"row"` // Row index (0-based)
+	Col int `json:"col"` // Column index (0-based)
+}
+
+type BoundingBox struct {
+	X      int `json:"x"`      // Left coordinate
+	Y      int `json:"y"`      // Top coordinate
+	Width  int `json:"width"`  // Width in pixels
+	Height int `json:"height"` // Height in pixels
+}
+
+type Statistics struct {
+	TotalElements int         `json:"totalElements"` // Total number of elements
+	UniqueTypes   int         `json:"uniqueTypes"`   // Number of unique element types
+	TypeCounts    []TypeCount `json:"typeCounts"`    // Count of each type
+}
+
+type TypeCount struct {
+	Type  string `json:"type"`  // Element type
+	Count int    `json:"count"` // Number of occurrences
+}
+
+// UIElementsResult represents structured analysis of UI elements
+type UIElementsResult struct {
+	Content    string      `json:"content"`    // Description
+	Thought    string      `json:"thought"`    // Reasoning
+	Elements   []UIElement `json:"elements"`   // UI elements found
+	Categories []string    `json:"categories"` // Categories of elements
+}
+
+type UIElement struct {
+	Type        string      `json:"type"`        // Element type (button, text, image, etc.)
+	Text        string      `json:"text"`        // Text content if any
+	Description string      `json:"description"` // Element description
+	BoundBox    BoundingBox `json:"boundBox"`    // Pixel coordinates
+	Clickable   bool        `json:"clickable"`   // Whether element is clickable
+	Visible     bool        `json:"visible"`     // Whether element is visible
+}
+
+// Test functions
+
+func TestParseQueryResult(t *testing.T) {
+	tests := []struct {
+		name     string
+		content  string
+		expected *QueryResult
+	}{
+		{
+			name: "valid JSON response",
+			content: `{
+				"content": "这是一个14行8列的连连看游戏界面，包含25种不同的图案",
+				"thought": "通过分析图片，我识别出了游戏界面的结构和图案类型"
+			}`,
+			expected: &QueryResult{
+				Content: "这是一个14行8列的连连看游戏界面，包含25种不同的图案",
+				Thought: "通过分析图片，我识别出了游戏界面的结构和图案类型",
+			},
+		},
+		{
+			name:    "JSON in markdown",
+			content: "```json\n{\n  \"content\": \"游戏界面分析结果\",\n  \"thought\": \"分析过程\"\n}\n```",
+			expected: &QueryResult{
+				Content: "游戏界面分析结果",
+				Thought: "分析过程",
+			},
+		},
+		{
+			name:    "plain text response",
+			content: "这是一个连连看游戏界面，包含多种图案。",
+			expected: &QueryResult{
+				Content: "这是一个连连看游戏界面，包含多种图案。",
+				Thought: "Direct response from model",
+			},
+		},
+		{
+			name:    "invalid JSON",
+			content: `{"content": "incomplete json", "missing_closing_brace": true`,
+			expected: &QueryResult{
+				Content: `{"content": "incomplete json", "missing_closing_brace": true`,
+				Thought: "Direct response from model",
+			},
+		},
+		{
+			name:    "malformed JSON that can be extracted but not parsed",
+			content: `{"content": "test", "invalid": }`,
+			expected: &QueryResult{
+				Content: `{"content": "test", "invalid": }`,
+				Thought: "Failed to parse as JSON, returning raw content",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := parseQueryResult(tt.content)
+			assert.NoError(t, err)
+			assert.Equal(t, tt.expected.Content, result.Content)
+			assert.Equal(t, tt.expected.Thought, result.Thought)
+		})
+	}
+}
+
+func setupTestQuerier(t *testing.T) *Querier {
+	ctx := context.Background()
+	modelConfig, err := GetModelConfig(option.OPENAI_GPT_4O)
+	require.NoError(t, err)
+	querier, err := NewQuerier(ctx, modelConfig)
+	require.NoError(t, err)
+	return querier
+}
+
+// TestQueryBasicUsage demonstrates basic query functionality without custom schema
+func TestQueryBasicUsage(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load screenshot
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	// Prepare query options
+	opts := &QueryOptions{
+		Query:      "这是一张连连看小游戏的界面，请将其转换为一个二维数组，数组中的每个元素包含图案名称及其坐标",
+		Screenshot: screenshot,
+		Size:       size,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotEmpty(t, result.Content)
+	assert.NotEmpty(t, result.Thought)
+	assert.Nil(t, result.Data) // Should be nil for standard query
+
+	t.Logf("Query Result:")
+	t.Logf("Content: %s", result.Content)
+	t.Logf("Thought: %s", result.Thought)
+}
+
+// TestQueryWithCustomSchema tests the query functionality with custom output schema
+func TestQueryWithCustomSchema(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load test image
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	// Define custom output schema for lianliankan game
+	outputSchema := LianliankanResponse{}
+
+	// Prepare query options with custom schema
+	opts := &QueryOptions{
+		Query: `这是一张连连看小游戏的界面，请分析游戏界面并返回结构化数据：
+1. 游戏网格的行数和列数
+2. 每个位置的图案名称和坐标
+3. 所有不同类型的图案列表
+请将结果组织成二维数组格式，每个元素包含图案名称及其坐标位置。`,
+		Screenshot:   screenshot,
+		Size:         size,
+		OutputSchema: outputSchema,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotEmpty(t, result.Content)
+	assert.NotEmpty(t, result.Thought)
+	assert.NotNil(t, result.Data)
+
+	t.Logf("Query result content: %s", result.Content)
+	t.Logf("Query result thought: %s", result.Thought)
+	t.Logf("Structured data: %+v", result.Data)
+
+	// Try to parse the structured data
+	if dataMap, ok := result.Data.(map[string]interface{}); ok {
+		if gridData, exists := dataMap["data"]; exists {
+			t.Logf("Game grid data: %+v", gridData)
+		}
+		if rows, exists := dataMap["rows"]; exists {
+			t.Logf("Rows: %v", rows)
+		}
+		if cols, exists := dataMap["cols"]; exists {
+			t.Logf("Cols: %v", cols)
+		}
+		if icons, exists := dataMap["icons"]; exists {
+			t.Logf("Icon Types: %v", icons)
+		}
+	}
+}
+
+// TestQueryWithSimpleSchema tests with a simpler custom schema
+func TestQueryWithSimpleSchema(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load test image
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	outputSchema := SimpleGameInfo{}
+
+	// Prepare query options
+	opts := &QueryOptions{
+		Query:        "请分析这个连连看游戏界面，告诉我有多少行多少列，有哪些不同类型的图案，总共有多少个图标",
+		Screenshot:   screenshot,
+		Size:         size,
+		OutputSchema: outputSchema,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotEmpty(t, result.Content)
+	assert.NotEmpty(t, result.Thought)
+	assert.NotNil(t, result.Data)
+
+	t.Logf("Simple schema result: %+v", result)
+}
+
+// TestQueryWithGameAnalysisSchema tests with comprehensive game analysis schema
+func TestQueryWithGameAnalysisSchema(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load test image
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	outputSchema := GameAnalysisResult{}
+
+	// Prepare query options
+	opts := &QueryOptions{
+		Query: `Analyze this game interface and provide structured information about:
+1. The type of game
+2. Grid dimensions (rows and columns)
+3. All game elements with their positions and types
+4. Statistics about element distribution`,
+		Screenshot:   screenshot,
+		Size:         size,
+		OutputSchema: outputSchema,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotEmpty(t, result.Content)
+	assert.NotEmpty(t, result.Thought)
+	assert.NotNil(t, result.Data)
+
+	t.Logf("Game analysis result: %+v", result)
+}
+
+// TestQueryWithUIElementsSchema tests UI elements analysis
+func TestQueryWithUIElementsSchema(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load test image
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	outputSchema := UIElementsResult{}
+
+	// Prepare query options
+	opts := &QueryOptions{
+		Query: `Analyze this interface and identify all UI elements including:
+1. Buttons and their text
+2. Text labels and content
+3. Images and icons
+4. Interactive elements
+5. Their positions and properties`,
+		Screenshot:   screenshot,
+		Size:         size,
+		OutputSchema: outputSchema,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotEmpty(t, result.Content)
+	assert.NotEmpty(t, result.Thought)
+	assert.NotNil(t, result.Data)
+
+	t.Logf("UI elements analysis result: %+v", result)
+}
+
+// TestQuerySchemaComparison compares standard vs custom schema queries
+func TestQuerySchemaComparison(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	query := "请分析这个连连看游戏界面的基本信息"
+
+	// Standard query (without custom schema)
+	t.Run("StandardQuery", func(t *testing.T) {
+		standardOpts := &QueryOptions{
+			Query:      query,
+			Screenshot: screenshot,
+			Size:       size,
+			// No OutputSchema specified
+		}
+
+		standardResult, err := querier.Query(context.Background(), standardOpts)
+		assert.NoError(t, err)
+		assert.NotNil(t, standardResult)
+		assert.NotEmpty(t, standardResult.Content)
+		assert.NotEmpty(t, standardResult.Thought)
+		assert.Nil(t, standardResult.Data) // Should be nil for standard query
+
+		t.Logf("Standard Query Result:")
+		t.Logf("Content: %s", standardResult.Content)
+		t.Logf("Thought: %s", standardResult.Thought)
+		t.Logf("Data: %+v", standardResult.Data)
+	})
+
+	// Custom schema query
+	t.Run("CustomSchemaQuery", func(t *testing.T) {
+		type GameInfo struct {
+			Content string   `json:"content"`
+			Thought string   `json:"thought"`
+			Rows    int      `json:"rows"`
+			Cols    int      `json:"cols"`
+			Icons   []string `json:"icons"`
+		}
+
+		customOpts := &QueryOptions{
+			Query:        query,
+			Screenshot:   screenshot,
+			Size:         size,
+			OutputSchema: GameInfo{},
+		}
+
+		customResult, err := querier.Query(context.Background(), customOpts)
+		assert.NoError(t, err)
+		assert.NotNil(t, customResult)
+		assert.NotEmpty(t, customResult.Content)
+		assert.NotEmpty(t, customResult.Thought)
+		assert.NotNil(t, customResult.Data) // Should contain structured data
+
+		t.Logf("Custom Schema Query Result:")
+		t.Logf("Content: %s", customResult.Content)
+		t.Logf("Thought: %s", customResult.Thought)
+		t.Logf("Structured Data: %+v", customResult.Data)
+	})
+}
+
+// TestQueryWithDifferentPrompts tests various types of queries on the same screenshot
+func TestQueryWithDifferentPrompts(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load screenshot
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	// Example queries
+	queries := []string{
+		"请描述这张图片中的内容",
+		"这个游戏界面有多少行多少列？",
+		"请识别图片中所有不同类型的图案",
+		"请找出可以消除的图案对",
+	}
+
+	for i, query := range queries {
+		t.Run(fmt.Sprintf("Query_%d", i+1), func(t *testing.T) {
+			opts := &QueryOptions{
+				Query:      query,
+				Screenshot: screenshot,
+				Size:       size,
+			}
+
+			result, err := querier.Query(context.Background(), opts)
+			assert.NoError(t, err)
+			assert.NotNil(t, result)
+			assert.NotEmpty(t, result.Content)
+			assert.NotEmpty(t, result.Thought)
+
+			t.Logf("Query %d: %s", i+1, query)
+			t.Logf("Answer: %s", result.Content)
+			t.Logf("Reasoning: %s", result.Thought)
+		})
+	}
+}
+
+// TestConvertQueryResultData tests the type conversion functionality
+func TestConvertQueryResultData(t *testing.T) {
+	// Test data structure
+	type TestSchema struct {
+		Content string   `json:"content"`
+		Thought string   `json:"thought"`
+		Count   int      `json:"count"`
+		Items   []string `json:"items"`
+	}
+
+	// Create a QueryResult with structured data
+	testData := &TestSchema{
+		Content: "Test content",
+		Thought: "Test thought",
+		Count:   5,
+		Items:   []string{"item1", "item2", "item3"},
+	}
+
+	result := &QueryResult{
+		Content: "Test content",
+		Thought: "Test thought",
+		Data:    testData,
+	}
+
+	// Test type conversion
+	converted, err := ConvertQueryResultData[TestSchema](result)
+	assert.NoError(t, err)
+	assert.NotNil(t, converted)
+	assert.Equal(t, "Test content", converted.Content)
+	assert.Equal(t, "Test thought", converted.Thought)
+	assert.Equal(t, 5, converted.Count)
+	assert.Equal(t, []string{"item1", "item2", "item3"}, converted.Items)
+
+	t.Logf("Successfully converted data: %+v", converted)
+}
+
+// TestQueryResultDataConsistency tests that QueryResult.Data matches OutputSchema
+func TestQueryResultDataConsistency(t *testing.T) {
+	querier := setupTestQuerier(t)
+
+	// Load test image
+	screenshot, size, err := builtin.LoadImage("testdata/llk_1.png")
+	require.NoError(t, err)
+
+	// Define a simple test schema
+	type TestGameInfo struct {
+		Content string   `json:"content"`
+		Thought string   `json:"thought"`
+		Rows    int      `json:"rows"`
+		Cols    int      `json:"cols"`
+		Icons   []string `json:"icons"`
+	}
+
+	outputSchema := TestGameInfo{}
+
+	// Prepare query options
+	opts := &QueryOptions{
+		Query:        "请分析这个连连看游戏界面，告诉我有多少行多少列，有哪些不同类型的图案",
+		Screenshot:   screenshot,
+		Size:         size,
+		OutputSchema: outputSchema,
+	}
+
+	// Perform query
+	result, err := querier.Query(context.Background(), opts)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotNil(t, result.Data)
+	gameInfo, ok := result.Data.(*TestGameInfo)
+	assert.True(t, ok)
+	assert.NotNil(t, gameInfo)
+
+	// Verify that the converted data has the expected structure
+	assert.NotEmpty(t, gameInfo.Content)
+	assert.NotEmpty(t, gameInfo.Thought)
+	assert.NotEmpty(t, gameInfo.Rows)
+	assert.NotEmpty(t, gameInfo.Cols)
+	assert.NotEmpty(t, gameInfo.Icons)
+}
+
+// TestAutoTypeConversion tests that QueryResult.Data is automatically converted to the correct type
+func TestAutoTypeConversion(t *testing.T) {
+	// Test data structure
+	type TestSchema struct {
+		Content string   `json:"content"`
+		Thought string   `json:"thought"`
+		Count   int      `json:"count"`
+		Items   []string `json:"items"`
+	}
+
+	// Simulate a JSON response from the model
+	jsonResponse := `{
+		"content": "Test content from model",
+		"thought": "Test reasoning process",
+		"count": 42,
+		"items": ["apple", "banana", "cherry"]
+	}`
+
+	// Test the parseCustomSchemaResult function directly
+	result, err := parseCustomSchemaResult(jsonResponse, TestSchema{})
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotNil(t, result.Data)
+
+	// Verify that Data is automatically converted to the correct type
+	typedData, ok := result.Data.(*TestSchema)
+	assert.True(t, ok, "Data should be automatically converted to *TestSchema")
+	assert.NotNil(t, typedData)
+
+	// Verify the content
+	assert.Equal(t, "Test content from model", typedData.Content)
+	assert.Equal(t, "Test reasoning process", typedData.Thought)
+	assert.Equal(t, 42, typedData.Count)
+	assert.Equal(t, []string{"apple", "banana", "cherry"}, typedData.Items)
+
+	// Verify that QueryResult fields are also populated
+	assert.Equal(t, "Test content from model", result.Content)
+	assert.Equal(t, "Test reasoning process", result.Thought)
+
+	t.Logf("Auto-converted data: %+v", typedData)
+}
+
+// TestDirectTypeAssertion tests that users can directly use type assertion on QueryResult.Data
+func TestDirectTypeAssertion(t *testing.T) {
+	// Test data structure
+	type GameInfo struct {
+		Content string   `json:"content"`
+		Thought string   `json:"thought"`
+		Rows    int      `json:"rows"`
+		Cols    int      `json:"cols"`
+		Icons   []string `json:"icons"`
+	}
+
+	// Simulate a JSON response
+	jsonResponse := `{
+		"content": "Game analysis complete",
+		"thought": "Analyzed the game grid structure",
+		"rows": 8,
+		"cols": 10,
+		"icons": ["apple", "banana", "cherry", "grape"]
+	}`
+
+	// Test the parseCustomSchemaResult function
+	result, err := parseCustomSchemaResult(jsonResponse, GameInfo{})
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.NotNil(t, result.Data)
+
+	// Users can now directly use type assertion
+	if gameInfo, ok := result.Data.(*GameInfo); ok {
+		assert.Equal(t, "Game analysis complete", gameInfo.Content)
+		assert.Equal(t, "Analyzed the game grid structure", gameInfo.Thought)
+		assert.Equal(t, 8, gameInfo.Rows)
+		assert.Equal(t, 10, gameInfo.Cols)
+		assert.Equal(t, []string{"apple", "banana", "cherry", "grape"}, gameInfo.Icons)
+		t.Logf("Direct type assertion successful: %+v", gameInfo)
+	} else {
+		t.Fatalf("Type assertion failed, Data type: %T", result.Data)
+	}
+}
--- a/uixt/ai/utils.go
+++ b/uixt/ai/utils.go
@@ -1,9 +1,17 @@
 package ai

 import (
+	"context"
 	"regexp"
 	"strings"
+	"time"
 	"unicode/utf8"
+
+	"github.com/cloudwego/eino/components/model"
+	"github.com/cloudwego/eino/schema"
+	"github.com/rs/zerolog/log"
+
+	"github.com/httprunner/httprunner/v5/uixt/option"
 )

 // extractJSONFromContent extracts JSON content from various formats in the response
@@ -102,3 +110,29 @@ func extractJSONFromContent(content string) string {

 	return ""
 }
+
+// callModelWithLogging is a common function to call model with logging and timing
+// It handles the common pattern of:
+// 1. Log request
+// 2. Start timing
+// 3. Call model.Generate
+// 4. Log timing and model info
+// 5. Log response
+func callModelWithLogging(ctx context.Context, model model.ToolCallingChatModel, history ConversationHistory, modelType option.LLMServiceType, operation string) (*schema.Message, error) {
+	logRequest(history)
+
+	startTime := time.Now()
+	defer func() {
+		log.Debug().Float64("elapsed(s)", time.Since(startTime).Seconds()).
+			Str("model", string(modelType)).
+			Msgf("call model service for %s", operation)
+	}()
+
+	message, err := model.Generate(ctx, history)
+	if err != nil {
+		return nil, err
+	}
+
+	logResponse(message)
+	return message, nil
+}