Files
httprunner/uixt/ai/querier.go
2025-07-03 22:13:23 +08:00

541 lines
17 KiB
Go

package ai
import (
"context"
"fmt"
"reflect"
"github.com/cloudwego/eino-ext/components/model/ark"
"github.com/cloudwego/eino/components/model"
"github.com/cloudwego/eino/schema"
"github.com/getkin/kin-openapi/openapi3gen"
"github.com/pkg/errors"
arkModel "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model"
"github.com/httprunner/httprunner/v5/code"
"github.com/httprunner/httprunner/v5/internal/json"
"github.com/httprunner/httprunner/v5/uixt/option"
"github.com/httprunner/httprunner/v5/uixt/types"
)
// IQuerier interface defines the contract for query operations
type IQuerier interface {
Query(ctx context.Context, opts *QueryOptions) (*QueryResult, error)
}
// QueryOptions represents the input options for query
type QueryOptions struct {
Query string `json:"query"` // The query text to extract information
Screenshot string `json:"screenshot"` // Base64 encoded screenshot
Size types.Size `json:"size"` // Screen dimensions
OutputSchema interface{} `json:"outputSchema,omitempty"` // Custom output schema for structured response
}
// QueryResult represents the response from an AI query
type QueryResult struct {
Content string `json:"content"` // The extracted content/information
Thought string `json:"thought"` // The reasoning process
Data interface{} `json:"data,omitempty"` // Structured data when OutputSchema is provided
ModelName string `json:"model_name"` // model name used for query
Usage *schema.TokenUsage `json:"usage,omitempty"` // token usage statistics
}
// Querier handles query operations using different AI models
type Querier struct {
modelConfig *ModelConfig
model model.ToolCallingChatModel
systemPrompt string
history ConversationHistory
}
// NewQuerier creates a new Querier instance
func NewQuerier(ctx context.Context, modelConfig *ModelConfig) (*Querier, error) {
querier := &Querier{
modelConfig: modelConfig,
systemPrompt: defaultQueryPrompt,
}
if option.IS_UI_TARS(modelConfig.ModelType) {
querier.systemPrompt += "\n" + uiTarsQueryResponseFormat
} else {
// define default output format
type OutputFormat struct {
Content string `json:"content"`
Thought string `json:"thought"`
Error string `json:"error,omitempty"`
}
outputFormatSchema, err := openapi3gen.NewSchemaRefForValue(&OutputFormat{}, nil)
if err != nil {
return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
}
// set structured response format
modelConfig.ChatModelConfig.ResponseFormat = &ark.ResponseFormat{
Type: arkModel.ResponseFormatJSONSchema,
JSONSchema: &arkModel.ResponseFormatJSONSchemaJSONSchemaParam{
Name: "query_result",
Description: "data that describes query result",
Schema: outputFormatSchema.Value,
Strict: false,
},
}
}
var err error
querier.model, err = ark.NewChatModel(ctx, modelConfig.ChatModelConfig)
if err != nil {
return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
}
return querier, nil
}
// callModelWithLogging calls the model with automatic logging and timing
// Query performs the information extraction from the screenshot
func (q *Querier) Query(ctx context.Context, opts *QueryOptions) (result *QueryResult, err error) {
// Validate input parameters
if err := validateQueryInput(opts); err != nil {
return nil, errors.Wrap(err, "validate query parameters failed")
}
// Handle custom output schema if provided
if opts.OutputSchema != nil {
return q.queryWithCustomSchema(ctx, opts)
}
// Reset history for each new query
q.history = ConversationHistory{
{
Role: schema.System,
Content: q.systemPrompt,
},
}
// Create user message with screenshot and query
userMsg := &schema.Message{
Role: schema.User,
MultiContent: []schema.ChatMessagePart{
{
Type: schema.ChatMessagePartTypeImageURL,
ImageURL: &schema.ChatMessageImageURL{
URL: opts.Screenshot,
Detail: schema.ImageURLDetailAuto,
},
},
{
Type: schema.ChatMessagePartTypeText,
Text: fmt.Sprintf(`
Here is the query. Please extract the requested information from the screenshot.
=====================================
%s
=====================================
`, opts.Query),
},
},
}
// Append user message to history
q.history.Append(userMsg)
// Call model service with logging
message, err := callModelWithLogging(ctx, q.model, q.history,
q.modelConfig.ModelType, "query")
if err != nil {
return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
}
defer func() {
// Extract usage information if available
if message.ResponseMeta != nil && message.ResponseMeta.Usage != nil {
result.Usage = message.ResponseMeta.Usage
}
}()
// Parse result
result, err = parseQueryResult(message.Content, q.modelConfig.ModelType)
if err != nil {
return nil, errors.Wrap(code.LLMParseQueryResponseError, err.Error())
}
// Append assistant message to history
q.history.Append(&schema.Message{
Role: schema.Assistant,
Content: message.Content,
})
return result, nil
}
// validateQueryInput validates the input parameters for query
func validateQueryInput(opts *QueryOptions) error {
if opts.Query == "" {
return errors.Wrap(code.LLMPrepareRequestError, "query text is required")
}
if opts.Screenshot == "" {
return errors.Wrap(code.LLMPrepareRequestError, "screenshot is required")
}
return nil
}
// parseQueryResult parses the model response into QueryResult
func parseQueryResult(content string, modelType option.LLMServiceType) (*QueryResult, error) {
var result QueryResult
// Use the generic structured response parser with enhanced error recovery
if err := parseStructuredResponse(content, &result); err != nil {
// If parseStructuredResponse fails completely, treat content as plain text
return &QueryResult{
Content: content,
Thought: "Failed to parse response, returning raw content",
ModelName: string(modelType),
}, nil
}
result.ModelName = string(modelType)
return &result, nil
}
// queryWithCustomSchema performs query with custom output schema
func (q *Querier) queryWithCustomSchema(ctx context.Context, opts *QueryOptions) (*QueryResult, error) {
// Create a new model config with custom schema
modelConfig := *q.modelConfig
if !option.IS_UI_TARS(modelConfig.ModelType) {
// Generate schema from the provided output schema
outputFormatSchema, err := openapi3gen.NewSchemaRefForValue(opts.OutputSchema, nil)
if err != nil {
return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
}
// Create custom response format with the provided schema
modelConfig.ChatModelConfig.ResponseFormat = &ark.ResponseFormat{
Type: arkModel.ResponseFormatJSONSchema,
JSONSchema: &arkModel.ResponseFormatJSONSchemaJSONSchemaParam{
Name: "custom_query_result",
Description: "custom structured data response",
Schema: outputFormatSchema.Value,
Strict: false,
},
}
}
// Create a new model instance with custom schema
model, err := ark.NewChatModel(ctx, modelConfig.ChatModelConfig)
if err != nil {
return nil, errors.Wrap(code.LLMPrepareRequestError, err.Error())
}
// Reset history for each new query
systemPrompt := q.systemPrompt
if option.IS_UI_TARS(modelConfig.ModelType) {
systemPrompt += "\n" + uiTarsQueryResponseFormat
} else {
// Add instruction for structured output
systemPrompt += "\n\nPlease respond with structured data according to the specified schema. Include both the structured data and your reasoning process."
}
history := ConversationHistory{
{
Role: schema.System,
Content: systemPrompt,
},
}
// Create user message with screenshot and query
userMsg := &schema.Message{
Role: schema.User,
MultiContent: []schema.ChatMessagePart{
{
Type: schema.ChatMessagePartTypeImageURL,
ImageURL: &schema.ChatMessageImageURL{
URL: opts.Screenshot,
Detail: schema.ImageURLDetailAuto,
},
},
{
Type: schema.ChatMessagePartTypeText,
Text: fmt.Sprintf(`
Here is the query. Please extract the requested information from the screenshot and return it in the specified structured format.
=====================================
%s
=====================================
`, opts.Query),
},
},
}
// Append user message to history
history.Append(userMsg)
// Call model service with logging
message, err := callModelWithLogging(ctx, model, history, modelConfig.ModelType, "custom schema query")
if err != nil {
return nil, errors.Wrap(code.LLMRequestServiceError, err.Error())
}
// Parse result with custom schema
result, err := parseCustomSchemaResult(message.Content, opts.OutputSchema)
if err != nil {
return nil, errors.Wrap(code.LLMParseQueryResponseError, err.Error())
}
// Append assistant message to history
q.history.Append(&schema.Message{
Role: schema.Assistant,
Content: message.Content,
})
return result, nil
}
// setDefaultFieldValue sets a default value for a field in the structured data using reflection
func setDefaultFieldValue(structValue reflect.Value, fieldName, defaultValue string) {
if field := structValue.FieldByName(fieldName); field.IsValid() && field.CanSet() && field.Kind() == reflect.String {
field.SetString(defaultValue)
}
}
// ensureDefaultValues ensures that Content and Thought fields have default values if empty
func ensureDefaultValues(result *QueryResult, structuredData interface{}) {
const (
defaultContent = "Structured data extracted successfully"
defaultThought = "Parsed structured response according to custom schema"
)
// Set defaults for QueryResult
if result.Content == "" {
result.Content = defaultContent
}
if result.Thought == "" {
result.Thought = defaultThought
}
// Set defaults in structured data if it's a pointer to struct
if structuredData != nil {
if structValue := reflect.ValueOf(structuredData); structValue.Kind() == reflect.Ptr {
if elem := structValue.Elem(); elem.IsValid() && elem.Kind() == reflect.Struct {
if result.Content == defaultContent {
setDefaultFieldValue(elem, "Content", defaultContent)
}
if result.Thought == defaultThought {
setDefaultFieldValue(elem, "Thought", defaultThought)
}
}
}
}
}
// parseCustomSchemaResult parses the model response with custom schema
func parseCustomSchemaResult(content string, outputSchema interface{}) (*QueryResult, error) {
// Extract JSON content from response
jsonContent := extractJSONFromContent(content)
if jsonContent == "" {
// If no JSON found, treat the entire content as the result
return &QueryResult{
Content: content,
Thought: "Direct response from model",
}, nil
}
// Handle OpenAI structured output properties wrapper
actualJSONContent := unwrapPropertiesIfNeeded(jsonContent)
// Try direct unmarshaling first (most efficient)
if result, err := tryDirectUnmarshal(actualJSONContent, outputSchema); err == nil {
return result, nil
}
// Fallback: try generic parsing and conversion
if result, err := tryGenericParsingAndConversion(actualJSONContent, outputSchema); err == nil {
return result, nil
}
// Final fallback: treat as plain text
return &QueryResult{
Content: content,
Thought: "Failed to parse as structured data, returning raw content",
}, nil
}
// unwrapPropertiesIfNeeded handles OpenAI structured output properties wrapper
func unwrapPropertiesIfNeeded(jsonContent string) string {
var tempMap map[string]interface{}
if err := json.Unmarshal([]byte(jsonContent), &tempMap); err == nil {
if properties, exists := tempMap["properties"]; exists {
if propertiesBytes, err := json.Marshal(properties); err == nil {
return string(propertiesBytes)
}
}
}
return jsonContent
}
// tryDirectUnmarshal attempts to unmarshal directly into the schema type
func tryDirectUnmarshal(jsonContent string, outputSchema interface{}) (*QueryResult, error) {
// Create a new instance of the schema type
newInstance := createSchemaInstance(outputSchema)
// Try to unmarshal directly into the schema type
if err := json.Unmarshal([]byte(jsonContent), newInstance); err != nil {
return nil, err
}
// Create result with the typed data
result := &QueryResult{Data: newInstance}
// Extract content and thought fields
extractContentAndThoughtFromStruct(result, newInstance)
if result.Content == "" && result.Thought == "" {
extractContentAndThoughtFromJSON(result, jsonContent)
}
// Ensure default values are set
ensureDefaultValues(result, newInstance)
return result, nil
}
// tryGenericParsingAndConversion attempts generic parsing and type conversion
func tryGenericParsingAndConversion(jsonContent string, outputSchema interface{}) (*QueryResult, error) {
var structuredData interface{}
if err := json.Unmarshal([]byte(jsonContent), &structuredData); err != nil {
return nil, err
}
// Try to convert to the expected schema type
if convertedData, err := convertToSchemaType(structuredData, outputSchema); err == nil {
result := &QueryResult{Data: convertedData}
extractContentAndThoughtFromMap(result, structuredData)
ensureDefaultValues(result, convertedData)
return result, nil
}
// If conversion failed, store the generic data
if dataMap, ok := structuredData.(map[string]interface{}); ok {
result := &QueryResult{Data: structuredData}
extractContentAndThoughtFromMap(result, dataMap)
ensureDefaultValues(result, nil)
return result, nil
}
return nil, errors.New("failed to parse structured data")
}
// createSchemaInstance creates a new instance of the schema type
func createSchemaInstance(outputSchema interface{}) interface{} {
schemaType := reflect.TypeOf(outputSchema)
if schemaType.Kind() == reflect.Ptr {
schemaType = schemaType.Elem()
}
return reflect.New(schemaType).Interface()
}
// extractContentAndThoughtFromStruct extracts content and thought from struct fields using reflection
func extractContentAndThoughtFromStruct(result *QueryResult, structData interface{}) {
schemaValue := reflect.ValueOf(structData).Elem()
if contentField := schemaValue.FieldByName("Content"); contentField.IsValid() && contentField.Kind() == reflect.String {
result.Content = contentField.String()
}
if thoughtField := schemaValue.FieldByName("Thought"); thoughtField.IsValid() && thoughtField.Kind() == reflect.String {
result.Thought = thoughtField.String()
}
}
// extractContentAndThoughtFromJSON extracts content and thought from JSON map
func extractContentAndThoughtFromJSON(result *QueryResult, jsonContent string) {
var dataMap map[string]interface{}
if err := json.Unmarshal([]byte(jsonContent), &dataMap); err == nil {
extractContentAndThoughtFromMap(result, dataMap)
}
}
// extractContentAndThoughtFromMap extracts content and thought from a map
func extractContentAndThoughtFromMap(result *QueryResult, dataMap interface{}) {
if mapData, ok := dataMap.(map[string]interface{}); ok {
if content, exists := mapData["content"]; exists {
if contentStr, ok := content.(string); ok {
result.Content = contentStr
}
}
if thought, exists := mapData["thought"]; exists {
if thoughtStr, ok := thought.(string); ok {
result.Thought = thoughtStr
}
}
}
}
// convertToSchemaType converts generic data to the specified schema type
func convertToSchemaType(data interface{}, outputSchema interface{}) (interface{}, error) {
// Get the type of the output schema
schemaType := reflect.TypeOf(outputSchema)
if schemaType.Kind() == reflect.Ptr {
schemaType = schemaType.Elem()
}
// Create a new instance of the schema type
newInstance := reflect.New(schemaType).Interface()
// Convert via JSON marshaling/unmarshaling
jsonData, err := json.Marshal(data)
if err != nil {
return nil, errors.Wrap(err, "failed to marshal data to JSON")
}
if err := json.Unmarshal(jsonData, newInstance); err != nil {
return nil, errors.Wrap(err, "failed to unmarshal data to target type")
}
return newInstance, nil
}
// ConvertQueryResultData converts QueryResult.Data to the specified type T
// This is a helper function for type-safe conversion of the structured data
//
// Note: When using QueryOptions.OutputSchema, the Data field is automatically
// converted to the correct type, so this function is typically not needed.
// This function is mainly useful for:
// 1. Converting data when OutputSchema was not used
// 2. Converting to a different type than the original OutputSchema
// 3. Handling legacy code or edge cases
func ConvertQueryResultData[T any](result *QueryResult) (*T, error) {
if result.Data == nil {
return nil, errors.New("no structured data available")
}
// If Data is already of the correct type, return it directly
if typedData, ok := result.Data.(*T); ok {
return typedData, nil
}
// If Data is a pointer to the correct type, dereference and return
if reflect.TypeOf(result.Data).Kind() == reflect.Ptr {
if typedData, ok := result.Data.(*T); ok {
return typedData, nil
}
// Try to get the value that the pointer points to
dataValue := reflect.ValueOf(result.Data)
if dataValue.Kind() == reflect.Ptr && !dataValue.IsNil() {
elem := dataValue.Elem()
if elem.Type() == reflect.TypeOf((*T)(nil)).Elem() {
typedData := elem.Interface().(T)
return &typedData, nil
}
}
}
// Fallback: try to convert via JSON marshaling/unmarshaling
jsonData, err := json.Marshal(result.Data)
if err != nil {
return nil, errors.Wrap(err, "failed to marshal data to JSON")
}
var converted T
if err := json.Unmarshal(jsonData, &converted); err != nil {
return nil, errors.Wrap(err, "failed to unmarshal data to target type")
}
return &converted, nil
}