mirror of
https://github.com/snailyp/gemini-balance.git
synced 2026-06-05 23:59:45 +08:00
feat(image): 支持多模态模型输入base64格式图片
- 在消息转换中,增加对 `data:image/png;base64,...` 格式图片的支持,允许用户直接在输入中提供base64编码的图片。 - 调整图片处理逻辑,使其能够根据模型名称判断是否启用多模态能力,避免非多模态模型错误处理图片链接。 - 当未配置图床时,模型输出的图片将回退为base64格式,确保图片内容始终可用。 - 优化了相关函数的参数传递和代码格式,提高了代码的可读性和健壮性。
This commit is contained in:
@@ -27,7 +27,7 @@ class MessageConverter(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def convert(
|
||||
self, messages: List[Dict[str, Any]]
|
||||
self, messages: List[Dict[str, Any]], model: str
|
||||
) -> tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
||||
pass
|
||||
|
||||
@@ -84,7 +84,7 @@ def _convert_image_to_base64(url: str) -> str:
|
||||
raise Exception(f"Failed to fetch image: {response.status_code}")
|
||||
|
||||
|
||||
def _process_text_with_image(text: str) -> List[Dict[str, Any]]:
|
||||
def _process_text_with_image(text: str, model: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
处理可能包含图片URL的文本,提取图片并转换为base64
|
||||
|
||||
@@ -94,17 +94,31 @@ def _process_text_with_image(text: str) -> List[Dict[str, Any]]:
|
||||
Returns:
|
||||
List[Dict[str, Any]]: 包含文本和图片的部分列表
|
||||
"""
|
||||
# 如果模型名中没有包含image,当作普通文本处理
|
||||
if "image" not in model:
|
||||
return [{"text": text}]
|
||||
parts = []
|
||||
img_url_match = re.search(IMAGE_URL_PATTERN, text)
|
||||
if img_url_match:
|
||||
# 提取URL
|
||||
img_url = img_url_match.group(2)
|
||||
# 将URL对应的图片转换为base64
|
||||
# 先判断是否是base64url如果是,直接用,不过不是,再将URL对应的图片转换为base64
|
||||
try:
|
||||
base64_data = _convert_image_to_base64(img_url)
|
||||
parts.append(
|
||||
{"inline_data": {"mimeType": "image/png", "data": base64_data}}
|
||||
)
|
||||
base64_url_match = re.search(DATA_URL_PATTERN, img_url)
|
||||
if base64_url_match:
|
||||
parts.append(
|
||||
{
|
||||
"inline_data": {
|
||||
"mimeType": base64_url_match.group(1),
|
||||
"data": base64_url_match.group(2),
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
base64_data = _convert_image_to_base64(img_url)
|
||||
parts.append(
|
||||
{"inline_data": {"mimeType": "image/png", "data": base64_data}}
|
||||
)
|
||||
except Exception:
|
||||
# 如果转换失败,回退到文本模式
|
||||
parts.append({"text": text})
|
||||
@@ -145,7 +159,7 @@ class OpenAIMessageConverter(MessageConverter):
|
||||
raise
|
||||
|
||||
def convert(
|
||||
self, messages: List[Dict[str, Any]]
|
||||
self, messages: List[Dict[str, Any]], model: str
|
||||
) -> tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
||||
converted_messages = []
|
||||
system_instruction_parts = []
|
||||
@@ -296,7 +310,7 @@ class OpenAIMessageConverter(MessageConverter):
|
||||
elif (
|
||||
"content" in msg and isinstance(msg["content"], str) and msg["content"]
|
||||
):
|
||||
parts.extend(_process_text_with_image(msg["content"]))
|
||||
parts.extend(_process_text_with_image(msg["content"], model))
|
||||
elif "tool_calls" in msg and isinstance(msg["tool_calls"], list):
|
||||
# Keep existing tool call processing
|
||||
for tool_call in msg["tool_calls"]:
|
||||
|
||||
@@ -8,9 +8,9 @@ from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.config.config import settings
|
||||
from app.utils.uploader import ImageUploaderFactory
|
||||
from app.log.logger import get_openai_logger
|
||||
from app.utils.helpers import is_image_upload_configured
|
||||
from app.utils.uploader import ImageUploaderFactory
|
||||
|
||||
logger = get_openai_logger()
|
||||
|
||||
@@ -33,7 +33,11 @@ class GeminiResponseHandler(ResponseHandler):
|
||||
self.thinking_status = False
|
||||
|
||||
def handle_response(
|
||||
self, response: Dict[str, Any], model: str, stream: bool = False, usage_metadata: Optional[Dict[str, Any]] = None
|
||||
self,
|
||||
response: Dict[str, Any],
|
||||
model: str,
|
||||
stream: bool = False,
|
||||
usage_metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if stream:
|
||||
return _handle_gemini_stream_response(response, model, stream)
|
||||
@@ -41,7 +45,10 @@ class GeminiResponseHandler(ResponseHandler):
|
||||
|
||||
|
||||
def _handle_openai_stream_response(
|
||||
response: Dict[str, Any], model: str, finish_reason: str, usage_metadata: Optional[Dict[str, Any]]
|
||||
response: Dict[str, Any],
|
||||
model: str,
|
||||
finish_reason: str,
|
||||
usage_metadata: Optional[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
choices = []
|
||||
candidates = response.get("candidates", [])
|
||||
@@ -55,15 +62,15 @@ def _handle_openai_stream_response(
|
||||
if not text and not tool_calls and not reasoning_content:
|
||||
delta = {}
|
||||
else:
|
||||
delta = {"content": text, "reasoning_content": reasoning_content, "role": "assistant"}
|
||||
delta = {
|
||||
"content": text,
|
||||
"reasoning_content": reasoning_content,
|
||||
"role": "assistant",
|
||||
}
|
||||
if tool_calls:
|
||||
delta["tool_calls"] = tool_calls
|
||||
|
||||
choice = {
|
||||
"index": index,
|
||||
"delta": delta,
|
||||
"finish_reason": finish_reason
|
||||
}
|
||||
|
||||
choice = {"index": index, "delta": delta, "finish_reason": finish_reason}
|
||||
choices.append(choice)
|
||||
|
||||
template_chunk = {
|
||||
@@ -74,16 +81,23 @@ def _handle_openai_stream_response(
|
||||
"choices": choices,
|
||||
}
|
||||
if usage_metadata:
|
||||
template_chunk["usage"] = {"prompt_tokens": usage_metadata.get("promptTokenCount", 0), "completion_tokens": usage_metadata.get("candidatesTokenCount",0), "total_tokens": usage_metadata.get("totalTokenCount", 0)}
|
||||
template_chunk["usage"] = {
|
||||
"prompt_tokens": usage_metadata.get("promptTokenCount", 0),
|
||||
"completion_tokens": usage_metadata.get("candidatesTokenCount", 0),
|
||||
"total_tokens": usage_metadata.get("totalTokenCount", 0),
|
||||
}
|
||||
return template_chunk
|
||||
|
||||
|
||||
def _handle_openai_normal_response(
|
||||
response: Dict[str, Any], model: str, finish_reason: str, usage_metadata: Optional[Dict[str, Any]]
|
||||
response: Dict[str, Any],
|
||||
model: str,
|
||||
finish_reason: str,
|
||||
usage_metadata: Optional[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
choices = []
|
||||
candidates = response.get("candidates", [])
|
||||
|
||||
|
||||
for i, candidate in enumerate(candidates):
|
||||
text, reasoning_content, tool_calls, _ = _extract_result(
|
||||
{"candidates": [candidate]}, model, stream=False, gemini_format=False
|
||||
@@ -106,7 +120,11 @@ def _handle_openai_normal_response(
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": choices,
|
||||
"usage": {"prompt_tokens": usage_metadata.get("promptTokenCount", 0), "completion_tokens": usage_metadata.get("candidatesTokenCount",0), "total_tokens": usage_metadata.get("totalTokenCount", 0)},
|
||||
"usage": {
|
||||
"prompt_tokens": usage_metadata.get("promptTokenCount", 0),
|
||||
"completion_tokens": usage_metadata.get("candidatesTokenCount", 0),
|
||||
"total_tokens": usage_metadata.get("totalTokenCount", 0),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -127,8 +145,12 @@ class OpenAIResponseHandler(ResponseHandler):
|
||||
usage_metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
if stream:
|
||||
return _handle_openai_stream_response(response, model, finish_reason, usage_metadata)
|
||||
return _handle_openai_normal_response(response, model, finish_reason, usage_metadata)
|
||||
return _handle_openai_stream_response(
|
||||
response, model, finish_reason, usage_metadata
|
||||
)
|
||||
return _handle_openai_normal_response(
|
||||
response, model, finish_reason, usage_metadata
|
||||
)
|
||||
|
||||
def handle_image_chat_response(
|
||||
self, image_str: str, model: str, stream=False, finish_reason="stop"
|
||||
@@ -182,7 +204,7 @@ def _extract_result(
|
||||
gemini_format: bool = False,
|
||||
) -> tuple[str, Optional[str], List[Dict[str, Any]], Optional[bool]]:
|
||||
text, reasoning_content, tool_calls, thought = "", "", [], None
|
||||
|
||||
|
||||
if stream:
|
||||
if response.get("candidates"):
|
||||
candidate = response["candidates"][0]
|
||||
@@ -191,7 +213,7 @@ def _extract_result(
|
||||
if not parts:
|
||||
logger.warning("No parts found in stream response")
|
||||
return "", None, [], None
|
||||
|
||||
|
||||
if "text" in parts[0]:
|
||||
text = parts[0].get("text")
|
||||
if "thought" in parts[0]:
|
||||
@@ -217,13 +239,13 @@ def _extract_result(
|
||||
if response.get("candidates"):
|
||||
candidate = response["candidates"][0]
|
||||
text, reasoning_content = "", ""
|
||||
|
||||
|
||||
# 使用安全的访问方式
|
||||
content = candidate.get("content", {})
|
||||
|
||||
|
||||
if content and isinstance(content, dict):
|
||||
parts = content.get("parts", [])
|
||||
|
||||
|
||||
if parts:
|
||||
for part in parts:
|
||||
if "text" in part:
|
||||
@@ -241,14 +263,14 @@ def _extract_result(
|
||||
logger.error(f"Invalid content structure for model: {model}")
|
||||
|
||||
text = _add_search_link_text(model, candidate, text)
|
||||
|
||||
|
||||
# 安全地获取 parts 用于工具调用提取
|
||||
parts = candidate.get("content", {}).get("parts", [])
|
||||
tool_calls = _extract_tool_calls(parts, gemini_format)
|
||||
else:
|
||||
logger.warning(f"No candidates found in response for model: {model}")
|
||||
text = "暂无返回"
|
||||
|
||||
|
||||
return text, reasoning_content, tool_calls, thought
|
||||
|
||||
|
||||
@@ -264,10 +286,6 @@ def _has_inline_image_part(response: Dict[str, Any]) -> bool:
|
||||
|
||||
|
||||
def _extract_image_data(part: dict) -> str:
|
||||
# Return empty string if no uploader is configured
|
||||
if not is_image_upload_configured():
|
||||
return ""
|
||||
|
||||
image_uploader = None
|
||||
if settings.UPLOAD_PROVIDER == "smms":
|
||||
image_uploader = ImageUploaderFactory.create(
|
||||
@@ -287,13 +305,17 @@ def _extract_image_data(part: dict) -> str:
|
||||
current_date = time.strftime("%Y/%m/%d")
|
||||
filename = f"{current_date}/{uuid.uuid4().hex[:8]}.png"
|
||||
base64_data = part["inlineData"]["data"]
|
||||
mime_type = part["inlineData"]["mimeType"]
|
||||
# 将base64_data转成bytes数组
|
||||
# Return empty string if no uploader is configured
|
||||
if not is_image_upload_configured(settings):
|
||||
return f"\n\n\n\n"
|
||||
bytes_data = base64.b64decode(base64_data)
|
||||
upload_response = image_uploader.upload(bytes_data, filename)
|
||||
if upload_response.success:
|
||||
text = f"\n\n\n\n"
|
||||
else:
|
||||
text = ""
|
||||
text = f"\n\n\n\n"
|
||||
return text
|
||||
|
||||
|
||||
@@ -306,7 +328,7 @@ def _extract_tool_calls(
|
||||
|
||||
letters = string.ascii_lowercase + string.digits
|
||||
tool_calls = list()
|
||||
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
if not part or not isinstance(part, dict):
|
||||
@@ -315,7 +337,7 @@ def _extract_tool_calls(
|
||||
item = part.get("functionCall", {})
|
||||
if not item or not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
|
||||
if gemini_format:
|
||||
tool_calls.append(part)
|
||||
else:
|
||||
@@ -339,9 +361,9 @@ def _handle_gemini_stream_response(
|
||||
response: Dict[str, Any], model: str, stream: bool
|
||||
) -> Dict[str, Any]:
|
||||
# Early return raw Gemini response if no uploader configured and contains inline images
|
||||
if not is_image_upload_configured() and _has_inline_image_part(response):
|
||||
if not is_image_upload_configured(settings) and _has_inline_image_part(response):
|
||||
return response
|
||||
|
||||
|
||||
text, reasoning_content, tool_calls, thought = _extract_result(
|
||||
response, model, stream=stream, gemini_format=True
|
||||
)
|
||||
@@ -360,9 +382,9 @@ def _handle_gemini_normal_response(
|
||||
response: Dict[str, Any], model: str, stream: bool
|
||||
) -> Dict[str, Any]:
|
||||
# Early return raw Gemini response if no uploader configured and contains inline images
|
||||
if not is_image_upload_configured() and _has_inline_image_part(response):
|
||||
if not is_image_upload_configured(settings) and _has_inline_image_part(response):
|
||||
return response
|
||||
|
||||
|
||||
text, reasoning_content, tool_calls, thought = _extract_result(
|
||||
response, model, stream=stream, gemini_format=True
|
||||
)
|
||||
@@ -371,7 +393,7 @@ def _handle_gemini_normal_response(
|
||||
parts = tool_calls
|
||||
else:
|
||||
if thought is not None:
|
||||
parts.append({"text": reasoning_content,"thought": thought})
|
||||
parts.append({"text": reasoning_content, "thought": thought})
|
||||
part = {"text": text}
|
||||
parts.append(part)
|
||||
content = {"parts": parts, "role": "model"}
|
||||
|
||||
Reference in New Issue
Block a user