Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.
Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
This commit is contained in:
@@ -112,6 +112,8 @@ class ModelCapabilities:
|
||||
supports_system_prompts: bool = True
|
||||
supports_streaming: bool = True
|
||||
supports_function_calling: bool = False
|
||||
supports_images: bool = False # Whether model can process images
|
||||
max_image_size_mb: float = 0.0 # Maximum total size for all images in MB
|
||||
|
||||
# Temperature constraint object - preferred way to define temperature limits
|
||||
temperature_constraint: TemperatureConstraint = field(
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""Gemini model provider implementation."""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
@@ -21,11 +23,15 @@ class GeminiModelProvider(ModelProvider):
|
||||
"context_window": 1_048_576, # 1M tokens
|
||||
"supports_extended_thinking": True,
|
||||
"max_thinking_tokens": 24576, # Flash 2.5 thinking budget limit
|
||||
"supports_images": True, # Vision capability
|
||||
"max_image_size_mb": 20.0, # Conservative 20MB limit for reliability
|
||||
},
|
||||
"gemini-2.5-pro-preview-06-05": {
|
||||
"context_window": 1_048_576, # 1M tokens
|
||||
"supports_extended_thinking": True,
|
||||
"max_thinking_tokens": 32768, # Pro 2.5 thinking budget limit
|
||||
"supports_images": True, # Vision capability
|
||||
"max_image_size_mb": 32.0, # Higher limit for Pro model
|
||||
},
|
||||
# Shorthands
|
||||
"flash": "gemini-2.5-flash-preview-05-20",
|
||||
@@ -84,6 +90,8 @@ class GeminiModelProvider(ModelProvider):
|
||||
supports_system_prompts=True,
|
||||
supports_streaming=True,
|
||||
supports_function_calling=True,
|
||||
supports_images=config.get("supports_images", False),
|
||||
max_image_size_mb=config.get("max_image_size_mb", 0.0),
|
||||
temperature_constraint=temp_constraint,
|
||||
)
|
||||
|
||||
@@ -95,6 +103,7 @@ class GeminiModelProvider(ModelProvider):
|
||||
temperature: float = 0.7,
|
||||
max_output_tokens: Optional[int] = None,
|
||||
thinking_mode: str = "medium",
|
||||
images: Optional[list[str]] = None,
|
||||
**kwargs,
|
||||
) -> ModelResponse:
|
||||
"""Generate content using Gemini model."""
|
||||
@@ -102,12 +111,34 @@ class GeminiModelProvider(ModelProvider):
|
||||
resolved_name = self._resolve_model_name(model_name)
|
||||
self.validate_parameters(resolved_name, temperature)
|
||||
|
||||
# Combine system prompt with user prompt if provided
|
||||
# Prepare content parts (text and potentially images)
|
||||
parts = []
|
||||
|
||||
# Add system and user prompts as text
|
||||
if system_prompt:
|
||||
full_prompt = f"{system_prompt}\n\n{prompt}"
|
||||
else:
|
||||
full_prompt = prompt
|
||||
|
||||
parts.append({"text": full_prompt})
|
||||
|
||||
# Add images if provided and model supports vision
|
||||
if images and self._supports_vision(resolved_name):
|
||||
for image_path in images:
|
||||
try:
|
||||
image_part = self._process_image(image_path)
|
||||
if image_part:
|
||||
parts.append(image_part)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to process image {image_path}: {e}")
|
||||
# Continue with other images and text
|
||||
continue
|
||||
elif images and not self._supports_vision(resolved_name):
|
||||
logger.warning(f"Model {resolved_name} does not support images, ignoring {len(images)} image(s)")
|
||||
|
||||
# Create contents structure
|
||||
contents = [{"parts": parts}]
|
||||
|
||||
# Prepare generation config
|
||||
generation_config = types.GenerateContentConfig(
|
||||
temperature=temperature,
|
||||
@@ -139,7 +170,7 @@ class GeminiModelProvider(ModelProvider):
|
||||
# Generate content
|
||||
response = self.client.models.generate_content(
|
||||
model=resolved_name,
|
||||
contents=full_prompt,
|
||||
contents=contents,
|
||||
config=generation_config,
|
||||
)
|
||||
|
||||
@@ -274,3 +305,51 @@ class GeminiModelProvider(ModelProvider):
|
||||
usage["total_tokens"] = usage["input_tokens"] + usage["output_tokens"]
|
||||
|
||||
return usage
|
||||
|
||||
def _supports_vision(self, model_name: str) -> bool:
|
||||
"""Check if the model supports vision (image processing)."""
|
||||
# Gemini 2.5 models support vision
|
||||
vision_models = {
|
||||
"gemini-2.5-flash-preview-05-20",
|
||||
"gemini-2.5-pro-preview-06-05",
|
||||
"gemini-2.0-flash",
|
||||
"gemini-1.5-pro",
|
||||
"gemini-1.5-flash",
|
||||
}
|
||||
return model_name in vision_models
|
||||
|
||||
def _process_image(self, image_path: str) -> Optional[dict]:
|
||||
"""Process an image for Gemini API."""
|
||||
try:
|
||||
if image_path.startswith("...
|
||||
header, data = image_path.split(",", 1)
|
||||
mime_type = header.split(";")[0].split(":")[1]
|
||||
return {"inline_data": {"mime_type": mime_type, "data": data}}
|
||||
else:
|
||||
# Handle file path - translate for Docker environment
|
||||
from utils.file_types import get_image_mime_type
|
||||
from utils.file_utils import translate_path_for_environment
|
||||
|
||||
translated_path = translate_path_for_environment(image_path)
|
||||
logger.debug(f"Translated image path from '{image_path}' to '{translated_path}'")
|
||||
|
||||
if not os.path.exists(translated_path):
|
||||
logger.warning(f"Image file not found: {translated_path} (original: {image_path})")
|
||||
return None
|
||||
|
||||
# Use translated path for all subsequent operations
|
||||
image_path = translated_path
|
||||
|
||||
# Detect MIME type from file extension using centralized mappings
|
||||
ext = os.path.splitext(image_path)[1].lower()
|
||||
mime_type = get_image_mime_type(ext)
|
||||
|
||||
# Read and encode the image
|
||||
with open(image_path, "rb") as f:
|
||||
image_data = base64.b64encode(f.read()).decode()
|
||||
|
||||
return {"inline_data": {"mime_type": mime_type, "data": image_data}}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image {image_path}: {e}")
|
||||
return None
|
||||
|
||||
@@ -23,22 +23,38 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
"o3": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # O3 models support vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
"o3-mini": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # O3 models support vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
"o3-pro": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # O3 models support vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
"o4-mini": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # O4 models support vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
"o4-mini-high": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # O4 models support vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
"gpt-4.1-2025-04-14": {
|
||||
"context_window": 1_000_000, # 1M tokens
|
||||
"supports_extended_thinking": False,
|
||||
"supports_images": True, # GPT-4.1 supports vision
|
||||
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
|
||||
},
|
||||
# Shorthands
|
||||
"mini": "o4-mini", # Default 'mini' to latest mini model
|
||||
@@ -46,6 +62,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
"o4mini": "o4-mini",
|
||||
"o4minihigh": "o4-mini-high",
|
||||
"o4minihi": "o4-mini-high",
|
||||
"gpt4.1": "gpt-4.1-2025-04-14",
|
||||
}
|
||||
|
||||
def __init__(self, api_key: str, **kwargs):
|
||||
@@ -76,7 +93,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
# O3 and O4 reasoning models only support temperature=1.0
|
||||
temp_constraint = FixedTemperatureConstraint(1.0)
|
||||
else:
|
||||
# Other OpenAI models support 0.0-2.0 range
|
||||
# Other OpenAI models (including GPT-4.1) support 0.0-2.0 range
|
||||
temp_constraint = RangeTemperatureConstraint(0.0, 2.0, 0.7)
|
||||
|
||||
return ModelCapabilities(
|
||||
@@ -88,6 +105,8 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
supports_system_prompts=True,
|
||||
supports_streaming=True,
|
||||
supports_function_calling=True,
|
||||
supports_images=config.get("supports_images", False),
|
||||
max_image_size_mb=config.get("max_image_size_mb", 0.0),
|
||||
temperature_constraint=temp_constraint,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Base class for OpenAI-compatible API providers."""
|
||||
|
||||
import base64
|
||||
import ipaddress
|
||||
import logging
|
||||
import os
|
||||
@@ -229,6 +230,7 @@ class OpenAICompatibleProvider(ModelProvider):
|
||||
system_prompt: Optional[str] = None,
|
||||
temperature: float = 0.7,
|
||||
max_output_tokens: Optional[int] = None,
|
||||
images: Optional[list[str]] = None,
|
||||
**kwargs,
|
||||
) -> ModelResponse:
|
||||
"""Generate content using the OpenAI-compatible API.
|
||||
@@ -255,7 +257,32 @@ class OpenAICompatibleProvider(ModelProvider):
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
# Prepare user message with text and potentially images
|
||||
user_content = []
|
||||
user_content.append({"type": "text", "text": prompt})
|
||||
|
||||
# Add images if provided and model supports vision
|
||||
if images and self._supports_vision(model_name):
|
||||
for image_path in images:
|
||||
try:
|
||||
image_content = self._process_image(image_path)
|
||||
if image_content:
|
||||
user_content.append(image_content)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to process image {image_path}: {e}")
|
||||
# Continue with other images and text
|
||||
continue
|
||||
elif images and not self._supports_vision(model_name):
|
||||
logging.warning(f"Model {model_name} does not support images, ignoring {len(images)} image(s)")
|
||||
|
||||
# Add user message
|
||||
if len(user_content) == 1:
|
||||
# Only text content, use simple string format for compatibility
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
else:
|
||||
# Text + images, use content array format
|
||||
messages.append({"role": "user", "content": user_content})
|
||||
|
||||
# Prepare completion parameters
|
||||
completion_params = {
|
||||
@@ -424,3 +451,66 @@ class OpenAICompatibleProvider(ModelProvider):
|
||||
Default is False for OpenAI-compatible providers.
|
||||
"""
|
||||
return False
|
||||
|
||||
def _supports_vision(self, model_name: str) -> bool:
|
||||
"""Check if the model supports vision (image processing).
|
||||
|
||||
Default implementation for OpenAI-compatible providers.
|
||||
Subclasses should override with specific model support.
|
||||
"""
|
||||
# Common vision-capable models - only include models that actually support images
|
||||
vision_models = {
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4-turbo",
|
||||
"gpt-4-vision-preview",
|
||||
"gpt-4.1-2025-04-14", # GPT-4.1 supports vision
|
||||
"o3",
|
||||
"o3-mini",
|
||||
"o3-pro",
|
||||
"o4-mini",
|
||||
"o4-mini-high",
|
||||
# Note: Claude models would be handled by a separate provider
|
||||
}
|
||||
supports = model_name.lower() in vision_models
|
||||
logging.debug(f"Model '{model_name}' vision support: {supports}")
|
||||
return supports
|
||||
|
||||
def _process_image(self, image_path: str) -> Optional[dict]:
|
||||
"""Process an image for OpenAI-compatible API."""
|
||||
try:
|
||||
if image_path.startswith("...
|
||||
return {"type": "image_url", "image_url": {"url": image_path}}
|
||||
else:
|
||||
# Handle file path - translate for Docker environment
|
||||
from utils.file_utils import translate_path_for_environment
|
||||
|
||||
translated_path = translate_path_for_environment(image_path)
|
||||
logging.debug(f"Translated image path from '{image_path}' to '{translated_path}'")
|
||||
|
||||
if not os.path.exists(translated_path):
|
||||
logging.warning(f"Image file not found: {translated_path} (original: {image_path})")
|
||||
return None
|
||||
|
||||
# Use translated path for all subsequent operations
|
||||
image_path = translated_path
|
||||
|
||||
# Detect MIME type from file extension using centralized mappings
|
||||
from utils.file_types import get_image_mime_type
|
||||
|
||||
ext = os.path.splitext(image_path)[1].lower()
|
||||
mime_type = get_image_mime_type(ext)
|
||||
logging.debug(f"Processing image '{image_path}' with extension '{ext}' as MIME type '{mime_type}'")
|
||||
|
||||
# Read and encode the image
|
||||
with open(image_path, "rb") as f:
|
||||
image_data = base64.b64encode(f.read()).decode()
|
||||
|
||||
# Create data URL for OpenAI API
|
||||
data_url = f"data:{mime_type};base64,{image_data}"
|
||||
|
||||
return {"type": "image_url", "image_url": {"url": data_url}}
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing image {image_path}: {e}")
|
||||
return None
|
||||
|
||||
@@ -23,6 +23,8 @@ class OpenRouterModelConfig:
|
||||
supports_streaming: bool = True
|
||||
supports_function_calling: bool = False
|
||||
supports_json_mode: bool = False
|
||||
supports_images: bool = False # Whether model can process images
|
||||
max_image_size_mb: float = 0.0 # Maximum total size for all images in MB
|
||||
is_custom: bool = False # True for models that should only be used with custom endpoints
|
||||
description: str = ""
|
||||
|
||||
@@ -37,6 +39,8 @@ class OpenRouterModelConfig:
|
||||
supports_system_prompts=self.supports_system_prompts,
|
||||
supports_streaming=self.supports_streaming,
|
||||
supports_function_calling=self.supports_function_calling,
|
||||
supports_images=self.supports_images,
|
||||
max_image_size_mb=self.max_image_size_mb,
|
||||
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 1.0),
|
||||
)
|
||||
|
||||
@@ -66,7 +70,8 @@ class OpenRouterModelRegistry:
|
||||
translated_path = translate_path_for_environment(env_path)
|
||||
self.config_path = Path(translated_path)
|
||||
else:
|
||||
# Default to conf/custom_models.json (already in container)
|
||||
# Default to conf/custom_models.json - use relative path from this file
|
||||
# This works both in development and container environments
|
||||
self.config_path = Path(__file__).parent.parent / "conf" / "custom_models.json"
|
||||
|
||||
# Load configuration
|
||||
|
||||
Reference in New Issue
Block a user