Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added
OpenAI GPT-4.1 support
Chat tool prompt enhancement
Lint and code quality improvements
This commit is contained in:
Fahad
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions

View File

@@ -1,6 +1,8 @@
"""Gemini model provider implementation."""
import base64
import logging
import os
import time
from typing import Optional
@@ -21,11 +23,15 @@ class GeminiModelProvider(ModelProvider):
"context_window": 1_048_576, # 1M tokens
"supports_extended_thinking": True,
"max_thinking_tokens": 24576, # Flash 2.5 thinking budget limit
"supports_images": True, # Vision capability
"max_image_size_mb": 20.0, # Conservative 20MB limit for reliability
},
"gemini-2.5-pro-preview-06-05": {
"context_window": 1_048_576, # 1M tokens
"supports_extended_thinking": True,
"max_thinking_tokens": 32768, # Pro 2.5 thinking budget limit
"supports_images": True, # Vision capability
"max_image_size_mb": 32.0, # Higher limit for Pro model
},
# Shorthands
"flash": "gemini-2.5-flash-preview-05-20",
@@ -84,6 +90,8 @@ class GeminiModelProvider(ModelProvider):
supports_system_prompts=True,
supports_streaming=True,
supports_function_calling=True,
supports_images=config.get("supports_images", False),
max_image_size_mb=config.get("max_image_size_mb", 0.0),
temperature_constraint=temp_constraint,
)
@@ -95,6 +103,7 @@ class GeminiModelProvider(ModelProvider):
temperature: float = 0.7,
max_output_tokens: Optional[int] = None,
thinking_mode: str = "medium",
images: Optional[list[str]] = None,
**kwargs,
) -> ModelResponse:
"""Generate content using Gemini model."""
@@ -102,12 +111,34 @@ class GeminiModelProvider(ModelProvider):
resolved_name = self._resolve_model_name(model_name)
self.validate_parameters(resolved_name, temperature)
# Combine system prompt with user prompt if provided
# Prepare content parts (text and potentially images)
parts = []
# Add system and user prompts as text
if system_prompt:
full_prompt = f"{system_prompt}\n\n{prompt}"
else:
full_prompt = prompt
parts.append({"text": full_prompt})
# Add images if provided and model supports vision
if images and self._supports_vision(resolved_name):
for image_path in images:
try:
image_part = self._process_image(image_path)
if image_part:
parts.append(image_part)
except Exception as e:
logger.warning(f"Failed to process image {image_path}: {e}")
# Continue with other images and text
continue
elif images and not self._supports_vision(resolved_name):
logger.warning(f"Model {resolved_name} does not support images, ignoring {len(images)} image(s)")
# Create contents structure
contents = [{"parts": parts}]
# Prepare generation config
generation_config = types.GenerateContentConfig(
temperature=temperature,
@@ -139,7 +170,7 @@ class GeminiModelProvider(ModelProvider):
# Generate content
response = self.client.models.generate_content(
model=resolved_name,
contents=full_prompt,
contents=contents,
config=generation_config,
)
@@ -274,3 +305,51 @@ class GeminiModelProvider(ModelProvider):
usage["total_tokens"] = usage["input_tokens"] + usage["output_tokens"]
return usage
def _supports_vision(self, model_name: str) -> bool:
"""Check if the model supports vision (image processing)."""
# Gemini 2.5 models support vision
vision_models = {
"gemini-2.5-flash-preview-05-20",
"gemini-2.5-pro-preview-06-05",
"gemini-2.0-flash",
"gemini-1.5-pro",
"gemini-1.5-flash",
}
return model_name in vision_models
def _process_image(self, image_path: str) -> Optional[dict]:
"""Process an image for Gemini API."""
try:
if image_path.startswith("data:image/"):
# Handle data URL: data:image/png;base64,iVBORw0...
header, data = image_path.split(",", 1)
mime_type = header.split(";")[0].split(":")[1]
return {"inline_data": {"mime_type": mime_type, "data": data}}
else:
# Handle file path - translate for Docker environment
from utils.file_types import get_image_mime_type
from utils.file_utils import translate_path_for_environment
translated_path = translate_path_for_environment(image_path)
logger.debug(f"Translated image path from '{image_path}' to '{translated_path}'")
if not os.path.exists(translated_path):
logger.warning(f"Image file not found: {translated_path} (original: {image_path})")
return None
# Use translated path for all subsequent operations
image_path = translated_path
# Detect MIME type from file extension using centralized mappings
ext = os.path.splitext(image_path)[1].lower()
mime_type = get_image_mime_type(ext)
# Read and encode the image
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
return {"inline_data": {"mime_type": mime_type, "data": image_data}}
except Exception as e:
logger.error(f"Error processing image {image_path}: {e}")
return None