Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/providers/base.py
+++ b/providers/base.py
@@ -112,6 +112,8 @@ class ModelCapabilities:
    supports_system_prompts: bool = True
    supports_streaming: bool = True
    supports_function_calling: bool = False
+    supports_images: bool = False  # Whether model can process images
+    max_image_size_mb: float = 0.0  # Maximum total size for all images in MB

    # Temperature constraint object - preferred way to define temperature limits
    temperature_constraint: TemperatureConstraint = field(
--- a/providers/gemini.py
+++ b/providers/gemini.py
@@ -1,6 +1,8 @@
 """Gemini model provider implementation."""

+import base64
 import logging
+import os
 import time
 from typing import Optional

@@ -21,11 +23,15 @@ class GeminiModelProvider(ModelProvider):
            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 24576,  # Flash 2.5 thinking budget limit
+            "supports_images": True,  # Vision capability
+            "max_image_size_mb": 20.0,  # Conservative 20MB limit for reliability
        },
        "gemini-2.5-pro-preview-06-05": {
            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 32768,  # Pro 2.5 thinking budget limit
+            "supports_images": True,  # Vision capability
+            "max_image_size_mb": 32.0,  # Higher limit for Pro model
        },
        # Shorthands
        "flash": "gemini-2.5-flash-preview-05-20",
@@ -84,6 +90,8 @@ class GeminiModelProvider(ModelProvider):
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
+            supports_images=config.get("supports_images", False),
+            max_image_size_mb=config.get("max_image_size_mb", 0.0),
            temperature_constraint=temp_constraint,
        )

@@ -95,6 +103,7 @@ class GeminiModelProvider(ModelProvider):
        temperature: float = 0.7,
        max_output_tokens: Optional[int] = None,
        thinking_mode: str = "medium",
+        images: Optional[list[str]] = None,
        **kwargs,
    ) -> ModelResponse:
        """Generate content using Gemini model."""
@@ -102,12 +111,34 @@ class GeminiModelProvider(ModelProvider):
        resolved_name = self._resolve_model_name(model_name)
        self.validate_parameters(resolved_name, temperature)

-        # Combine system prompt with user prompt if provided
+        # Prepare content parts (text and potentially images)
+        parts = []
+
+        # Add system and user prompts as text
        if system_prompt:
            full_prompt = f"{system_prompt}\n\n{prompt}"
        else:
            full_prompt = prompt

+        parts.append({"text": full_prompt})
+
+        # Add images if provided and model supports vision
+        if images and self._supports_vision(resolved_name):
+            for image_path in images:
+                try:
+                    image_part = self._process_image(image_path)
+                    if image_part:
+                        parts.append(image_part)
+                except Exception as e:
+                    logger.warning(f"Failed to process image {image_path}: {e}")
+                    # Continue with other images and text
+                    continue
+        elif images and not self._supports_vision(resolved_name):
+            logger.warning(f"Model {resolved_name} does not support images, ignoring {len(images)} image(s)")
+
+        # Create contents structure
+        contents = [{"parts": parts}]
+
        # Prepare generation config
        generation_config = types.GenerateContentConfig(
            temperature=temperature,
@@ -139,7 +170,7 @@ class GeminiModelProvider(ModelProvider):
                # Generate content
                response = self.client.models.generate_content(
                    model=resolved_name,
-                    contents=full_prompt,
+                    contents=contents,
                    config=generation_config,
                )

@@ -274,3 +305,51 @@ class GeminiModelProvider(ModelProvider):
                usage["total_tokens"] = usage["input_tokens"] + usage["output_tokens"]

        return usage
+
+    def _supports_vision(self, model_name: str) -> bool:
+        """Check if the model supports vision (image processing)."""
+        # Gemini 2.5 models support vision
+        vision_models = {
+            "gemini-2.5-flash-preview-05-20",
+            "gemini-2.5-pro-preview-06-05",
+            "gemini-2.0-flash",
+            "gemini-1.5-pro",
+            "gemini-1.5-flash",
+        }
+        return model_name in vision_models
+
+    def _process_image(self, image_path: str) -> Optional[dict]:
+        """Process an image for Gemini API."""
+        try:
+            if image_path.startswith("data:image/"):
+                # Handle data URL: data:image/png;base64,iVBORw0...
+                header, data = image_path.split(",", 1)
+                mime_type = header.split(";")[0].split(":")[1]
+                return {"inline_data": {"mime_type": mime_type, "data": data}}
+            else:
+                # Handle file path - translate for Docker environment
+                from utils.file_types import get_image_mime_type
+                from utils.file_utils import translate_path_for_environment
+
+                translated_path = translate_path_for_environment(image_path)
+                logger.debug(f"Translated image path from '{image_path}' to '{translated_path}'")
+
+                if not os.path.exists(translated_path):
+                    logger.warning(f"Image file not found: {translated_path} (original: {image_path})")
+                    return None
+
+                # Use translated path for all subsequent operations
+                image_path = translated_path
+
+                # Detect MIME type from file extension using centralized mappings
+                ext = os.path.splitext(image_path)[1].lower()
+                mime_type = get_image_mime_type(ext)
+
+                # Read and encode the image
+                with open(image_path, "rb") as f:
+                    image_data = base64.b64encode(f.read()).decode()
+
+                return {"inline_data": {"mime_type": mime_type, "data": image_data}}
+        except Exception as e:
+            logger.error(f"Error processing image {image_path}: {e}")
+            return None
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -23,22 +23,38 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
        "o3": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
+            "supports_images": True,  # O3 models support vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
        "o3-mini": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
+            "supports_images": True,  # O3 models support vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
        "o3-pro": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
+            "supports_images": True,  # O3 models support vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
        "o4-mini": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
+            "supports_images": True,  # O4 models support vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
        "o4-mini-high": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
+            "supports_images": True,  # O4 models support vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
+        },
+        "gpt-4.1-2025-04-14": {
+            "context_window": 1_000_000,  # 1M tokens
+            "supports_extended_thinking": False,
+            "supports_images": True,  # GPT-4.1 supports vision
+            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
        # Shorthands
        "mini": "o4-mini",  # Default 'mini' to latest mini model
@@ -46,6 +62,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
        "o4mini": "o4-mini",
        "o4minihigh": "o4-mini-high",
        "o4minihi": "o4-mini-high",
+        "gpt4.1": "gpt-4.1-2025-04-14",
    }

    def __init__(self, api_key: str, **kwargs):
@@ -76,7 +93,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            # O3 and O4 reasoning models only support temperature=1.0
            temp_constraint = FixedTemperatureConstraint(1.0)
        else:
-            # Other OpenAI models support 0.0-2.0 range
+            # Other OpenAI models (including GPT-4.1) support 0.0-2.0 range
            temp_constraint = RangeTemperatureConstraint(0.0, 2.0, 0.7)

        return ModelCapabilities(
@@ -88,6 +105,8 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
+            supports_images=config.get("supports_images", False),
+            max_image_size_mb=config.get("max_image_size_mb", 0.0),
            temperature_constraint=temp_constraint,
        )

--- a/providers/openai_compatible.py
+++ b/providers/openai_compatible.py
@@ -1,5 +1,6 @@
 """Base class for OpenAI-compatible API providers."""

+import base64
 import ipaddress
 import logging
 import os
@@ -229,6 +230,7 @@ class OpenAICompatibleProvider(ModelProvider):
        system_prompt: Optional[str] = None,
        temperature: float = 0.7,
        max_output_tokens: Optional[int] = None,
+        images: Optional[list[str]] = None,
        **kwargs,
    ) -> ModelResponse:
        """Generate content using the OpenAI-compatible API.
@@ -255,7 +257,32 @@ class OpenAICompatibleProvider(ModelProvider):
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
+
+        # Prepare user message with text and potentially images
+        user_content = []
+        user_content.append({"type": "text", "text": prompt})
+
+        # Add images if provided and model supports vision
+        if images and self._supports_vision(model_name):
+            for image_path in images:
+                try:
+                    image_content = self._process_image(image_path)
+                    if image_content:
+                        user_content.append(image_content)
+                except Exception as e:
+                    logging.warning(f"Failed to process image {image_path}: {e}")
+                    # Continue with other images and text
+                    continue
+        elif images and not self._supports_vision(model_name):
+            logging.warning(f"Model {model_name} does not support images, ignoring {len(images)} image(s)")
+
+        # Add user message
+        if len(user_content) == 1:
+            # Only text content, use simple string format for compatibility
+            messages.append({"role": "user", "content": prompt})
+        else:
+            # Text + images, use content array format
+            messages.append({"role": "user", "content": user_content})

        # Prepare completion parameters
        completion_params = {
@@ -424,3 +451,66 @@ class OpenAICompatibleProvider(ModelProvider):
        Default is False for OpenAI-compatible providers.
        """
        return False
+
+    def _supports_vision(self, model_name: str) -> bool:
+        """Check if the model supports vision (image processing).
+
+        Default implementation for OpenAI-compatible providers.
+        Subclasses should override with specific model support.
+        """
+        # Common vision-capable models - only include models that actually support images
+        vision_models = {
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4-turbo",
+            "gpt-4-vision-preview",
+            "gpt-4.1-2025-04-14",  # GPT-4.1 supports vision
+            "o3",
+            "o3-mini",
+            "o3-pro",
+            "o4-mini",
+            "o4-mini-high",
+            # Note: Claude models would be handled by a separate provider
+        }
+        supports = model_name.lower() in vision_models
+        logging.debug(f"Model '{model_name}' vision support: {supports}")
+        return supports
+
+    def _process_image(self, image_path: str) -> Optional[dict]:
+        """Process an image for OpenAI-compatible API."""
+        try:
+            if image_path.startswith("data:image/"):
+                # Handle data URL: data:image/png;base64,iVBORw0...
+                return {"type": "image_url", "image_url": {"url": image_path}}
+            else:
+                # Handle file path - translate for Docker environment
+                from utils.file_utils import translate_path_for_environment
+
+                translated_path = translate_path_for_environment(image_path)
+                logging.debug(f"Translated image path from '{image_path}' to '{translated_path}'")
+
+                if not os.path.exists(translated_path):
+                    logging.warning(f"Image file not found: {translated_path} (original: {image_path})")
+                    return None
+
+                # Use translated path for all subsequent operations
+                image_path = translated_path
+
+                # Detect MIME type from file extension using centralized mappings
+                from utils.file_types import get_image_mime_type
+
+                ext = os.path.splitext(image_path)[1].lower()
+                mime_type = get_image_mime_type(ext)
+                logging.debug(f"Processing image '{image_path}' with extension '{ext}' as MIME type '{mime_type}'")
+
+                # Read and encode the image
+                with open(image_path, "rb") as f:
+                    image_data = base64.b64encode(f.read()).decode()
+
+                # Create data URL for OpenAI API
+                data_url = f"data:{mime_type};base64,{image_data}"
+
+                return {"type": "image_url", "image_url": {"url": data_url}}
+        except Exception as e:
+            logging.error(f"Error processing image {image_path}: {e}")
+            return None
--- a/providers/openrouter_registry.py
+++ b/providers/openrouter_registry.py
@@ -23,6 +23,8 @@ class OpenRouterModelConfig:
    supports_streaming: bool = True
    supports_function_calling: bool = False
    supports_json_mode: bool = False
+    supports_images: bool = False  # Whether model can process images
+    max_image_size_mb: float = 0.0  # Maximum total size for all images in MB
    is_custom: bool = False  # True for models that should only be used with custom endpoints
    description: str = ""

@@ -37,6 +39,8 @@ class OpenRouterModelConfig:
            supports_system_prompts=self.supports_system_prompts,
            supports_streaming=self.supports_streaming,
            supports_function_calling=self.supports_function_calling,
+            supports_images=self.supports_images,
+            max_image_size_mb=self.max_image_size_mb,
            temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 1.0),
        )

@@ -66,7 +70,8 @@ class OpenRouterModelRegistry:
                translated_path = translate_path_for_environment(env_path)
                self.config_path = Path(translated_path)
            else:
-                # Default to conf/custom_models.json (already in container)
+                # Default to conf/custom_models.json - use relative path from this file
+                # This works both in development and container environments
                self.config_path = Path(__file__).parent.parent / "conf" / "custom_models.json"

        # Load configuration