Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/tools/base.py
+++ b/tools/base.py
@@ -27,6 +27,7 @@ if TYPE_CHECKING:

 from config import MCP_PROMPT_SIZE_LIMIT
 from providers import ModelProvider, ModelProviderRegistry
+from providers.base import ProviderType
 from utils import check_token_limit
 from utils.conversation_memory import (
    MAX_CONVERSATION_TURNS,
@@ -84,6 +85,17 @@ class ToolRequest(BaseModel):
            "additional findings, or answers to follow-up questions. Can be used across different tools."
        ),
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description=(
+            "Optional image(s) for visual context. Accepts absolute file paths or "
+            "base64 data URLs. Only provide when user explicitly mentions images. "
+            "When including images, please describe what you believe each image contains "
+            "(e.g., 'screenshot of error dialog', 'architecture diagram', 'code snippet') "
+            "to aid with contextual understanding. Useful for UI discussions, diagrams, "
+            "visual problems, error screens, architecture mockups, and visual analysis tasks."
+        ),
+    )


 class BaseTool(ABC):
@@ -981,6 +993,139 @@ When recommending searches, be specific about what information you need and why
            }
        return None

+    def _validate_image_limits(
+        self, images: Optional[list[str]], model_name: str, continuation_id: Optional[str] = None
+    ) -> Optional[dict]:
+        """
+        Validate image size against model capabilities at MCP boundary.
+
+        This performs strict validation to ensure we don't exceed model-specific
+        image size limits. Uses capability-based validation with actual model
+        configuration rather than hard-coded limits.
+
+        Args:
+            images: List of image paths/data URLs to validate
+            model_name: Name of the model to check limits against
+
+        Returns:
+            Optional[dict]: Error response if validation fails, None if valid
+        """
+        if not images:
+            return None
+
+        # Get model capabilities to check image support and size limits
+        try:
+            provider = self.get_model_provider(model_name)
+            capabilities = provider.get_capabilities(model_name)
+        except Exception as e:
+            logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
+            # Fall back to checking custom models configuration
+            capabilities = None
+
+        # Check if model supports images at all
+        supports_images = False
+        max_size_mb = 0.0
+
+        if capabilities:
+            supports_images = capabilities.supports_images
+            max_size_mb = capabilities.max_image_size_mb
+        else:
+            # Fall back to custom models configuration
+            try:
+                import json
+                from pathlib import Path
+
+                custom_models_path = Path(__file__).parent.parent / "conf" / "custom_models.json"
+                if custom_models_path.exists():
+                    with open(custom_models_path) as f:
+                        custom_config = json.load(f)
+
+                    # Check if model is in custom models list
+                    for model_config in custom_config.get("models", []):
+                        if model_config.get("model_name") == model_name or model_name in model_config.get(
+                            "aliases", []
+                        ):
+                            supports_images = model_config.get("supports_images", False)
+                            max_size_mb = model_config.get("max_image_size_mb", 0.0)
+                            break
+            except Exception as e:
+                logger.warning(f"Failed to load custom models config: {e}")
+
+        # If model doesn't support images, reject
+        if not supports_images:
+            return {
+                "status": "error",
+                "content": (
+                    f"Image support not available: Model '{model_name}' does not support image processing. "
+                    f"Please use a vision-capable model such as 'gemini-2.5-flash-preview-05-20', 'o3', "
+                    f"or 'claude-3-opus' for image analysis tasks."
+                ),
+                "content_type": "text",
+                "metadata": {
+                    "error_type": "validation_error",
+                    "model_name": model_name,
+                    "supports_images": False,
+                    "image_count": len(images),
+                },
+            }
+
+        # Calculate total size of all images
+        total_size_mb = 0.0
+        for image_path in images:
+            try:
+                if image_path.startswith("data:image/"):
+                    # Handle data URL: data:image/png;base64,iVBORw0...
+                    _, data = image_path.split(",", 1)
+                    # Base64 encoding increases size by ~33%, so decode to get actual size
+                    import base64
+
+                    actual_size = len(base64.b64decode(data))
+                    total_size_mb += actual_size / (1024 * 1024)
+                else:
+                    # Handle file path
+                    if os.path.exists(image_path):
+                        file_size = os.path.getsize(image_path)
+                        total_size_mb += file_size / (1024 * 1024)
+                    else:
+                        logger.warning(f"Image file not found: {image_path}")
+                        # Assume a reasonable size for missing files to avoid breaking validation
+                        total_size_mb += 1.0  # 1MB assumption
+            except Exception as e:
+                logger.warning(f"Failed to get size for image {image_path}: {e}")
+                # Assume a reasonable size for problematic files
+                total_size_mb += 1.0  # 1MB assumption
+
+        # Apply 40MB cap for custom models as requested
+        effective_limit_mb = max_size_mb
+        if hasattr(capabilities, "provider") and capabilities.provider == ProviderType.CUSTOM:
+            effective_limit_mb = min(max_size_mb, 40.0)
+        elif not capabilities:  # Fallback case for custom models
+            effective_limit_mb = min(max_size_mb, 40.0)
+
+        # Validate against size limit
+        if total_size_mb > effective_limit_mb:
+            return {
+                "status": "error",
+                "content": (
+                    f"Image size limit exceeded: Model '{model_name}' supports maximum {effective_limit_mb:.1f}MB "
+                    f"for all images combined, but {total_size_mb:.1f}MB was provided. "
+                    f"Please reduce image sizes or count and try again."
+                ),
+                "content_type": "text",
+                "metadata": {
+                    "error_type": "validation_error",
+                    "model_name": model_name,
+                    "total_size_mb": round(total_size_mb, 2),
+                    "limit_mb": round(effective_limit_mb, 2),
+                    "image_count": len(images),
+                    "supports_images": supports_images,
+                },
+            }
+
+        # All validations passed
+        logger.debug(f"Image validation passed: {len(images)} images")
+        return None
+
    def estimate_tokens_smart(self, file_path: str) -> int:
        """
        Estimate tokens for a file using file-type aware ratios.
@@ -1131,6 +1276,9 @@ When recommending searches, be specific about what information you need and why
                )
                return [TextContent(type="text", text=error_output.model_dump_json())]

+            # Extract and validate images from request
+            images = getattr(request, "images", None) or []
+
            # Check if we have continuation_id - if so, conversation history is already embedded
            continuation_id = getattr(request, "continuation_id", None)

@@ -1215,6 +1363,12 @@ When recommending searches, be specific about what information you need and why
            # Only set this after auto mode validation to prevent "auto" being used as a model name
            self._current_model_name = model_name

+            # Validate images at MCP boundary if any were provided
+            if images:
+                image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
+                if image_validation_error:
+                    return [TextContent(type="text", text=json.dumps(image_validation_error))]
+
            temperature = getattr(request, "temperature", None)
            if temperature is None:
                temperature = self.get_default_temperature()
@@ -1247,6 +1401,7 @@ When recommending searches, be specific about what information you need and why
                system_prompt=system_prompt,
                temperature=temperature,
                thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
+                images=images if images else None,  # Pass images via kwargs
            )

            logger.info(f"Received response from {provider.get_provider_type().value} API for {self.name}")
@@ -1298,6 +1453,7 @@ When recommending searches, be specific about what information you need and why
                        system_prompt=system_prompt,
                        temperature=temperature,
                        thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
+                        images=images if images else None,  # Pass images via kwargs in retry too
                    )

                    if retry_response.content:
@@ -1398,6 +1554,7 @@ When recommending searches, be specific about what information you need and why
        continuation_id = getattr(request, "continuation_id", None)
        if continuation_id:
            request_files = getattr(request, "files", []) or []
+            request_images = getattr(request, "images", []) or []
            # Extract model metadata for conversation tracking
            model_provider = None
            model_name = None
@@ -1417,6 +1574,7 @@ When recommending searches, be specific about what information you need and why
                "assistant",
                formatted_content,
                files=request_files,
+                images=request_images,
                tool_name=self.name,
                model_provider=model_provider,
                model_name=model_name,
@@ -1519,6 +1677,7 @@ When recommending searches, be specific about what information you need and why
            # Use actually processed files from file preparation instead of original request files
            # This ensures directories are tracked as their individual expanded files
            request_files = getattr(self, "_actually_processed_files", []) or getattr(request, "files", []) or []
+            request_images = getattr(request, "images", []) or []
            # Extract model metadata
            model_provider = None
            model_name = None
@@ -1538,6 +1697,7 @@ When recommending searches, be specific about what information you need and why
                "assistant",
                content,
                files=request_files,
+                images=request_images,
                tool_name=self.name,
                model_provider=model_provider,
                model_name=model_name,