Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/tools/analyze.py
+++ b/tools/analyze.py
@@ -87,7 +87,13 @@ class AnalyzeTool(BaseTool):
                },
                "use_websearch": {
                    "type": "boolean",
-                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
+                    "description": (
+                        "Enable web search for documentation, best practices, and current information. "
+                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
+                        "exploring industry best practices, working with specific frameworks/technologies, "
+                        "researching solutions to complex problems, or when current documentation and "
+                        "community insights would enhance the analysis."
+                    ),
                    "default": True,
                },
                "continuation_id": {
--- a/tools/base.py
+++ b/tools/base.py
@@ -27,6 +27,7 @@ if TYPE_CHECKING:

 from config import MCP_PROMPT_SIZE_LIMIT
 from providers import ModelProvider, ModelProviderRegistry
+from providers.base import ProviderType
 from utils import check_token_limit
 from utils.conversation_memory import (
    MAX_CONVERSATION_TURNS,
@@ -84,6 +85,17 @@ class ToolRequest(BaseModel):
            "additional findings, or answers to follow-up questions. Can be used across different tools."
        ),
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description=(
+            "Optional image(s) for visual context. Accepts absolute file paths or "
+            "base64 data URLs. Only provide when user explicitly mentions images. "
+            "When including images, please describe what you believe each image contains "
+            "(e.g., 'screenshot of error dialog', 'architecture diagram', 'code snippet') "
+            "to aid with contextual understanding. Useful for UI discussions, diagrams, "
+            "visual problems, error screens, architecture mockups, and visual analysis tasks."
+        ),
+    )


 class BaseTool(ABC):
@@ -981,6 +993,139 @@ When recommending searches, be specific about what information you need and why
            }
        return None

+    def _validate_image_limits(
+        self, images: Optional[list[str]], model_name: str, continuation_id: Optional[str] = None
+    ) -> Optional[dict]:
+        """
+        Validate image size against model capabilities at MCP boundary.
+
+        This performs strict validation to ensure we don't exceed model-specific
+        image size limits. Uses capability-based validation with actual model
+        configuration rather than hard-coded limits.
+
+        Args:
+            images: List of image paths/data URLs to validate
+            model_name: Name of the model to check limits against
+
+        Returns:
+            Optional[dict]: Error response if validation fails, None if valid
+        """
+        if not images:
+            return None
+
+        # Get model capabilities to check image support and size limits
+        try:
+            provider = self.get_model_provider(model_name)
+            capabilities = provider.get_capabilities(model_name)
+        except Exception as e:
+            logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
+            # Fall back to checking custom models configuration
+            capabilities = None
+
+        # Check if model supports images at all
+        supports_images = False
+        max_size_mb = 0.0
+
+        if capabilities:
+            supports_images = capabilities.supports_images
+            max_size_mb = capabilities.max_image_size_mb
+        else:
+            # Fall back to custom models configuration
+            try:
+                import json
+                from pathlib import Path
+
+                custom_models_path = Path(__file__).parent.parent / "conf" / "custom_models.json"
+                if custom_models_path.exists():
+                    with open(custom_models_path) as f:
+                        custom_config = json.load(f)
+
+                    # Check if model is in custom models list
+                    for model_config in custom_config.get("models", []):
+                        if model_config.get("model_name") == model_name or model_name in model_config.get(
+                            "aliases", []
+                        ):
+                            supports_images = model_config.get("supports_images", False)
+                            max_size_mb = model_config.get("max_image_size_mb", 0.0)
+                            break
+            except Exception as e:
+                logger.warning(f"Failed to load custom models config: {e}")
+
+        # If model doesn't support images, reject
+        if not supports_images:
+            return {
+                "status": "error",
+                "content": (
+                    f"Image support not available: Model '{model_name}' does not support image processing. "
+                    f"Please use a vision-capable model such as 'gemini-2.5-flash-preview-05-20', 'o3', "
+                    f"or 'claude-3-opus' for image analysis tasks."
+                ),
+                "content_type": "text",
+                "metadata": {
+                    "error_type": "validation_error",
+                    "model_name": model_name,
+                    "supports_images": False,
+                    "image_count": len(images),
+                },
+            }
+
+        # Calculate total size of all images
+        total_size_mb = 0.0
+        for image_path in images:
+            try:
+                if image_path.startswith("data:image/"):
+                    # Handle data URL: data:image/png;base64,iVBORw0...
+                    _, data = image_path.split(",", 1)
+                    # Base64 encoding increases size by ~33%, so decode to get actual size
+                    import base64
+
+                    actual_size = len(base64.b64decode(data))
+                    total_size_mb += actual_size / (1024 * 1024)
+                else:
+                    # Handle file path
+                    if os.path.exists(image_path):
+                        file_size = os.path.getsize(image_path)
+                        total_size_mb += file_size / (1024 * 1024)
+                    else:
+                        logger.warning(f"Image file not found: {image_path}")
+                        # Assume a reasonable size for missing files to avoid breaking validation
+                        total_size_mb += 1.0  # 1MB assumption
+            except Exception as e:
+                logger.warning(f"Failed to get size for image {image_path}: {e}")
+                # Assume a reasonable size for problematic files
+                total_size_mb += 1.0  # 1MB assumption
+
+        # Apply 40MB cap for custom models as requested
+        effective_limit_mb = max_size_mb
+        if hasattr(capabilities, "provider") and capabilities.provider == ProviderType.CUSTOM:
+            effective_limit_mb = min(max_size_mb, 40.0)
+        elif not capabilities:  # Fallback case for custom models
+            effective_limit_mb = min(max_size_mb, 40.0)
+
+        # Validate against size limit
+        if total_size_mb > effective_limit_mb:
+            return {
+                "status": "error",
+                "content": (
+                    f"Image size limit exceeded: Model '{model_name}' supports maximum {effective_limit_mb:.1f}MB "
+                    f"for all images combined, but {total_size_mb:.1f}MB was provided. "
+                    f"Please reduce image sizes or count and try again."
+                ),
+                "content_type": "text",
+                "metadata": {
+                    "error_type": "validation_error",
+                    "model_name": model_name,
+                    "total_size_mb": round(total_size_mb, 2),
+                    "limit_mb": round(effective_limit_mb, 2),
+                    "image_count": len(images),
+                    "supports_images": supports_images,
+                },
+            }
+
+        # All validations passed
+        logger.debug(f"Image validation passed: {len(images)} images")
+        return None
+
    def estimate_tokens_smart(self, file_path: str) -> int:
        """
        Estimate tokens for a file using file-type aware ratios.
@@ -1131,6 +1276,9 @@ When recommending searches, be specific about what information you need and why
                )
                return [TextContent(type="text", text=error_output.model_dump_json())]

+            # Extract and validate images from request
+            images = getattr(request, "images", None) or []
+
            # Check if we have continuation_id - if so, conversation history is already embedded
            continuation_id = getattr(request, "continuation_id", None)

@@ -1215,6 +1363,12 @@ When recommending searches, be specific about what information you need and why
            # Only set this after auto mode validation to prevent "auto" being used as a model name
            self._current_model_name = model_name

+            # Validate images at MCP boundary if any were provided
+            if images:
+                image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
+                if image_validation_error:
+                    return [TextContent(type="text", text=json.dumps(image_validation_error))]
+
            temperature = getattr(request, "temperature", None)
            if temperature is None:
                temperature = self.get_default_temperature()
@@ -1247,6 +1401,7 @@ When recommending searches, be specific about what information you need and why
                system_prompt=system_prompt,
                temperature=temperature,
                thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
+                images=images if images else None,  # Pass images via kwargs
            )

            logger.info(f"Received response from {provider.get_provider_type().value} API for {self.name}")
@@ -1298,6 +1453,7 @@ When recommending searches, be specific about what information you need and why
                        system_prompt=system_prompt,
                        temperature=temperature,
                        thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
+                        images=images if images else None,  # Pass images via kwargs in retry too
                    )

                    if retry_response.content:
@@ -1398,6 +1554,7 @@ When recommending searches, be specific about what information you need and why
        continuation_id = getattr(request, "continuation_id", None)
        if continuation_id:
            request_files = getattr(request, "files", []) or []
+            request_images = getattr(request, "images", []) or []
            # Extract model metadata for conversation tracking
            model_provider = None
            model_name = None
@@ -1417,6 +1574,7 @@ When recommending searches, be specific about what information you need and why
                "assistant",
                formatted_content,
                files=request_files,
+                images=request_images,
                tool_name=self.name,
                model_provider=model_provider,
                model_name=model_name,
@@ -1519,6 +1677,7 @@ When recommending searches, be specific about what information you need and why
            # Use actually processed files from file preparation instead of original request files
            # This ensures directories are tracked as their individual expanded files
            request_files = getattr(self, "_actually_processed_files", []) or getattr(request, "files", []) or []
+            request_images = getattr(request, "images", []) or []
            # Extract model metadata
            model_provider = None
            model_name = None
@@ -1538,6 +1697,7 @@ When recommending searches, be specific about what information you need and why
                "assistant",
                content,
                files=request_files,
+                images=request_images,
                tool_name=self.name,
                model_provider=model_provider,
                model_name=model_name,
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -20,12 +20,25 @@ class ChatRequest(ToolRequest):

    prompt: str = Field(
        ...,
-        description="Your question, topic, or current thinking to discuss",
+        description=(
+            "Your thorough, expressive question with as much context as possible. Remember: you're talking to "
+            "another Claude assistant who has deep expertise and can provide nuanced insights. Include your "
+            "current thinking, specific challenges, background context, what you've already tried, and what "
+            "kind of response would be most helpful. The more context and detail you provide, the more "
+            "valuable and targeted the response will be."
+        ),
    )
    files: Optional[list[str]] = Field(
        default_factory=list,
        description="Optional files for context (must be absolute paths)",
    )
+    images: Optional[list[str]] = Field(
+        default_factory=list,
+        description=(
+            "Optional images for visual context. Useful for UI discussions, diagrams, visual problems, "
+            "error screens, or architectural mockups."
+        ),
+    )


 class ChatTool(BaseTool):
@@ -42,7 +55,8 @@ class ChatTool(BaseTool):
            "Also great for: explanations, comparisons, general development questions. "
            "Use this when you want to ask questions, brainstorm ideas, get opinions, discuss topics, "
            "share your thinking, or need explanations about concepts and approaches. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
+            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can "
+            "provide enhanced capabilities."
        )

    def get_input_schema(self) -> dict[str, Any]:
@@ -51,13 +65,27 @@ class ChatTool(BaseTool):
            "properties": {
                "prompt": {
                    "type": "string",
-                    "description": "Your question, topic, or current thinking to discuss",
+                    "description": (
+                        "Your thorough, expressive question with as much context as possible. Remember: you're "
+                        "talking to another Claude assistant who has deep expertise and can provide nuanced "
+                        "insights. Include your current thinking, specific challenges, background context, what "
+                        "you've already tried, and what kind of response would be most helpful. The more context "
+                        "and detail you provide, the more valuable and targeted the response will be."
+                    ),
                },
                "files": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "Optional files for context (must be absolute paths)",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": (
+                        "Optional images for visual context. Useful for UI discussions, diagrams, visual "
+                        "problems, error screens, or architectural mockups."
+                    ),
+                },
                "model": self.get_model_field_schema(),
                "temperature": {
                    "type": "number",
@@ -68,16 +96,29 @@ class ChatTool(BaseTool):
                "thinking_mode": {
                    "type": "string",
                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
+                    "description": (
+                        "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), "
+                        "max (100% of model max)"
+                    ),
                },
                "use_websearch": {
                    "type": "boolean",
-                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
+                    "description": (
+                        "Enable web search for documentation, best practices, and current information. "
+                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
+                        "exploring industry best practices, working with specific frameworks/technologies, "
+                        "researching solutions to complex problems, or when current documentation and "
+                        "community insights would enhance the analysis."
+                    ),
                    "default": True,
                },
                "continuation_id": {
                    "type": "string",
-                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
+                    "description": (
+                        "Thread continuation ID for multi-turn conversations. Can be used to continue "
+                        "conversations across different tools. Only provide this if continuing a previous "
+                        "conversation thread."
+                    ),
                },
            },
            "required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []),
@@ -157,4 +198,7 @@ Please provide a thoughtful, comprehensive response:"""

    def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
        """Format the chat response"""
-        return f"{response}\n\n---\n\n**Claude's Turn:** Evaluate this perspective alongside your analysis to form a comprehensive solution and continue with the user's request and task at hand."
+        return (
+            f"{response}\n\n---\n\n**Claude's Turn:** Evaluate this perspective alongside your analysis to "
+            "form a comprehensive solution and continue with the user's request and task at hand."
+        )
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -41,6 +41,10 @@ class CodeReviewRequest(ToolRequest):
        ...,
        description="User's summary of what the code does, expected behavior, constraints, and review objectives",
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description="Optional images of architecture diagrams, UI mockups, design documents, or visual references for code review context",
+    )
    review_type: str = Field("full", description="Type of review: full|security|performance|quick")
    focus_on: Optional[str] = Field(
        None,
@@ -94,6 +98,11 @@ class CodeReviewTool(BaseTool):
                    "type": "string",
                    "description": "User's summary of what the code does, expected behavior, constraints, and review objectives",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional images of architecture diagrams, UI mockups, design documents, or visual references for code review context",
+                },
                "review_type": {
                    "type": "string",
                    "enum": ["full", "security", "performance", "quick"],
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -24,6 +24,10 @@ class DebugIssueRequest(ToolRequest):
        None,
        description="Files or directories that might be related to the issue (must be absolute paths)",
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description="Optional images showing error screens, UI issues, logs displays, or visual debugging information",
+    )
    runtime_info: Optional[str] = Field(None, description="Environment, versions, or runtime information")
    previous_attempts: Optional[str] = Field(None, description="What has been tried already")

@@ -69,6 +73,11 @@ class DebugIssueTool(BaseTool):
                    "items": {"type": "string"},
                    "description": "Files or directories that might be related to the issue (must be absolute paths)",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional images showing error screens, UI issues, logs displays, or visual debugging information",
+                },
                "runtime_info": {
                    "type": "string",
                    "description": "Environment, versions, or runtime information",
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -78,6 +78,10 @@ class PrecommitRequest(ToolRequest):
        None,
        description="Optional files or directories to provide as context (must be absolute paths). These files are not part of the changes but provide helpful context like configs, docs, or related code.",
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description="Optional images showing expected UI changes, design requirements, or visual references for the changes being validated",
+    )


 class Precommit(BaseTool):
@@ -170,6 +174,11 @@ class Precommit(BaseTool):
                    "items": {"type": "string"},
                    "description": "Optional files or directories to provide as context (must be absolute paths). These files are not part of the changes but provide helpful context like configs, docs, or related code.",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional images showing expected UI changes, design requirements, or visual references for the changes being validated",
+                },
                "use_websearch": {
                    "type": "boolean",
                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
--- a/tools/thinkdeep.py
+++ b/tools/thinkdeep.py
@@ -33,6 +33,10 @@ class ThinkDeepRequest(ToolRequest):
        None,
        description="Optional file paths or directories for additional context (must be absolute paths)",
    )
+    images: Optional[list[str]] = Field(
+        None,
+        description="Optional images for visual analysis - diagrams, charts, system architectures, or any visual information to analyze",
+    )


 class ThinkDeepTool(BaseTool):
@@ -60,7 +64,13 @@ class ThinkDeepTool(BaseTool):
            "properties": {
                "prompt": {
                    "type": "string",
-                    "description": "Your current thinking/analysis to extend and validate. IMPORTANT: Before using this tool, Claude MUST first think deeply and establish a deep understanding of the topic and question by thinking through all relevant details, context, constraints, and implications. Share these extended thoughts and ideas in the prompt so the model has comprehensive information to work with for the best analysis.",
+                    "description": (
+                        "Your current thinking/analysis to extend and validate. IMPORTANT: Before using this tool, "
+                        "Claude MUST first think deeply and establish a deep understanding of the topic and question "
+                        "by thinking through all relevant details, context, constraints, and implications. Share "
+                        "these extended thoughts and ideas in the prompt so the model has comprehensive information "
+                        "to work with for the best analysis."
+                    ),
                },
                "model": self.get_model_field_schema(),
                "problem_context": {
@@ -77,6 +87,11 @@ class ThinkDeepTool(BaseTool):
                    "items": {"type": "string"},
                    "description": "Optional file paths or directories for additional context (must be absolute paths)",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional images for visual analysis - diagrams, charts, system architectures, or any visual information to analyze",
+                },
                "temperature": {
                    "type": "number",
                    "description": "Temperature for creative thinking (0-1, default 0.7)",
--- a/tools/tracer.py
+++ b/tools/tracer.py
@@ -22,11 +22,29 @@ class TracerRequest(ToolRequest):

    prompt: str = Field(
        ...,
-        description="Detailed description of what to trace and WHY you need this analysis. Include context about what you're trying to understand, debug, or analyze. For precision mode: describe the specific method/function and what aspect of its execution flow you need to understand. For dependencies mode: describe the class/module and what relationships you need to map. Example: 'I need to understand how BookingManager.finalizeInvoice method is called throughout the system and what side effects it has, as I'm debugging payment processing issues' rather than just 'BookingManager finalizeInvoice method'",
+        description=(
+            "Detailed description of what to trace and WHY you need this analysis. Include context about what "
+            "you're trying to understand, debug, or analyze. For precision mode: describe the specific "
+            "method/function and what aspect of its execution flow you need to understand. For dependencies "
+            "mode: describe the class/module and what relationships you need to map. Example: 'I need to "
+            "understand how BookingManager.finalizeInvoice method is called throughout the system and what "
+            "side effects it has, as I'm debugging payment processing issues' rather than just "
+            "'BookingManager finalizeInvoice method'"
+        ),
    )
    trace_mode: Literal["precision", "dependencies"] = Field(
        ...,
-        description="Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or 'dependencies' (for classes/modules/protocols - shows structural relationships)",
+        description=(
+            "Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or "
+            "'dependencies' (for classes/modules/protocols - shows structural relationships)"
+        ),
+    )
+    images: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Optional images of system architecture diagrams, flow charts, or visual references to help "
+            "understand the tracing context"
+        ),
    )


@@ -44,11 +62,15 @@ class TracerTool(BaseTool):
    def get_description(self) -> str:
        return (
            "ANALYSIS PROMPT GENERATOR - Creates structured prompts for static code analysis. "
-            "Helps generate detailed analysis requests with specific method/function names, file paths, and component context. "
-            "Type 'precision': For methods/functions - traces execution flow, call chains, call stacks, and shows when/how they are used. "
-            "Type 'dependencies': For classes/modules/protocols - maps structural relationships and bidirectional dependencies. "
+            "Helps generate detailed analysis requests with specific method/function names, file paths, and "
+            "component context. "
+            "Type 'precision': For methods/functions - traces execution flow, call chains, call stacks, and "
+            "shows when/how they are used. "
+            "Type 'dependencies': For classes/modules/protocols - maps structural relationships and "
+            "bidirectional dependencies. "
            "Returns detailed instructions on how to perform the analysis and format the results. "
-            "Use this to create focused analysis requests that can be fed back to Claude with the appropriate code files. "
+            "Use this to create focused analysis requests that can be fed back to Claude with the appropriate "
+            "code files. "
        )

    def get_input_schema(self) -> dict[str, Any]:
@@ -57,13 +79,26 @@ class TracerTool(BaseTool):
            "properties": {
                "prompt": {
                    "type": "string",
-                    "description": "Detailed description of what to trace and WHY you need this analysis. Include context about what you're trying to understand, debug, or analyze. For precision mode: describe the specific method/function and what aspect of its execution flow you need to understand. For dependencies mode: describe the class/module and what relationships you need to map. Example: 'I need to understand how BookingManager.finalizeInvoice method is called throughout the system and what side effects it has, as I'm debugging payment processing issues' rather than just 'BookingManager finalizeInvoice method'",
+                    "description": (
+                        "Detailed description of what to trace and WHY you need this analysis. Include context "
+                        "about what you're trying to understand, debug, or analyze. For precision mode: describe "
+                        "the specific method/function and what aspect of its execution flow you need to understand. "
+                        "For dependencies mode: describe the class/module and what relationships you need to map. "
+                        "Example: 'I need to understand how BookingManager.finalizeInvoice method is called "
+                        "throughout the system and what side effects it has, as I'm debugging payment processing "
+                        "issues' rather than just 'BookingManager finalizeInvoice method'"
+                    ),
                },
                "trace_mode": {
                    "type": "string",
                    "enum": ["precision", "dependencies"],
                    "description": "Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or 'dependencies' (for classes/modules/protocols - shows structural relationships)",
                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional images of system architecture diagrams, flow charts, or visual references to help understand the tracing context",
+                },
            },
            "required": ["prompt", "trace_mode"],
        }