Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added
OpenAI GPT-4.1 support
Chat tool prompt enhancement
Lint and code quality improvements
This commit is contained in:
Fahad
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions

View File

@@ -87,7 +87,13 @@ class AnalyzeTool(BaseTool):
},
"use_websearch": {
"type": "boolean",
"description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
"description": (
"Enable web search for documentation, best practices, and current information. "
"Particularly useful for: brainstorming sessions, architectural design discussions, "
"exploring industry best practices, working with specific frameworks/technologies, "
"researching solutions to complex problems, or when current documentation and "
"community insights would enhance the analysis."
),
"default": True,
},
"continuation_id": {

View File

@@ -27,6 +27,7 @@ if TYPE_CHECKING:
from config import MCP_PROMPT_SIZE_LIMIT
from providers import ModelProvider, ModelProviderRegistry
from providers.base import ProviderType
from utils import check_token_limit
from utils.conversation_memory import (
MAX_CONVERSATION_TURNS,
@@ -84,6 +85,17 @@ class ToolRequest(BaseModel):
"additional findings, or answers to follow-up questions. Can be used across different tools."
),
)
images: Optional[list[str]] = Field(
None,
description=(
"Optional image(s) for visual context. Accepts absolute file paths or "
"base64 data URLs. Only provide when user explicitly mentions images. "
"When including images, please describe what you believe each image contains "
"(e.g., 'screenshot of error dialog', 'architecture diagram', 'code snippet') "
"to aid with contextual understanding. Useful for UI discussions, diagrams, "
"visual problems, error screens, architecture mockups, and visual analysis tasks."
),
)
class BaseTool(ABC):
@@ -981,6 +993,139 @@ When recommending searches, be specific about what information you need and why
}
return None
def _validate_image_limits(
self, images: Optional[list[str]], model_name: str, continuation_id: Optional[str] = None
) -> Optional[dict]:
"""
Validate image size against model capabilities at MCP boundary.
This performs strict validation to ensure we don't exceed model-specific
image size limits. Uses capability-based validation with actual model
configuration rather than hard-coded limits.
Args:
images: List of image paths/data URLs to validate
model_name: Name of the model to check limits against
Returns:
Optional[dict]: Error response if validation fails, None if valid
"""
if not images:
return None
# Get model capabilities to check image support and size limits
try:
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
except Exception as e:
logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
# Fall back to checking custom models configuration
capabilities = None
# Check if model supports images at all
supports_images = False
max_size_mb = 0.0
if capabilities:
supports_images = capabilities.supports_images
max_size_mb = capabilities.max_image_size_mb
else:
# Fall back to custom models configuration
try:
import json
from pathlib import Path
custom_models_path = Path(__file__).parent.parent / "conf" / "custom_models.json"
if custom_models_path.exists():
with open(custom_models_path) as f:
custom_config = json.load(f)
# Check if model is in custom models list
for model_config in custom_config.get("models", []):
if model_config.get("model_name") == model_name or model_name in model_config.get(
"aliases", []
):
supports_images = model_config.get("supports_images", False)
max_size_mb = model_config.get("max_image_size_mb", 0.0)
break
except Exception as e:
logger.warning(f"Failed to load custom models config: {e}")
# If model doesn't support images, reject
if not supports_images:
return {
"status": "error",
"content": (
f"Image support not available: Model '{model_name}' does not support image processing. "
f"Please use a vision-capable model such as 'gemini-2.5-flash-preview-05-20', 'o3', "
f"or 'claude-3-opus' for image analysis tasks."
),
"content_type": "text",
"metadata": {
"error_type": "validation_error",
"model_name": model_name,
"supports_images": False,
"image_count": len(images),
},
}
# Calculate total size of all images
total_size_mb = 0.0
for image_path in images:
try:
if image_path.startswith("...
_, data = image_path.split(",", 1)
# Base64 encoding increases size by ~33%, so decode to get actual size
import base64
actual_size = len(base64.b64decode(data))
total_size_mb += actual_size / (1024 * 1024)
else:
# Handle file path
if os.path.exists(image_path):
file_size = os.path.getsize(image_path)
total_size_mb += file_size / (1024 * 1024)
else:
logger.warning(f"Image file not found: {image_path}")
# Assume a reasonable size for missing files to avoid breaking validation
total_size_mb += 1.0 # 1MB assumption
except Exception as e:
logger.warning(f"Failed to get size for image {image_path}: {e}")
# Assume a reasonable size for problematic files
total_size_mb += 1.0 # 1MB assumption
# Apply 40MB cap for custom models as requested
effective_limit_mb = max_size_mb
if hasattr(capabilities, "provider") and capabilities.provider == ProviderType.CUSTOM:
effective_limit_mb = min(max_size_mb, 40.0)
elif not capabilities: # Fallback case for custom models
effective_limit_mb = min(max_size_mb, 40.0)
# Validate against size limit
if total_size_mb > effective_limit_mb:
return {
"status": "error",
"content": (
f"Image size limit exceeded: Model '{model_name}' supports maximum {effective_limit_mb:.1f}MB "
f"for all images combined, but {total_size_mb:.1f}MB was provided. "
f"Please reduce image sizes or count and try again."
),
"content_type": "text",
"metadata": {
"error_type": "validation_error",
"model_name": model_name,
"total_size_mb": round(total_size_mb, 2),
"limit_mb": round(effective_limit_mb, 2),
"image_count": len(images),
"supports_images": supports_images,
},
}
# All validations passed
logger.debug(f"Image validation passed: {len(images)} images")
return None
def estimate_tokens_smart(self, file_path: str) -> int:
"""
Estimate tokens for a file using file-type aware ratios.
@@ -1131,6 +1276,9 @@ When recommending searches, be specific about what information you need and why
)
return [TextContent(type="text", text=error_output.model_dump_json())]
# Extract and validate images from request
images = getattr(request, "images", None) or []
# Check if we have continuation_id - if so, conversation history is already embedded
continuation_id = getattr(request, "continuation_id", None)
@@ -1215,6 +1363,12 @@ When recommending searches, be specific about what information you need and why
# Only set this after auto mode validation to prevent "auto" being used as a model name
self._current_model_name = model_name
# Validate images at MCP boundary if any were provided
if images:
image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
if image_validation_error:
return [TextContent(type="text", text=json.dumps(image_validation_error))]
temperature = getattr(request, "temperature", None)
if temperature is None:
temperature = self.get_default_temperature()
@@ -1247,6 +1401,7 @@ When recommending searches, be specific about what information you need and why
system_prompt=system_prompt,
temperature=temperature,
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
images=images if images else None, # Pass images via kwargs
)
logger.info(f"Received response from {provider.get_provider_type().value} API for {self.name}")
@@ -1298,6 +1453,7 @@ When recommending searches, be specific about what information you need and why
system_prompt=system_prompt,
temperature=temperature,
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
images=images if images else None, # Pass images via kwargs in retry too
)
if retry_response.content:
@@ -1398,6 +1554,7 @@ When recommending searches, be specific about what information you need and why
continuation_id = getattr(request, "continuation_id", None)
if continuation_id:
request_files = getattr(request, "files", []) or []
request_images = getattr(request, "images", []) or []
# Extract model metadata for conversation tracking
model_provider = None
model_name = None
@@ -1417,6 +1574,7 @@ When recommending searches, be specific about what information you need and why
"assistant",
formatted_content,
files=request_files,
images=request_images,
tool_name=self.name,
model_provider=model_provider,
model_name=model_name,
@@ -1519,6 +1677,7 @@ When recommending searches, be specific about what information you need and why
# Use actually processed files from file preparation instead of original request files
# This ensures directories are tracked as their individual expanded files
request_files = getattr(self, "_actually_processed_files", []) or getattr(request, "files", []) or []
request_images = getattr(request, "images", []) or []
# Extract model metadata
model_provider = None
model_name = None
@@ -1538,6 +1697,7 @@ When recommending searches, be specific about what information you need and why
"assistant",
content,
files=request_files,
images=request_images,
tool_name=self.name,
model_provider=model_provider,
model_name=model_name,

View File

@@ -20,12 +20,25 @@ class ChatRequest(ToolRequest):
prompt: str = Field(
...,
description="Your question, topic, or current thinking to discuss",
description=(
"Your thorough, expressive question with as much context as possible. Remember: you're talking to "
"another Claude assistant who has deep expertise and can provide nuanced insights. Include your "
"current thinking, specific challenges, background context, what you've already tried, and what "
"kind of response would be most helpful. The more context and detail you provide, the more "
"valuable and targeted the response will be."
),
)
files: Optional[list[str]] = Field(
default_factory=list,
description="Optional files for context (must be absolute paths)",
)
images: Optional[list[str]] = Field(
default_factory=list,
description=(
"Optional images for visual context. Useful for UI discussions, diagrams, visual problems, "
"error screens, or architectural mockups."
),
)
class ChatTool(BaseTool):
@@ -42,7 +55,8 @@ class ChatTool(BaseTool):
"Also great for: explanations, comparisons, general development questions. "
"Use this when you want to ask questions, brainstorm ideas, get opinions, discuss topics, "
"share your thinking, or need explanations about concepts and approaches. "
"Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
"Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can "
"provide enhanced capabilities."
)
def get_input_schema(self) -> dict[str, Any]:
@@ -51,13 +65,27 @@ class ChatTool(BaseTool):
"properties": {
"prompt": {
"type": "string",
"description": "Your question, topic, or current thinking to discuss",
"description": (
"Your thorough, expressive question with as much context as possible. Remember: you're "
"talking to another Claude assistant who has deep expertise and can provide nuanced "
"insights. Include your current thinking, specific challenges, background context, what "
"you've already tried, and what kind of response would be most helpful. The more context "
"and detail you provide, the more valuable and targeted the response will be."
),
},
"files": {
"type": "array",
"items": {"type": "string"},
"description": "Optional files for context (must be absolute paths)",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": (
"Optional images for visual context. Useful for UI discussions, diagrams, visual "
"problems, error screens, or architectural mockups."
),
},
"model": self.get_model_field_schema(),
"temperature": {
"type": "number",
@@ -68,16 +96,29 @@ class ChatTool(BaseTool):
"thinking_mode": {
"type": "string",
"enum": ["minimal", "low", "medium", "high", "max"],
"description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
"description": (
"Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), "
"max (100% of model max)"
),
},
"use_websearch": {
"type": "boolean",
"description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
"description": (
"Enable web search for documentation, best practices, and current information. "
"Particularly useful for: brainstorming sessions, architectural design discussions, "
"exploring industry best practices, working with specific frameworks/technologies, "
"researching solutions to complex problems, or when current documentation and "
"community insights would enhance the analysis."
),
"default": True,
},
"continuation_id": {
"type": "string",
"description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
"description": (
"Thread continuation ID for multi-turn conversations. Can be used to continue "
"conversations across different tools. Only provide this if continuing a previous "
"conversation thread."
),
},
},
"required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []),
@@ -157,4 +198,7 @@ Please provide a thoughtful, comprehensive response:"""
def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
"""Format the chat response"""
return f"{response}\n\n---\n\n**Claude's Turn:** Evaluate this perspective alongside your analysis to form a comprehensive solution and continue with the user's request and task at hand."
return (
f"{response}\n\n---\n\n**Claude's Turn:** Evaluate this perspective alongside your analysis to "
"form a comprehensive solution and continue with the user's request and task at hand."
)

View File

@@ -41,6 +41,10 @@ class CodeReviewRequest(ToolRequest):
...,
description="User's summary of what the code does, expected behavior, constraints, and review objectives",
)
images: Optional[list[str]] = Field(
None,
description="Optional images of architecture diagrams, UI mockups, design documents, or visual references for code review context",
)
review_type: str = Field("full", description="Type of review: full|security|performance|quick")
focus_on: Optional[str] = Field(
None,
@@ -94,6 +98,11 @@ class CodeReviewTool(BaseTool):
"type": "string",
"description": "User's summary of what the code does, expected behavior, constraints, and review objectives",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": "Optional images of architecture diagrams, UI mockups, design documents, or visual references for code review context",
},
"review_type": {
"type": "string",
"enum": ["full", "security", "performance", "quick"],

View File

@@ -24,6 +24,10 @@ class DebugIssueRequest(ToolRequest):
None,
description="Files or directories that might be related to the issue (must be absolute paths)",
)
images: Optional[list[str]] = Field(
None,
description="Optional images showing error screens, UI issues, logs displays, or visual debugging information",
)
runtime_info: Optional[str] = Field(None, description="Environment, versions, or runtime information")
previous_attempts: Optional[str] = Field(None, description="What has been tried already")
@@ -69,6 +73,11 @@ class DebugIssueTool(BaseTool):
"items": {"type": "string"},
"description": "Files or directories that might be related to the issue (must be absolute paths)",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": "Optional images showing error screens, UI issues, logs displays, or visual debugging information",
},
"runtime_info": {
"type": "string",
"description": "Environment, versions, or runtime information",

View File

@@ -78,6 +78,10 @@ class PrecommitRequest(ToolRequest):
None,
description="Optional files or directories to provide as context (must be absolute paths). These files are not part of the changes but provide helpful context like configs, docs, or related code.",
)
images: Optional[list[str]] = Field(
None,
description="Optional images showing expected UI changes, design requirements, or visual references for the changes being validated",
)
class Precommit(BaseTool):
@@ -170,6 +174,11 @@ class Precommit(BaseTool):
"items": {"type": "string"},
"description": "Optional files or directories to provide as context (must be absolute paths). These files are not part of the changes but provide helpful context like configs, docs, or related code.",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": "Optional images showing expected UI changes, design requirements, or visual references for the changes being validated",
},
"use_websearch": {
"type": "boolean",
"description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",

View File

@@ -33,6 +33,10 @@ class ThinkDeepRequest(ToolRequest):
None,
description="Optional file paths or directories for additional context (must be absolute paths)",
)
images: Optional[list[str]] = Field(
None,
description="Optional images for visual analysis - diagrams, charts, system architectures, or any visual information to analyze",
)
class ThinkDeepTool(BaseTool):
@@ -60,7 +64,13 @@ class ThinkDeepTool(BaseTool):
"properties": {
"prompt": {
"type": "string",
"description": "Your current thinking/analysis to extend and validate. IMPORTANT: Before using this tool, Claude MUST first think deeply and establish a deep understanding of the topic and question by thinking through all relevant details, context, constraints, and implications. Share these extended thoughts and ideas in the prompt so the model has comprehensive information to work with for the best analysis.",
"description": (
"Your current thinking/analysis to extend and validate. IMPORTANT: Before using this tool, "
"Claude MUST first think deeply and establish a deep understanding of the topic and question "
"by thinking through all relevant details, context, constraints, and implications. Share "
"these extended thoughts and ideas in the prompt so the model has comprehensive information "
"to work with for the best analysis."
),
},
"model": self.get_model_field_schema(),
"problem_context": {
@@ -77,6 +87,11 @@ class ThinkDeepTool(BaseTool):
"items": {"type": "string"},
"description": "Optional file paths or directories for additional context (must be absolute paths)",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": "Optional images for visual analysis - diagrams, charts, system architectures, or any visual information to analyze",
},
"temperature": {
"type": "number",
"description": "Temperature for creative thinking (0-1, default 0.7)",

View File

@@ -22,11 +22,29 @@ class TracerRequest(ToolRequest):
prompt: str = Field(
...,
description="Detailed description of what to trace and WHY you need this analysis. Include context about what you're trying to understand, debug, or analyze. For precision mode: describe the specific method/function and what aspect of its execution flow you need to understand. For dependencies mode: describe the class/module and what relationships you need to map. Example: 'I need to understand how BookingManager.finalizeInvoice method is called throughout the system and what side effects it has, as I'm debugging payment processing issues' rather than just 'BookingManager finalizeInvoice method'",
description=(
"Detailed description of what to trace and WHY you need this analysis. Include context about what "
"you're trying to understand, debug, or analyze. For precision mode: describe the specific "
"method/function and what aspect of its execution flow you need to understand. For dependencies "
"mode: describe the class/module and what relationships you need to map. Example: 'I need to "
"understand how BookingManager.finalizeInvoice method is called throughout the system and what "
"side effects it has, as I'm debugging payment processing issues' rather than just "
"'BookingManager finalizeInvoice method'"
),
)
trace_mode: Literal["precision", "dependencies"] = Field(
...,
description="Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or 'dependencies' (for classes/modules/protocols - shows structural relationships)",
description=(
"Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or "
"'dependencies' (for classes/modules/protocols - shows structural relationships)"
),
)
images: list[str] = Field(
default_factory=list,
description=(
"Optional images of system architecture diagrams, flow charts, or visual references to help "
"understand the tracing context"
),
)
@@ -44,11 +62,15 @@ class TracerTool(BaseTool):
def get_description(self) -> str:
return (
"ANALYSIS PROMPT GENERATOR - Creates structured prompts for static code analysis. "
"Helps generate detailed analysis requests with specific method/function names, file paths, and component context. "
"Type 'precision': For methods/functions - traces execution flow, call chains, call stacks, and shows when/how they are used. "
"Type 'dependencies': For classes/modules/protocols - maps structural relationships and bidirectional dependencies. "
"Helps generate detailed analysis requests with specific method/function names, file paths, and "
"component context. "
"Type 'precision': For methods/functions - traces execution flow, call chains, call stacks, and "
"shows when/how they are used. "
"Type 'dependencies': For classes/modules/protocols - maps structural relationships and "
"bidirectional dependencies. "
"Returns detailed instructions on how to perform the analysis and format the results. "
"Use this to create focused analysis requests that can be fed back to Claude with the appropriate code files. "
"Use this to create focused analysis requests that can be fed back to Claude with the appropriate "
"code files. "
)
def get_input_schema(self) -> dict[str, Any]:
@@ -57,13 +79,26 @@ class TracerTool(BaseTool):
"properties": {
"prompt": {
"type": "string",
"description": "Detailed description of what to trace and WHY you need this analysis. Include context about what you're trying to understand, debug, or analyze. For precision mode: describe the specific method/function and what aspect of its execution flow you need to understand. For dependencies mode: describe the class/module and what relationships you need to map. Example: 'I need to understand how BookingManager.finalizeInvoice method is called throughout the system and what side effects it has, as I'm debugging payment processing issues' rather than just 'BookingManager finalizeInvoice method'",
"description": (
"Detailed description of what to trace and WHY you need this analysis. Include context "
"about what you're trying to understand, debug, or analyze. For precision mode: describe "
"the specific method/function and what aspect of its execution flow you need to understand. "
"For dependencies mode: describe the class/module and what relationships you need to map. "
"Example: 'I need to understand how BookingManager.finalizeInvoice method is called "
"throughout the system and what side effects it has, as I'm debugging payment processing "
"issues' rather than just 'BookingManager finalizeInvoice method'"
),
},
"trace_mode": {
"type": "string",
"enum": ["precision", "dependencies"],
"description": "Trace mode: 'precision' (for methods/functions - shows execution flow and usage patterns) or 'dependencies' (for classes/modules/protocols - shows structural relationships)",
},
"images": {
"type": "array",
"items": {"type": "string"},
"description": "Optional images of system architecture diagrams, flow charts, or visual references to help understand the tracing context",
},
},
"required": ["prompt", "trace_mode"],
}