Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added
OpenAI GPT-4.1 support
Chat tool prompt enhancement
Lint and code quality improvements
This commit is contained in:
Fahad
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions

View File

@@ -27,6 +27,7 @@ if TYPE_CHECKING:
from config import MCP_PROMPT_SIZE_LIMIT
from providers import ModelProvider, ModelProviderRegistry
from providers.base import ProviderType
from utils import check_token_limit
from utils.conversation_memory import (
MAX_CONVERSATION_TURNS,
@@ -84,6 +85,17 @@ class ToolRequest(BaseModel):
"additional findings, or answers to follow-up questions. Can be used across different tools."
),
)
images: Optional[list[str]] = Field(
None,
description=(
"Optional image(s) for visual context. Accepts absolute file paths or "
"base64 data URLs. Only provide when user explicitly mentions images. "
"When including images, please describe what you believe each image contains "
"(e.g., 'screenshot of error dialog', 'architecture diagram', 'code snippet') "
"to aid with contextual understanding. Useful for UI discussions, diagrams, "
"visual problems, error screens, architecture mockups, and visual analysis tasks."
),
)
class BaseTool(ABC):
@@ -981,6 +993,139 @@ When recommending searches, be specific about what information you need and why
}
return None
def _validate_image_limits(
self, images: Optional[list[str]], model_name: str, continuation_id: Optional[str] = None
) -> Optional[dict]:
"""
Validate image size against model capabilities at MCP boundary.
This performs strict validation to ensure we don't exceed model-specific
image size limits. Uses capability-based validation with actual model
configuration rather than hard-coded limits.
Args:
images: List of image paths/data URLs to validate
model_name: Name of the model to check limits against
Returns:
Optional[dict]: Error response if validation fails, None if valid
"""
if not images:
return None
# Get model capabilities to check image support and size limits
try:
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
except Exception as e:
logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
# Fall back to checking custom models configuration
capabilities = None
# Check if model supports images at all
supports_images = False
max_size_mb = 0.0
if capabilities:
supports_images = capabilities.supports_images
max_size_mb = capabilities.max_image_size_mb
else:
# Fall back to custom models configuration
try:
import json
from pathlib import Path
custom_models_path = Path(__file__).parent.parent / "conf" / "custom_models.json"
if custom_models_path.exists():
with open(custom_models_path) as f:
custom_config = json.load(f)
# Check if model is in custom models list
for model_config in custom_config.get("models", []):
if model_config.get("model_name") == model_name or model_name in model_config.get(
"aliases", []
):
supports_images = model_config.get("supports_images", False)
max_size_mb = model_config.get("max_image_size_mb", 0.0)
break
except Exception as e:
logger.warning(f"Failed to load custom models config: {e}")
# If model doesn't support images, reject
if not supports_images:
return {
"status": "error",
"content": (
f"Image support not available: Model '{model_name}' does not support image processing. "
f"Please use a vision-capable model such as 'gemini-2.5-flash-preview-05-20', 'o3', "
f"or 'claude-3-opus' for image analysis tasks."
),
"content_type": "text",
"metadata": {
"error_type": "validation_error",
"model_name": model_name,
"supports_images": False,
"image_count": len(images),
},
}
# Calculate total size of all images
total_size_mb = 0.0
for image_path in images:
try:
if image_path.startswith("data:image/"):
# Handle data URL: data:image/png;base64,iVBORw0...
_, data = image_path.split(",", 1)
# Base64 encoding increases size by ~33%, so decode to get actual size
import base64
actual_size = len(base64.b64decode(data))
total_size_mb += actual_size / (1024 * 1024)
else:
# Handle file path
if os.path.exists(image_path):
file_size = os.path.getsize(image_path)
total_size_mb += file_size / (1024 * 1024)
else:
logger.warning(f"Image file not found: {image_path}")
# Assume a reasonable size for missing files to avoid breaking validation
total_size_mb += 1.0 # 1MB assumption
except Exception as e:
logger.warning(f"Failed to get size for image {image_path}: {e}")
# Assume a reasonable size for problematic files
total_size_mb += 1.0 # 1MB assumption
# Apply 40MB cap for custom models as requested
effective_limit_mb = max_size_mb
if hasattr(capabilities, "provider") and capabilities.provider == ProviderType.CUSTOM:
effective_limit_mb = min(max_size_mb, 40.0)
elif not capabilities: # Fallback case for custom models
effective_limit_mb = min(max_size_mb, 40.0)
# Validate against size limit
if total_size_mb > effective_limit_mb:
return {
"status": "error",
"content": (
f"Image size limit exceeded: Model '{model_name}' supports maximum {effective_limit_mb:.1f}MB "
f"for all images combined, but {total_size_mb:.1f}MB was provided. "
f"Please reduce image sizes or count and try again."
),
"content_type": "text",
"metadata": {
"error_type": "validation_error",
"model_name": model_name,
"total_size_mb": round(total_size_mb, 2),
"limit_mb": round(effective_limit_mb, 2),
"image_count": len(images),
"supports_images": supports_images,
},
}
# All validations passed
logger.debug(f"Image validation passed: {len(images)} images")
return None
def estimate_tokens_smart(self, file_path: str) -> int:
"""
Estimate tokens for a file using file-type aware ratios.
@@ -1131,6 +1276,9 @@ When recommending searches, be specific about what information you need and why
)
return [TextContent(type="text", text=error_output.model_dump_json())]
# Extract and validate images from request
images = getattr(request, "images", None) or []
# Check if we have continuation_id - if so, conversation history is already embedded
continuation_id = getattr(request, "continuation_id", None)
@@ -1215,6 +1363,12 @@ When recommending searches, be specific about what information you need and why
# Only set this after auto mode validation to prevent "auto" being used as a model name
self._current_model_name = model_name
# Validate images at MCP boundary if any were provided
if images:
image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
if image_validation_error:
return [TextContent(type="text", text=json.dumps(image_validation_error))]
temperature = getattr(request, "temperature", None)
if temperature is None:
temperature = self.get_default_temperature()
@@ -1247,6 +1401,7 @@ When recommending searches, be specific about what information you need and why
system_prompt=system_prompt,
temperature=temperature,
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
images=images if images else None, # Pass images via kwargs
)
logger.info(f"Received response from {provider.get_provider_type().value} API for {self.name}")
@@ -1298,6 +1453,7 @@ When recommending searches, be specific about what information you need and why
system_prompt=system_prompt,
temperature=temperature,
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
images=images if images else None, # Pass images via kwargs in retry too
)
if retry_response.content:
@@ -1398,6 +1554,7 @@ When recommending searches, be specific about what information you need and why
continuation_id = getattr(request, "continuation_id", None)
if continuation_id:
request_files = getattr(request, "files", []) or []
request_images = getattr(request, "images", []) or []
# Extract model metadata for conversation tracking
model_provider = None
model_name = None
@@ -1417,6 +1574,7 @@ When recommending searches, be specific about what information you need and why
"assistant",
formatted_content,
files=request_files,
images=request_images,
tool_name=self.name,
model_provider=model_provider,
model_name=model_name,
@@ -1519,6 +1677,7 @@ When recommending searches, be specific about what information you need and why
# Use actually processed files from file preparation instead of original request files
# This ensures directories are tracked as their individual expanded files
request_files = getattr(self, "_actually_processed_files", []) or getattr(request, "files", []) or []
request_images = getattr(request, "images", []) or []
# Extract model metadata
model_provider = None
model_name = None
@@ -1538,6 +1697,7 @@ When recommending searches, be specific about what information you need and why
"assistant",
content,
files=request_files,
images=request_images,
tool_name=self.name,
model_provider=model_provider,
model_name=model_name,