Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/conf/custom_models.json
+++ b/conf/custom_models.json
@@ -25,6 +25,8 @@
      "supports_extended_thinking": "Whether the model supports extended reasoning tokens (currently none do via OpenRouter or custom APIs)",
      "supports_json_mode": "Whether the model can guarantee valid JSON output",
      "supports_function_calling": "Whether the model supports function/tool calling",
+      "supports_images": "Whether the model can process images/visual input",
+      "max_image_size_mb": "Maximum total size in MB for all images combined (capped at 40MB max for custom models)",
      "is_custom": "Set to true for models that should ONLY be used with custom API endpoints (Ollama, vLLM, etc.). False or omitted for OpenRouter/cloud models.",
      "description": "Human-readable description of the model"
    },
@@ -35,6 +37,8 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
+      "supports_images": true,
+      "max_image_size_mb": 10.0,
      "is_custom": true,
      "description": "Example custom/local model for Ollama, vLLM, etc."
    }
@@ -47,7 +51,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
-      "description": "Claude 3 Opus - Most capable Claude model"
+      "supports_images": true,
+      "max_image_size_mb": 5.0,
+      "description": "Claude 3 Opus - Most capable Claude model with vision"
    },
    {
      "model_name": "anthropic/claude-3-sonnet",
@@ -56,7 +62,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
-      "description": "Claude 3 Sonnet - Balanced performance"
+      "supports_images": true,
+      "max_image_size_mb": 5.0,
+      "description": "Claude 3 Sonnet - Balanced performance with vision"
    },
    {
      "model_name": "anthropic/claude-3-haiku",
@@ -65,7 +73,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
-      "description": "Claude 3 Haiku - Fast and efficient"
+      "supports_images": true,
+      "max_image_size_mb": 5.0,
+      "description": "Claude 3 Haiku - Fast and efficient with vision"
    },
    {
      "model_name": "google/gemini-2.5-pro-preview",
@@ -74,7 +84,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": false,
-      "description": "Google's Gemini 2.5 Pro via OpenRouter"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "Google's Gemini 2.5 Pro via OpenRouter with vision"
    },
    {
      "model_name": "google/gemini-2.5-flash-preview-05-20",
@@ -83,7 +95,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": false,
-      "description": "Google's Gemini 2.5 Flash via OpenRouter"
+      "supports_images": true,
+      "max_image_size_mb": 15.0,
+      "description": "Google's Gemini 2.5 Flash via OpenRouter with vision"
    },
    {
      "model_name": "mistral/mistral-large",
@@ -92,7 +106,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "Mistral's largest model"
+      "supports_images": false,
+      "max_image_size_mb": 0.0,
+      "description": "Mistral's largest model (text-only)"
    },
    {
      "model_name": "meta-llama/llama-3-70b",
@@ -101,7 +117,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
-      "description": "Meta's Llama 3 70B model"
+      "supports_images": false,
+      "max_image_size_mb": 0.0,
+      "description": "Meta's Llama 3 70B model (text-only)"
    },
    {
      "model_name": "deepseek/deepseek-r1-0528",
@@ -110,7 +128,9 @@
      "supports_extended_thinking": true,
      "supports_json_mode": true,
      "supports_function_calling": false,
-      "description": "DeepSeek R1 with thinking mode - advanced reasoning capabilities"
+      "supports_images": false,
+      "max_image_size_mb": 0.0,
+      "description": "DeepSeek R1 with thinking mode - advanced reasoning capabilities (text-only)"
    },
    {
      "model_name": "perplexity/llama-3-sonar-large-32k-online",
@@ -119,7 +139,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
-      "description": "Perplexity's online model with web search"
+      "supports_images": false,
+      "max_image_size_mb": 0.0,
+      "description": "Perplexity's online model with web search (text-only)"
    },
    {
      "model_name": "openai/o3",
@@ -128,7 +150,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o3 model - well-rounded and powerful across domains"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o3 model - well-rounded and powerful across domains with vision"
    },
    {
      "model_name": "openai/o3-mini",
@@ -137,7 +161,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o3-mini model - balanced performance and speed"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o3-mini model - balanced performance and speed with vision"
    },
    {
      "model_name": "openai/o3-mini-high",
@@ -146,7 +172,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems with vision"
    },
    {
      "model_name": "openai/o3-pro",
@@ -155,7 +183,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o3-pro model - professional-grade reasoning and analysis"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o3-pro model - professional-grade reasoning and analysis with vision"
    },
    {
      "model_name": "openai/o4-mini",
@@ -164,7 +194,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning and vision"
    },
    {
      "model_name": "openai/o4-mini-high",
@@ -173,7 +205,9 @@
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
-      "description": "OpenAI's o4-mini with high reasoning effort - enhanced for complex tasks"
+      "supports_images": true,
+      "max_image_size_mb": 20.0,
+      "description": "OpenAI's o4-mini with high reasoning effort - enhanced for complex tasks with vision"
    },
    {
      "model_name": "llama3.2",
@@ -182,8 +216,10 @@
      "supports_extended_thinking": false,
      "supports_json_mode": false,
      "supports_function_calling": false,
+      "supports_images": false,
+      "max_image_size_mb": 0.0,
      "is_custom": true,
-      "description": "Local Llama 3.2 model via custom endpoint (Ollama/vLLM) - 128K context window"
+      "description": "Local Llama 3.2 model via custom endpoint (Ollama/vLLM) - 128K context window (text-only)"
    }
  ]
 }