feat: added intelligence_score to the model capabilities schema; a 1-20 number that can be specified to influence the sort order of models presented to the CLI in auto selection mode

fix: model definition re-introduced into the schema but intelligently and only a summary is generated per tool. Required to ensure CLI calls and uses the correct model
fix: removed `model` param from some tools where this wasn't needed
fix: fixed adherence to `*_ALLOWED_MODELS` by advertising only the allowed models to the CLI
fix: removed duplicates across providers when passing canonical names back to the CLI; the first enabled provider wins
This commit is contained in:
Fahad
2025-10-02 21:43:44 +04:00
parent e78fe35a1b
commit 6cab9e56fc
22 changed files with 525 additions and 110 deletions

View File

@@ -283,10 +283,12 @@ DISABLED_TOOLS=
## Quick Links
**📖 Documentation**
- [Docs Overview](docs/index.md) - High-level map of major guides
- [Getting Started](docs/getting-started.md) - Complete setup guide
- [Tools Reference](docs/tools/) - All tools with examples
- [Advanced Usage](docs/advanced-usage.md) - Power user features
- [Configuration](docs/configuration.md) - Environment variables, restrictions
- [Model Ranking Guide](docs/model_ranking.md) - How intelligence scores drive auto-mode suggestions
**🔧 Setup & Support**
- [WSL Setup](docs/wsl-setup.md) - Windows users

View File

@@ -31,7 +31,8 @@
"supports_temperature": "Whether the model accepts temperature parameter in API calls (set to false for O3/O4 reasoning models)",
"temperature_constraint": "Type of temperature constraint: 'fixed' (fixed value), 'range' (continuous range), 'discrete' (specific values), or omit for default range",
"is_custom": "Set to true for models that should ONLY be used with custom API endpoints (Ollama, vLLM, etc.). False or omitted for OpenRouter/cloud models.",
"description": "Human-readable description of the model"
"description": "Human-readable description of the model",
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
},
"example_custom_model": {
"model_name": "my-local-model",
@@ -46,7 +47,8 @@
"supports_temperature": true,
"temperature_constraint": "range",
"is_custom": true,
"description": "Example custom/local model for Ollama, vLLM, etc."
"description": "Example custom/local model for Ollama, vLLM, etc.",
"intelligence_score": 12
}
},
"models": [
@@ -63,7 +65,8 @@
"supports_function_calling": false,
"supports_images": true,
"max_image_size_mb": 5.0,
"description": "Claude Sonnet 4.5 - High-performance model with exceptional reasoning and efficiency"
"description": "Claude Sonnet 4.5 - High-performance model with exceptional reasoning and efficiency",
"intelligence_score": 12
},
{
"model_name": "anthropic/claude-opus-4.1",
@@ -75,7 +78,8 @@
"supports_function_calling": false,
"supports_images": true,
"max_image_size_mb": 5.0,
"description": "Claude Opus 4.1 - Our most capable and intelligent model yet"
"description": "Claude Opus 4.1 - Our most capable and intelligent model yet",
"intelligence_score": 14
},
{
"model_name": "anthropic/claude-sonnet-4.1",
@@ -87,7 +91,8 @@
"supports_function_calling": false,
"supports_images": true,
"max_image_size_mb": 5.0,
"description": "Claude Sonnet 4.1 - Last generation high-performance model with exceptional reasoning and efficiency"
"description": "Claude Sonnet 4.1 - Last generation high-performance model with exceptional reasoning and efficiency",
"intelligence_score": 10
},
{
"model_name": "anthropic/claude-3.5-haiku",
@@ -99,31 +104,34 @@
"supports_function_calling": false,
"supports_images": true,
"max_image_size_mb": 5.0,
"description": "Claude 3 Haiku - Fast and efficient with vision"
"description": "Claude 3 Haiku - Fast and efficient with vision",
"intelligence_score": 8
},
{
"model_name": "google/gemini-2.5-pro",
"aliases": ["pro","gemini-pro", "gemini", "pro-openrouter"],
"context_window": 1048576,
"max_output_tokens": 65536,
"supports_extended_thinking": false,
"supports_extended_thinking": true,
"supports_json_mode": true,
"supports_function_calling": false,
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"description": "Google's Gemini 2.5 Pro via OpenRouter with vision"
"description": "Google's Gemini 2.5 Pro via OpenRouter with vision",
"intelligence_score": 18
},
{
"model_name": "google/gemini-2.5-flash",
"aliases": ["flash","gemini-flash"],
"context_window": 1048576,
"max_output_tokens": 65536,
"supports_extended_thinking": false,
"supports_extended_thinking": true,
"supports_json_mode": true,
"supports_function_calling": false,
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 15.0,
"description": "Google's Gemini 2.5 Flash via OpenRouter with vision"
"description": "Google's Gemini 2.5 Flash via OpenRouter with vision",
"intelligence_score": 10
},
{
"model_name": "mistralai/mistral-large-2411",
@@ -135,7 +143,8 @@
"supports_function_calling": true,
"supports_images": false,
"max_image_size_mb": 0.0,
"description": "Mistral's largest model (text-only)"
"description": "Mistral's largest model (text-only)",
"intelligence_score": 11
},
{
"model_name": "meta-llama/llama-3-70b",
@@ -147,7 +156,8 @@
"supports_function_calling": false,
"supports_images": false,
"max_image_size_mb": 0.0,
"description": "Meta's Llama 3 70B model (text-only)"
"description": "Meta's Llama 3 70B model (text-only)",
"intelligence_score": 9
},
{
"model_name": "deepseek/deepseek-r1-0528",
@@ -159,7 +169,8 @@
"supports_function_calling": false,
"supports_images": false,
"max_image_size_mb": 0.0,
"description": "DeepSeek R1 with thinking mode - advanced reasoning capabilities (text-only)"
"description": "DeepSeek R1 with thinking mode - advanced reasoning capabilities (text-only)",
"intelligence_score": 15
},
{
"model_name": "perplexity/llama-3-sonar-large-32k-online",
@@ -171,7 +182,8 @@
"supports_function_calling": false,
"supports_images": false,
"max_image_size_mb": 0.0,
"description": "Perplexity's online model with web search (text-only)"
"description": "Perplexity's online model with web search (text-only)",
"intelligence_score": 9
},
{
"model_name": "openai/o3",
@@ -185,7 +197,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3 model - well-rounded and powerful across domains with vision"
"description": "OpenAI's o3 model - well-rounded and powerful across domains with vision",
"intelligence_score": 14
},
{
"model_name": "openai/o3-mini",
@@ -199,7 +212,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-mini model - balanced performance and speed with vision"
"description": "OpenAI's o3-mini model - balanced performance and speed with vision",
"intelligence_score": 12
},
{
"model_name": "openai/o3-mini-high",
@@ -213,7 +227,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems with vision"
"description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems with vision",
"intelligence_score": 13
},
{
"model_name": "openai/o3-pro",
@@ -227,7 +242,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-pro model - professional-grade reasoning and analysis with vision"
"description": "OpenAI's o3-pro model - professional-grade reasoning and analysis with vision",
"intelligence_score": 15
},
{
"model_name": "openai/o4-mini",
@@ -241,7 +257,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning and vision"
"description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning and vision",
"intelligence_score": 11
},
{
"model_name": "openai/gpt-5",
@@ -255,7 +272,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "range",
"description": "GPT-5 (400K context, 128K output) - Advanced model with reasoning support"
"description": "GPT-5 (400K context, 128K output) - Advanced model with reasoning support",
"intelligence_score": 16
},
{
"model_name": "openai/gpt-5-mini",
@@ -269,7 +287,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "fixed",
"description": "GPT-5-mini (400K context, 128K output) - Efficient variant with reasoning support"
"description": "GPT-5-mini (400K context, 128K output) - Efficient variant with reasoning support",
"intelligence_score": 15
},
{
"model_name": "openai/gpt-5-nano",
@@ -283,7 +302,8 @@
"max_image_size_mb": 20.0,
"supports_temperature": true,
"temperature_constraint": "fixed",
"description": "GPT-5 nano (400K context, 128K output) - Fastest, cheapest version of GPT-5 for summarization and classification tasks"
"description": "GPT-5 nano (400K context, 128K output) - Fastest, cheapest version of GPT-5 for summarization and classification tasks",
"intelligence_score": 13
},
{
"model_name": "llama3.2",
@@ -296,7 +316,8 @@
"supports_images": false,
"max_image_size_mb": 0.0,
"is_custom": true,
"description": "Local Llama 3.2 model via custom endpoint (Ollama/vLLM) - 128K context window (text-only)"
"description": "Local Llama 3.2 model via custom endpoint (Ollama/vLLM) - 128K context window (text-only)",
"intelligence_score": 6
}
]
}

View File

@@ -10,6 +10,13 @@ Each provider:
- Implements the minimal abstract hooks (`get_provider_type()` and `generate_content()`)
- Gets registered automatically via environment variables
### Intelligence score cheatsheet
Set `intelligence_score` (120) when you want deterministic ordering in auto
mode or the `listmodels` output. The runtime rank starts from this human score
and adds smaller bonuses for context window, extended thinking, and other
features ([details here](model_ranking.md)).
## Choose Your Implementation Path
**Option A: Full Provider (`ModelProvider`)**
@@ -68,6 +75,7 @@ class ExampleModelProvider(ModelProvider):
provider=ProviderType.EXAMPLE,
model_name="example-large",
friendly_name="Example Large",
intelligence_score=18,
context_window=100_000,
max_output_tokens=50_000,
supports_extended_thinking=False,
@@ -79,6 +87,7 @@ class ExampleModelProvider(ModelProvider):
provider=ProviderType.EXAMPLE,
model_name="example-small",
friendly_name="Example Small",
intelligence_score=14,
context_window=32_000,
max_output_tokens=16_000,
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7),

View File

@@ -60,6 +60,10 @@ The server uses `conf/custom_models.json` to map convenient aliases to both Open
View the full list in [`conf/custom_models.json`](conf/custom_models.json).
To control ordering in auto mode or the `listmodels` summary, adjust the
[`intelligence_score`](model_ranking.md) for each entry (or rely on the automatic
heuristic described there).
**Note:** While you can use any OpenRouter model by its full name, models not in the config file will use generic capabilities (32K context window, no extended thinking, etc.) which may not match the model's actual capabilities. For best results, add new models to the config file with their proper specifications.
## Quick Start

15
docs/index.md Normal file
View File

@@ -0,0 +1,15 @@
# Zen MCP Server Documentation
| Document | Description |
|----------|-------------|
| [Getting Started](getting-started.md) | Installation paths, prerequisite setup, and first-run guidance. |
| [Adding Providers](adding_providers.md) | How to register new AI providers and advertise capabilities. |
| [Model Ranking](model_ranking.md) | How intelligence scores translate into auto-mode ordering. |
| [Custom Models](custom_models.md) | Configure OpenRouter/custom models and aliases. |
| [Adding Tools](adding_tools.md) | Create new tools using the shared base classes. |
| [Advanced Usage](advanced-usage.md) | Auto-mode tricks, workflow tools, and collaboration tips. |
| [Configuration](configuration.md) | .env options, restriction policies, logging levels. |
| [Testing](testing.md) | Test strategy, command cheats, and coverage notes. |
| [Troubleshooting](troubleshooting.md) | Common issues and resolutions. |
Additional docs live in this directory; start with the table above to orient yourself.

69
docs/model_ranking.md Normal file
View File

@@ -0,0 +1,69 @@
# Model Capability Ranking
Auto mode needs a short, trustworthy list of models to suggest. The server
computes a capability rank for every model at runtime using a simple recipe:
1. Start with the human-supplied `intelligence_score` (120). This is the
anchor—multiply it by five to map onto the 0100 scale the server uses.
2. Add a few light bonuses for hard capabilities:
- **Context window:** up to +5 (log-scale bonus when the model exceeds ~1K tokens).
- **Output budget:** +2 for ≥65K tokens, +1 for ≥32K.
- **Extended thinking:** +3 when the provider supports it.
- **Function calling / JSON / images:** +1 each when available.
- **Custom endpoints:** 1 to nudge cloud-hosted defaults ahead unless tuned.
3. Clamp the final score to 0100 so downstream callers can rely on the range.
In code this looks like:
```python
base = clamp(intelligence_score, 1, 20) * 5
ctx_bonus = min(5, max(0, log10(context_window) - 3))
output_bonus = 2 if max_output_tokens >= 65_000 else 1 if >= 32_000 else 0
feature_bonus = (
(3 if supports_extended_thinking else 0)
+ (1 if supports_function_calling else 0)
+ (1 if supports_json_mode else 0)
+ (1 if supports_images else 0)
)
penalty = 1 if is_custom else 0
effective_rank = clamp(base + ctx_bonus + output_bonus + feature_bonus - penalty, 0, 100)
```
The bonuses are intentionally small—the human intelligence score does most
of the work so you can enforce organisational preferences easily.
## Picking an intelligence score
A straightforward rubric that mirrors typical provider tiers:
| Intelligence | Guidance |
|--------------|----------|
| 1819 | Frontier reasoning models (Gemini 2.5 Pro, GPT5) |
| 1517 | Strong general models with large context (O3 Pro, DeepSeek R1) |
| 1214 | Balanced assistants (Claude Opus/Sonnet, Mistral Large) |
| 911 | Fast distillations (Gemini Flash, GPT-5 Mini, Mistral medium) |
| 68 | Local or efficiency-focused models (Llama 3 70B, Claude Haiku) |
| ≤5 | Experimental/lightweight models |
Record the reasoning for your scores so future updates stay consistent.
## How the rank is used
The ranked list is cached per provider and consumed by:
- Tool schemas (`model` parameter descriptions) when auto mode is active.
- The `listmodels` tools “top models” sections.
- Fallback messaging when a requested model is unavailable.
Because the rank is computed after restriction filters, only allowed models
appear in these summaries.
## Customising further
If you need a different weighting you can:
- Override `intelligence_score` in your provider or custom model config.
- Subclass the provider and override `get_effective_capability_rank()`.
- Post-process the rank via `get_capabilities_by_rank()` before surfacing it.
Most teams find that adjusting `intelligence_score` alone is enough to keep
auto mode honest without revisiting code.

View File

@@ -42,6 +42,7 @@ class ModelProvider(ABC):
"""Initialize the provider with API key and optional configuration."""
self.api_key = api_key
self.config = kwargs
self._sorted_capabilities_cache: Optional[list[tuple[str, ModelCapabilities]]] = None
# ------------------------------------------------------------------
# Provider identity & capability surface
@@ -77,6 +78,27 @@ class ModelProvider(ABC):
return {k: v for k, v in model_map.items() if isinstance(v, ModelCapabilities)}
return {}
def get_capabilities_by_rank(self) -> list[tuple[str, ModelCapabilities]]:
"""Return model capabilities sorted by effective capability rank."""
if self._sorted_capabilities_cache is not None:
return list(self._sorted_capabilities_cache)
model_configs = self.get_all_model_capabilities()
if not model_configs:
self._sorted_capabilities_cache = []
return []
items = list(model_configs.items())
items.sort(key=lambda item: (-item[1].get_effective_capability_rank(), item[0]))
self._sorted_capabilities_cache = items
return list(items)
def _invalidate_capability_cache(self) -> None:
"""Clear cached sorted capability data (call after dynamic updates)."""
self._sorted_capabilities_cache = None
def list_models(
self,
*,

View File

@@ -33,6 +33,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="o3-2025-04-16",
friendly_name="DIAL (O3)",
intelligence_score=14,
context_window=200_000,
max_output_tokens=100_000,
supports_extended_thinking=False,
@@ -51,6 +52,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="o4-mini-2025-04-16",
friendly_name="DIAL (O4-mini)",
intelligence_score=11,
context_window=200_000,
max_output_tokens=100_000,
supports_extended_thinking=False,
@@ -69,6 +71,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="anthropic.claude-sonnet-4.1-20250805-v1:0",
friendly_name="DIAL (Sonnet 4.1)",
intelligence_score=10,
context_window=200_000,
max_output_tokens=64_000,
supports_extended_thinking=False,
@@ -87,6 +90,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="anthropic.claude-sonnet-4.1-20250805-v1:0-with-thinking",
friendly_name="DIAL (Sonnet 4.1 Thinking)",
intelligence_score=11,
context_window=200_000,
max_output_tokens=64_000,
supports_extended_thinking=True, # Thinking mode variant
@@ -105,6 +109,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="anthropic.claude-opus-4.1-20250805-v1:0",
friendly_name="DIAL (Opus 4.1)",
intelligence_score=14,
context_window=200_000,
max_output_tokens=64_000,
supports_extended_thinking=False,
@@ -123,6 +128,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="anthropic.claude-opus-4.1-20250805-v1:0-with-thinking",
friendly_name="DIAL (Opus 4.1 Thinking)",
intelligence_score=15,
context_window=200_000,
max_output_tokens=64_000,
supports_extended_thinking=True, # Thinking mode variant
@@ -141,6 +147,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="gemini-2.5-pro-preview-03-25-google-search",
friendly_name="DIAL (Gemini 2.5 Pro Search)",
intelligence_score=17,
context_window=1_000_000,
max_output_tokens=65_536,
supports_extended_thinking=False, # DIAL doesn't expose thinking mode
@@ -159,6 +166,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="gemini-2.5-pro-preview-05-06",
friendly_name="DIAL (Gemini 2.5 Pro)",
intelligence_score=18,
context_window=1_000_000,
max_output_tokens=65_536,
supports_extended_thinking=False,
@@ -177,6 +185,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
provider=ProviderType.DIAL,
model_name="gemini-2.5-flash-preview-05-20",
friendly_name="DIAL (Gemini Flash 2.5)",
intelligence_score=10,
context_window=1_000_000,
max_output_tokens=65_536,
supports_extended_thinking=False,

View File

@@ -33,6 +33,7 @@ class GeminiModelProvider(ModelProvider):
provider=ProviderType.GOOGLE,
model_name="gemini-2.5-pro",
friendly_name="Gemini (Pro 2.5)",
intelligence_score=18,
context_window=1_048_576, # 1M tokens
max_output_tokens=65_536,
supports_extended_thinking=True,
@@ -52,6 +53,7 @@ class GeminiModelProvider(ModelProvider):
provider=ProviderType.GOOGLE,
model_name="gemini-2.0-flash",
friendly_name="Gemini (Flash 2.0)",
intelligence_score=9,
context_window=1_048_576, # 1M tokens
max_output_tokens=65_536,
supports_extended_thinking=True, # Experimental thinking mode
@@ -71,6 +73,7 @@ class GeminiModelProvider(ModelProvider):
provider=ProviderType.GOOGLE,
model_name="gemini-2.0-flash-lite",
friendly_name="Gemin (Flash Lite 2.0)",
intelligence_score=7,
context_window=1_048_576, # 1M tokens
max_output_tokens=65_536,
supports_extended_thinking=False, # Not supported per user request
@@ -89,6 +92,7 @@ class GeminiModelProvider(ModelProvider):
provider=ProviderType.GOOGLE,
model_name="gemini-2.5-flash",
friendly_name="Gemini (Flash 2.5)",
intelligence_score=10,
context_window=1_048_576, # 1M tokens
max_output_tokens=65_536,
supports_extended_thinking=True,

View File

@@ -26,6 +26,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="gpt-5",
friendly_name="OpenAI (GPT-5)",
intelligence_score=16,
context_window=400_000, # 400K tokens
max_output_tokens=128_000, # 128K max output tokens
supports_extended_thinking=True, # Supports reasoning tokens
@@ -44,6 +45,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="gpt-5-mini",
friendly_name="OpenAI (GPT-5-mini)",
intelligence_score=15,
context_window=400_000, # 400K tokens
max_output_tokens=128_000, # 128K max output tokens
supports_extended_thinking=True, # Supports reasoning tokens
@@ -62,6 +64,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="gpt-5-nano",
friendly_name="OpenAI (GPT-5 nano)",
intelligence_score=13,
context_window=400_000,
max_output_tokens=128_000,
supports_extended_thinking=True,
@@ -80,6 +83,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="o3",
friendly_name="OpenAI (O3)",
intelligence_score=14,
context_window=200_000, # 200K tokens
max_output_tokens=65536, # 64K max output tokens
supports_extended_thinking=False,
@@ -98,6 +102,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="o3-mini",
friendly_name="OpenAI (O3-mini)",
intelligence_score=12,
context_window=200_000, # 200K tokens
max_output_tokens=65536, # 64K max output tokens
supports_extended_thinking=False,
@@ -116,6 +121,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="o3-pro",
friendly_name="OpenAI (O3-Pro)",
intelligence_score=15,
context_window=200_000, # 200K tokens
max_output_tokens=65536, # 64K max output tokens
supports_extended_thinking=False,
@@ -134,6 +140,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="o4-mini",
friendly_name="OpenAI (O4-mini)",
intelligence_score=11,
context_window=200_000, # 200K tokens
max_output_tokens=65536, # 64K max output tokens
supports_extended_thinking=False,
@@ -152,6 +159,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENAI,
model_name="gpt-4.1",
friendly_name="OpenAI (GPT 4.1)",
intelligence_score=13,
context_window=1_000_000, # 1M tokens
max_output_tokens=32_768,
supports_extended_thinking=False,

View File

@@ -85,6 +85,7 @@ class OpenRouterProvider(OpenAICompatibleProvider):
provider=ProviderType.OPENROUTER,
model_name=canonical_name,
friendly_name=self.FRIENDLY_NAME,
intelligence_score=9,
context_window=32_768,
max_output_tokens=32_768,
supports_extended_thinking=False,

View File

@@ -1,5 +1,6 @@
"""Dataclass describing the feature set of a model exposed by a provider."""
import math
from dataclasses import dataclass, field
from typing import Optional
@@ -32,6 +33,7 @@ class ModelCapabilities:
provider: ProviderType
model_name: str
friendly_name: str
intelligence_score: int = 10 # Human-curated 120 score reflecting general capability
description: str = ""
aliases: list[str] = field(default_factory=list)
@@ -69,6 +71,42 @@ class ModelCapabilities:
return self.temperature_constraint.get_corrected_value(requested_temperature)
def get_effective_capability_rank(self) -> int:
"""Calculate the runtime capability rank from intelligence + capabilities."""
# Human signal drives the baseline (120 → 5100 after scaling)
base_intelligence = self.intelligence_score if self.intelligence_score else 10
base_intelligence = max(1, min(20, base_intelligence))
score = base_intelligence * 5
# Context window bonus with gentle diminishing returns
ctx_bonus = 0
ctx = max(self.context_window, 0)
if ctx > 0:
ctx_bonus = int(min(5, max(0.0, math.log10(ctx) - 3)))
score += ctx_bonus
# Output token capacity adds a small bonus
if self.max_output_tokens >= 65_000:
score += 2
elif self.max_output_tokens >= 32_000:
score += 1
# Feature-level boosts
if self.supports_extended_thinking:
score += 3
if self.supports_function_calling:
score += 1
if self.supports_json_mode:
score += 1
if self.supports_images:
score += 1
if self.is_custom:
score -= 1
return max(0, min(100, score))
@staticmethod
def collect_aliases(model_configs: dict[str, "ModelCapabilities"]) -> dict[str, list[str]]:
"""Build a mapping of model name to aliases from capability configs."""
@@ -112,7 +150,13 @@ class ModelCapabilities:
formatted_names.append(formatted)
for base_model, capabilities in model_configs.items():
# Sort models by capability rank (descending) then by name for deterministic ordering
sorted_items = sorted(
model_configs.items(),
key=lambda item: (-item[1].get_effective_capability_rank(), item[0]),
)
for base_model, capabilities in sorted_items:
append_name(base_model)
if include_aliases and capabilities.aliases:

View File

@@ -27,6 +27,7 @@ class XAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.XAI,
model_name="grok-4",
friendly_name="X.AI (Grok 4)",
intelligence_score=16,
context_window=256_000, # 256K tokens
max_output_tokens=256_000, # 256K tokens max output
supports_extended_thinking=True, # Grok-4 supports reasoning mode
@@ -45,6 +46,7 @@ class XAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.XAI,
model_name="grok-3",
friendly_name="X.AI (Grok 3)",
intelligence_score=13,
context_window=131_072, # 131K tokens
max_output_tokens=131072,
supports_extended_thinking=False,
@@ -63,6 +65,7 @@ class XAIModelProvider(OpenAICompatibleProvider):
provider=ProviderType.XAI,
model_name="grok-3-fast",
friendly_name="X.AI (Grok 3 Fast)",
intelligence_score=12,
context_window=131_072, # 131K tokens
max_output_tokens=131072,
supports_extended_thinking=False,

View File

@@ -122,8 +122,8 @@ class TestConsensusTool:
# relevant_files should be present as it's used by consensus
assert "relevant_files" in schema["properties"]
# model field should be present for Gemini compatibility (consensus uses 'models' as well)
assert "model" in schema["properties"]
# model field should NOT be present as consensus uses 'models' field instead
assert "model" not in schema["properties"]
# Verify workflow fields that should NOT be present
assert "files_checked" not in schema["properties"]

View File

@@ -26,6 +26,7 @@ from utils.conversation_memory import (
get_conversation_image_list,
get_thread,
)
from utils.model_context import ModelContext
@pytest.mark.no_mock_provider
@@ -180,17 +181,18 @@ class TestImageSupportIntegration:
try:
# Test with an invalid model name that doesn't exist in any provider
result = tool._validate_image_limits(small_images, "non-existent-model-12345")
# Use model_context parameter name (not positional)
result = tool._validate_image_limits(small_images, model_context=ModelContext("non-existent-model-12345"))
# Should return error because model not available or doesn't support images
assert result is not None
assert result["status"] == "error"
assert "is not available" in result["content"] or "does not support image processing" in result["content"]
# Test that empty/None images always pass regardless of model
result = tool._validate_image_limits([], "any-model")
result = tool._validate_image_limits([], model_context=ModelContext("gemini-2.5-pro"))
assert result is None
result = tool._validate_image_limits(None, "any-model")
result = tool._validate_image_limits(None, model_context=ModelContext("gemini-2.5-pro"))
assert result is None
finally:
@@ -215,7 +217,7 @@ class TestImageSupportIntegration:
small_image_path = temp_file.name
# Test with the default model from test environment (gemini-2.5-flash)
result = tool._validate_image_limits([small_image_path], "gemini-2.5-flash")
result = tool._validate_image_limits([small_image_path], ModelContext("gemini-2.5-flash"))
assert result is None # Should pass for Gemini models
# Create 150MB image (over typical limits)
@@ -223,7 +225,7 @@ class TestImageSupportIntegration:
temp_file.write(b"\x00" * (150 * 1024 * 1024)) # 150MB
large_image_path = temp_file.name
result = tool._validate_image_limits([large_image_path], "gemini-2.5-flash")
result = tool._validate_image_limits([large_image_path], ModelContext("gemini-2.5-flash"))
# Large images should fail validation
assert result is not None
assert result["status"] == "error"
@@ -429,14 +431,14 @@ class TestImageSupportIntegration:
images = [data_url]
# Test with a dummy model that doesn't exist in any provider
result = tool._validate_image_limits(images, "test-dummy-model-name")
result = tool._validate_image_limits(images, ModelContext("test-dummy-model-name"))
# Should return error because model not available or doesn't support images
assert result is not None
assert result["status"] == "error"
assert "is not available" in result["content"] or "does not support image processing" in result["content"]
# Test with another non-existent model to check error handling
result = tool._validate_image_limits(images, "another-dummy-model")
result = tool._validate_image_limits(images, ModelContext("another-dummy-model"))
# Should return error because model not available
assert result is not None
assert result["status"] == "error"
@@ -446,11 +448,11 @@ class TestImageSupportIntegration:
tool = ChatTool()
# Empty list should not fail validation (no need for provider setup)
result = tool._validate_image_limits([], "test_model")
result = tool._validate_image_limits([], ModelContext("gemini-2.5-pro"))
assert result is None
# None should not fail validation (no need for provider setup)
result = tool._validate_image_limits(None, "test_model")
result = tool._validate_image_limits(None, ModelContext("gemini-2.5-pro"))
assert result is None
@patch("utils.conversation_memory.get_storage")

View File

@@ -70,11 +70,25 @@ class TestListModelsRestrictions(unittest.TestCase):
config = MagicMock()
config.model_name = "anthropic/claude-opus-4-20240229"
config.context_window = 200000
config.get_effective_capability_rank.return_value = 90 # High rank for Opus
return config
elif "sonnet" in model_name.lower():
config = MagicMock()
config.model_name = "anthropic/claude-sonnet-4-20240229"
config.context_window = 200000
config.get_effective_capability_rank.return_value = 80 # Lower rank for Sonnet
return config
elif "deepseek" in model_name.lower():
config = MagicMock()
config.model_name = "deepseek/deepseek-r1-0528:free"
config.context_window = 100000
config.get_effective_capability_rank.return_value = 70
return config
elif "qwen" in model_name.lower():
config = MagicMock()
config.model_name = "qwen/qwen3-235b-a22b-04-28:free"
config.context_window = 100000
config.get_effective_capability_rank.return_value = 60
return config
return None # No config for models without aliases
@@ -90,6 +104,9 @@ class TestListModelsRestrictions(unittest.TestCase):
mock_get_provider.side_effect = get_provider_side_effect
# Ensure registry is cleared before test
ModelProviderRegistry._registry = {}
# Mock available models
mock_get_models.return_value = {
"gemini-2.5-flash": ProviderType.GOOGLE,
@@ -131,6 +148,9 @@ class TestListModelsRestrictions(unittest.TestCase):
# Parse the output
lines = result.split("\n")
# Debug: print the actual result for troubleshooting
# print(f"DEBUG: Full result:\n{result}")
# Check that OpenRouter section exists
openrouter_section_found = False
openrouter_models = []
@@ -141,15 +161,18 @@ class TestListModelsRestrictions(unittest.TestCase):
openrouter_section_found = True
elif "Available Models" in line and openrouter_section_found:
in_openrouter_section = True
elif in_openrouter_section and line.strip().startswith("- "):
# Extract model name from various line formats:
# - `model-name` → `full-name` (context)
# - `model-name`
line_content = line.strip()[2:] # Remove "- "
if "`" in line_content:
# Extract content between first pair of backticks
model_name = line_content.split("`")[1]
openrouter_models.append(model_name)
elif in_openrouter_section:
# Check for lines with model names in backticks
# Format: - `model-name` (score X)
if line.strip().startswith("- ") and "`" in line:
# Extract model name between backticks
parts = line.split("`")
if len(parts) >= 2:
model_name = parts[1]
openrouter_models.append(model_name)
# Stop parsing when we hit the next section
elif "##" in line and in_openrouter_section:
break
self.assertTrue(openrouter_section_found, "OpenRouter section not found")
self.assertEqual(

View File

@@ -174,6 +174,7 @@ class TestOpenRouterAutoMode:
mock_config = Mock()
mock_config.is_custom = False
mock_config.aliases = [] # Empty list of aliases
mock_config.get_effective_capability_rank = Mock(return_value=50) # Add ranking method
return mock_config
return None
@@ -220,6 +221,7 @@ class TestOpenRouterAutoMode:
# Mock the resolve method to return model configs with aliases
mock_model_config = Mock()
mock_model_config.aliases = [] # Empty aliases for simplicity
mock_model_config.get_effective_capability_rank = Mock(return_value=50) # Add ranking method
mock_registry.resolve.return_value = mock_model_config
ModelProviderRegistry.register_provider(ProviderType.OPENROUTER, OpenRouterProvider)

View File

@@ -48,8 +48,9 @@ CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS = {
),
"relevant_files": "Optional supporting files that help the consensus analysis. Must be absolute full, non-abbreviated paths.",
"models": (
"List of models to consult. Each entry may include model, stance (for/against/neutral), and stance_prompt. "
"Each (model, stance) pair must be unique, e.g. [{'model':'o3','stance':'for'}, {'model':'o3','stance':'against'}]."
"User-specified list of models to consult (provide at least two entries). "
"Each entry may include model, stance (for/against/neutral), and stance_prompt. "
"Each (model, stance) pair must be unique, e.g. [{'model':'gpt5','stance':'for'}, {'model':'pro','stance':'against'}]."
),
"current_model_index": "0-based index of the next model to consult (managed internally).",
"model_responses": "Internal log of responses gathered so far.",
@@ -233,7 +234,11 @@ of the evidence, even when it strongly points in one direction.""",
},
"required": ["model"],
},
"description": CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS["models"],
"description": (
"User-specified roster of models to consult (provide at least two entries). "
+ CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS["models"]
),
"minItems": 2,
},
"current_model_index": {
"type": "integer",
@@ -268,17 +273,19 @@ of the evidence, even when it strongly points in one direction.""",
"thinking_mode", # Not used in consensus workflow
]
# Build schema with proper field exclusion
# Include model field for compatibility but don't require it
schema = WorkflowSchemaBuilder.build_schema(
requires_model = self.requires_model()
model_field_schema = self.get_model_field_schema() if requires_model else None
auto_mode = self.is_effective_auto_mode() if requires_model else False
return WorkflowSchemaBuilder.build_schema(
tool_specific_fields=consensus_field_overrides,
model_field_schema=self.get_model_field_schema(),
auto_mode=False, # Consensus doesn't require model at MCP boundary
model_field_schema=model_field_schema,
auto_mode=auto_mode,
tool_name=self.get_name(),
excluded_workflow_fields=excluded_workflow_fields,
excluded_common_fields=excluded_common_fields,
require_model=requires_model,
)
return schema
def get_required_actions(
self, step_number: int, confidence: str, findings: str, total_steps: int, request=None

View File

@@ -40,8 +40,9 @@ class ListModelsTool(BaseTool):
"""Return the JSON schema for the tool's input"""
return {
"type": "object",
"properties": {"model": {"type": "string", "description": "Model to use (ignored by listmodels tool)"}},
"properties": {},
"required": [],
"additionalProperties": False,
}
def get_annotations(self) -> Optional[dict[str, Any]]:
@@ -106,7 +107,7 @@ class ListModelsTool(BaseTool):
output_lines.append("\n**Models**:")
aliases = []
for model_name, capabilities in provider.get_all_model_capabilities().items():
for model_name, capabilities in provider.get_capabilities_by_rank():
description = capabilities.description or "No description available"
context_window = capabilities.context_window
@@ -153,33 +154,44 @@ class ListModelsTool(BaseTool):
available_models = provider.list_models(respect_restrictions=True)
registry = OpenRouterModelRegistry()
# Group by provider for better organization
providers_models = {}
for model_name in available_models: # Show ALL available models
# Try to resolve to get config details
# Group by provider and retain ranking information for consistent ordering
providers_models: dict[str, list[tuple[int, str, Optional[Any]]]] = {}
def _format_context(tokens: int) -> str:
if not tokens:
return "?"
if tokens >= 1_000_000:
return f"{tokens // 1_000_000}M"
if tokens >= 1_000:
return f"{tokens // 1_000}K"
return str(tokens)
for model_name in available_models:
config = registry.resolve(model_name)
if config:
# Extract provider from model_name
provider_name = config.model_name.split("/")[0] if "/" in config.model_name else "other"
if provider_name not in providers_models:
providers_models[provider_name] = []
providers_models[provider_name].append((model_name, config))
else:
# Model without config - add with basic info
provider_name = model_name.split("/")[0] if "/" in model_name else "other"
if provider_name not in providers_models:
providers_models[provider_name] = []
providers_models[provider_name].append((model_name, None))
provider_name = "other"
if config and "/" in config.model_name:
provider_name = config.model_name.split("/")[0]
elif "/" in model_name:
provider_name = model_name.split("/")[0]
providers_models.setdefault(provider_name, [])
rank = config.get_effective_capability_rank() if config else 0
providers_models[provider_name].append((rank, model_name, config))
output_lines.append("\n**Available Models**:")
for provider_name, models in sorted(providers_models.items()):
output_lines.append(f"\n*{provider_name.title()}:*")
for alias, config in models: # Show ALL models from each provider
for rank, alias, config in sorted(models, key=lambda item: (-item[0], item[1])):
if config:
context_str = f"{config.context_window // 1000}K" if config.context_window else "?"
output_lines.append(f"- `{alias}` → `{config.model_name}` ({context_str} context)")
context_str = _format_context(config.context_window)
suffix_parts = [f"{context_str} context"]
if getattr(config, "supports_extended_thinking", False):
suffix_parts.append("thinking")
suffix = ", ".join(suffix_parts)
output_lines.append(f"- `{alias}` → `{config.model_name}` (score {rank}, {suffix})")
else:
output_lines.append(f"- `{alias}`")
output_lines.append(f"- `{alias}` (score {rank})")
total_models = len(available_models)
# Show all models - no truncation message needed

View File

@@ -291,13 +291,161 @@ class BaseTool(ABC):
def _format_available_models_list(self) -> str:
"""Return a human-friendly list of available models or guidance when none found."""
available_models = self._get_available_models()
if not available_models:
summaries, total, has_restrictions = self._get_ranked_model_summaries()
if not summaries:
return (
"No models detected. Configure provider credentials or set DEFAULT_MODEL to a valid option. "
"If the user requested a specific model, respond with this notice instead of substituting another model."
)
return ", ".join(available_models)
display = "; ".join(summaries)
remainder = total - len(summaries)
if remainder > 0:
display = f"{display}; +{remainder} more (use the `listmodels` tool for the full roster)"
return display
@staticmethod
def _format_context_window(tokens: int) -> Optional[str]:
"""Convert a raw context window into a short display string."""
if not tokens or tokens <= 0:
return None
if tokens >= 1_000_000:
if tokens % 1_000_000 == 0:
return f"{tokens // 1_000_000}M ctx"
return f"{tokens / 1_000_000:.1f}M ctx"
if tokens >= 1_000:
if tokens % 1_000 == 0:
return f"{tokens // 1_000}K ctx"
return f"{tokens / 1_000:.1f}K ctx"
return f"{tokens} ctx"
def _collect_ranked_capabilities(self) -> list[tuple[int, str, Any]]:
"""Gather available model capabilities sorted by capability rank."""
from providers.registry import ModelProviderRegistry
ranked: list[tuple[int, str, Any]] = []
available = ModelProviderRegistry.get_available_models(respect_restrictions=True)
for model_name, provider_type in available.items():
provider = ModelProviderRegistry.get_provider(provider_type)
if not provider:
continue
try:
capabilities = provider.get_capabilities(model_name)
except ValueError:
continue
rank = capabilities.get_effective_capability_rank()
ranked.append((rank, model_name, capabilities))
ranked.sort(key=lambda item: (-item[0], item[1]))
return ranked
@staticmethod
def _normalize_model_identifier(name: str) -> str:
"""Normalize model names for deduplication across providers."""
normalized = name.lower()
if ":" in normalized:
normalized = normalized.split(":", 1)[0]
if "/" in normalized:
normalized = normalized.split("/", 1)[-1]
return normalized
def _get_ranked_model_summaries(self, limit: int = 5) -> tuple[list[str], int, bool]:
"""Return formatted, ranked model summaries and restriction status."""
ranked = self._collect_ranked_capabilities()
# Build allowlist map (provider -> lowercase names) when restrictions are active
allowed_map: dict[Any, set[str]] = {}
try:
from utils.model_restrictions import get_restriction_service
restriction_service = get_restriction_service()
if restriction_service:
from providers.shared import ProviderType
for provider_type in ProviderType:
allowed = restriction_service.get_allowed_models(provider_type)
if allowed:
allowed_map[provider_type] = {name.lower() for name in allowed if name}
except Exception:
allowed_map = {}
filtered: list[tuple[int, str, Any]] = []
seen_normalized: set[str] = set()
for rank, model_name, capabilities in ranked:
canonical_name = getattr(capabilities, "model_name", model_name)
canonical_lower = canonical_name.lower()
alias_lower = model_name.lower()
provider_type = getattr(capabilities, "provider", None)
if allowed_map:
if provider_type not in allowed_map:
continue
allowed_set = allowed_map[provider_type]
if canonical_lower not in allowed_set and alias_lower not in allowed_set:
continue
normalized = self._normalize_model_identifier(canonical_name)
if normalized in seen_normalized:
continue
seen_normalized.add(normalized)
filtered.append((rank, canonical_name, capabilities))
summaries: list[str] = []
for rank, canonical_name, capabilities in filtered[:limit]:
details: list[str] = []
context_str = self._format_context_window(getattr(capabilities, "context_window", 0))
if context_str:
details.append(context_str)
if getattr(capabilities, "supports_extended_thinking", False):
details.append("thinking")
base = f"{canonical_name} (score {rank}"
if details:
base = f"{base}, {', '.join(details)}"
summaries.append(f"{base})")
return summaries, len(filtered), bool(allowed_map)
def _get_restriction_note(self) -> Optional[str]:
"""Return a string describing active per-provider allowlists, if any."""
env_labels = {
"OPENAI_ALLOWED_MODELS": "OpenAI",
"GOOGLE_ALLOWED_MODELS": "Google",
"XAI_ALLOWED_MODELS": "X.AI",
"OPENROUTER_ALLOWED_MODELS": "OpenRouter",
"DIAL_ALLOWED_MODELS": "DIAL",
}
notes: list[str] = []
for env_var, label in env_labels.items():
raw = os.getenv(env_var)
if not raw:
continue
models = sorted({token.strip() for token in raw.split(",") if token.strip()})
if not models:
continue
notes.append(f"{label}: {', '.join(models)}")
if not notes:
return None
return "Policy allows only → " + "; ".join(notes)
def _build_model_unavailable_message(self, model_name: str) -> str:
"""Compose a consistent error message for unavailable model scenarios."""
@@ -344,8 +492,23 @@ class BaseTool(ABC):
if self.is_effective_auto_mode():
description = (
"Currently in auto model selection mode. CRITICAL: When the user names a model, you MUST use that exact name unless the server rejects it. "
"If no model is provided, you may call the `listmodels` tool to review options and select an appropriate match."
"If no model is provided, you may use the `listmodels` tool to review options and select an appropriate match."
)
summaries, total, restricted = self._get_ranked_model_summaries()
remainder = max(0, total - len(summaries))
if summaries:
top_line = "; ".join(summaries)
if remainder > 0:
label = "Allowed models" if restricted else "Top models"
top_line = f"{label}: {top_line}; +{remainder} more via `listmodels`."
else:
label = "Allowed models" if restricted else "Top models"
top_line = f"{label}: {top_line}."
description = f"{description} {top_line}"
restriction_note = self._get_restriction_note()
if restriction_note and (remainder > 0 or not summaries):
description = f"{description} {restriction_note}."
return {
"type": "string",
"description": description,
@@ -353,8 +516,23 @@ class BaseTool(ABC):
description = (
f"The default model is '{DEFAULT_MODEL}'. Override only when the user explicitly requests a different model, and use that exact name. "
"If the requested model fails validation, surface the server error instead of substituting another model. When unsure, call the `listmodels` tool for details."
"If the requested model fails validation, surface the server error instead of substituting another model. When unsure, use the `listmodels` tool for details."
)
summaries, total, restricted = self._get_ranked_model_summaries()
remainder = max(0, total - len(summaries))
if summaries:
top_line = "; ".join(summaries)
if remainder > 0:
label = "Allowed models" if restricted else "Preferred alternatives"
top_line = f"{label}: {top_line}; +{remainder} more via `listmodels`."
else:
label = "Allowed models" if restricted else "Preferred alternatives"
top_line = f"{label}: {top_line}."
description = f"{description} {top_line}"
restriction_note = self._get_restriction_note()
if restriction_note and (remainder > 0 or not summaries):
description = f"{description} {restriction_note}."
return {
"type": "string",
@@ -1242,31 +1420,6 @@ When recommending searches, be specific about what information you need and why
import base64
from pathlib import Path
# Handle legacy calls (positional model_name string)
if isinstance(model_context, str):
# Legacy call: _validate_image_limits(images, "model-name")
logger.warning(
"Legacy _validate_image_limits call with model_name string. Use model_context object instead."
)
try:
from utils.model_context import ModelContext
model_context = ModelContext(model_context)
except Exception as e:
logger.warning(f"Failed to create model context from legacy model_name: {e}")
# Generic error response for any unavailable model
return {
"status": "error",
"content": self._build_model_unavailable_message(str(model_context)),
"content_type": "text",
"metadata": {
"error_type": "validation_error",
"model_name": model_context,
"supports_images": None, # Unknown since model doesn't exist
"image_count": len(images) if images else 0,
},
}
if not model_context:
# Get from tool's stored context as fallback
model_context = getattr(self, "_model_context", None)

View File

@@ -146,8 +146,9 @@ class VersionTool(BaseTool):
"""Return the JSON schema for the tool's input"""
return {
"type": "object",
"properties": {"model": {"type": "string", "description": "Model to use (ignored by version tool)"}},
"properties": {},
"required": [],
"additionalProperties": False,
}
def get_annotations(self) -> Optional[dict[str, Any]]:

View File

@@ -139,12 +139,16 @@ class WorkflowTool(BaseTool, BaseWorkflowMixin):
Returns:
Complete JSON schema for the workflow tool
"""
requires_model = self.requires_model()
model_field_schema = self.get_model_field_schema() if requires_model else None
auto_mode = self.is_effective_auto_mode() if requires_model else False
return WorkflowSchemaBuilder.build_schema(
tool_specific_fields=self.get_tool_fields(),
required_fields=self.get_required_fields(),
model_field_schema=self.get_model_field_schema(),
auto_mode=self.is_effective_auto_mode(),
model_field_schema=model_field_schema,
auto_mode=auto_mode,
tool_name=self.get_name(),
require_model=requires_model,
)
def get_workflow_request_model(self):