diff --git a/config.py b/config.py index cebc4ab..99e0a75 100644 --- a/config.py +++ b/config.py @@ -47,13 +47,6 @@ MODEL_CAPABILITIES_DESC = { # - "o3" → "openai/gpt-4o" # - "o3-mini" → "openai/gpt-4o-mini" -# Token allocation for Gemini Pro (1M total capacity) -# MAX_CONTEXT_TOKENS: Total model capacity -# MAX_CONTENT_TOKENS: Available for prompts, conversation history, and files -# RESPONSE_RESERVE_TOKENS: Reserved for model response generation -MAX_CONTEXT_TOKENS = 1_000_000 # 1M tokens total capacity for Gemini Pro -MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content (prompts + files + history) -RESPONSE_RESERVE_TOKENS = 200_000 # 200K tokens reserved for response generation # Temperature defaults for different tool types # Temperature controls the randomness/creativity of model responses diff --git a/providers/base.py b/providers/base.py index 0908fd1..5ef1c25 100644 --- a/providers/base.py +++ b/providers/base.py @@ -105,7 +105,7 @@ class ModelCapabilities: provider: ProviderType model_name: str friendly_name: str # Human-friendly name like "Gemini" or "OpenAI" - max_tokens: int + context_window: int # Total context window size in tokens supports_extended_thinking: bool = False supports_system_prompts: bool = True supports_streaming: bool = True diff --git a/providers/gemini.py b/providers/gemini.py index 5fe435e..588ad2b 100644 --- a/providers/gemini.py +++ b/providers/gemini.py @@ -14,12 +14,12 @@ class GeminiModelProvider(ModelProvider): # Model configurations SUPPORTED_MODELS = { "gemini-2.5-flash-preview-05-20": { - "max_tokens": 1_048_576, # 1M tokens + "context_window": 1_048_576, # 1M tokens "supports_extended_thinking": True, "max_thinking_tokens": 24576, # Flash 2.5 thinking budget limit }, "gemini-2.5-pro-preview-06-05": { - "max_tokens": 1_048_576, # 1M tokens + "context_window": 1_048_576, # 1M tokens "supports_extended_thinking": True, "max_thinking_tokens": 32768, # Pro 2.5 thinking budget limit }, @@ -68,7 +68,7 @@ class GeminiModelProvider(ModelProvider): provider=ProviderType.GOOGLE, model_name=resolved_name, friendly_name="Gemini", - max_tokens=config["max_tokens"], + context_window=config["context_window"], supports_extended_thinking=config["supports_extended_thinking"], supports_system_prompts=True, supports_streaming=True, diff --git a/providers/openai.py b/providers/openai.py index e1875de..9284ff0 100644 --- a/providers/openai.py +++ b/providers/openai.py @@ -15,11 +15,11 @@ class OpenAIModelProvider(OpenAICompatibleProvider): # Model configurations SUPPORTED_MODELS = { "o3": { - "max_tokens": 200_000, # 200K tokens + "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, }, "o3-mini": { - "max_tokens": 200_000, # 200K tokens + "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, }, } @@ -49,7 +49,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider): provider=ProviderType.OPENAI, model_name=model_name, friendly_name="OpenAI", - max_tokens=config["max_tokens"], + context_window=config["context_window"], supports_extended_thinking=config["supports_extended_thinking"], supports_system_prompts=True, supports_streaming=True, diff --git a/providers/openrouter.py b/providers/openrouter.py index e82d258..fb55bc9 100644 --- a/providers/openrouter.py +++ b/providers/openrouter.py @@ -109,7 +109,7 @@ class OpenRouterProvider(OpenAICompatibleProvider): provider=ProviderType.OPENROUTER, model_name=resolved_name, friendly_name=self.FRIENDLY_NAME, - max_tokens=32_768, # Conservative default context window + context_window=32_768, # Conservative default context window supports_extended_thinking=False, supports_system_prompts=True, supports_streaming=True, diff --git a/providers/openrouter_registry.py b/providers/openrouter_registry.py index 2172fcb..fa3f246 100644 --- a/providers/openrouter_registry.py +++ b/providers/openrouter_registry.py @@ -30,7 +30,7 @@ class OpenRouterModelConfig: provider=ProviderType.OPENROUTER, model_name=self.model_name, friendly_name="OpenRouter", - max_tokens=self.context_window, # ModelCapabilities still uses max_tokens + context_window=self.context_window, supports_extended_thinking=self.supports_extended_thinking, supports_system_prompts=self.supports_system_prompts, supports_streaming=self.supports_streaming, @@ -103,10 +103,6 @@ class OpenRouterModelRegistry: # Parse models configs = [] for model_data in data.get("models", []): - # Handle backwards compatibility - rename max_tokens to context_window - if "max_tokens" in model_data and "context_window" not in model_data: - model_data["context_window"] = model_data.pop("max_tokens") - config = OpenRouterModelConfig(**model_data) configs.append(config) diff --git a/server.py b/server.py index 669145c..64b475b 100644 --- a/server.py +++ b/server.py @@ -33,7 +33,6 @@ from mcp.types import ServerCapabilities, TextContent, Tool, ToolsCapability from config import ( DEFAULT_MODEL, - MAX_CONTEXT_TOKENS, __author__, __updated__, __version__, @@ -521,7 +520,7 @@ async def handle_get_version() -> list[TextContent]: "author": __author__, "default_model": DEFAULT_MODEL, "default_thinking_mode_thinkdeep": DEFAULT_THINKING_MODE_THINKDEEP, - "max_context_tokens": f"{MAX_CONTEXT_TOKENS:,}", + "max_context_tokens": "Dynamic (model-specific)", "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", "server_started": datetime.now().isoformat(), "available_tools": list(TOOLS.keys()) + ["get_version"], @@ -547,7 +546,7 @@ Author: {__author__} Configuration: - Default Model: {DEFAULT_MODEL} - Default Thinking Mode (ThinkDeep): {DEFAULT_THINKING_MODE_THINKDEEP} -- Max Context: {MAX_CONTEXT_TOKENS:,} tokens +- Max Context: Dynamic (model-specific) - Python: {version_info["python_version"]} - Started: {version_info["server_started"]} diff --git a/tests/test_config.py b/tests/test_config.py index 6220226..e6ad23b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -4,7 +4,6 @@ Tests for configuration from config import ( DEFAULT_MODEL, - MAX_CONTEXT_TOKENS, TEMPERATURE_ANALYTICAL, TEMPERATURE_BALANCED, TEMPERATURE_CREATIVE, @@ -33,7 +32,6 @@ class TestConfig: """Test model configuration""" # DEFAULT_MODEL is set in conftest.py for tests assert DEFAULT_MODEL == "gemini-2.5-flash-preview-05-20" - assert MAX_CONTEXT_TOKENS == 1_000_000 def test_temperature_defaults(self): """Test temperature constants""" diff --git a/tools/base.py b/tools/base.py index 28fc342..12b4812 100644 --- a/tools/base.py +++ b/tools/base.py @@ -22,7 +22,7 @@ from typing import Any, Literal, Optional from mcp.types import TextContent from pydantic import BaseModel, Field -from config import MAX_CONTEXT_TOKENS, MCP_PROMPT_SIZE_LIMIT +from config import MCP_PROMPT_SIZE_LIMIT from providers import ModelProvider, ModelProviderRegistry from utils import check_token_limit from utils.conversation_memory import ( @@ -414,7 +414,7 @@ class BaseTool(ABC): request_files: List of files requested for current tool execution continuation_id: Thread continuation ID, or None for new conversations context_description: Description for token limit validation (e.g. "Code", "New files") - max_tokens: Maximum tokens to use (defaults to remaining budget or MAX_CONTENT_TOKENS) + max_tokens: Maximum tokens to use (defaults to remaining budget or model-specific content allocation) reserve_tokens: Tokens to reserve for additional prompt content (default 1K) remaining_budget: Remaining token budget after conversation history (from server.py) arguments: Original tool arguments (used to extract _remaining_tokens if available) @@ -473,17 +473,17 @@ class BaseTool(ABC): capabilities = provider.get_capabilities(model_name) # Calculate content allocation based on model capacity - if capabilities.max_tokens < 300_000: + if capabilities.context_window < 300_000: # Smaller context models: 60% content, 40% response - model_content_tokens = int(capabilities.max_tokens * 0.6) + model_content_tokens = int(capabilities.context_window * 0.6) else: # Larger context models: 80% content, 20% response - model_content_tokens = int(capabilities.max_tokens * 0.8) + model_content_tokens = int(capabilities.context_window * 0.8) effective_max_tokens = model_content_tokens - reserve_tokens logger.debug( f"[FILES] {self.name}: Using model-specific limit for {model_name}: " - f"{model_content_tokens:,} content tokens from {capabilities.max_tokens:,} total" + f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total" ) except (ValueError, AttributeError) as e: # Handle specific errors: provider not found, model not supported, missing attributes @@ -491,17 +491,13 @@ class BaseTool(ABC): f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}" ) # Fall back to conservative default for safety - from config import MAX_CONTENT_TOKENS - - effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens + effective_max_tokens = 100_000 - reserve_tokens except Exception as e: # Catch any other unexpected errors logger.error( f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}" ) - from config import MAX_CONTENT_TOKENS - - effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens + effective_max_tokens = 100_000 - reserve_tokens # Ensure we have a reasonable minimum budget effective_max_tokens = max(1000, effective_max_tokens) @@ -1233,7 +1229,7 @@ When recommending searches, be specific about what information you need and why """ return response - def _validate_token_limit(self, text: str, context_type: str = "Context") -> None: + def _validate_token_limit(self, text: str, context_type: str = "Context", context_window: int = 200_000) -> None: """ Validate token limit and raise ValueError if exceeded. @@ -1243,14 +1239,15 @@ When recommending searches, be specific about what information you need and why Args: text: The text to check context_type: Description of what's being checked (for error message) + context_window: The model's context window size Raises: - ValueError: If text exceeds MAX_CONTEXT_TOKENS + ValueError: If text exceeds context_window """ - within_limit, estimated_tokens = check_token_limit(text) + within_limit, estimated_tokens = check_token_limit(text, context_window) if not within_limit: raise ValueError( - f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {MAX_CONTEXT_TOKENS:,} tokens." + f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {context_window:,} tokens." ) def _validate_and_correct_temperature(self, model_name: str, temperature: float) -> tuple[float, list[str]]: diff --git a/tools/precommit.py b/tools/precommit.py index 87ea5a5..23bdb99 100644 --- a/tools/precommit.py +++ b/tools/precommit.py @@ -14,7 +14,8 @@ from typing import Any, Literal, Optional from mcp.types import TextContent from pydantic import Field -from config import MAX_CONTEXT_TOKENS +# Conservative fallback for token limits +DEFAULT_CONTEXT_WINDOW = 200_000 from prompts.tool_prompts import PRECOMMIT_PROMPT from utils.file_utils import translate_file_paths, translate_path_for_environment from utils.git_utils import find_git_repositories, get_git_status, run_git_command @@ -186,7 +187,7 @@ class Precommit(BaseTool): all_diffs = [] repo_summaries = [] total_tokens = 0 - max_tokens = MAX_CONTEXT_TOKENS - 50000 # Reserve tokens for prompt and response + max_tokens = DEFAULT_CONTEXT_WINDOW - 50000 # Reserve tokens for prompt and response for repo_path in repositories: repo_name = os.path.basename(repo_path) or "root" diff --git a/utils/file_utils.py b/utils/file_utils.py index 8d986c4..fb28c36 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -23,7 +23,7 @@ import os from pathlib import Path from typing import Optional -from .token_utils import MAX_CONTEXT_TOKENS, estimate_tokens +from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens logger = logging.getLogger(__name__) @@ -508,14 +508,14 @@ def read_files( Args: file_paths: List of file or directory paths (absolute paths required) code: Optional direct code to include (prioritized over files) - max_tokens: Maximum tokens to use (defaults to MAX_CONTEXT_TOKENS) + max_tokens: Maximum tokens to use (defaults to DEFAULT_CONTEXT_WINDOW) reserve_tokens: Tokens to reserve for prompt and response (default 50K) Returns: str: All file contents formatted for AI consumption """ if max_tokens is None: - max_tokens = MAX_CONTEXT_TOKENS + max_tokens = DEFAULT_CONTEXT_WINDOW logger.debug(f"[FILES] read_files called with {len(file_paths)} paths") logger.debug( diff --git a/utils/model_context.py b/utils/model_context.py index 766d0f8..1055172 100644 --- a/utils/model_context.py +++ b/utils/model_context.py @@ -72,7 +72,7 @@ class ModelContext: Returns: TokenAllocation with calculated budgets """ - total_tokens = self.capabilities.max_tokens + total_tokens = self.capabilities.context_window # Dynamic allocation based on model capacity if total_tokens < 300_000: diff --git a/utils/token_utils.py b/utils/token_utils.py index 2ea7fa9..393669e 100644 --- a/utils/token_utils.py +++ b/utils/token_utils.py @@ -9,7 +9,8 @@ approximate. For production systems requiring precise token counts, consider using the actual tokenizer for the specific model. """ -from config import MAX_CONTEXT_TOKENS +# Default fallback for token limit (conservative estimate) +DEFAULT_CONTEXT_WINDOW = 200_000 # Conservative fallback for unknown models def estimate_tokens(text: str) -> int: @@ -32,9 +33,9 @@ def estimate_tokens(text: str) -> int: return len(text) // 4 -def check_token_limit(text: str) -> tuple[bool, int]: +def check_token_limit(text: str, context_window: int = DEFAULT_CONTEXT_WINDOW) -> tuple[bool, int]: """ - Check if text exceeds the maximum token limit for Gemini models. + Check if text exceeds the specified token limit. This function is used to validate that prepared prompts will fit within the model's context window, preventing API errors and ensuring @@ -42,11 +43,12 @@ def check_token_limit(text: str) -> tuple[bool, int]: Args: text: The text to check + context_window: The model's context window size (defaults to conservative fallback) Returns: Tuple[bool, int]: (is_within_limit, estimated_tokens) - - is_within_limit: True if the text fits within MAX_CONTEXT_TOKENS + - is_within_limit: True if the text fits within context_window - estimated_tokens: The estimated token count """ estimated = estimate_tokens(text) - return estimated <= MAX_CONTEXT_TOKENS, estimated + return estimated <= context_window, estimated