Use consistent terminology

2025-06-13 09:06:12 +04:00
parent e2762c4ed0
commit b16f85979b
13 changed files with 38 additions and 52 deletions
--- a/config.py
+++ b/config.py
@@ -47,13 +47,6 @@ MODEL_CAPABILITIES_DESC = {
 # - "o3" → "openai/gpt-4o"
 # - "o3-mini" → "openai/gpt-4o-mini"

-# Token allocation for Gemini Pro (1M total capacity)
-# MAX_CONTEXT_TOKENS: Total model capacity
-# MAX_CONTENT_TOKENS: Available for prompts, conversation history, and files
-# RESPONSE_RESERVE_TOKENS: Reserved for model response generation
-MAX_CONTEXT_TOKENS = 1_000_000  # 1M tokens total capacity for Gemini Pro
-MAX_CONTENT_TOKENS = 800_000  # 800K tokens for content (prompts + files + history)
-RESPONSE_RESERVE_TOKENS = 200_000  # 200K tokens reserved for response generation

 # Temperature defaults for different tool types
 # Temperature controls the randomness/creativity of model responses
--- a/providers/base.py
+++ b/providers/base.py
@@ -105,7 +105,7 @@ class ModelCapabilities:
    provider: ProviderType
    model_name: str
    friendly_name: str  # Human-friendly name like "Gemini" or "OpenAI"
-    max_tokens: int
+    context_window: int  # Total context window size in tokens
    supports_extended_thinking: bool = False
    supports_system_prompts: bool = True
    supports_streaming: bool = True
--- a/providers/gemini.py
+++ b/providers/gemini.py
@@ -14,12 +14,12 @@ class GeminiModelProvider(ModelProvider):
    # Model configurations
    SUPPORTED_MODELS = {
        "gemini-2.5-flash-preview-05-20": {
-            "max_tokens": 1_048_576,  # 1M tokens
+            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 24576,  # Flash 2.5 thinking budget limit
        },
        "gemini-2.5-pro-preview-06-05": {
-            "max_tokens": 1_048_576,  # 1M tokens
+            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 32768,  # Pro 2.5 thinking budget limit
        },
@@ -68,7 +68,7 @@ class GeminiModelProvider(ModelProvider):
            provider=ProviderType.GOOGLE,
            model_name=resolved_name,
            friendly_name="Gemini",
-            max_tokens=config["max_tokens"],
+            context_window=config["context_window"],
            supports_extended_thinking=config["supports_extended_thinking"],
            supports_system_prompts=True,
            supports_streaming=True,
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -15,11 +15,11 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    # Model configurations
    SUPPORTED_MODELS = {
        "o3": {
-            "max_tokens": 200_000,  # 200K tokens
+            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
        "o3-mini": {
-            "max_tokens": 200_000,  # 200K tokens
+            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
    }
@@ -49,7 +49,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            provider=ProviderType.OPENAI,
            model_name=model_name,
            friendly_name="OpenAI",
-            max_tokens=config["max_tokens"],
+            context_window=config["context_window"],
            supports_extended_thinking=config["supports_extended_thinking"],
            supports_system_prompts=True,
            supports_streaming=True,
--- a/providers/openrouter.py
+++ b/providers/openrouter.py
@@ -109,7 +109,7 @@ class OpenRouterProvider(OpenAICompatibleProvider):
                provider=ProviderType.OPENROUTER,
                model_name=resolved_name,
                friendly_name=self.FRIENDLY_NAME,
-                max_tokens=32_768,  # Conservative default context window
+                context_window=32_768,  # Conservative default context window
                supports_extended_thinking=False,
                supports_system_prompts=True,
                supports_streaming=True,
--- a/providers/openrouter_registry.py
+++ b/providers/openrouter_registry.py
@@ -30,7 +30,7 @@ class OpenRouterModelConfig:
            provider=ProviderType.OPENROUTER,
            model_name=self.model_name,
            friendly_name="OpenRouter",
-            max_tokens=self.context_window,  # ModelCapabilities still uses max_tokens
+            context_window=self.context_window,
            supports_extended_thinking=self.supports_extended_thinking,
            supports_system_prompts=self.supports_system_prompts,
            supports_streaming=self.supports_streaming,
@@ -103,10 +103,6 @@ class OpenRouterModelRegistry:
            # Parse models
            configs = []
            for model_data in data.get("models", []):
-                # Handle backwards compatibility - rename max_tokens to context_window
-                if "max_tokens" in model_data and "context_window" not in model_data:
-                    model_data["context_window"] = model_data.pop("max_tokens")
-
                config = OpenRouterModelConfig(**model_data)
                configs.append(config)

--- a/server.py
+++ b/server.py
@@ -33,7 +33,6 @@ from mcp.types import ServerCapabilities, TextContent, Tool, ToolsCapability

 from config import (
    DEFAULT_MODEL,
-    MAX_CONTEXT_TOKENS,
    __author__,
    __updated__,
    __version__,
@@ -521,7 +520,7 @@ async def handle_get_version() -> list[TextContent]:
        "author": __author__,
        "default_model": DEFAULT_MODEL,
        "default_thinking_mode_thinkdeep": DEFAULT_THINKING_MODE_THINKDEEP,
-        "max_context_tokens": f"{MAX_CONTEXT_TOKENS:,}",
+        "max_context_tokens": "Dynamic (model-specific)",
        "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
        "server_started": datetime.now().isoformat(),
        "available_tools": list(TOOLS.keys()) + ["get_version"],
@@ -547,7 +546,7 @@ Author: {__author__}
 Configuration:
 - Default Model: {DEFAULT_MODEL}
 - Default Thinking Mode (ThinkDeep): {DEFAULT_THINKING_MODE_THINKDEEP}
- Max Context: {MAX_CONTEXT_TOKENS:,} tokens
+- Max Context: Dynamic (model-specific)
 - Python: {version_info["python_version"]}
 - Started: {version_info["server_started"]}

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -4,7 +4,6 @@ Tests for configuration

 from config import (
    DEFAULT_MODEL,
-    MAX_CONTEXT_TOKENS,
    TEMPERATURE_ANALYTICAL,
    TEMPERATURE_BALANCED,
    TEMPERATURE_CREATIVE,
@@ -33,7 +32,6 @@ class TestConfig:
        """Test model configuration"""
        # DEFAULT_MODEL is set in conftest.py for tests
        assert DEFAULT_MODEL == "gemini-2.5-flash-preview-05-20"
-        assert MAX_CONTEXT_TOKENS == 1_000_000

    def test_temperature_defaults(self):
        """Test temperature constants"""
--- a/tools/base.py
+++ b/tools/base.py
@@ -22,7 +22,7 @@ from typing import Any, Literal, Optional
 from mcp.types import TextContent
 from pydantic import BaseModel, Field

-from config import MAX_CONTEXT_TOKENS, MCP_PROMPT_SIZE_LIMIT
+from config import MCP_PROMPT_SIZE_LIMIT
 from providers import ModelProvider, ModelProviderRegistry
 from utils import check_token_limit
 from utils.conversation_memory import (
@@ -414,7 +414,7 @@ class BaseTool(ABC):
            request_files: List of files requested for current tool execution
            continuation_id: Thread continuation ID, or None for new conversations
            context_description: Description for token limit validation (e.g. "Code", "New files")
-            max_tokens: Maximum tokens to use (defaults to remaining budget or MAX_CONTENT_TOKENS)
+            max_tokens: Maximum tokens to use (defaults to remaining budget or model-specific content allocation)
            reserve_tokens: Tokens to reserve for additional prompt content (default 1K)
            remaining_budget: Remaining token budget after conversation history (from server.py)
            arguments: Original tool arguments (used to extract _remaining_tokens if available)
@@ -473,17 +473,17 @@ class BaseTool(ABC):
                    capabilities = provider.get_capabilities(model_name)

                    # Calculate content allocation based on model capacity
-                    if capabilities.max_tokens < 300_000:
+                    if capabilities.context_window < 300_000:
                        # Smaller context models: 60% content, 40% response
-                        model_content_tokens = int(capabilities.max_tokens * 0.6)
+                        model_content_tokens = int(capabilities.context_window * 0.6)
                    else:
                        # Larger context models: 80% content, 20% response
-                        model_content_tokens = int(capabilities.max_tokens * 0.8)
+                        model_content_tokens = int(capabilities.context_window * 0.8)

                    effective_max_tokens = model_content_tokens - reserve_tokens
                    logger.debug(
                        f"[FILES] {self.name}: Using model-specific limit for {model_name}: "
-                        f"{model_content_tokens:,} content tokens from {capabilities.max_tokens:,} total"
+                        f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
                    )
                except (ValueError, AttributeError) as e:
                    # Handle specific errors: provider not found, model not supported, missing attributes
@@ -491,17 +491,13 @@ class BaseTool(ABC):
                        f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}"
                    )
                    # Fall back to conservative default for safety
-                    from config import MAX_CONTENT_TOKENS
-
-                    effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens
+                    effective_max_tokens = 100_000 - reserve_tokens
                except Exception as e:
                    # Catch any other unexpected errors
                    logger.error(
                        f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
                    )
-                    from config import MAX_CONTENT_TOKENS
-
-                    effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens
+                    effective_max_tokens = 100_000 - reserve_tokens

        # Ensure we have a reasonable minimum budget
        effective_max_tokens = max(1000, effective_max_tokens)
@@ -1233,7 +1229,7 @@ When recommending searches, be specific about what information you need and why
        """
        return response

-    def _validate_token_limit(self, text: str, context_type: str = "Context") -> None:
+    def _validate_token_limit(self, text: str, context_type: str = "Context", context_window: int = 200_000) -> None:
        """
        Validate token limit and raise ValueError if exceeded.

@@ -1243,14 +1239,15 @@ When recommending searches, be specific about what information you need and why
        Args:
            text: The text to check
            context_type: Description of what's being checked (for error message)
+            context_window: The model's context window size

        Raises:
-            ValueError: If text exceeds MAX_CONTEXT_TOKENS
+            ValueError: If text exceeds context_window
        """
-        within_limit, estimated_tokens = check_token_limit(text)
+        within_limit, estimated_tokens = check_token_limit(text, context_window)
        if not within_limit:
            raise ValueError(
-                f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {MAX_CONTEXT_TOKENS:,} tokens."
+                f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {context_window:,} tokens."
            )

    def _validate_and_correct_temperature(self, model_name: str, temperature: float) -> tuple[float, list[str]]:
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -14,7 +14,8 @@ from typing import Any, Literal, Optional
 from mcp.types import TextContent
 from pydantic import Field

-from config import MAX_CONTEXT_TOKENS
+# Conservative fallback for token limits
+DEFAULT_CONTEXT_WINDOW = 200_000
 from prompts.tool_prompts import PRECOMMIT_PROMPT
 from utils.file_utils import translate_file_paths, translate_path_for_environment
 from utils.git_utils import find_git_repositories, get_git_status, run_git_command
@@ -186,7 +187,7 @@ class Precommit(BaseTool):
        all_diffs = []
        repo_summaries = []
        total_tokens = 0
-        max_tokens = MAX_CONTEXT_TOKENS - 50000  # Reserve tokens for prompt and response
+        max_tokens = DEFAULT_CONTEXT_WINDOW - 50000  # Reserve tokens for prompt and response

        for repo_path in repositories:
            repo_name = os.path.basename(repo_path) or "root"
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -23,7 +23,7 @@ import os
 from pathlib import Path
 from typing import Optional

-from .token_utils import MAX_CONTEXT_TOKENS, estimate_tokens
+from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens

 logger = logging.getLogger(__name__)

@@ -508,14 +508,14 @@ def read_files(
    Args:
        file_paths: List of file or directory paths (absolute paths required)
        code: Optional direct code to include (prioritized over files)
-        max_tokens: Maximum tokens to use (defaults to MAX_CONTEXT_TOKENS)
+        max_tokens: Maximum tokens to use (defaults to DEFAULT_CONTEXT_WINDOW)
        reserve_tokens: Tokens to reserve for prompt and response (default 50K)

    Returns:
        str: All file contents formatted for AI consumption
    """
    if max_tokens is None:
-        max_tokens = MAX_CONTEXT_TOKENS
+        max_tokens = DEFAULT_CONTEXT_WINDOW

    logger.debug(f"[FILES] read_files called with {len(file_paths)} paths")
    logger.debug(
--- a/utils/model_context.py
+++ b/utils/model_context.py
@@ -72,7 +72,7 @@ class ModelContext:
        Returns:
            TokenAllocation with calculated budgets
        """
-        total_tokens = self.capabilities.max_tokens
+        total_tokens = self.capabilities.context_window

        # Dynamic allocation based on model capacity
        if total_tokens < 300_000:
--- a/utils/token_utils.py
+++ b/utils/token_utils.py
@@ -9,7 +9,8 @@ approximate. For production systems requiring precise token counts,
 consider using the actual tokenizer for the specific model.
 """

-from config import MAX_CONTEXT_TOKENS
+# Default fallback for token limit (conservative estimate)
+DEFAULT_CONTEXT_WINDOW = 200_000  # Conservative fallback for unknown models


 def estimate_tokens(text: str) -> int:
@@ -32,9 +33,9 @@ def estimate_tokens(text: str) -> int:
    return len(text) // 4


-def check_token_limit(text: str) -> tuple[bool, int]:
+def check_token_limit(text: str, context_window: int = DEFAULT_CONTEXT_WINDOW) -> tuple[bool, int]:
    """
-    Check if text exceeds the maximum token limit for Gemini models.
+    Check if text exceeds the specified token limit.

    This function is used to validate that prepared prompts will fit
    within the model's context window, preventing API errors and ensuring
@@ -42,11 +43,12 @@ def check_token_limit(text: str) -> tuple[bool, int]:

    Args:
        text: The text to check
+        context_window: The model's context window size (defaults to conservative fallback)

    Returns:
        Tuple[bool, int]: (is_within_limit, estimated_tokens)
-        - is_within_limit: True if the text fits within MAX_CONTEXT_TOKENS
+        - is_within_limit: True if the text fits within context_window
        - estimated_tokens: The estimated token count
    """
    estimated = estimate_tokens(text)
-    return estimated <= MAX_CONTEXT_TOKENS, estimated
+    return estimated <= context_window, estimated