Merge remote-tracking branch 'upstream/main' into feat/comprehensive-project-improvements

2025-06-13 07:41:03 +02:00
parent 51e0c554cb a641159a67
commit 3a8e61fa6c
20 changed files with 72 additions and 98 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -165,9 +165,3 @@ test_simulation_files/.claude/
 # Temporary test directories
 test-setup/
 /test_simulation_files/**
 # Remove Claude configuration
 .mcp.json
 Claude.md
 memory-bank
--- a/config.py
+++ b/config.py
@@ -47,13 +47,6 @@ MODEL_CAPABILITIES_DESC = {
 # - "o3" → "openai/gpt-4o"
 # - "o3-mini" → "openai/gpt-4o-mini"
 # Token allocation for Gemini Pro (1M total capacity)
 # MAX_CONTEXT_TOKENS: Total model capacity
 # MAX_CONTENT_TOKENS: Available for prompts, conversation history, and files
 # RESPONSE_RESERVE_TOKENS: Reserved for model response generation
 MAX_CONTEXT_TOKENS = 1_000_000  # 1M tokens total capacity for Gemini Pro
 MAX_CONTENT_TOKENS = 800_000  # 800K tokens for content (prompts + files + history)
 RESPONSE_RESERVE_TOKENS = 200_000  # 200K tokens reserved for response generation
 # Temperature defaults for different tool types
 # Temperature controls the randomness/creativity of model responses
--- a/providers/base.py
+++ b/providers/base.py
@@ -105,7 +105,7 @@ class ModelCapabilities:
    provider: ProviderType
    model_name: str
    friendly_name: str  # Human-friendly name like "Gemini" or "OpenAI"
-    max_tokens: int
+    context_window: int  # Total context window size in tokens
    supports_extended_thinking: bool = False
    supports_system_prompts: bool = True
    supports_streaming: bool = True
--- a/providers/gemini.py
+++ b/providers/gemini.py
@@ -14,12 +14,12 @@ class GeminiModelProvider(ModelProvider):
    # Model configurations
    SUPPORTED_MODELS = {
        "gemini-2.5-flash-preview-05-20": {
-            "max_tokens": 1_048_576,  # 1M tokens
+            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 24576,  # Flash 2.5 thinking budget limit
        },
        "gemini-2.5-pro-preview-06-05": {
-            "max_tokens": 1_048_576,  # 1M tokens
+            "context_window": 1_048_576,  # 1M tokens
            "supports_extended_thinking": True,
            "max_thinking_tokens": 32768,  # Pro 2.5 thinking budget limit
        },
@@ -68,7 +68,7 @@ class GeminiModelProvider(ModelProvider):
            provider=ProviderType.GOOGLE,
            model_name=resolved_name,
            friendly_name="Gemini",
-            max_tokens=config["max_tokens"],
+            context_window=config["context_window"],
            supports_extended_thinking=config["supports_extended_thinking"],
            supports_system_prompts=True,
            supports_streaming=True,
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -15,11 +15,11 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    # Model configurations
    SUPPORTED_MODELS = {
        "o3": {
-            "max_tokens": 200_000,  # 200K tokens
+            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
        "o3-mini": {
-            "max_tokens": 200_000,  # 200K tokens
+            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
    }
@@ -49,7 +49,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            provider=ProviderType.OPENAI,
            model_name=model_name,
            friendly_name="OpenAI",
-            max_tokens=config["max_tokens"],
+            context_window=config["context_window"],
            supports_extended_thinking=config["supports_extended_thinking"],
            supports_system_prompts=True,
            supports_streaming=True,
--- a/providers/openrouter.py
+++ b/providers/openrouter.py
@@ -109,7 +109,7 @@ class OpenRouterProvider(OpenAICompatibleProvider):
                provider=ProviderType.OPENROUTER,
                model_name=resolved_name,
                friendly_name=self.FRIENDLY_NAME,
-                max_tokens=32_768,  # Conservative default context window
+                context_window=32_768,  # Conservative default context window
                supports_extended_thinking=False,
                supports_system_prompts=True,
                supports_streaming=True,
--- a/providers/openrouter_registry.py
+++ b/providers/openrouter_registry.py
@@ -30,7 +30,7 @@ class OpenRouterModelConfig:
            provider=ProviderType.OPENROUTER,
            model_name=self.model_name,
            friendly_name="OpenRouter",
-            max_tokens=self.context_window,  # ModelCapabilities still uses max_tokens
+            context_window=self.context_window,
            supports_extended_thinking=self.supports_extended_thinking,
            supports_system_prompts=self.supports_system_prompts,
            supports_streaming=self.supports_streaming,
@@ -103,10 +103,6 @@ class OpenRouterModelRegistry:
            # Parse models
            configs = []
            for model_data in data.get("models", []):
                # Handle backwards compatibility - rename max_tokens to context_window
                if "max_tokens" in model_data and "context_window" not in model_data:
                    model_data["context_window"] = model_data.pop("max_tokens")
                config = OpenRouterModelConfig(**model_data)
                configs.append(config)
--- a/server.py
+++ b/server.py
@@ -33,7 +33,6 @@ from mcp.types import ServerCapabilities, TextContent, Tool, ToolsCapability
 from config import (
    DEFAULT_MODEL,
    MAX_CONTEXT_TOKENS,
    __author__,
    __updated__,
    __version__,
@@ -158,24 +157,6 @@ def configure_providers():
        has_openrouter = True
        logger.info("OpenRouter API key found - Multiple models available via OpenRouter")
    # Check for conflicting configuration
    if has_native_apis and has_openrouter:
        logger.warning(
            "\n" + "=" * 70 + "\n"
            "WARNING: Both OpenRouter and native API keys detected!\n"
            "\n"
            "This creates ambiguity about which provider will be used for models\n"
            "available through both APIs (e.g., 'o3' could come from OpenAI or OpenRouter).\n"
            "\n"
            "RECOMMENDATION: Use EITHER OpenRouter OR native APIs, not both.\n"
            "\n"
            "To fix this:\n"
            "1. Use only OpenRouter: unset GEMINI_API_KEY and OPENAI_API_KEY\n"
            "2. Use only native APIs: unset OPENROUTER_API_KEY\n"
            "\n"
            "Current configuration will prioritize native APIs over OpenRouter.\n" + "=" * 70 + "\n"
        )
    # Register providers - native APIs first to ensure they take priority
    if has_native_apis:
        if gemini_key and gemini_key != "your_gemini_api_key_here":
@@ -539,7 +520,7 @@ async def handle_get_version() -> list[TextContent]:
        "author": __author__,
        "default_model": DEFAULT_MODEL,
        "default_thinking_mode_thinkdeep": DEFAULT_THINKING_MODE_THINKDEEP,
-        "max_context_tokens": f"{MAX_CONTEXT_TOKENS:,}",
+        "max_context_tokens": "Dynamic (model-specific)",
        "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
        "server_started": datetime.now().isoformat(),
        "available_tools": list(TOOLS.keys()) + ["get_version"],
@@ -565,7 +546,7 @@ Author: {__author__}
 Configuration:
 - Default Model: {DEFAULT_MODEL}
 - Default Thinking Mode (ThinkDeep): {DEFAULT_THINKING_MODE_THINKDEEP}
- Max Context: {MAX_CONTEXT_TOKENS:,} tokens
+- Max Context: Dynamic (model-specific)
 - Python: {version_info["python_version"]}
 - Started: {version_info["server_started"]}
--- a/simulator_tests/test_per_tool_deduplication.py
+++ b/simulator_tests/test_per_tool_deduplication.py
@@ -11,6 +11,7 @@ Validates that:
 4. Docker logs show deduplication behavior
 """
 import os
 import subprocess
 from .base_test import BaseSimulatorTest
@@ -98,14 +99,17 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
            # Setup test files
            self.setup_test_files()
-            # Create a short dummy file for quick testing
+            # Create a short dummy file for quick testing in the current repo
            dummy_content = """def add(a, b):
    return a + b  # Missing type hints
 def divide(x, y):
    return x / y  # No zero check
 """
-            dummy_file_path = self.create_additional_test_file("dummy_code.py", dummy_content)
+            # Create the file in the current git repo directory to make it show up in git status
            dummy_file_path = os.path.join(os.getcwd(), "dummy_code.py")
            with open(dummy_file_path, "w") as f:
                f.write(dummy_content)
            # Get timestamp for log filtering
            import datetime
@@ -162,7 +166,10 @@ def divide(x, y):
 def subtract(a, b):
    return a - b
 """
-            new_file_path = self.create_additional_test_file("new_feature.py", new_file_content)
+            # Create another temp file in the current repo for git changes
            new_file_path = os.path.join(os.getcwd(), "new_feature.py")
            with open(new_file_path, "w") as f:
                f.write(new_file_content)
            # Continue precommit with both files
            continue_params = {
@@ -249,4 +256,11 @@ def subtract(a, b):
            self.logger.error(f"File deduplication workflow test failed: {e}")
            return False
        finally:
            # Clean up temp files created in current repo
            temp_files = ["dummy_code.py", "new_feature.py"]
            for temp_file in temp_files:
                temp_path = os.path.join(os.getcwd(), temp_file)
                if os.path.exists(temp_path):
                    os.remove(temp_path)
                    self.logger.debug(f"Removed temp file: {temp_path}")
            self.cleanup_test_files()
--- a/tests/mock_helpers.py
+++ b/tests/mock_helpers.py
@@ -5,7 +5,7 @@ from unittest.mock import Mock
 from providers.base import ModelCapabilities, ProviderType, RangeTemperatureConstraint
-def create_mock_provider(model_name="gemini-2.5-flash-preview-05-20", max_tokens=1_048_576):
+def create_mock_provider(model_name="gemini-2.5-flash-preview-05-20", context_window=1_048_576):
    """Create a properly configured mock provider."""
    mock_provider = Mock()
@@ -14,7 +14,7 @@ def create_mock_provider(model_name="gemini-2.5-flash-preview-05-20", max_tokens
        provider=ProviderType.GOOGLE,
        model_name=model_name,
        friendly_name="Gemini",
-        max_tokens=max_tokens,
+        context_window=context_window,
        supports_extended_thinking=False,
        supports_system_prompts=True,
        supports_streaming=True,
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -4,7 +4,6 @@ Tests for configuration
 from config import (
    DEFAULT_MODEL,
    MAX_CONTEXT_TOKENS,
    TEMPERATURE_ANALYTICAL,
    TEMPERATURE_BALANCED,
    TEMPERATURE_CREATIVE,
@@ -33,7 +32,6 @@ class TestConfig:
        """Test model configuration"""
        # DEFAULT_MODEL is set in conftest.py for tests
        assert DEFAULT_MODEL == "gemini-2.5-flash-preview-05-20"
        assert MAX_CONTEXT_TOKENS == 1_000_000
    def test_temperature_defaults(self):
        """Test temperature constants"""
--- a/tests/test_conversation_field_mapping.py
+++ b/tests/test_conversation_field_mapping.py
@@ -77,7 +77,7 @@ async def test_conversation_history_field_mapping():
                            provider=ProviderType.GOOGLE,
                            model_name="gemini-2.5-flash-preview-05-20",
                            friendly_name="Gemini",
-                            max_tokens=200000,
+                            context_window=200000,
                            supports_extended_thinking=True,
                        )
                        mock_get_provider.return_value = mock_provider
--- a/tests/test_openrouter_provider.py
+++ b/tests/test_openrouter_provider.py
@@ -61,7 +61,7 @@ class TestOpenRouterProvider:
        caps = provider.get_capabilities("unknown-model")
        assert caps.provider == ProviderType.OPENROUTER
        assert caps.model_name == "unknown-model"
-        assert caps.max_tokens == 32_768  # Safe default
+        assert caps.context_window == 32_768  # Safe default
        assert hasattr(caps, "_is_generic") and caps._is_generic is True
    def test_model_alias_resolution(self):
@@ -139,7 +139,7 @@ class TestOpenRouterRegistry:
        caps = registry.get_capabilities("opus")
        assert caps is not None
        assert caps.model_name == "anthropic/claude-3-opus"
-        assert caps.max_tokens == 200000  # Claude's context window
+        assert caps.context_window == 200000  # Claude's context window
        # Test using full model name
        caps = registry.get_capabilities("anthropic/claude-3-opus")
--- a/tests/test_openrouter_registry.py
+++ b/tests/test_openrouter_registry.py
@@ -120,7 +120,7 @@ class TestOpenRouterModelRegistry:
        assert caps.provider == ProviderType.OPENROUTER
        assert caps.model_name == "anthropic/claude-3-opus"
        assert caps.friendly_name == "OpenRouter"
-        assert caps.max_tokens == 200000
+        assert caps.context_window == 200000
        assert not caps.supports_extended_thinking
    def test_duplicate_alias_detection(self):
@@ -147,13 +147,13 @@ class TestOpenRouterModelRegistry:
            os.unlink(temp_path)
    def test_backwards_compatibility_max_tokens(self):
-        """Test backwards compatibility with old max_tokens field."""
+        """Test that old max_tokens field is no longer supported (should result in empty registry)."""
        config_data = {
            "models": [
                {
                    "model_name": "test/old-model",
                    "aliases": ["old"],
-                    "max_tokens": 16384,  # Old field name
+                    "max_tokens": 16384,  # Old field name should cause error
                    "supports_extended_thinking": False,
                }
            ]
@@ -164,15 +164,12 @@ class TestOpenRouterModelRegistry:
            temp_path = f.name
        try:
            # Should gracefully handle the error and result in empty registry
            registry = OpenRouterModelRegistry(config_path=temp_path)
-            config = registry.resolve("old")
+            # Registry should be empty due to config error
-
+            assert len(registry.list_models()) == 0
-            assert config is not None
+            assert len(registry.list_aliases()) == 0
-            assert config.context_window == 16384  # Should be converted
+            assert registry.resolve("old") is None
            # Check capabilities still work
            caps = config.to_capabilities()
            assert caps.max_tokens == 16384
        finally:
            os.unlink(temp_path)
@@ -215,7 +212,7 @@ class TestOpenRouterModelRegistry:
        )
        caps = config.to_capabilities()
-        assert caps.max_tokens == 128000
+        assert caps.context_window == 128000
        assert caps.supports_extended_thinking
        assert caps.supports_system_prompts
        assert caps.supports_streaming
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -84,7 +84,7 @@ class TestGeminiProvider:
        assert capabilities.provider == ProviderType.GOOGLE
        assert capabilities.model_name == "gemini-2.5-flash-preview-05-20"
-        assert capabilities.max_tokens == 1_048_576
+        assert capabilities.context_window == 1_048_576
        assert capabilities.supports_extended_thinking
    def test_get_capabilities_pro_model(self):
@@ -165,7 +165,7 @@ class TestOpenAIProvider:
        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.model_name == "o3-mini"
-        assert capabilities.max_tokens == 200_000
+        assert capabilities.context_window == 200_000
        assert not capabilities.supports_extended_thinking
    def test_validate_model_names(self):
--- a/tools/base.py
+++ b/tools/base.py
@@ -22,7 +22,7 @@ from typing import Any, Literal, Optional
 from mcp.types import TextContent
 from pydantic import BaseModel, Field
-from config import MAX_CONTEXT_TOKENS, MCP_PROMPT_SIZE_LIMIT
+from config import MCP_PROMPT_SIZE_LIMIT
 from providers import ModelProvider, ModelProviderRegistry
 from utils import check_token_limit
 from utils.conversation_memory import (
@@ -414,7 +414,7 @@ class BaseTool(ABC):
            request_files: List of files requested for current tool execution
            continuation_id: Thread continuation ID, or None for new conversations
            context_description: Description for token limit validation (e.g. "Code", "New files")
-            max_tokens: Maximum tokens to use (defaults to remaining budget or MAX_CONTENT_TOKENS)
+            max_tokens: Maximum tokens to use (defaults to remaining budget or model-specific content allocation)
            reserve_tokens: Tokens to reserve for additional prompt content (default 1K)
            remaining_budget: Remaining token budget after conversation history (from server.py)
            arguments: Original tool arguments (used to extract _remaining_tokens if available)
@@ -473,17 +473,17 @@ class BaseTool(ABC):
                    capabilities = provider.get_capabilities(model_name)
                    # Calculate content allocation based on model capacity
-                    if capabilities.max_tokens < 300_000:
+                    if capabilities.context_window < 300_000:
                        # Smaller context models: 60% content, 40% response
-                        model_content_tokens = int(capabilities.max_tokens * 0.6)
+                        model_content_tokens = int(capabilities.context_window * 0.6)
                    else:
                        # Larger context models: 80% content, 20% response
-                        model_content_tokens = int(capabilities.max_tokens * 0.8)
+                        model_content_tokens = int(capabilities.context_window * 0.8)
                    effective_max_tokens = model_content_tokens - reserve_tokens
                    logger.debug(
                        f"[FILES] {self.name}: Using model-specific limit for {model_name}: "
-                        f"{model_content_tokens:,} content tokens from {capabilities.max_tokens:,} total"
+                        f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
                    )
                except (ValueError, AttributeError) as e:
                    # Handle specific errors: provider not found, model not supported, missing attributes
@@ -491,17 +491,13 @@ class BaseTool(ABC):
                        f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}"
                    )
                    # Fall back to conservative default for safety
-                    from config import MAX_CONTENT_TOKENS
+                    effective_max_tokens = 100_000 - reserve_tokens
                    effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens
                except Exception as e:
                    # Catch any other unexpected errors
                    logger.error(
                        f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
                    )
-                    from config import MAX_CONTENT_TOKENS
+                    effective_max_tokens = 100_000 - reserve_tokens
                    effective_max_tokens = min(MAX_CONTENT_TOKENS, 100_000) - reserve_tokens
        # Ensure we have a reasonable minimum budget
        effective_max_tokens = max(1000, effective_max_tokens)
@@ -1233,7 +1229,7 @@ When recommending searches, be specific about what information you need and why
        """
        return response
-    def _validate_token_limit(self, text: str, context_type: str = "Context") -> None:
+    def _validate_token_limit(self, text: str, context_type: str = "Context", context_window: int = 200_000) -> None:
        """
        Validate token limit and raise ValueError if exceeded.
@@ -1243,14 +1239,15 @@ When recommending searches, be specific about what information you need and why
        Args:
            text: The text to check
            context_type: Description of what's being checked (for error message)
            context_window: The model's context window size
        Raises:
-            ValueError: If text exceeds MAX_CONTEXT_TOKENS
+            ValueError: If text exceeds context_window
        """
-        within_limit, estimated_tokens = check_token_limit(text)
+        within_limit, estimated_tokens = check_token_limit(text, context_window)
        if not within_limit:
            raise ValueError(
-                f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {MAX_CONTEXT_TOKENS:,} tokens."
+                f"{context_type} too large (~{estimated_tokens:,} tokens). Maximum is {context_window:,} tokens."
            )
    def _validate_and_correct_temperature(self, model_name: str, temperature: float) -> tuple[float, list[str]]:
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -14,7 +14,6 @@ from typing import Any, Literal, Optional
 from mcp.types import TextContent
 from pydantic import Field
 from config import MAX_CONTEXT_TOKENS
 from prompts.tool_prompts import PRECOMMIT_PROMPT
 from utils.file_utils import translate_file_paths, translate_path_for_environment
 from utils.git_utils import find_git_repositories, get_git_status, run_git_command
@@ -23,6 +22,9 @@ from utils.token_utils import estimate_tokens
 from .base import BaseTool, ToolRequest
 from .models import ToolOutput
 # Conservative fallback for token limits
 DEFAULT_CONTEXT_WINDOW = 200_000
 class PrecommitRequest(ToolRequest):
    """Request model for precommit tool"""
@@ -186,7 +188,7 @@ class Precommit(BaseTool):
        all_diffs = []
        repo_summaries = []
        total_tokens = 0
-        max_tokens = MAX_CONTEXT_TOKENS - 50000  # Reserve tokens for prompt and response
+        max_tokens = DEFAULT_CONTEXT_WINDOW - 50000  # Reserve tokens for prompt and response
        for repo_path in repositories:
            repo_name = os.path.basename(repo_path) or "root"
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -23,7 +23,7 @@ import os
 from pathlib import Path
 from typing import Optional
-from .token_utils import MAX_CONTEXT_TOKENS, estimate_tokens
+from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens
 logger = logging.getLogger(__name__)
@@ -508,14 +508,14 @@ def read_files(
    Args:
        file_paths: List of file or directory paths (absolute paths required)
        code: Optional direct code to include (prioritized over files)
-        max_tokens: Maximum tokens to use (defaults to MAX_CONTEXT_TOKENS)
+        max_tokens: Maximum tokens to use (defaults to DEFAULT_CONTEXT_WINDOW)
        reserve_tokens: Tokens to reserve for prompt and response (default 50K)
    Returns:
        str: All file contents formatted for AI consumption
    """
    if max_tokens is None:
-        max_tokens = MAX_CONTEXT_TOKENS
+        max_tokens = DEFAULT_CONTEXT_WINDOW
    logger.debug(f"[FILES] read_files called with {len(file_paths)} paths")
    logger.debug(
--- a/utils/model_context.py
+++ b/utils/model_context.py
@@ -72,7 +72,7 @@ class ModelContext:
        Returns:
            TokenAllocation with calculated budgets
        """
-        total_tokens = self.capabilities.max_tokens
+        total_tokens = self.capabilities.context_window
        # Dynamic allocation based on model capacity
        if total_tokens < 300_000:
--- a/utils/token_utils.py
+++ b/utils/token_utils.py
@@ -9,7 +9,8 @@ approximate. For production systems requiring precise token counts,
 consider using the actual tokenizer for the specific model.
 """
-from config import MAX_CONTEXT_TOKENS
+# Default fallback for token limit (conservative estimate)
 DEFAULT_CONTEXT_WINDOW = 200_000  # Conservative fallback for unknown models
 def estimate_tokens(text: str) -> int:
@@ -32,9 +33,9 @@ def estimate_tokens(text: str) -> int:
    return len(text) // 4
-def check_token_limit(text: str) -> tuple[bool, int]:
+def check_token_limit(text: str, context_window: int = DEFAULT_CONTEXT_WINDOW) -> tuple[bool, int]:
    """
-    Check if text exceeds the maximum token limit for Gemini models.
+    Check if text exceeds the specified token limit.
    This function is used to validate that prepared prompts will fit
    within the model's context window, preventing API errors and ensuring
@@ -42,11 +43,12 @@ def check_token_limit(text: str) -> tuple[bool, int]:
    Args:
        text: The text to check
        context_window: The model's context window size (defaults to conservative fallback)
    Returns:
        Tuple[bool, int]: (is_within_limit, estimated_tokens)
-        - is_within_limit: True if the text fits within MAX_CONTEXT_TOKENS
+        - is_within_limit: True if the text fits within context_window
        - estimated_tokens: The estimated token count
    """
    estimated = estimate_tokens(text)
-    return estimated <= MAX_CONTEXT_TOKENS, estimated
+    return estimated <= context_window, estimated