Renamed setup script to avoid confusion (https://github.com/BeehiveInnovations/zen-mcp-server/issues/35)

Further fixes to tests Pass O3 simulation test when keys are not set, along with a notice Updated docs on testing, simulation tests / contributing Support for OpenAI o4-mini and o4-mini-high
2025-06-14 09:28:20 +04:00
parent c5f682c7b0
commit 746380eb7f
17 changed files with 324 additions and 53 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -36,10 +36,32 @@ Please provide a clear and concise description of what this PR does.
 ## Testing
- [ ] Unit tests pass
+**Please review our [Testing Guide](../docs/testing.md) before submitting.**
- [ ] Integration tests pass (if applicable)
+
- [ ] Manual testing completed
+### Run all linting and tests (required):
- [ ] Documentation updated (if needed)
+```bash
 # Activate virtual environment first
 source venv/bin/activate
 # Run all linting checks
 ruff check .
 black --check .
 isort --check-only .
 # Run all unit tests
 python -m pytest -xvs
 # If you made tool changes, also run simulator tests
 python communication_simulator_test.py
 ```
 - [ ] All linting passes (ruff, black, isort)
 - [ ] All unit tests pass
 - [ ] **For new features**: Unit tests added in `tests/`
 - [ ] **For tool changes**: Simulator tests added in `simulator_tests/`
 - [ ] **For bug fixes**: Tests added to prevent regression
 - [ ] Simulator tests pass (if applicable)
 - [ ] Manual testing completed with realistic scenarios
 ## Related Issues
@@ -48,11 +70,12 @@ Fixes #(issue number)
 ## Checklist
 - [ ] PR title follows the format guidelines above
- [ ] Code follows the project's style guidelines
+- [ ] Activated venv and ran all linting: `source venv/bin/activate && ruff check . && black --check . && isort --check-only .`
 - [ ] Self-review completed
- [ ] Tests added/updated as needed
+- [ ] **Tests added for ALL changes** (see Testing section above)
 - [ ] Documentation updated as needed
- [ ] All tests passing
+- [ ] All unit tests passing: `python -m pytest -xvs`
 - [ ] Relevant simulator tests passing (if tool changes)
 - [ ] Ready for review
 ## Additional Notes
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ git clone https://github.com/BeehiveInnovations/zen-mcp-server.git
 cd zen-mcp-server
 # One-command setup (includes Redis for AI conversations)
-./setup-docker.sh
+./run-server.sh
 ```
 **What this does:**
@@ -153,6 +153,9 @@ nano .env
 # WORKSPACE_ROOT=/Users/your-username  (automatically configured)
 # Note: At least one API key OR custom URL is required
 # After making changes to .env, restart the server:
 # ./run-server.sh
 ```
 ### 4. Configure Claude
@@ -184,7 +187,7 @@ This will open a folder revealing `claude_desktop_config.json`.
 2. ** Update Docker Configuration**
-The setup script shows you the exact configuration. It looks like this. When you ran `setup-docker.sh` it should
+The setup script shows you the exact configuration. It looks like this. When you ran `run-server.sh` it should
 have produced a configuration for you to copy:
 ```json
@@ -500,18 +503,24 @@ DEFAULT_MODEL=auto  # Claude picks the best model automatically
 # API Keys (at least one required)
 GEMINI_API_KEY=your-gemini-key    # Enables Gemini Pro & Flash
-OPENAI_API_KEY=your-openai-key    # Enables O3, O3-mini
+OPENAI_API_KEY=your-openai-key    # Enables O3, O3mini, O4-mini, O4-mini-high
 ```
 **Available Models:**
 - **`pro`** (Gemini 2.5 Pro): Extended thinking, deep analysis
 - **`flash`** (Gemini 2.0 Flash): Ultra-fast responses
 - **`o3`**: Strong logical reasoning  
- **`o3-mini`**: Balanced speed/quality
+- **`o3mini`**: Balanced speed/quality
 - **`o4-mini`**: Latest reasoning model, optimized for shorter contexts
 - **`o4-mini-high`**: Enhanced O4 with higher reasoning effort
 - **Custom models**: via OpenRouter or local APIs (Ollama, vLLM, etc.)
 For detailed configuration options, see the [Advanced Usage Guide](docs/advanced-usage.md).
 ## Testing
 For information on running tests and contributing, see the [Testing Guide](docs/testing.md).
 ## License
 Apache 2.0 License - see LICENSE file for details.
--- a/communication_simulator_test.py
+++ b/communication_simulator_test.py
@@ -17,7 +17,7 @@ Usage:
    --tests: Run specific tests only (space-separated)
    --list-tests: List all available tests
    --individual: Run a single test individually
-    --rebuild: Force rebuild Docker environment using setup-docker.sh
+    --rebuild: Force rebuild Docker environment using run-server.sh
 Available tests:
    basic_conversation          - Basic conversation flow with chat tool
@@ -115,9 +115,9 @@ class CommunicationSimulator:
            self.temp_dir = tempfile.mkdtemp(prefix="mcp_test_")
            self.logger.debug(f"Created temp directory: {self.temp_dir}")
-            # Only run setup-docker.sh if rebuild is requested
+            # Only run run-server.sh if rebuild is requested
            if self.rebuild:
-                if not self._run_setup_docker():
+                if not self._run_server_script():
                    return False
            # Always verify containers are running (regardless of rebuild)
@@ -127,34 +127,34 @@ class CommunicationSimulator:
            self.logger.error(f"Failed to setup test environment: {e}")
            return False
-    def _run_setup_docker(self) -> bool:
+    def _run_server_script(self) -> bool:
-        """Run the setup-docker.sh script"""
+        """Run the run-server.sh script"""
        try:
-            self.logger.info("Running setup-docker.sh...")
+            self.logger.info("Running run-server.sh...")
-            # Check if setup-docker.sh exists
+            # Check if run-server.sh exists
-            setup_script = "./setup-docker.sh"
+            setup_script = "./run-server.sh"
            if not os.path.exists(setup_script):
-                self.logger.error(f"setup-docker.sh not found at {setup_script}")
+                self.logger.error(f"run-server.sh not found at {setup_script}")
                return False
            # Make sure it's executable
            result = self._run_command(["chmod", "+x", setup_script], capture_output=True)
            if result.returncode != 0:
-                self.logger.error(f"Failed to make setup-docker.sh executable: {result.stderr}")
+                self.logger.error(f"Failed to make run-server.sh executable: {result.stderr}")
                return False
            # Run the setup script
            result = self._run_command([setup_script], capture_output=True)
            if result.returncode != 0:
-                self.logger.error(f"setup-docker.sh failed: {result.stderr}")
+                self.logger.error(f"run-server.sh failed: {result.stderr}")
                return False
-            self.logger.info("setup-docker.sh completed successfully")
+            self.logger.info("run-server.sh completed successfully")
            return True
        except Exception as e:
-            self.logger.error(f"Failed to run setup-docker.sh: {e}")
+            self.logger.error(f"Failed to run run-server.sh: {e}")
            return False
    def _verify_existing_containers(self) -> bool:
@@ -345,9 +345,9 @@ class CommunicationSimulator:
        try:
            self.logger.info("Cleaning up test environment...")
-            # Note: We don't stop Docker services ourselves - let setup-docker.sh handle Docker lifecycle
+            # Note: We don't stop Docker services ourselves - let run-server.sh handle Docker lifecycle
            if not self.keep_logs:
-                self.logger.info("Test completed. Docker containers left running (use setup-docker.sh to manage)")
+                self.logger.info("Test completed. Docker containers left running (use run-server.sh to manage)")
            else:
                self.logger.info("Keeping logs and Docker services running for inspection")
@@ -375,7 +375,7 @@ def parse_arguments():
    parser.add_argument("--tests", "-t", nargs="+", help="Specific tests to run (space-separated)")
    parser.add_argument("--list-tests", action="store_true", help="List available tests and exit")
    parser.add_argument("--individual", "-i", help="Run a single test individually")
-    parser.add_argument("--rebuild", action="store_true", help="Force rebuild Docker environment using setup-docker.sh")
+    parser.add_argument("--rebuild", action="store_true", help="Force rebuild Docker environment using run-server.sh")
    return parser.parse_args()
--- a/conf/custom_models.json
+++ b/conf/custom_models.json
@@ -130,15 +130,42 @@
      "supports_function_calling": true,
      "description": "OpenAI's o3 model - well-rounded and powerful across domains"
    },
    {
      "model_name": "openai/o3-mini",
      "aliases": ["o3-mini", "o3mini"],
      "context_window": 200000,
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "description": "OpenAI's o3-mini model - balanced performance and speed"
    },
    {
      "model_name": "openai/o3-mini-high",
-      "aliases": ["o3-mini", "o3mini", "o3-mini-high", "o3mini-high"],
+      "aliases": ["o3-mini-high", "o3mini-high"],
      "context_window": 200000,
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems"
    },
    {
      "model_name": "openai/o4-mini",
      "aliases": ["o4-mini", "o4mini"],
      "context_window": 200000,
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning"
    },
    {
      "model_name": "openai/o4-mini-high",
      "aliases": ["o4-mini-high", "o4mini-high", "o4minihigh", "o4minihi"],
      "context_window": 200000,
      "supports_extended_thinking": false,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "description": "OpenAI's o4-mini with high reasoning effort - enhanced for complex tasks"
    },
    {
      "model_name": "llama3.2",
      "aliases": ["local-llama", "local", "llama3.2", "ollama-llama"],
--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "4.3.0"
+__version__ = "4.3.1"
 # Last update date in ISO format
 __updated__ = "2025-06-14"
 # Primary maintainer
@@ -32,23 +32,44 @@ IS_AUTO_MODE = DEFAULT_MODEL.lower() == "auto"
 # Model capabilities descriptions for auto mode
 # These help Claude choose the best model for each task
 #
 # IMPORTANT: These are the built-in natively supported models:
 # - When GEMINI_API_KEY is set: Enables "flash", "pro" (and their full names)
 # - When OPENAI_API_KEY is set: Enables "o3", "o3mini", "o4-mini", "o4-mini-high"
 # - When both are set: All models below are available
 # - When neither is set but OpenRouter/Custom API is configured: These model
 #   aliases will automatically map to equivalent models via the proxy provider
 #
 # In auto mode (DEFAULT_MODEL=auto), Claude will see these descriptions and
 # intelligently select the best model for each task. The descriptions appear
 # in the tool schema to guide Claude's selection based on task requirements.
 MODEL_CAPABILITIES_DESC = {
    # Gemini models - Available when GEMINI_API_KEY is configured
    "flash": "Ultra-fast (1M context) - Quick analysis, simple queries, rapid iterations",
    "pro": "Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis",
    # OpenAI models - Available when OPENAI_API_KEY is configured
    "o3": "Strong reasoning (200K context) - Logical problems, code generation, systematic analysis",
    "o3-mini": "Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity",
-    # Full model names also supported
+    "o4-mini": "Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning",
    "o4-mini-high": "Enhanced O4 mini (200K context) - Higher reasoning effort for complex tasks",
    # Full model names also supported (for explicit specification)
    "gemini-2.5-flash-preview-05-20": "Ultra-fast (1M context) - Quick analysis, simple queries, rapid iterations",
    "gemini-2.5-pro-preview-06-05": (
        "Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis"
    ),
 }
-# Note: When only OpenRouter is configured, these model aliases automatically map to equivalent models:
+# OpenRouter/Custom API Fallback Behavior:
-# - "flash" → "google/gemini-2.5-flash-preview-05-20"
+# When only OpenRouter or Custom API is configured (no native API keys), these
-# - "pro" → "google/gemini-2.5-pro-preview-06-05"
+# model aliases automatically map to equivalent models through the proxy:
-# - "o3" → "openai/gpt-4o"
+# - "flash" → "google/gemini-2.5-flash-preview-05-20" (via OpenRouter)
-# - "o3-mini" → "openai/gpt-4o-mini"
+# - "pro" → "google/gemini-2.5-pro-preview-06-05" (via OpenRouter)
 # - "o3" → "openai/o3" (via OpenRouter)
 # - "o3mini" → "openai/o3-mini" (via OpenRouter)
 # - "o4-mini" → "openai/o4-mini" (via OpenRouter)
 # - "o4-mini-high" → "openai/o4-mini-high" (via OpenRouter)
 #
 # This ensures the same model names work regardless of which provider is configured.
 # Temperature defaults for different tool types
--- a/docs/advanced-usage.md
+++ b/docs/advanced-usage.md
@@ -55,6 +55,8 @@ DEFAULT_MODEL=flash                         # Always use Flash
 DEFAULT_MODEL=o3                           # Always use O3
 ```
 **Important:** After changing any configuration in `.env` (including `DEFAULT_MODEL`, API keys, or other settings), restart the server with `./run-server.sh` to apply the changes.
 **Per-Request Model Override:**
 Regardless of your default setting, you can specify models per request:
 - "Use **pro** for deep security analysis of auth.py"
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -0,0 +1,126 @@
 # Testing Guide
 This project includes comprehensive test coverage through unit tests and integration simulator tests.
 ## Running Tests
 ### Prerequisites
 - Python virtual environment activated: `source venv/bin/activate`
 - All dependencies installed: `pip install -r requirements.txt`
 - Docker containers running (for simulator tests): `./run-server.sh`
 ### Unit Tests
 Run all unit tests with pytest:
 ```bash
 # Run all tests with verbose output
 python -m pytest -xvs
 # Run specific test file
 python -m pytest tests/test_providers.py -xvs
 ```
 ### Simulator Tests
 Simulator tests replicate real-world Claude CLI interactions with the MCP server running in Docker. Unlike unit tests that test isolated functions, simulator tests validate the complete end-to-end flow including:
 - Actual MCP protocol communication
 - Docker container interactions
 - Multi-turn conversations across tools
 - Log output validation
 **Important**: Simulator tests require `LOG_LEVEL=DEBUG` in your `.env` file to validate detailed execution logs.
 #### Running All Simulator Tests
 ```bash
 # Run all simulator tests
 python communication_simulator_test.py
 # Run with verbose output for debugging
 python communication_simulator_test.py --verbose
 # Keep Docker logs after tests for inspection
 python communication_simulator_test.py --keep-logs
 ```
 #### Running Individual Tests
 To run a single simulator test in isolation (useful for debugging or test development):
 ```bash
 # Run a specific test by name
 python communication_simulator_test.py --individual basic_conversation
 # Examples of available tests:
 python communication_simulator_test.py --individual content_validation
 python communication_simulator_test.py --individual cross_tool_continuation
 python communication_simulator_test.py --individual redis_validation
 ```
 #### Other Options
 ```bash
 # List all available simulator tests with descriptions
 python communication_simulator_test.py --list-tests
 # Run multiple specific tests (not all)
 python communication_simulator_test.py --tests basic_conversation content_validation
 # Force Docker environment rebuild before running tests
 python communication_simulator_test.py --rebuild
 ```
 ### Code Quality Checks
 Before committing, ensure all linting passes:
 ```bash
 # Run all linting checks
 ruff check .
 black --check .
 isort --check-only .
 # Auto-fix issues
 ruff check . --fix
 black .
 isort .
 ```
 ## What Each Test Suite Covers
 ### Unit Tests (256 tests)
 Test isolated components and functions:
 - **Provider functionality**: Model initialization, API interactions, capability checks
 - **Tool operations**: All MCP tools (chat, analyze, debug, etc.)
 - **Conversation memory**: Threading, continuation, history management
 - **File handling**: Path validation, token limits, deduplication
 - **Auto mode**: Model selection logic and fallback behavior
 ### Simulator Tests (14 tests)
 Validate real-world usage scenarios by simulating actual Claude prompts:
 - **Basic conversations**: Multi-turn chat functionality with real prompts
 - **Cross-tool continuation**: Context preservation across different tools
 - **File deduplication**: Efficient handling of repeated file references
 - **Model selection**: Proper routing to configured providers
 - **Token allocation**: Context window management in practice
 - **Redis validation**: Conversation persistence and retrieval
 ## Contributing: Test Requirements
 When contributing to this project:
 1. **New features MUST include tests**:
   - Add unit tests in `tests/` for new functions or classes
   - Test both success and error cases
 2. **Tool changes require simulator tests**:
   - Add simulator tests in `simulator_tests/` for new or modified tools
   - Use realistic prompts that demonstrate the feature
   - Validate output through Docker logs
 3. **Test naming conventions**:
   - Unit tests: `test_<feature>_<scenario>.py`
   - Simulator tests: `test_<tool>_<behavior>.py`
 4. **Before submitting PR**:
   - Run all unit tests: `python -m pytest -xvs`
   - Run relevant simulator tests
   - Ensure all linting passes
 Remember: Tests are documentation. They show how features are intended to be used and help prevent regressions.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ cat .env
 If you need to update your API keys, edit the `.env` file and then run:
 ```bash
-./setup-docker.sh
+./run-server.sh
 ```
 This will validate your configuration and restart the services.
@@ -73,7 +73,7 @@ See [Logging Documentation](logging.md) for more details on accessing logs.
 **"API key environment variable is required"**
 - Add your API key to the `.env` file
- Run: `./setup-docker.sh` to validate and restart
+- Run: `./run-server.sh` to validate and restart
 **File path errors**
 - Always use absolute paths: `/Users/you/project/file.py`
--- a/examples/claude_config_macos.json
+++ b/examples/claude_config_macos.json
@@ -1,7 +1,7 @@
 {
  "comment": "macOS configuration using Docker",
  "comment2": "Ensure Docker is running and containers are started",
-  "comment3": "Run './setup-docker.sh' first to set up the environment",
+  "comment3": "Run './run-server.sh' first to set up the environment",
  "mcpServers": {
    "zen": {
      "command": "docker",
--- a/examples/claude_config_wsl.json
+++ b/examples/claude_config_wsl.json
@@ -1,7 +1,7 @@
 {
  "comment": "Windows configuration using WSL with Docker",
  "comment2": "Ensure Docker Desktop is running and WSL integration is enabled",
-  "comment3": "Run './setup-docker.sh' in WSL first to set up the environment",
+  "comment3": "Run './run-server.sh' in WSL first to set up the environment",
  "mcpServers": {
    "zen": {
      "command": "wsl.exe",
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -22,6 +22,19 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
        "o4-mini": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
        "o4-mini-high": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
        },
        # Shorthands
        "o3mini": "o3-mini",
        "o4mini": "o4-mini",
        "o4minihigh": "o4-mini-high",
        "o4minihi": "o4-mini-high",
    }
    def __init__(self, api_key: str, **kwargs):
@@ -32,14 +45,17 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    def get_capabilities(self, model_name: str) -> ModelCapabilities:
        """Get capabilities for a specific OpenAI model."""
-        if model_name not in self.SUPPORTED_MODELS:
+        # Resolve shorthand
        resolved_name = self._resolve_model_name(model_name)
        if resolved_name not in self.SUPPORTED_MODELS or isinstance(self.SUPPORTED_MODELS[resolved_name], str):
            raise ValueError(f"Unsupported OpenAI model: {model_name}")
-        config = self.SUPPORTED_MODELS[model_name]
+        config = self.SUPPORTED_MODELS[resolved_name]
        # Define temperature constraints per model
-        if model_name in ["o3", "o3-mini"]:
+        if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]:
-            # O3 models only support temperature=1.0
+            # O3 and O4 reasoning models only support temperature=1.0
            temp_constraint = FixedTemperatureConstraint(1.0)
        else:
            # Other OpenAI models support 0.0-2.0 range
@@ -63,10 +79,19 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    def validate_model_name(self, model_name: str) -> bool:
        """Validate if the model name is supported."""
-        return model_name in self.SUPPORTED_MODELS
+        resolved_name = self._resolve_model_name(model_name)
        return resolved_name in self.SUPPORTED_MODELS and isinstance(self.SUPPORTED_MODELS[resolved_name], dict)
    def supports_thinking_mode(self, model_name: str) -> bool:
        """Check if the model supports extended thinking mode."""
        # Currently no OpenAI models support extended thinking
        # This may change with future O3 models
        return False
    def _resolve_model_name(self, model_name: str) -> str:
        """Resolve model shorthand to full name."""
        # Check if it's a shorthand
        shorthand_value = self.SUPPORTED_MODELS.get(model_name)
        if isinstance(shorthand_value, str):
            return shorthand_value
        return model_name
--- a/setup-docker.sh
+++ b/setup-docker.sh
@@ -3,8 +3,12 @@
 # Exit on any error, undefined variables, and pipe failures
 set -euo pipefail
-# Modern Docker setup script for Zen MCP Server with Redis
+# Run/Restart script for Zen MCP Server with Redis
-# This script sets up the complete Docker environment including Redis for conversation threading
+# This script builds, starts, and manages the Docker environment including Redis for conversation threading
 # Run this script to:
 # - Initial setup of the Docker environment
 # - Restart services after changing .env configuration
 # - Rebuild and restart after code changes
 # Spinner function for long-running operations
 show_spinner() {
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -71,6 +71,15 @@ class O3ModelSelectionTest(BaseSimulatorTest):
                self.logger.info("  ℹ️  Only OpenRouter configured - O3 models will be routed through OpenRouter")
                return self._run_openrouter_o3_test()
            # If neither OpenAI nor OpenRouter is configured, skip the test
            if not has_openai and not has_openrouter:
                self.logger.info("  ⚠️  Neither OpenAI nor OpenRouter API keys configured - skipping test")
                self.logger.info(
                    "  ℹ️  This test requires either OPENAI_API_KEY or OPENROUTER_API_KEY to be set in .env"
                )
                self.logger.info("  ✅ Test skipped (no API keys configured)")
                return True  # Return True to indicate test passed/skipped
            # Original test for when OpenAI is configured
            self.logger.info("  ℹ️  OpenAI API configured - expecting direct OpenAI API calls")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -85,7 +85,9 @@ def mock_provider_availability(request, monkeypatch):
    the tools don't require model selection unless explicitly testing auto mode.
    """
    # Skip this fixture for tests that need real providers
-    if hasattr(request, "node") and request.node.get_closest_marker("no_mock_provider"):
+    if hasattr(request, "node"):
        marker = request.node.get_closest_marker("no_mock_provider")
        if marker:
            return
    from unittest.mock import MagicMock
--- a/tests/test_conversation_field_mapping.py
+++ b/tests/test_conversation_field_mapping.py
@@ -2,7 +2,6 @@
 Test that conversation history is correctly mapped to tool-specific fields
 """
 import os
 from datetime import datetime
 from unittest.mock import MagicMock, patch
@@ -130,8 +129,7 @@ async def test_unknown_tool_defaults_to_prompt():
    with patch("utils.conversation_memory.get_thread", return_value=mock_context):
        with patch("utils.conversation_memory.add_turn", return_value=True):
            with patch("utils.conversation_memory.build_conversation_history", return_value=("History", 500)):
-                # The test uses the conftest fixture which should handle provider mocking
+                # The autouse fixture should handle provider mocking
                # We just need to ensure the arguments are correct
                arguments = {
                    "continuation_id": "test-thread-456",
                    "prompt": "User input",
--- a/tests/test_openrouter_provider.py
+++ b/tests/test_openrouter_provider.py
@@ -72,7 +72,10 @@ class TestOpenRouterProvider:
        assert provider._resolve_model_name("opus") == "anthropic/claude-3-opus"
        assert provider._resolve_model_name("sonnet") == "anthropic/claude-3-sonnet"
        assert provider._resolve_model_name("o3") == "openai/o3"
-        assert provider._resolve_model_name("o3-mini") == "openai/o3-mini-high"
+        assert provider._resolve_model_name("o3-mini") == "openai/o3-mini"
        assert provider._resolve_model_name("o3mini") == "openai/o3-mini"
        assert provider._resolve_model_name("o4-mini") == "openai/o4-mini"
        assert provider._resolve_model_name("o4-mini-high") == "openai/o4-mini-high"
        assert provider._resolve_model_name("claude") == "anthropic/claude-3-sonnet"
        assert provider._resolve_model_name("mistral") == "mistral/mistral-large"
        assert provider._resolve_model_name("deepseek") == "deepseek/deepseek-r1-0528"
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -183,12 +183,31 @@ class TestOpenAIProvider:
        assert capabilities.context_window == 200_000
        assert not capabilities.supports_extended_thinking
    def test_get_capabilities_o4_mini(self):
        """Test getting O4-mini model capabilities"""
        provider = OpenAIModelProvider(api_key="test-key")
        capabilities = provider.get_capabilities("o4-mini")
        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.model_name == "o4-mini"
        assert capabilities.context_window == 200_000
        assert not capabilities.supports_extended_thinking
        # Check temperature constraint is fixed at 1.0
        assert capabilities.temperature_constraint.value == 1.0
    def test_validate_model_names(self):
        """Test model name validation"""
        provider = OpenAIModelProvider(api_key="test-key")
        assert provider.validate_model_name("o3")
-        assert provider.validate_model_name("o3-mini")
+        assert provider.validate_model_name("o3mini")
        assert provider.validate_model_name("o3-mini")  # Backwards compatibility
        assert provider.validate_model_name("o4-mini")
        assert provider.validate_model_name("o4mini")
        assert provider.validate_model_name("o4-mini-high")
        assert provider.validate_model_name("o4minihigh")
        assert provider.validate_model_name("o4minihi")
        assert not provider.validate_model_name("gpt-4o")
        assert not provider.validate_model_name("invalid-model")
@@ -197,4 +216,7 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider(api_key="test-key")
        assert not provider.supports_thinking_mode("o3")
        assert not provider.supports_thinking_mode("o3mini")
        assert not provider.supports_thinking_mode("o3-mini")
        assert not provider.supports_thinking_mode("o4-mini")
        assert not provider.supports_thinking_mode("o4-mini-high")