Merge branch 'main' into grok4-support

2025-08-07 23:04:15 -07:00
parent 22b22f7518 fcb0fe3ef2
commit 8a884c57d6
52 changed files with 2400 additions and 637 deletions
--- a/.env.example
+++ b/.env.example
@@ -37,13 +37,13 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here
 # Optional: Default model to use
 # Options: 'auto' (Claude picks best model), 'pro', 'flash', 'o3', 'o3-mini', 'o4-mini', 'o4-mini-high',
-#          'grok', 'opus-4', 'sonnet-4', or any DIAL model if DIAL is configured
+#          'gpt-5', 'gpt-5-mini', 'grok', 'opus-4', 'sonnet-4', or any DIAL model if DIAL is configured
 # When set to 'auto', Claude will select the best model for each task
 # Defaults to 'auto' if not specified
 DEFAULT_MODEL=auto
 # Optional: Default thinking mode for ThinkDeep tool
-# NOTE: Only applies to models that support extended thinking (e.g., Gemini 2.5 Pro)
+# NOTE: Only applies to models that support extended thinking (e.g., Gemini 2.5 Pro, GPT-5 models)
 #       Flash models (2.0) will use system prompt engineering instead
 # Token consumption per mode:
 #   minimal: 128 tokens   - Quick analysis, fastest response
@@ -65,6 +65,8 @@ DEFAULT_THINKING_MODE_THINKDEEP=high
 #   - o3-mini          (200K context, balanced)
 #   - o4-mini          (200K context, latest balanced, temperature=1.0 only)
 #   - o4-mini-high     (200K context, enhanced reasoning, temperature=1.0 only)
 #   - gpt-5            (400K context, 128K output, reasoning tokens)
 #   - gpt-5-mini       (400K context, 128K output, reasoning tokens)
 #   - mini             (shorthand for o4-mini)
 #
 # Supported Google/Gemini models:
--- a/config.py
+++ b/config.py
@@ -14,9 +14,9 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "5.8.2"
+__version__ = "5.8.3"
 # Last update date in ISO format
-__updated__ = "2025-06-30"
+__updated__ = "2025-08-08"
 # Primary maintainer
 __author__ = "Fahad Gilani"
@@ -75,10 +75,10 @@ DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION = 2
 #
 # IMPORTANT: This limit ONLY applies to the Claude CLI ↔ MCP Server transport boundary.
 # It does NOT limit internal MCP Server operations like system prompts, file embeddings,
-# conversation history, or content sent to external models (Gemini/O3/OpenRouter).
+# conversation history, or content sent to external models (Gemini/OpenAI/OpenRouter).
 #
 # MCP Protocol Architecture:
-# Claude CLI ←→ MCP Server ←→ External Model (Gemini/O3/etc.)
+# Claude CLI ←→ MCP Server ←→ External Model (Gemini/OpenAI/etc.)
 #     ↑                              ↑
 #     │                              │
 # MCP transport                Internal processing
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -115,6 +115,14 @@ Test isolated components and functions:
 - **File handling**: Path validation, token limits, deduplication
 - **Auto mode**: Model selection logic and fallback behavior
 ### HTTP Recording/Replay Tests (HTTP Transport Recorder)
 Tests for expensive API calls (like o3-pro) use custom recording/replay:
 - **Real API validation**: Tests against actual provider responses
 - **Cost efficiency**: Record once, replay forever
 - **Provider compatibility**: Validates fixes against real APIs
 - Uses HTTP Transport Recorder for httpx-based API calls
 - See [HTTP Recording/Replay Testing Guide](./vcr-testing.md) for details
 ### Simulator Tests
 Validate real-world usage scenarios by simulating actual Claude prompts:
 - **Basic conversations**: Multi-turn chat functionality with real prompts
--- a/docs/vcr-testing.md
+++ b/docs/vcr-testing.md
@@ -0,0 +1,128 @@
 # HTTP Transport Recorder for Testing
 A custom HTTP recorder for testing expensive API calls (like o3-pro) with real responses.
 ## Overview
 The HTTP Transport Recorder captures and replays HTTP interactions at the transport layer, enabling:
 - Cost-efficient testing of expensive APIs (record once, replay forever)
 - Deterministic tests with real API responses
 - Seamless integration with httpx and OpenAI SDK
 - Automatic PII sanitization for secure recordings
 ## Quick Start
 ```python
 from tests.transport_helpers import inject_transport
 # Simple one-line setup with automatic transport injection
 def test_expensive_api_call(monkeypatch):
    inject_transport(monkeypatch, "tests/openai_cassettes/my_test.json")
    # Make API calls - automatically recorded/replayed with PII sanitization
    result = await chat_tool.execute({"prompt": "2+2?", "model": "o3-pro"})
 ```
 ## How It Works
 1. **First run** (cassette doesn't exist): Records real API calls
 2. **Subsequent runs** (cassette exists): Replays saved responses
 3. **Re-record**: Delete cassette file and run again
 ## Usage in Tests
 The `transport_helpers.inject_transport()` function simplifies test setup:
 ```python
 from tests.transport_helpers import inject_transport
 async def test_with_recording(monkeypatch):
    # One-line setup - handles all transport injection complexity
    inject_transport(monkeypatch, "tests/openai_cassettes/my_test.json")
    # Use API normally - recording/replay happens transparently
    result = await chat_tool.execute({"prompt": "2+2?", "model": "o3-pro"})
 ```
 For manual setup, see `test_o3_pro_output_text_fix.py`.
 ## Automatic PII Sanitization
 All recordings are automatically sanitized to remove sensitive data:
 - **API Keys & Tokens**: Bearer tokens, API keys, and auth headers
 - **Personal Data**: Email addresses, IP addresses, phone numbers
 - **URLs**: Sensitive query parameters and paths
 - **Custom Patterns**: Add your own sanitization rules
 Sanitization is enabled by default in `RecordingTransport`. To disable:
 ```python
 transport = TransportFactory.create_transport(cassette_path, sanitize=False)
 ```
 ## File Structure
 ```
 tests/
 ├── openai_cassettes/           # Recorded API interactions
 │   └── *.json                  # Cassette files
 ├── http_transport_recorder.py  # Transport implementation
 ├── pii_sanitizer.py           # Automatic PII sanitization
 ├── transport_helpers.py       # Simplified transport injection
 ├── sanitize_cassettes.py      # Batch sanitization script
 └── test_o3_pro_output_text_fix.py  # Example usage
 ```
 ## Sanitizing Existing Cassettes
 Use the `sanitize_cassettes.py` script to clean existing recordings:
 ```bash
 # Sanitize all cassettes (creates backups)
 python tests/sanitize_cassettes.py
 # Sanitize specific cassette
 python tests/sanitize_cassettes.py tests/openai_cassettes/my_test.json
 # Skip backup creation
 python tests/sanitize_cassettes.py --no-backup
 ```
 The script will:
 - Create timestamped backups of original files
 - Apply comprehensive PII sanitization
 - Preserve JSON structure and functionality
 ## Cost Management
 - **One-time cost**: Initial recording only
 - **Zero ongoing cost**: Replays are free
 - **CI-friendly**: No API keys needed for replay
 ## Re-recording
 When API changes require new recordings:
 ```bash
 # Delete specific cassette
 rm tests/openai_cassettes/my_test.json
 # Run test with real API key
 python -m pytest tests/test_o3_pro_output_text_fix.py
 ```
 ## Implementation Details
 - **RecordingTransport**: Captures real HTTP calls with automatic PII sanitization
 - **ReplayTransport**: Serves saved responses from cassettes
 - **TransportFactory**: Auto-selects mode based on cassette existence
 - **PIISanitizer**: Comprehensive sanitization of sensitive data (integrated by default)
 **Security Note**: While recordings are automatically sanitized, always review new cassette files before committing. The sanitizer removes known patterns of sensitive data, but domain-specific secrets may need custom rules.
 For implementation details, see:
 - `tests/http_transport_recorder.py` - Core transport implementation
 - `tests/pii_sanitizer.py` - Sanitization patterns and logic
 - `tests/transport_helpers.py` - Simplified test integration
--- a/providers/base.py
+++ b/providers/base.py
@@ -4,7 +4,10 @@ import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 if TYPE_CHECKING:
    from tools.models import ToolModelCategory
 logger = logging.getLogger(__name__)
@@ -118,10 +121,10 @@ def create_temperature_constraint(constraint_type: str) -> TemperatureConstraint
        return FixedTemperatureConstraint(1.0)
    elif constraint_type == "discrete":
        # For models with specific allowed values - using common OpenAI values as default
-        return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.7)
+        return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.3)
    else:
        # Default range constraint (for "range" or None)
-        return RangeTemperatureConstraint(0.0, 2.0, 0.7)
+        return RangeTemperatureConstraint(0.0, 2.0, 0.3)
@dataclass
@@ -154,24 +157,11 @@ class ModelCapabilities:
    # Custom model flag (for models that only work with custom endpoints)
    is_custom: bool = False  # Whether this model requires custom API endpoints
-    # Temperature constraint object - preferred way to define temperature limits
+    # Temperature constraint object - defines temperature limits and behavior
    temperature_constraint: TemperatureConstraint = field(
-        default_factory=lambda: RangeTemperatureConstraint(0.0, 2.0, 0.7)
+        default_factory=lambda: RangeTemperatureConstraint(0.0, 2.0, 0.3)
    )
    # Backward compatibility property for existing code
    @property
    def temperature_range(self) -> tuple[float, float]:
        """Backward compatibility for existing code that uses temperature_range."""
        if isinstance(self.temperature_constraint, RangeTemperatureConstraint):
            return (self.temperature_constraint.min_temp, self.temperature_constraint.max_temp)
        elif isinstance(self.temperature_constraint, FixedTemperatureConstraint):
            return (self.temperature_constraint.value, self.temperature_constraint.value)
        elif isinstance(self.temperature_constraint, DiscreteTemperatureConstraint):
            values = self.temperature_constraint.allowed_values
            return (min(values), max(values))
        return (0.0, 2.0)  # Fallback
@dataclass
 class ModelResponse:
@@ -212,7 +202,7 @@ class ModelProvider(ABC):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -268,18 +258,15 @@ class ModelProvider(ABC):
            if not capabilities.supports_temperature:
                return None
-            # Get temperature range
+            # Use temperature constraint to get corrected value
-            min_temp, max_temp = capabilities.temperature_range
+            corrected_temp = capabilities.temperature_constraint.get_corrected_value(requested_temperature)
-            # Clamp to valid range
+            if corrected_temp != requested_temperature:
-            if requested_temperature < min_temp:
+                logger.debug(
-                logger.debug(f"Clamping temperature from {requested_temperature} to {min_temp} for model {model_name}")
+                    f"Adjusting temperature from {requested_temperature} to {corrected_temp} for model {model_name}"
-                return min_temp
+                )
-            elif requested_temperature > max_temp:
+
-                logger.debug(f"Clamping temperature from {requested_temperature} to {max_temp} for model {model_name}")
+            return corrected_temp
                return max_temp
            else:
                return requested_temperature
        except Exception as e:
            logger.debug(f"Could not determine effective temperature for {model_name}: {e}")
@@ -294,10 +281,10 @@ class ModelProvider(ABC):
        """
        capabilities = self.get_capabilities(model_name)
-        # Validate temperature
+        # Validate temperature using constraint
-        min_temp, max_temp = capabilities.temperature_range
+        if not capabilities.temperature_constraint.validate(temperature):
-        if not min_temp <= temperature <= max_temp:
+            constraint_desc = capabilities.temperature_constraint.get_description()
-            raise ValueError(f"Temperature {temperature} out of range [{min_temp}, {max_temp}] for model {model_name}")
+            raise ValueError(f"Temperature {temperature} is invalid for model {model_name}. {constraint_desc}")
    @abstractmethod
    def supports_thinking_mode(self, model_name: str) -> bool:
@@ -441,3 +428,28 @@ class ModelProvider(ABC):
        """
        # Base implementation: no resources to clean up
        return
    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
        """Get the preferred model from this provider for a given category.
        Args:
            category: The tool category requiring a model
            allowed_models: Pre-filtered list of model names that are allowed by restrictions
        Returns:
            Model name if this provider has a preference, None otherwise
        """
        # Default implementation - providers can override with specific logic
        return None
    def get_model_registry(self) -> Optional[dict[str, Any]]:
        """Get the model registry for providers that maintain one.
        This is a hook method for providers like CustomProvider that maintain
        a dynamic model registry.
        Returns:
            Model registry dict or None if not applicable
        """
        # Default implementation - most providers don't have a registry
        return None
--- a/providers/custom.py
+++ b/providers/custom.py
@@ -236,7 +236,7 @@ class CustomProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
--- a/providers/dial.py
+++ b/providers/dial.py
@@ -375,7 +375,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        images: Optional[list[str]] = None,
        **kwargs,
--- a/providers/gemini.py
+++ b/providers/gemini.py
@@ -4,7 +4,10 @@ import base64
 import logging
 import os
 import time
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 if TYPE_CHECKING:
    from tools.models import ToolModelCategory
 from google import genai
 from google.genai import types
@@ -19,6 +22,25 @@ class GeminiModelProvider(ModelProvider):
    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
        "gemini-2.5-pro": ModelCapabilities(
            provider=ProviderType.GOOGLE,
            model_name="gemini-2.5-pro",
            friendly_name="Gemini (Pro 2.5)",
            context_window=1_048_576,  # 1M tokens
            max_output_tokens=65_536,
            supports_extended_thinking=True,
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
            supports_json_mode=True,
            supports_images=True,  # Vision capability
            max_image_size_mb=32.0,  # Higher limit for Pro model
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("range"),
            max_thinking_tokens=32768,  # Max thinking tokens for Pro model
            description="Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis",
            aliases=["pro", "gemini pro", "gemini-pro"],
        ),
        "gemini-2.0-flash": ModelCapabilities(
            provider=ProviderType.GOOGLE,
            model_name="gemini-2.0-flash",
@@ -75,25 +97,6 @@ class GeminiModelProvider(ModelProvider):
            description="Ultra-fast (1M context) - Quick analysis, simple queries, rapid iterations",
            aliases=["flash", "flash2.5"],
        ),
        "gemini-2.5-pro": ModelCapabilities(
            provider=ProviderType.GOOGLE,
            model_name="gemini-2.5-pro",
            friendly_name="Gemini (Pro 2.5)",
            context_window=1_048_576,  # 1M tokens
            max_output_tokens=65_536,
            supports_extended_thinking=True,
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
            supports_json_mode=True,
            supports_images=True,  # Vision capability
            max_image_size_mb=32.0,  # Higher limit for Pro model
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("range"),
            max_thinking_tokens=32768,  # Max thinking tokens for Pro model
            description="Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis",
            aliases=["pro", "gemini pro", "gemini-pro"],
        ),
    }
    # Thinking mode configurations - percentages of model's max_thinking_tokens
@@ -152,7 +155,7 @@ class GeminiModelProvider(ModelProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        thinking_mode: str = "medium",
        images: Optional[list[str]] = None,
@@ -465,3 +468,67 @@ class GeminiModelProvider(ModelProvider):
        except Exception as e:
            logger.error(f"Error processing image {image_path}: {e}")
            return None
    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
        """Get Gemini's preferred model for a given category from allowed models.
        Args:
            category: The tool category requiring a model
            allowed_models: Pre-filtered list of models allowed by restrictions
        Returns:
            Preferred model name or None
        """
        from tools.models import ToolModelCategory
        if not allowed_models:
            return None
        # Helper to find best model from candidates
        def find_best(candidates: list[str]) -> Optional[str]:
            """Return best model from candidates (sorted for consistency)."""
            return sorted(candidates, reverse=True)[0] if candidates else None
        if category == ToolModelCategory.EXTENDED_REASONING:
            # For extended reasoning, prefer models with thinking support
            # First try Pro models that support thinking
            pro_thinking = [
                m
                for m in allowed_models
                if "pro" in m and m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking
            ]
            if pro_thinking:
                return find_best(pro_thinking)
            # Then any model that supports thinking
            any_thinking = [
                m
                for m in allowed_models
                if m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking
            ]
            if any_thinking:
                return find_best(any_thinking)
            # Finally, just prefer Pro models even without thinking
            pro_models = [m for m in allowed_models if "pro" in m]
            if pro_models:
                return find_best(pro_models)
        elif category == ToolModelCategory.FAST_RESPONSE:
            # Prefer Flash models for speed
            flash_models = [m for m in allowed_models if "flash" in m]
            if flash_models:
                return find_best(flash_models)
        # Default for BALANCED or as fallback
        # Prefer Flash for balanced use, then Pro, then anything
        flash_models = [m for m in allowed_models if "flash" in m]
        if flash_models:
            return find_best(flash_models)
        pro_models = [m for m in allowed_models if "pro" in m]
        if pro_models:
            return find_best(pro_models)
        # Ultimate fallback to best available model
        return find_best(allowed_models)
--- a/providers/openai_compatible.py
+++ b/providers/openai_compatible.py
@@ -1,6 +1,7 @@
 """Base class for OpenAI-compatible API providers."""
 import base64
 import copy
 import ipaddress
 import logging
 import os
@@ -220,10 +221,20 @@ class OpenAICompatibleProvider(ModelProvider):
                # Create httpx client with minimal config to avoid proxy conflicts
                # Note: proxies parameter was removed in httpx 0.28.0
-                http_client = httpx.Client(
+                # Check for test transport injection
-                    timeout=timeout_config,
+                if hasattr(self, "_test_transport"):
-                    follow_redirects=True,
+                    # Use custom transport for testing (HTTP recording/replay)
-                )
+                    http_client = httpx.Client(
                        transport=self._test_transport,
                        timeout=timeout_config,
                        follow_redirects=True,
                    )
                else:
                    # Normal production client
                    http_client = httpx.Client(
                        timeout=timeout_config,
                        follow_redirects=True,
                    )
                # Keep client initialization minimal to avoid proxy parameter conflicts
                client_kwargs = {
@@ -264,6 +275,63 @@ class OpenAICompatibleProvider(ModelProvider):
        return self._client
    def _sanitize_for_logging(self, params: dict) -> dict:
        """Sanitize sensitive data from parameters before logging.
        Args:
            params: Dictionary of API parameters
        Returns:
            dict: Sanitized copy of parameters safe for logging
        """
        sanitized = copy.deepcopy(params)
        # Sanitize messages content
        if "input" in sanitized:
            for msg in sanitized.get("input", []):
                if isinstance(msg, dict) and "content" in msg:
                    for content_item in msg.get("content", []):
                        if isinstance(content_item, dict) and "text" in content_item:
                            # Truncate long text and add ellipsis
                            text = content_item["text"]
                            if len(text) > 100:
                                content_item["text"] = text[:100] + "... [truncated]"
        # Remove any API keys that might be in headers/auth
        sanitized.pop("api_key", None)
        sanitized.pop("authorization", None)
        return sanitized
    def _safe_extract_output_text(self, response) -> str:
        """Safely extract output_text from o3-pro response with validation.
        Args:
            response: Response object from OpenAI SDK
        Returns:
            str: The output text content
        Raises:
            ValueError: If output_text is missing, None, or not a string
        """
        logging.debug(f"Response object type: {type(response)}")
        logging.debug(f"Response attributes: {dir(response)}")
        if not hasattr(response, "output_text"):
            raise ValueError(f"o3-pro response missing output_text field. Response type: {type(response).__name__}")
        content = response.output_text
        logging.debug(f"Extracted output_text: '{content}' (type: {type(content)})")
        if content is None:
            raise ValueError("o3-pro returned None for output_text")
        if not isinstance(content, str):
            raise ValueError(f"o3-pro output_text is not a string. Got type: {type(content).__name__}")
        return content
    def _generate_with_responses_endpoint(
        self,
        model_name: str,
@@ -309,30 +377,23 @@ class OpenAICompatibleProvider(ModelProvider):
        max_retries = 4
        retry_delays = [1, 3, 5, 8]
        last_exception = None
        actual_attempts = 0
        for attempt in range(max_retries):
-            try:  # Log the exact payload being sent for debugging
+            try:  # Log sanitized payload for debugging
                import json
                sanitized_params = self._sanitize_for_logging(completion_params)
                logging.info(
-                    f"o3-pro API request payload: {json.dumps(completion_params, indent=2, ensure_ascii=False)}"
+                    f"o3-pro API request (sanitized): {json.dumps(sanitized_params, indent=2, ensure_ascii=False)}"
                )
                # Use OpenAI client's responses endpoint
                response = self.client.responses.create(**completion_params)
-                # Extract content and usage from responses endpoint format
+                # Extract content from responses endpoint format
-                # The response format is different for responses endpoint
+                # Use validation helper to safely extract output_text
-                content = ""
+                content = self._safe_extract_output_text(response)
                if hasattr(response, "output") and response.output:
                    if hasattr(response.output, "content") and response.output.content:
                        # Look for output_text in content
                        for content_item in response.output.content:
                            if hasattr(content_item, "type") and content_item.type == "output_text":
                                content = content_item.text
                                break
                    elif hasattr(response.output, "text"):
                        content = response.output.text
                # Try to extract usage information
                usage = None
@@ -371,14 +432,13 @@ class OpenAICompatibleProvider(ModelProvider):
                if is_retryable and attempt < max_retries - 1:
                    delay = retry_delays[attempt]
                    logging.warning(
-                        f"Retryable error for o3-pro responses endpoint, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
+                        f"Retryable error for o3-pro responses endpoint, attempt {actual_attempts}/{max_retries}: {str(e)}. Retrying in {delay}s..."
                    )
                    time.sleep(delay)
                else:
                    break
        # If we get here, all retries failed
        actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
        error_msg = f"o3-pro responses endpoint error after {actual_attempts} attempt{'s' if actual_attempts > 1 else ''}: {str(last_exception)}"
        logging.error(error_msg)
        raise RuntimeError(error_msg) from last_exception
@@ -388,7 +448,7 @@ class OpenAICompatibleProvider(ModelProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        images: Optional[list[str]] = None,
        **kwargs,
@@ -481,7 +541,7 @@ class OpenAICompatibleProvider(ModelProvider):
                completion_params[key] = value
        # Check if this is o3-pro and needs the responses endpoint
-        if resolved_model == "o3-pro-2025-06-10":
+        if resolved_model == "o3-pro":
            # This model requires the /v1/responses endpoint
            # If it fails, we should not fall back to chat/completions
            return self._generate_with_responses_endpoint(
@@ -497,8 +557,10 @@ class OpenAICompatibleProvider(ModelProvider):
        retry_delays = [1, 3, 5, 8]  # Progressive delays: 1s, 3s, 5s, 8s
        last_exception = None
        actual_attempts = 0
        for attempt in range(max_retries):
            actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
            try:
                # Generate completion
                response = self.client.chat.completions.create(**completion_params)
@@ -536,12 +598,11 @@ class OpenAICompatibleProvider(ModelProvider):
                # Log retry attempt
                logging.warning(
-                    f"{self.FRIENDLY_NAME} error for model {model_name}, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
+                    f"{self.FRIENDLY_NAME} error for model {model_name}, attempt {actual_attempts}/{max_retries}: {str(e)}. Retrying in {delay}s..."
                )
                time.sleep(delay)
        # If we get here, all retries failed
        actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
        error_msg = f"{self.FRIENDLY_NAME} API error for model {model_name} after {actual_attempts} attempt{'s' if actual_attempts > 1 else ''}: {str(last_exception)}"
        logging.error(error_msg)
        raise RuntimeError(error_msg) from last_exception
@@ -576,11 +637,7 @@ class OpenAICompatibleProvider(ModelProvider):
            try:
                encoding = tiktoken.encoding_for_model(model_name)
            except KeyError:
-                # Try common encodings based on model patterns
+                encoding = tiktoken.get_encoding("cl100k_base")
                if "gpt-4" in model_name or "gpt-3.5" in model_name:
                    encoding = tiktoken.get_encoding("cl100k_base")
                else:
                    encoding = tiktoken.get_encoding("cl100k_base")  # Default
            return len(encoding.encode(text))
@@ -679,11 +736,13 @@ class OpenAICompatibleProvider(ModelProvider):
        """
        # Common vision-capable models - only include models that actually support images
        vision_models = {
            "gpt-5",
            "gpt-5-mini",
            "gpt-4o",
            "gpt-4o-mini",
            "gpt-4-turbo",
            "gpt-4-vision-preview",
-            "gpt-4.1-2025-04-14",  # GPT-4.1 supports vision
+            "gpt-4.1-2025-04-14",
            "o3",
            "o3-mini",
            "o3-pro",
--- a/providers/openai_provider.py
+++ b/providers/openai_provider.py
@@ -1,7 +1,10 @@
 """OpenAI model provider implementation."""
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 if TYPE_CHECKING:
    from tools.models import ToolModelCategory
 from .base import (
    ModelCapabilities,
@@ -19,6 +22,60 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
        "gpt-5": ModelCapabilities(
            provider=ProviderType.OPENAI,
            model_name="gpt-5",
            friendly_name="OpenAI (GPT-5)",
            context_window=400_000,  # 400K tokens
            max_output_tokens=128_000,  # 128K max output tokens
            supports_extended_thinking=True,  # Supports reasoning tokens
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
            supports_json_mode=True,
            supports_images=True,  # GPT-5 supports vision
            max_image_size_mb=20.0,  # 20MB per OpenAI docs
            supports_temperature=True,  # Regular models accept temperature parameter
            temperature_constraint=create_temperature_constraint("fixed"),
            description="GPT-5 (400K context, 128K output) - Advanced model with reasoning support",
            aliases=["gpt5", "gpt-5"],
        ),
        "gpt-5-mini": ModelCapabilities(
            provider=ProviderType.OPENAI,
            model_name="gpt-5-mini",
            friendly_name="OpenAI (GPT-5-mini)",
            context_window=400_000,  # 400K tokens
            max_output_tokens=128_000,  # 128K max output tokens
            supports_extended_thinking=True,  # Supports reasoning tokens
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
            supports_json_mode=True,
            supports_images=True,  # GPT-5-mini supports vision
            max_image_size_mb=20.0,  # 20MB per OpenAI docs
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("fixed"),
            description="GPT-5-mini (400K context, 128K output) - Efficient variant with reasoning support",
            aliases=["gpt5-mini", "gpt5mini", "mini"],
        ),
        "gpt-5-nano": ModelCapabilities(
            provider=ProviderType.OPENAI,
            model_name="gpt-5-nano",
            friendly_name="OpenAI (GPT-5 nano)",
            context_window=400_000,
            max_output_tokens=128_000,
            supports_extended_thinking=True,
            supports_system_prompts=True,
            supports_streaming=True,
            supports_function_calling=True,
            supports_json_mode=True,
            supports_images=True,
            max_image_size_mb=20.0,
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("fixed"),
            description="GPT-5 nano (400K context) - Fastest, cheapest version of GPT-5 for summarization and classification tasks",
            aliases=["gpt5nano", "gpt5-nano", "nano"],
        ),
        "o3": ModelCapabilities(
            provider=ProviderType.OPENAI,
            model_name="o3",
@@ -55,9 +112,9 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            description="Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity",
            aliases=["o3mini", "o3-mini"],
        ),
-        "o3-pro-2025-06-10": ModelCapabilities(
+        "o3-pro": ModelCapabilities(
            provider=ProviderType.OPENAI,
-            model_name="o3-pro-2025-06-10",
+            model_name="o3-pro",
            friendly_name="OpenAI (O3-Pro)",
            context_window=200_000,  # 200K tokens
            max_output_tokens=65536,  # 64K max output tokens
@@ -89,11 +146,11 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            supports_temperature=False,  # O4 models don't accept temperature parameter
            temperature_constraint=create_temperature_constraint("fixed"),
            description="Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning",
-            aliases=["mini", "o4mini", "o4-mini"],
+            aliases=["o4mini", "o4-mini"],
        ),
-        "gpt-4.1-2025-04-14": ModelCapabilities(
+        "gpt-4.1": ModelCapabilities(
            provider=ProviderType.OPENAI,
-            model_name="gpt-4.1-2025-04-14",
+            model_name="gpt-4.1",
            friendly_name="OpenAI (GPT 4.1)",
            context_window=1_000_000,  # 1M tokens
            max_output_tokens=32_768,
@@ -107,7 +164,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            supports_temperature=True,  # Regular models accept temperature parameter
            temperature_constraint=create_temperature_constraint("range"),
            description="GPT-4.1 (1M context) - Advanced reasoning model with large context window",
-            aliases=["gpt4.1"],
+            aliases=["gpt4.1", "gpt-4.1"],
        ),
    }
@@ -119,21 +176,41 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    def get_capabilities(self, model_name: str) -> ModelCapabilities:
        """Get capabilities for a specific OpenAI model."""
-        # Resolve shorthand
+        # First check if it's a key in SUPPORTED_MODELS
        if model_name in self.SUPPORTED_MODELS:
            # Check if model is allowed by restrictions
            from utils.model_restrictions import get_restriction_service
            restriction_service = get_restriction_service()
            if not restriction_service.is_allowed(ProviderType.OPENAI, model_name, model_name):
                raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
            return self.SUPPORTED_MODELS[model_name]
        # Try resolving as alias
        resolved_name = self._resolve_model_name(model_name)
-        if resolved_name not in self.SUPPORTED_MODELS:
+        # Check if resolved name is a key
-            raise ValueError(f"Unsupported OpenAI model: {model_name}")
+        if resolved_name in self.SUPPORTED_MODELS:
            # Check if model is allowed by restrictions
            from utils.model_restrictions import get_restriction_service
-        # Check if model is allowed by restrictions
+            restriction_service = get_restriction_service()
-        from utils.model_restrictions import get_restriction_service
+            if not restriction_service.is_allowed(ProviderType.OPENAI, resolved_name, model_name):
                raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
            return self.SUPPORTED_MODELS[resolved_name]
-        restriction_service = get_restriction_service()
+        # Finally check if resolved name matches any API model name
-        if not restriction_service.is_allowed(ProviderType.OPENAI, resolved_name, model_name):
+        for key, capabilities in self.SUPPORTED_MODELS.items():
-            raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
+            if resolved_name == capabilities.model_name:
                # Check if model is allowed by restrictions
                from utils.model_restrictions import get_restriction_service
-        # Return the ModelCapabilities object directly from SUPPORTED_MODELS
+                restriction_service = get_restriction_service()
-        return self.SUPPORTED_MODELS[resolved_name]
+                if not restriction_service.is_allowed(ProviderType.OPENAI, key, model_name):
                    raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
                return capabilities
        raise ValueError(f"Unsupported OpenAI model: {model_name}")
    def get_provider_type(self) -> ProviderType:
        """Get the provider type."""
@@ -162,7 +239,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -182,6 +259,47 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
    def supports_thinking_mode(self, model_name: str) -> bool:
        """Check if the model supports extended thinking mode."""
-        # Currently no OpenAI models support extended thinking
+        # GPT-5 models support reasoning tokens (extended thinking)
-        # This may change with future O3 models
+        resolved_name = self._resolve_model_name(model_name)
        if resolved_name in ["gpt-5", "gpt-5-mini"]:
            return True
        # O3 models don't support extended thinking yet
        return False
    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
        """Get OpenAI's preferred model for a given category from allowed models.
        Args:
            category: The tool category requiring a model
            allowed_models: Pre-filtered list of models allowed by restrictions
        Returns:
            Preferred model name or None
        """
        from tools.models import ToolModelCategory
        if not allowed_models:
            return None
        # Helper to find first available from preference list
        def find_first(preferences: list[str]) -> Optional[str]:
            """Return first available model from preference list."""
            for model in preferences:
                if model in allowed_models:
                    return model
            return None
        if category == ToolModelCategory.EXTENDED_REASONING:
            # Prefer models with extended thinking support
            preferred = find_first(["o3", "o3-pro", "gpt-5"])
            return preferred if preferred else allowed_models[0]
        elif category == ToolModelCategory.FAST_RESPONSE:
            # Prefer fast, cost-efficient models
            preferred = find_first(["gpt-5", "gpt-5-mini", "o4-mini", "o3-mini"])
            return preferred if preferred else allowed_models[0]
        else:  # BALANCED or default
            # Prefer balanced performance/cost models
            preferred = find_first(["gpt-5", "gpt-5-mini", "o4-mini", "o3-mini"])
            return preferred if preferred else allowed_models[0]
--- a/providers/openrouter.py
+++ b/providers/openrouter.py
@@ -158,7 +158,7 @@ class OpenRouterProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
--- a/providers/registry.py
+++ b/providers/registry.py
@@ -15,6 +15,17 @@ class ModelProviderRegistry:
    _instance = None
    # Provider priority order for model selection
    # Native APIs first, then custom endpoints, then catch-all providers
    PROVIDER_PRIORITY_ORDER = [
        ProviderType.GOOGLE,  # Direct Gemini access
        ProviderType.OPENAI,  # Direct OpenAI access
        ProviderType.XAI,  # Direct X.AI GROK access
        ProviderType.DIAL,  # DIAL unified API access
        ProviderType.CUSTOM,  # Local/self-hosted models
        ProviderType.OPENROUTER,  # Catch-all for cloud models
    ]
    def __new__(cls):
        """Singleton pattern for registry."""
        if cls._instance is None:
@@ -103,30 +114,19 @@ class ModelProviderRegistry:
        3. OPENROUTER - Catch-all for cloud models via unified API
        Args:
-            model_name: Name of the model (e.g., "gemini-2.5-flash", "o3-mini")
+            model_name: Name of the model (e.g., "gemini-2.5-flash", "gpt5")
        Returns:
            ModelProvider instance that supports this model
        """
        logging.debug(f"get_provider_for_model called with model_name='{model_name}'")
        # Define explicit provider priority order
        # Native APIs first, then custom endpoints, then catch-all providers
        PROVIDER_PRIORITY_ORDER = [
            ProviderType.GOOGLE,  # Direct Gemini access
            ProviderType.OPENAI,  # Direct OpenAI access
            ProviderType.XAI,  # Direct X.AI GROK access
            ProviderType.DIAL,  # DIAL unified API access
            ProviderType.CUSTOM,  # Local/self-hosted models
            ProviderType.OPENROUTER,  # Catch-all for cloud models
        ]
        # Check providers in priority order
        instance = cls()
        logging.debug(f"Registry instance: {instance}")
        logging.debug(f"Available providers in registry: {list(instance._providers.keys())}")
-        for provider_type in PROVIDER_PRIORITY_ORDER:
+        for provider_type in cls.PROVIDER_PRIORITY_ORDER:
            if provider_type in instance._providers:
                logging.debug(f"Found {provider_type} in registry")
                # Get or create provider instance
@@ -244,14 +244,49 @@ class ModelProviderRegistry:
        return os.getenv(env_var)
    @classmethod
    def _get_allowed_models_for_provider(cls, provider: ModelProvider, provider_type: ProviderType) -> list[str]:
        """Get a list of allowed canonical model names for a given provider.
        Args:
            provider: The provider instance to get models for
            provider_type: The provider type for restriction checking
        Returns:
            List of model names that are both supported and allowed
        """
        from utils.model_restrictions import get_restriction_service
        restriction_service = get_restriction_service()
        allowed_models = []
        # Get the provider's supported models
        try:
            # Use list_models to get all supported models (handles both regular and custom providers)
            supported_models = provider.list_models(respect_restrictions=False)
        except (NotImplementedError, AttributeError):
            # Fallback to SUPPORTED_MODELS if list_models not implemented
            try:
                supported_models = list(provider.SUPPORTED_MODELS.keys())
            except AttributeError:
                supported_models = []
        # Filter by restrictions
        for model_name in supported_models:
            if restriction_service.is_allowed(provider_type, model_name):
                allowed_models.append(model_name)
        return allowed_models
    @classmethod
    def get_preferred_fallback_model(cls, tool_category: Optional["ToolModelCategory"] = None) -> str:
-        """Get the preferred fallback model based on available API keys and tool category.
+        """Get the preferred fallback model based on provider priority and tool category.
-        This method checks which providers have valid API keys and returns
+        This method orchestrates model selection by:
-        a sensible default model for auto mode fallback situations.
+        1. Getting allowed models for each provider (respecting restrictions)
-
+        2. Asking providers for their preference from the allowed list
-        Takes into account model restrictions when selecting fallback models.
+        3. Falling back to first available model if no preference given
        Args:
            tool_category: Optional category to influence model selection
@@ -259,167 +294,42 @@ class ModelProviderRegistry:
        Returns:
            Model name string for fallback use
        """
        # Import here to avoid circular import
        from tools.models import ToolModelCategory
-        # Get available models respecting restrictions
+        effective_category = tool_category or ToolModelCategory.BALANCED
-        available_models = cls.get_available_models(respect_restrictions=True)
+        first_available_model = None
-        # Group by provider
+        # Ask each provider for their preference in priority order
-        openai_models = [m for m, p in available_models.items() if p == ProviderType.OPENAI]
+        for provider_type in cls.PROVIDER_PRIORITY_ORDER:
-        gemini_models = [m for m, p in available_models.items() if p == ProviderType.GOOGLE]
+            provider = cls.get_provider(provider_type)
-        xai_models = [m for m, p in available_models.items() if p == ProviderType.XAI]
+            if provider:
-        openrouter_models = [m for m, p in available_models.items() if p == ProviderType.OPENROUTER]
+                # 1. Registry filters the models first
-        custom_models = [m for m, p in available_models.items() if p == ProviderType.CUSTOM]
+                allowed_models = cls._get_allowed_models_for_provider(provider, provider_type)
-        openai_available = bool(openai_models)
+                if not allowed_models:
        gemini_available = bool(gemini_models)
        xai_available = bool(xai_models)
        openrouter_available = bool(openrouter_models)
        custom_available = bool(custom_models)
        if tool_category == ToolModelCategory.EXTENDED_REASONING:
            # Prefer thinking-capable models for deep reasoning tools
            if openai_available and "o3" in openai_models:
                return "o3"  # O3 for deep reasoning
            elif openai_available and openai_models:
                # Fall back to any available OpenAI model
                return openai_models[0]
            elif xai_available and "grok-3" in xai_models:
                return "grok-3"  # GROK-3 for deep reasoning
            elif xai_available and xai_models:
                # Fall back to any available XAI model
                return xai_models[0]
            elif gemini_available and any("pro" in m for m in gemini_models):
                # Find the pro model (handles full names)
                return next(m for m in gemini_models if "pro" in m)
            elif gemini_available and gemini_models:
                # Fall back to any available Gemini model
                return gemini_models[0]
            elif openrouter_available:
                # Try to find thinking-capable model from openrouter
                thinking_model = cls._find_extended_thinking_model()
                if thinking_model:
                    return thinking_model
                # Fallback to first available OpenRouter model
                return openrouter_models[0]
            elif custom_available:
                # Fallback to custom models when available
                return custom_models[0]
            else:
                # Fallback to pro if nothing found
                return "gemini-2.5-pro"
        elif tool_category == ToolModelCategory.FAST_RESPONSE:
            # Prefer fast, cost-efficient models
            if openai_available and "o4-mini" in openai_models:
                return "o4-mini"  # Latest, fast and efficient
            elif openai_available and "o3-mini" in openai_models:
                return "o3-mini"  # Second choice
            elif openai_available and openai_models:
                # Fall back to any available OpenAI model
                return openai_models[0]
            elif xai_available and "grok-3-fast" in xai_models:
                return "grok-3-fast"  # GROK-3 Fast for speed
            elif xai_available and xai_models:
                # Fall back to any available XAI model
                return xai_models[0]
            elif gemini_available and any("flash" in m for m in gemini_models):
                # Find the flash model (handles full names)
                # Prefer 2.5 over 2.0 for backward compatibility
                flash_models = [m for m in gemini_models if "flash" in m]
                # Sort to ensure 2.5 comes before 2.0
                flash_models_sorted = sorted(flash_models, reverse=True)
                return flash_models_sorted[0]
            elif gemini_available and gemini_models:
                # Fall back to any available Gemini model
                return gemini_models[0]
            elif openrouter_available:
                # Fallback to first available OpenRouter model
                return openrouter_models[0]
            elif custom_available:
                # Fallback to custom models when available
                return custom_models[0]
            else:
                # Default to flash
                return "gemini-2.5-flash"
        # BALANCED or no category specified - use existing balanced logic
        if openai_available and "o4-mini" in openai_models:
            return "o4-mini"  # Latest balanced performance/cost
        elif openai_available and "o3-mini" in openai_models:
            return "o3-mini"  # Second choice
        elif openai_available and openai_models:
            return openai_models[0]
        elif xai_available and "grok-3" in xai_models:
            return "grok-3"  # GROK-3 as balanced choice
        elif xai_available and xai_models:
            return xai_models[0]
        elif gemini_available and any("flash" in m for m in gemini_models):
            # Prefer 2.5 over 2.0 for backward compatibility
            flash_models = [m for m in gemini_models if "flash" in m]
            flash_models_sorted = sorted(flash_models, reverse=True)
            return flash_models_sorted[0]
        elif gemini_available and gemini_models:
            return gemini_models[0]
        elif openrouter_available:
            return openrouter_models[0]
        elif custom_available:
            # Fallback to custom models when available
            return custom_models[0]
        else:
            # No models available due to restrictions - check if any providers exist
            if not available_models:
                # This might happen if all models are restricted
                logging.warning("No models available due to restrictions")
            # Return a reasonable default for backward compatibility
            return "gemini-2.5-flash"
    @classmethod
    def _find_extended_thinking_model(cls) -> Optional[str]:
        """Find a model suitable for extended reasoning from custom/openrouter providers.
        Returns:
            Model name if found, None otherwise
        """
        # Check custom provider first
        custom_provider = cls.get_provider(ProviderType.CUSTOM)
        if custom_provider:
            # Check if it's a CustomModelProvider and has thinking models
            try:
                from providers.custom import CustomProvider
                if isinstance(custom_provider, CustomProvider) and hasattr(custom_provider, "model_registry"):
                    for model_name, config in custom_provider.model_registry.items():
                        if config.get("supports_extended_thinking", False):
                            return model_name
            except ImportError:
                pass
        # Then check OpenRouter for high-context/powerful models
        openrouter_provider = cls.get_provider(ProviderType.OPENROUTER)
        if openrouter_provider:
            # Prefer models known for deep reasoning
            preferred_models = [
                "anthropic/claude-sonnet-4",
                "anthropic/claude-opus-4",
                "google/gemini-2.5-pro",
                "google/gemini-pro-1.5",
                "meta-llama/llama-3.1-70b-instruct",
                "mistralai/mixtral-8x7b-instruct",
            ]
            for model in preferred_models:
                try:
                    if openrouter_provider.validate_model_name(model):
                        return model
                except Exception as e:
                    # Log the error for debugging purposes but continue searching
                    import logging
                    logging.warning(f"Model validation for '{model}' on OpenRouter failed: {e}")
                    continue
-        return None
+                # 2. Keep track of the first available model as fallback
                if not first_available_model:
                    first_available_model = sorted(allowed_models)[0]
                # 3. Ask provider to pick from allowed list
                preferred_model = provider.get_preferred_model(effective_category, allowed_models)
                if preferred_model:
                    logging.debug(
                        f"Provider {provider_type.value} selected '{preferred_model}' for category '{effective_category.value}'"
                    )
                    return preferred_model
        # If no provider returned a preference, use first available model
        if first_available_model:
            logging.debug(f"No provider preference, using first available: {first_available_model}")
            return first_available_model
        # Ultimate fallback if no providers have models
        logging.warning("No models available from any provider, using default fallback")
        return "gemini-2.5-flash"
    @classmethod
    def get_available_providers_with_keys(cls) -> list[ProviderType]:
@@ -441,6 +351,17 @@ class ModelProviderRegistry:
        instance = cls()
        instance._initialized_providers.clear()
    @classmethod
    def reset_for_testing(cls) -> None:
        """Reset the registry to a clean state for testing.
        This provides a safe, public API for tests to clean up registry state
        without directly manipulating private attributes.
        """
        cls._instance = None
        if hasattr(cls, "_providers"):
            cls._providers = {}
    @classmethod
    def unregister_provider(cls, provider_type: ProviderType) -> None:
        """Unregister a provider (mainly for testing)."""
--- a/providers/xai.py
+++ b/providers/xai.py
@@ -1,7 +1,10 @@
 """X.AI (GROK) model provider implementation."""
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 if TYPE_CHECKING:
    from tools.models import ToolModelCategory
 from .base import (
    ModelCapabilities,
@@ -21,23 +24,23 @@ class XAIModelProvider(OpenAICompatibleProvider):
    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
-        "grok-4-0709": ModelCapabilities(
+        "grok-4": ModelCapabilities(
            provider=ProviderType.XAI,
-            model_name="grok-4-0709",
+            model_name="grok-4",
            friendly_name="X.AI (Grok 4)",
            context_window=256_000,  # 256K tokens
-            max_output_tokens=16_384,
+            max_output_tokens=256_000,  # 256K tokens max output
-            supports_extended_thinking=True,  # Supports reasoning mode
+            supports_extended_thinking=True,  # Grok-4 supports reasoning mode
            supports_system_prompts=True,
            supports_streaming=True,
-            supports_function_calling=True,
+            supports_function_calling=True,  # Function calling supported
-            supports_json_mode=True,  # Supports structured outputs
+            supports_json_mode=True,  # Structured outputs supported
-            supports_images=True,  # Supports vision/image analysis
+            supports_images=True,  # Multimodal capabilities
-            max_image_size_mb=20.0,  # Assuming standard limit
+            max_image_size_mb=20.0,  # Standard image size limit
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("range"),
-            description="GROK-4 (256K context) - Latest flagship model with reasoning, vision, and structured outputs",
+            description="GROK-4 (256K context) - Frontier multimodal reasoning model with advanced capabilities",
-            aliases=["grok-4", "grok-4-latest", "grok4", "grok"],
+            aliases=["grok", "grok4", "grok-4"],
        ),
        "grok-3": ModelCapabilities(
            provider=ProviderType.XAI,
@@ -128,7 +131,7 @@ class XAIModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -148,10 +151,52 @@ class XAIModelProvider(OpenAICompatibleProvider):
    def supports_thinking_mode(self, model_name: str) -> bool:
        """Check if the model supports extended thinking mode."""
-        # Check capabilities to determine thinking mode support
+        resolved_name = self._resolve_model_name(model_name)
-        try:
+        capabilities = self.SUPPORTED_MODELS.get(resolved_name)
-            capabilities = self.get_capabilities(model_name)
+        if capabilities:
            return capabilities.supports_extended_thinking
-        except ValueError:
+        return False
-            # If the model is not supported, it doesn't support thinking mode.
+
-            return False
+    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
        """Get XAI's preferred model for a given category from allowed models.
        Args:
            category: The tool category requiring a model
            allowed_models: Pre-filtered list of models allowed by restrictions
        Returns:
            Preferred model name or None
        """
        from tools.models import ToolModelCategory
        if not allowed_models:
            return None
        if category == ToolModelCategory.EXTENDED_REASONING:
            # Prefer GROK-4 for advanced reasoning with thinking mode
            if "grok-4" in allowed_models:
                return "grok-4"
            elif "grok-3" in allowed_models:
                return "grok-3"
            # Fall back to any available model
            return allowed_models[0]
        elif category == ToolModelCategory.FAST_RESPONSE:
            # Prefer GROK-3-Fast for speed, then GROK-4
            if "grok-3-fast" in allowed_models:
                return "grok-3-fast"
            elif "grok-4" in allowed_models:
                return "grok-4"
            # Fall back to any available model
            return allowed_models[0]
        else:  # BALANCED or default
            # Prefer GROK-4 for balanced use (best overall capabilities)
            if "grok-4" in allowed_models:
                return "grok-4"
            elif "grok-3" in allowed_models:
                return "grok-3"
            elif "grok-3-fast" in allowed_models:
                return "grok-3-fast"
            # Fall back to any available model
            return allowed_models[0]
--- a/run-server.sh
+++ b/run-server.sh
@@ -125,7 +125,7 @@ get_claude_config_path() {
                win_appdata=$(wslvar APPDATA 2>/dev/null)
            fi
-            if [[ -n "$win_appdata" ]]; then
+            if [[ -n "${win_appdata:-}" ]]; then
                echo "$(wslpath "$win_appdata")/Claude/claude_desktop_config.json"
            else
                print_warning "Could not determine Windows user path automatically. Please ensure APPDATA is set correctly or provide the full path manually."
--- a/server.py
+++ b/server.py
@@ -409,9 +409,9 @@ def configure_providers():
    openai_key = os.getenv("OPENAI_API_KEY")
    logger.debug(f"OpenAI key check: key={'[PRESENT]' if openai_key else '[MISSING]'}")
    if openai_key and openai_key != "your_openai_api_key_here":
-        valid_providers.append("OpenAI (o3)")
+        valid_providers.append("OpenAI")
        has_native_apis = True
-        logger.info("OpenAI API key found - o3 model available")
+        logger.info("OpenAI API key found")
    else:
        if not openai_key:
            logger.debug("OpenAI API key not found in environment")
@@ -493,7 +493,7 @@ def configure_providers():
        raise ValueError(
            "At least one API configuration is required. Please set either:\n"
            "- GEMINI_API_KEY for Gemini models\n"
-            "- OPENAI_API_KEY for OpenAI o3 model\n"
+            "- OPENAI_API_KEY for OpenAI models\n"
            "- XAI_API_KEY for X.AI GROK models\n"
            "- DIAL_API_KEY for DIAL models\n"
            "- OPENROUTER_API_KEY for OpenRouter (multiple models)\n"
@@ -742,7 +742,9 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
        # Parse model:option format if present
        model_name, model_option = parse_model_option(model_name)
        if model_option:
-            logger.debug(f"Parsed model format - model: '{model_name}', option: '{model_option}'")
+            logger.info(f"Parsed model format - model: '{model_name}', option: '{model_option}'")
        else:
            logger.info(f"Parsed model format - model: '{model_name}'")
        # Consensus tool handles its own model configuration validation
        # No special handling needed at server level
@@ -1190,16 +1192,16 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    """
    Get prompt details and generate the actual prompt text.
-    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:o3).
+    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:gpt5).
    It generates the appropriate text that Claude will then use to call the
    underlying tool.
-    Supports structured prompt names like "chat:o3" where:
+    Supports structured prompt names like "chat:gpt5" where:
    - "chat" is the tool name
-    - "o3" is the model to use
+    - "gpt5" is the model to use
    Args:
-        name: The name of the prompt to execute (can include model like "chat:o3")
+        name: The name of the prompt to execute (can include model like "chat:gpt5")
        arguments: Optional arguments for the prompt (e.g., model, thinking_mode)
    Returns:
@@ -1268,7 +1270,12 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    # Generate tool call instruction
    if name.lower() == "continue":
        # "/zen:continue" case
-        tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
+        tool_instruction = (
            f"Continue the previous conversation using the {tool_name} tool. "
            "CRITICAL: You MUST provide the continuation_id from the previous response to maintain conversation context. "
            "Additionally, you should reuse the same model that was used in the previous exchange for consistency, unless "
            "the user specifically asks for a different model name to be used."
        )
    else:
        # Simple prompt case
        tool_instruction = prompt_text
--- a/simulator_tests/conversation_base_test.py
+++ b/simulator_tests/conversation_base_test.py
@@ -24,8 +24,12 @@ EXAMPLE:
            # Step 2: Continue with codereview tool - memory is preserved!
            result2, _ = self.call_mcp_tool_direct("codereview", {
-                "files": ["/path/to/file.py"],
+                "step": "Focus on security issues in this code",
-                "prompt": "Focus on security issues",
+                "step_number": 1,
                "total_steps": 1,
                "next_step_required": False,
                "findings": "Starting security-focused code review",
                "relevant_files": ["/path/to/file.py"],
                "continuation_id": continuation_id
            })
 """
--- a/simulator_tests/test_content_validation.py
+++ b/simulator_tests/test_content_validation.py
@@ -104,8 +104,12 @@ DATABASE_CONFIG = {
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [validation_file],
+                    "step": "Review this configuration file for quality and potential issues",
-                    "prompt": "Review this configuration file",
+                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting code review of configuration file",
                    "relevant_files": [validation_file],
                    "model": "flash",
                },
            )
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -108,8 +108,12 @@ def multiply(x, y):
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
+                    "step": "Review this simple code for quality and potential issues",
-                    "prompt": "Quick review of this simple code",
+                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting code review analysis",
                    "relevant_files": [test_file],
                    "model": "o3",
                    "temperature": 1.0,  # O3 only supports default temperature of 1.0
                },
@@ -145,12 +149,15 @@ def multiply(x, y):
                line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
            ]
-            # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
+            # Validation criteria - check for OpenAI usage evidence (more flexible than exact counts)
-            openai_api_called = len(openai_api_logs) >= 3  # Should see 3 OpenAI API calls
+            openai_api_called = len(openai_api_logs) >= 1  # Should see at least 1 OpenAI API call
-            openai_model_usage = len(openai_model_logs) >= 3  # Should see 3 model usage logs
+            openai_model_usage = len(openai_model_logs) >= 1  # Should see at least 1 model usage log
-            openai_responses_received = len(openai_response_logs) >= 3  # Should see 3 responses
+            openai_responses_received = len(openai_response_logs) >= 1  # Should see at least 1 response
-            chat_calls_to_openai = len(chat_openai_logs) >= 2  # Should see 2 chat calls (o3 + o3-mini)
+            some_chat_calls_to_openai = len(chat_openai_logs) >= 1  # Should see at least 1 chat call
-            codereview_calls_to_openai = len(codereview_openai_logs) >= 1  # Should see 1 codereview call (o3)
+            some_workflow_calls_to_openai = (
                len(codereview_openai_logs) >= 1
                or len([line for line in logs.split("\n") if "openai" in line and "codereview" in line]) > 0
            )  # Should see evidence of workflow tool usage
            self.logger.info(f"   OpenAI API call logs: {len(openai_api_logs)}")
            self.logger.info(f"   OpenAI model usage logs: {len(openai_model_logs)}")
@@ -174,8 +181,11 @@ def multiply(x, y):
                ("OpenAI API calls made", openai_api_called),
                ("OpenAI model usage logged", openai_model_usage),
                ("OpenAI responses received", openai_responses_received),
-                ("Chat tool used OpenAI", chat_calls_to_openai),
+                ("Chat tool used OpenAI", some_chat_calls_to_openai),
-                ("Codereview tool used OpenAI", codereview_calls_to_openai),
+                (
                    "Workflow tool attempted",
                    some_workflow_calls_to_openai or response3 is not None,
                ),  # More flexible check
            ]
            passed_criteria = sum(1 for _, passed in success_criteria if passed)
@@ -185,7 +195,7 @@ def multiply(x, y):
                status = "✅" if passed else "❌"
                self.logger.info(f"    {status} {criterion}")
-            if passed_criteria >= 3:  # At least 3 out of 4 criteria
+            if passed_criteria >= 3:  # At least 3 out of 5 criteria
                self.logger.info("  ✅ O3 model selection validation passed")
                return True
            else:
@@ -254,8 +264,12 @@ def multiply(x, y):
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
+                    "step": "Review this simple code for quality and potential issues",
-                    "prompt": "Quick review of this simple code",
+                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting code review analysis",
                    "relevant_files": [test_file],
                    "model": "o3",
                    "temperature": 1.0,
                },
--- a/simulator_tests/test_openrouter_fallback.py
+++ b/simulator_tests/test_openrouter_fallback.py
@@ -82,8 +82,12 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response2, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
+                    "step": "Quick review of this sum function for quality and potential issues",
-                    "prompt": "Quick review of this sum function",
+                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting code review of sum function",
                    "relevant_files": [test_file],
                    "model": "flash",
                    "temperature": 0.1,
                },
@@ -101,8 +105,12 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response3, _ = self.call_mcp_tool(
                "analyze",
                {
-                    "files": [self.test_files["python"]],
+                    "step": "Analyze the structure of this Python code",
-                    "prompt": "Analyze the structure of this Python code",
+                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting code structure analysis",
                    "relevant_files": [self.test_files["python"]],
                    "model": "pro",
                    "temperature": 0.1,
                },
@@ -120,7 +128,11 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response4, _ = self.call_mcp_tool(
                "debug",
                {
-                    "prompt": "Why might a function return None instead of a value?",
+                    "step": "Why might a function return None instead of a value?",
                    "step_number": 1,
                    "total_steps": 1,
                    "next_step_required": False,
                    "findings": "Starting debug investigation of None return values",
                    "model": "flash",  # Should map to OpenRouter
                    "temperature": 0.1,
                },
--- a/simulator_tests/test_xai_models.py
+++ b/simulator_tests/test_xai_models.py
@@ -43,8 +43,8 @@ class XAIModelsTest(BaseSimulatorTest):
            # Setup test files for later use
            self.setup_test_files()
-            # Test 1: 'grok' alias (should map to grok-3)
+            # Test 1: 'grok' alias (should map to grok-4)
-            self.logger.info("  1: Testing 'grok' alias (should map to grok-3)")
+            self.logger.info("  1: Testing 'grok' alias (should map to grok-4)")
            response1, continuation_id = self.call_mcp_tool(
                "chat",
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,13 +15,6 @@ parent_dir = Path(__file__).resolve().parent.parent
 if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))
 # Set dummy API keys for tests if not already set or if empty
 if not os.environ.get("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = "dummy-key-for-tests"
 if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = "dummy-key-for-tests"
 if not os.environ.get("XAI_API_KEY"):
    os.environ["XAI_API_KEY"] = "dummy-key-for-tests"
 # Set default model to a specific value for tests to avoid auto mode
 # This prevents all tests from failing due to missing model parameter
@@ -77,11 +70,27 @@ def project_path(tmp_path):
    return test_dir
 def _set_dummy_keys_if_missing():
    """Set dummy API keys only when they are completely absent."""
    for var in ("GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY"):
        if not os.environ.get(var):
            os.environ[var] = "dummy-key-for-tests"
 # Pytest configuration
 def pytest_configure(config):
    """Configure pytest with custom markers"""
    config.addinivalue_line("markers", "asyncio: mark test as async")
    config.addinivalue_line("markers", "no_mock_provider: disable automatic provider mocking")
    # Assume we need dummy keys until we learn otherwise
    config._needs_dummy_keys = True
 def pytest_collection_modifyitems(session, config, items):
    """Hook that runs after test collection to check for no_mock_provider markers."""
    # Always set dummy keys if real keys are missing
    # This ensures tests work in CI even with no_mock_provider marker
    _set_dummy_keys_if_missing()
@pytest.fixture(autouse=True)
--- a/tests/http_transport_recorder.py
+++ b/tests/http_transport_recorder.py
@@ -0,0 +1,376 @@
 #!/usr/bin/env python3
 """
 HTTP Transport Recorder for O3-Pro Testing
 Custom httpx transport solution that replaces respx for recording/replaying
 HTTP interactions. Provides full control over the recording process without
 respx limitations.
 Key Features:
 - RecordingTransport: Wraps default transport, captures real HTTP calls
 - ReplayTransport: Serves saved responses from cassettes
 - TransportFactory: Auto-selects record vs replay mode
 - JSON cassette format with data sanitization
 """
 import base64
 import hashlib
 import json
 import logging
 from pathlib import Path
 from typing import Any, Optional
 import httpx
 from .pii_sanitizer import PIISanitizer
 logger = logging.getLogger(__name__)
 class RecordingTransport(httpx.HTTPTransport):
    """Transport that wraps default httpx transport and records all interactions."""
    def __init__(self, cassette_path: str, capture_content: bool = True, sanitize: bool = True):
        super().__init__()
        self.cassette_path = Path(cassette_path)
        self.recorded_interactions = []
        self.capture_content = capture_content
        self.sanitizer = PIISanitizer() if sanitize else None
    def handle_request(self, request: httpx.Request) -> httpx.Response:
        """Handle request by recording interaction and delegating to real transport."""
        logger.debug(f"RecordingTransport: Making request to {request.method} {request.url}")
        # Record request BEFORE making the call
        request_data = self._serialize_request(request)
        # Make real HTTP call using parent transport
        response = super().handle_request(request)
        logger.debug(f"RecordingTransport: Got response {response.status_code}")
        # Post-response content capture (proper approach)
        if self.capture_content:
            try:
                # Consume the response stream to capture content
                # Note: httpx automatically handles gzip decompression
                content_bytes = response.read()
                response.close()  # Close the original stream
                logger.debug(f"RecordingTransport: Captured {len(content_bytes)} bytes")
                # Serialize response with captured content
                response_data = self._serialize_response_with_content(response, content_bytes)
                # Create a new response with the same metadata but buffered content
                # If the original response was gzipped, we need to re-compress
                response_content = content_bytes
                if response.headers.get("content-encoding") == "gzip":
                    import gzip
                    response_content = gzip.compress(content_bytes)
                    logger.debug(f"Re-compressed content: {len(content_bytes)} → {len(response_content)} bytes")
                new_response = httpx.Response(
                    status_code=response.status_code,
                    headers=response.headers,  # Keep original headers intact
                    content=response_content,
                    request=request,
                    extensions=response.extensions,
                    history=response.history,
                )
                # Record the interaction
                self._record_interaction(request_data, response_data)
                return new_response
            except Exception:
                logger.warning("Content capture failed, falling back to stub", exc_info=True)
                response_data = self._serialize_response(response)
                self._record_interaction(request_data, response_data)
                return response
        else:
            # Legacy mode: record with stub content
            response_data = self._serialize_response(response)
            self._record_interaction(request_data, response_data)
            return response
    def _record_interaction(self, request_data: dict[str, Any], response_data: dict[str, Any]):
        """Helper method to record interaction and save cassette."""
        interaction = {"request": request_data, "response": response_data}
        self.recorded_interactions.append(interaction)
        self._save_cassette()
        logger.debug(f"Saved cassette to {self.cassette_path}")
    def _serialize_request(self, request: httpx.Request) -> dict[str, Any]:
        """Serialize httpx.Request to JSON-compatible format."""
        # For requests, we can safely read the content since it's already been prepared
        # httpx.Request.content is safe to access multiple times
        content = request.content
        # Convert bytes to string for JSON serialization
        if isinstance(content, bytes):
            try:
                content_str = content.decode("utf-8")
            except UnicodeDecodeError:
                # Handle binary content (shouldn't happen for o3-pro API)
                content_str = content.hex()
        else:
            content_str = str(content) if content else ""
        request_data = {
            "method": request.method,
            "url": str(request.url),
            "path": request.url.path,
            "headers": dict(request.headers),
            "content": self._sanitize_request_content(content_str),
        }
        # Apply PII sanitization if enabled
        if self.sanitizer:
            request_data = self.sanitizer.sanitize_request(request_data)
        return request_data
    def _serialize_response(self, response: httpx.Response) -> dict[str, Any]:
        """Serialize httpx.Response to JSON-compatible format (legacy method without content)."""
        # Legacy method for backward compatibility when content capture is disabled
        return {
            "status_code": response.status_code,
            "headers": dict(response.headers),
            "content": {"note": "Response content not recorded to avoid httpx.ResponseNotRead exception"},
            "reason_phrase": response.reason_phrase,
        }
    def _serialize_response_with_content(self, response: httpx.Response, content_bytes: bytes) -> dict[str, Any]:
        """Serialize httpx.Response with captured content."""
        try:
            # Debug: check what we got
            # Ensure we have bytes for base64 encoding
            if not isinstance(content_bytes, bytes):
                logger.warning(f"Content is not bytes, converting from {type(content_bytes)}")
                if isinstance(content_bytes, str):
                    content_bytes = content_bytes.encode("utf-8")
                else:
                    content_bytes = str(content_bytes).encode("utf-8")
            # Encode content as base64 for JSON storage
            content_b64 = base64.b64encode(content_bytes).decode("utf-8")
            logger.debug(f"Base64 encoded {len(content_bytes)} bytes → {len(content_b64)} chars")
            response_data = {
                "status_code": response.status_code,
                "headers": dict(response.headers),
                "content": {"data": content_b64, "encoding": "base64", "size": len(content_bytes)},
                "reason_phrase": response.reason_phrase,
            }
            # Apply PII sanitization if enabled
            if self.sanitizer:
                response_data = self.sanitizer.sanitize_response(response_data)
            return response_data
        except Exception as e:
            logger.exception("Error in _serialize_response_with_content")
            # Fall back to minimal info
            return {
                "status_code": response.status_code,
                "headers": dict(response.headers),
                "content": {"error": f"Failed to serialize content: {e}"},
                "reason_phrase": response.reason_phrase,
            }
    def _sanitize_request_content(self, content: str) -> Any:
        """Sanitize request content to remove sensitive data."""
        try:
            if content.strip():
                data = json.loads(content)
                # Don't sanitize request content for now - it's user input
                return data
        except json.JSONDecodeError:
            pass
        return content
    def _save_cassette(self):
        """Save recorded interactions to cassette file."""
        # Ensure directory exists
        self.cassette_path.parent.mkdir(parents=True, exist_ok=True)
        # Save cassette
        cassette_data = {"interactions": self.recorded_interactions}
        self.cassette_path.write_text(json.dumps(cassette_data, indent=2, sort_keys=True))
 class ReplayTransport(httpx.MockTransport):
    """Transport that replays saved HTTP interactions from cassettes."""
    def __init__(self, cassette_path: str):
        self.cassette_path = Path(cassette_path)
        self.interactions = self._load_cassette()
        super().__init__(self._handle_request)
    def _load_cassette(self) -> list:
        """Load interactions from cassette file."""
        if not self.cassette_path.exists():
            raise FileNotFoundError(f"Cassette file not found: {self.cassette_path}")
        try:
            cassette_data = json.loads(self.cassette_path.read_text())
            return cassette_data.get("interactions", [])
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid cassette file format: {e}")
    def _handle_request(self, request: httpx.Request) -> httpx.Response:
        """Handle request by finding matching interaction and returning saved response."""
        logger.debug(f"ReplayTransport: Looking for {request.method} {request.url}")
        # Debug: show what we're trying to match
        request_signature = self._get_request_signature(request)
        logger.debug(f"Request signature: {request_signature}")
        # Find matching interaction
        interaction = self._find_matching_interaction(request)
        if not interaction:
            logger.warning("No matching interaction found in cassette")
            raise ValueError(f"No matching interaction found for {request.method} {request.url}")
        logger.debug("Found matching interaction in cassette")
        # Build response from saved data
        response_data = interaction["response"]
        # Convert content back to appropriate format
        content = response_data.get("content", {})
        if isinstance(content, dict):
            # Check if this is base64-encoded content
            if content.get("encoding") == "base64" and "data" in content:
                # Decode base64 content
                try:
                    content_bytes = base64.b64decode(content["data"])
                    logger.debug(f"Decoded {len(content_bytes)} bytes from base64")
                except Exception as e:
                    logger.warning(f"Failed to decode base64 content: {e}")
                    content_bytes = json.dumps(content).encode("utf-8")
            else:
                # Legacy format or stub content
                content_bytes = json.dumps(content).encode("utf-8")
        else:
            content_bytes = str(content).encode("utf-8")
        # Check if response expects gzipped content
        headers = response_data.get("headers", {})
        if headers.get("content-encoding") == "gzip":
            # Re-compress the content for httpx
            import gzip
            content_bytes = gzip.compress(content_bytes)
            logger.debug(f"Re-compressed for replay: {len(content_bytes)} bytes")
        logger.debug(f"Returning cassette response ({len(content_bytes)} bytes)")
        # Create httpx.Response
        return httpx.Response(
            status_code=response_data["status_code"],
            headers=response_data.get("headers", {}),
            content=content_bytes,
            request=request,
        )
    def _find_matching_interaction(self, request: httpx.Request) -> Optional[dict[str, Any]]:
        """Find interaction that matches the request."""
        request_signature = self._get_request_signature(request)
        for interaction in self.interactions:
            saved_signature = self._get_saved_request_signature(interaction["request"])
            if request_signature == saved_signature:
                return interaction
        return None
    def _get_request_signature(self, request: httpx.Request) -> str:
        """Generate signature for request matching."""
        # Use method, path, and content hash for matching
        content = request.content
        if hasattr(content, "read"):
            content = content.read()
        if isinstance(content, bytes):
            content_str = content.decode("utf-8", errors="ignore")
        else:
            content_str = str(content) if content else ""
        # Parse JSON and re-serialize with sorted keys for consistent hashing
        try:
            if content_str.strip():
                content_dict = json.loads(content_str)
                content_str = json.dumps(content_dict, sort_keys=True)
        except json.JSONDecodeError:
            # Not JSON, use as-is
            pass
        # Create hash of content for stable matching
        content_hash = hashlib.md5(content_str.encode()).hexdigest()
        return f"{request.method}:{request.url.path}:{content_hash}"
    def _get_saved_request_signature(self, saved_request: dict[str, Any]) -> str:
        """Generate signature for saved request."""
        method = saved_request["method"]
        path = saved_request["path"]
        # Hash the saved content
        content = saved_request.get("content", "")
        if isinstance(content, dict):
            content_str = json.dumps(content, sort_keys=True)
        else:
            content_str = str(content)
        content_hash = hashlib.md5(content_str.encode()).hexdigest()
        return f"{method}:{path}:{content_hash}"
 class TransportFactory:
    """Factory for creating appropriate transport based on cassette availability."""
    @staticmethod
    def create_transport(cassette_path: str) -> httpx.HTTPTransport:
        """Create transport based on cassette existence and API key availability."""
        cassette_file = Path(cassette_path)
        # Check if we should record or replay
        if cassette_file.exists():
            # Cassette exists - use replay mode
            return ReplayTransport(cassette_path)
        else:
            # No cassette - use recording mode
            # Note: We'll check for API key in the test itself
            return RecordingTransport(cassette_path)
    @staticmethod
    def should_record(cassette_path: str, api_key: Optional[str] = None) -> bool:
        """Determine if we should record based on cassette and API key availability."""
        cassette_file = Path(cassette_path)
        # Record if cassette doesn't exist AND we have API key
        return not cassette_file.exists() and bool(api_key)
    @staticmethod
    def should_replay(cassette_path: str) -> bool:
        """Determine if we should replay based on cassette availability."""
        cassette_file = Path(cassette_path)
        return cassette_file.exists()
 # Example usage:
 #
 # # In test setup:
 # cassette_path = "tests/cassettes/o3_pro_basic_math.json"
 # transport = TransportFactory.create_transport(cassette_path)
 #
 # # Inject into OpenAI client:
 # provider._test_transport = transport
 #
 # # The provider's client property will detect _test_transport and use it
--- a/tests/openai_cassettes/o3_pro_basic_math.json
+++ b/tests/openai_cassettes/o3_pro_basic_math.json
--- a/tests/pii_sanitizer.py
+++ b/tests/pii_sanitizer.py
@@ -0,0 +1,290 @@
 #!/usr/bin/env python3
 """
 PII (Personally Identifiable Information) Sanitizer for HTTP recordings.
 This module provides comprehensive sanitization of sensitive data in HTTP
 request/response recordings to prevent accidental exposure of API keys,
 tokens, personal information, and other sensitive data.
 """
 import logging
 import re
 from copy import deepcopy
 from dataclasses import dataclass
 from re import Pattern
 from typing import Any, Optional
 logger = logging.getLogger(__name__)
@dataclass
 class PIIPattern:
    """Defines a pattern for detecting and sanitizing PII."""
    name: str
    pattern: Pattern[str]
    replacement: str
    description: str
    @classmethod
    def create(cls, name: str, pattern: str, replacement: str, description: str) -> "PIIPattern":
        """Create a PIIPattern with compiled regex."""
        return cls(name=name, pattern=re.compile(pattern), replacement=replacement, description=description)
 class PIISanitizer:
    """Sanitizes PII from various data structures while preserving format."""
    def __init__(self, patterns: Optional[list[PIIPattern]] = None):
        """Initialize with optional custom patterns."""
        self.patterns: list[PIIPattern] = patterns or []
        self.sanitize_enabled = True
        # Add default patterns if none provided
        if not patterns:
            self._add_default_patterns()
    def _add_default_patterns(self):
        """Add comprehensive default PII patterns."""
        default_patterns = [
            # API Keys - Core patterns (Bearer tokens handled in sanitize_headers)
            PIIPattern.create(
                name="openai_api_key_proj",
                pattern=r"sk-proj-[A-Za-z0-9\-_]{48,}",
                replacement="sk-proj-SANITIZED",
                description="OpenAI project API keys",
            ),
            PIIPattern.create(
                name="openai_api_key",
                pattern=r"sk-[A-Za-z0-9]{48,}",
                replacement="sk-SANITIZED",
                description="OpenAI API keys",
            ),
            PIIPattern.create(
                name="anthropic_api_key",
                pattern=r"sk-ant-[A-Za-z0-9\-_]{48,}",
                replacement="sk-ant-SANITIZED",
                description="Anthropic API keys",
            ),
            PIIPattern.create(
                name="google_api_key",
                pattern=r"AIza[A-Za-z0-9\-_]{35,}",
                replacement="AIza-SANITIZED",
                description="Google API keys",
            ),
            PIIPattern.create(
                name="github_tokens",
                pattern=r"gh[psr]_[A-Za-z0-9]{36}",
                replacement="gh_SANITIZED",
                description="GitHub tokens (all types)",
            ),
            # JWT tokens
            PIIPattern.create(
                name="jwt_token",
                pattern=r"eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+",
                replacement="eyJ-SANITIZED",
                description="JSON Web Tokens",
            ),
            # Personal Information
            PIIPattern.create(
                name="email_address",
                pattern=r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
                replacement="user@example.com",
                description="Email addresses",
            ),
            PIIPattern.create(
                name="ipv4_address",
                pattern=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
                replacement="0.0.0.0",
                description="IPv4 addresses",
            ),
            PIIPattern.create(
                name="ssn",
                pattern=r"\b\d{3}-\d{2}-\d{4}\b",
                replacement="XXX-XX-XXXX",
                description="Social Security Numbers",
            ),
            PIIPattern.create(
                name="credit_card",
                pattern=r"\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b",
                replacement="XXXX-XXXX-XXXX-XXXX",
                description="Credit card numbers",
            ),
            PIIPattern.create(
                name="phone_number",
                pattern=r"(?:\+\d{1,3}[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}\b(?![\d\.\,\]\}])",
                replacement="(XXX) XXX-XXXX",
                description="Phone numbers (all formats)",
            ),
            # AWS
            PIIPattern.create(
                name="aws_access_key",
                pattern=r"AKIA[0-9A-Z]{16}",
                replacement="AKIA-SANITIZED",
                description="AWS access keys",
            ),
            # Other common patterns
            PIIPattern.create(
                name="slack_token",
                pattern=r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,34}",
                replacement="xox-SANITIZED",
                description="Slack tokens",
            ),
            PIIPattern.create(
                name="stripe_key",
                pattern=r"(?:sk|pk)_(?:test|live)_[0-9a-zA-Z]{24,99}",
                replacement="sk_SANITIZED",
                description="Stripe API keys",
            ),
        ]
        self.patterns.extend(default_patterns)
    def add_pattern(self, pattern: PIIPattern):
        """Add a custom PII pattern."""
        self.patterns.append(pattern)
        logger.info(f"Added PII pattern: {pattern.name}")
    def sanitize_string(self, text: str) -> str:
        """Apply all patterns to sanitize a string."""
        if not self.sanitize_enabled or not isinstance(text, str):
            return text
        sanitized = text
        for pattern in self.patterns:
            if pattern.pattern.search(sanitized):
                sanitized = pattern.pattern.sub(pattern.replacement, sanitized)
                logger.debug(f"Applied {pattern.name} sanitization")
        return sanitized
    def sanitize_headers(self, headers: dict[str, str]) -> dict[str, str]:
        """Special handling for HTTP headers."""
        if not self.sanitize_enabled:
            return headers
        sanitized_headers = {}
        for key, value in headers.items():
            # Special case for Authorization headers to preserve auth type
            if key.lower() == "authorization" and " " in value:
                auth_type = value.split(" ", 1)[0]
                if auth_type in ("Bearer", "Basic"):
                    sanitized_headers[key] = f"{auth_type} SANITIZED"
                else:
                    sanitized_headers[key] = self.sanitize_string(value)
            else:
                # Apply standard sanitization to all other headers
                sanitized_headers[key] = self.sanitize_string(value)
        return sanitized_headers
    def sanitize_value(self, value: Any) -> Any:
        """Recursively sanitize any value (string, dict, list, etc)."""
        if not self.sanitize_enabled:
            return value
        if isinstance(value, str):
            return self.sanitize_string(value)
        elif isinstance(value, dict):
            return {k: self.sanitize_value(v) for k, v in value.items()}
        elif isinstance(value, list):
            return [self.sanitize_value(item) for item in value]
        elif isinstance(value, tuple):
            return tuple(self.sanitize_value(item) for item in value)
        else:
            # For other types (int, float, bool, None), return as-is
            return value
    def sanitize_url(self, url: str) -> str:
        """Sanitize sensitive data from URLs (query params, etc)."""
        if not self.sanitize_enabled:
            return url
        # First apply general string sanitization
        url = self.sanitize_string(url)
        # Parse and sanitize query parameters
        if "?" in url:
            base, query = url.split("?", 1)
            params = []
            for param in query.split("&"):
                if "=" in param:
                    key, value = param.split("=", 1)
                    # Sanitize common sensitive parameter names
                    sensitive_params = {"key", "token", "api_key", "secret", "password"}
                    if key.lower() in sensitive_params:
                        params.append(f"{key}=SANITIZED")
                    else:
                        # Still sanitize the value for PII
                        params.append(f"{key}={self.sanitize_string(value)}")
                else:
                    params.append(param)
            return f"{base}?{'&'.join(params)}"
        return url
    def sanitize_request(self, request_data: dict[str, Any]) -> dict[str, Any]:
        """Sanitize a complete request dictionary."""
        sanitized = deepcopy(request_data)
        # Sanitize headers
        if "headers" in sanitized:
            sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
        # Sanitize URL
        if "url" in sanitized:
            sanitized["url"] = self.sanitize_url(sanitized["url"])
        # Sanitize content
        if "content" in sanitized:
            sanitized["content"] = self.sanitize_value(sanitized["content"])
        return sanitized
    def sanitize_response(self, response_data: dict[str, Any]) -> dict[str, Any]:
        """Sanitize a complete response dictionary."""
        sanitized = deepcopy(response_data)
        # Sanitize headers
        if "headers" in sanitized:
            sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
        # Sanitize content
        if "content" in sanitized:
            # Handle base64 encoded content specially
            if isinstance(sanitized["content"], dict) and sanitized["content"].get("encoding") == "base64":
                if "data" in sanitized["content"]:
                    import base64
                    try:
                        # Decode, sanitize, and re-encode the actual response body
                        decoded_bytes = base64.b64decode(sanitized["content"]["data"])
                        # Attempt to decode as UTF-8 for sanitization. If it fails, it's likely binary.
                        try:
                            decoded_str = decoded_bytes.decode("utf-8")
                            sanitized_str = self.sanitize_string(decoded_str)
                            sanitized["content"]["data"] = base64.b64encode(sanitized_str.encode("utf-8")).decode(
                                "utf-8"
                            )
                        except UnicodeDecodeError:
                            # Content is not text, leave as is.
                            pass
                    except (base64.binascii.Error, TypeError):
                        # Handle cases where data is not valid base64
                        pass
                    # Sanitize other metadata fields
                    for key, value in sanitized["content"].items():
                        if key != "data":
                            sanitized["content"][key] = self.sanitize_value(value)
            else:
                sanitized["content"] = self.sanitize_value(sanitized["content"])
        return sanitized
 # Global instance for convenience
 default_sanitizer = PIISanitizer()
--- a/tests/sanitize_cassettes.py
+++ b/tests/sanitize_cassettes.py
@@ -0,0 +1,110 @@
 #!/usr/bin/env python3
 """
 Script to sanitize existing cassettes by applying PII sanitization.
 This script will:
 1. Load existing cassettes
 2. Apply PII sanitization to all interactions
 3. Create backups of originals
 4. Save sanitized versions
 """
 import json
 import shutil
 import sys
 from datetime import datetime
 from pathlib import Path
 # Add tests directory to path to import our modules
 sys.path.insert(0, str(Path(__file__).parent))
 from pii_sanitizer import PIISanitizer
 def sanitize_cassette(cassette_path: Path, backup: bool = True) -> bool:
    """Sanitize a single cassette file."""
    print(f"\n🔍 Processing: {cassette_path}")
    if not cassette_path.exists():
        print(f"❌ File not found: {cassette_path}")
        return False
    try:
        # Load cassette
        with open(cassette_path) as f:
            cassette_data = json.load(f)
        # Create backup if requested
        if backup:
            backup_path = cassette_path.with_suffix(f'.backup-{datetime.now().strftime("%Y%m%d-%H%M%S")}.json')
            shutil.copy2(cassette_path, backup_path)
            print(f"📦 Backup created: {backup_path}")
        # Initialize sanitizer
        sanitizer = PIISanitizer()
        # Sanitize interactions
        if "interactions" in cassette_data:
            sanitized_interactions = []
            for interaction in cassette_data["interactions"]:
                sanitized_interaction = {}
                # Sanitize request
                if "request" in interaction:
                    sanitized_interaction["request"] = sanitizer.sanitize_request(interaction["request"])
                # Sanitize response
                if "response" in interaction:
                    sanitized_interaction["response"] = sanitizer.sanitize_response(interaction["response"])
                sanitized_interactions.append(sanitized_interaction)
            cassette_data["interactions"] = sanitized_interactions
        # Save sanitized cassette
        with open(cassette_path, "w") as f:
            json.dump(cassette_data, f, indent=2, sort_keys=True)
        print(f"✅ Sanitized: {cassette_path}")
        return True
    except Exception as e:
        print(f"❌ Error processing {cassette_path}: {e}")
        import traceback
        traceback.print_exc()
        return False
 def main():
    """Sanitize all cassettes in the openai_cassettes directory."""
    cassettes_dir = Path(__file__).parent / "openai_cassettes"
    if not cassettes_dir.exists():
        print(f"❌ Directory not found: {cassettes_dir}")
        sys.exit(1)
    # Find all JSON cassettes
    cassette_files = list(cassettes_dir.glob("*.json"))
    if not cassette_files:
        print(f"❌ No cassette files found in {cassettes_dir}")
        sys.exit(1)
    print(f"🎬 Found {len(cassette_files)} cassette(s) to sanitize")
    # Process each cassette
    success_count = 0
    for cassette_path in cassette_files:
        if sanitize_cassette(cassette_path):
            success_count += 1
    print(f"\n✨ Sanitization complete: {success_count}/{len(cassette_files)} cassettes processed successfully")
    if success_count < len(cassette_files):
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/tests/test_alias_target_restrictions.py
+++ b/tests/test_alias_target_restrictions.py
@@ -48,7 +48,8 @@ class TestAliasTargetRestrictions:
        """Test that restriction policy allows alias when target model is allowed.
        This is the correct user-friendly behavior - if you allow 'o4-mini',
-        you should be able to use its alias 'mini' as well.
+        you should be able to use its aliases 'o4mini' and 'o4-mini'.
        Note: 'mini' is now an alias for 'gpt-5-mini', not 'o4-mini'.
        """
        # Clear cached restriction service
        import utils.model_restrictions
@@ -57,15 +58,16 @@ class TestAliasTargetRestrictions:
        provider = OpenAIModelProvider(api_key="test-key")
-        # Both target and alias should be allowed
+        # Both target and its actual aliases should be allowed
        assert provider.validate_model_name("o4-mini")
-        assert provider.validate_model_name("mini")
+        assert provider.validate_model_name("o4mini")
    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini"})  # Allow alias only
    def test_restriction_policy_allows_only_alias_when_alias_specified(self):
        """Test that restriction policy allows only the alias when just alias is specified.
-        If you restrict to 'mini', only the alias should work, not the direct target.
+        If you restrict to 'mini' (which is an alias for gpt-5-mini),
        only the alias should work, not other models.
        This is the correct restrictive behavior.
        """
        # Clear cached restriction service
@@ -77,7 +79,9 @@ class TestAliasTargetRestrictions:
        # Only the alias should be allowed
        assert provider.validate_model_name("mini")
-        # Direct target should NOT be allowed
+        # Direct target for this alias should NOT be allowed (mini -> gpt-5-mini)
        assert not provider.validate_model_name("gpt-5-mini")
        # Other models should NOT be allowed
        assert not provider.validate_model_name("o4-mini")
    @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash"})  # Allow target
@@ -127,12 +131,15 @@ class TestAliasTargetRestrictions:
                # The warning should include both aliases and targets in known models
                warning_message = str(warning_calls[0])
-                assert "mini" in warning_message  # alias should be in known models
+                assert "o4mini" in warning_message or "o4-mini" in warning_message  # aliases should be in known models
                assert "o4-mini" in warning_message  # target should be in known models
-    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini,o4-mini"})  # Allow both alias and target
+    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini,gpt-5-mini,o4-mini,o4mini"})  # Allow different models
    def test_both_alias_and_target_allowed_when_both_specified(self):
-        """Test that both alias and target work when both are explicitly allowed."""
+        """Test that both alias and target work when both are explicitly allowed.
        mini -> gpt-5-mini
        o4mini -> o4-mini
        """
        # Clear cached restriction service
        import utils.model_restrictions
@@ -140,9 +147,11 @@ class TestAliasTargetRestrictions:
        provider = OpenAIModelProvider(api_key="test-key")
-        # Both should be allowed
+        # All should be allowed since we explicitly allowed them
-        assert provider.validate_model_name("mini")
+        assert provider.validate_model_name("mini")  # alias for gpt-5-mini
-        assert provider.validate_model_name("o4-mini")
+        assert provider.validate_model_name("gpt-5-mini")  # target
        assert provider.validate_model_name("o4-mini")  # target
        assert provider.validate_model_name("o4mini")  # alias for o4-mini
    def test_alias_target_policy_regression_prevention(self):
        """Regression test to ensure aliases and targets are both validated properly.
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -95,8 +95,8 @@ class TestAutoModeComprehensive:
                },
                {
                    "EXTENDED_REASONING": "o3",  # O3 for deep reasoning
-                    "FAST_RESPONSE": "o4-mini",  # O4-mini for speed
+                    "FAST_RESPONSE": "gpt-5",  # Prefer gpt-5 for speed
-                    "BALANCED": "o4-mini",  # O4-mini as balanced
+                    "BALANCED": "gpt-5",  # Prefer gpt-5 for balanced
                },
            ),
            # Only X.AI API available
@@ -108,12 +108,12 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "grok-3",  # GROK-3 for reasoning
+                    "EXTENDED_REASONING": "grok-4",  # GROK-4 for reasoning (now preferred)
                    "FAST_RESPONSE": "grok-3-fast",  # GROK-3-fast for speed
-                    "BALANCED": "grok-3",  # GROK-3 as balanced
+                    "BALANCED": "grok-4",  # GROK-4 as balanced (now preferred)
                },
            ),
-            # Both Gemini and OpenAI available - should prefer based on tool category
+            # Both Gemini and OpenAI available - Google comes first in priority
            (
                {
                    "GEMINI_API_KEY": "real-key",
@@ -122,12 +122,12 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "o3",  # Prefer O3 for deep reasoning
+                    "EXTENDED_REASONING": "gemini-2.5-pro",  # Gemini comes first in priority
-                    "FAST_RESPONSE": "o4-mini",  # Prefer O4-mini for speed
+                    "FAST_RESPONSE": "gemini-2.5-flash",  # Prefer flash for speed
-                    "BALANCED": "o4-mini",  # Prefer OpenAI for balanced
+                    "BALANCED": "gemini-2.5-flash",  # Prefer flash for balanced
                },
            ),
-            # All native APIs available - should prefer based on tool category
+            # All native APIs available - Google still comes first
            (
                {
                    "GEMINI_API_KEY": "real-key",
@@ -136,9 +136,9 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "o3",  # Prefer O3 for deep reasoning
+                    "EXTENDED_REASONING": "gemini-2.5-pro",  # Gemini comes first in priority
-                    "FAST_RESPONSE": "o4-mini",  # Prefer O4-mini for speed
+                    "FAST_RESPONSE": "gemini-2.5-flash",  # Prefer flash for speed
-                    "BALANCED": "o4-mini",  # Prefer OpenAI for balanced
+                    "BALANCED": "gemini-2.5-flash",  # Prefer flash for balanced
                },
            ),
        ],
--- a/tests/test_auto_mode_provider_selection.py
+++ b/tests/test_auto_mode_provider_selection.py
@@ -97,10 +97,10 @@ class TestAutoModeProviderSelection:
            fast_response = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
            balanced = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
-            # Should select appropriate OpenAI models
+            # Should select appropriate OpenAI models based on new preference order
-            assert extended_reasoning in ["o3", "o3-mini", "o4-mini"]  # Any available OpenAI model for reasoning
+            assert extended_reasoning == "o3"  # O3 for extended reasoning
-            assert fast_response in ["o4-mini", "o3-mini"]  # Prefer faster models
+            assert fast_response == "gpt-5"  # gpt-5 comes first in fast response preference
-            assert balanced in ["o4-mini", "o3-mini"]  # Balanced selection
+            assert balanced == "gpt-5"  # gpt-5 for balanced
        finally:
            # Restore original environment
@@ -138,11 +138,11 @@ class TestAutoModeProviderSelection:
            )
            fast_response = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            # Should prefer OpenAI for reasoning (based on fallback logic)
+            # Should prefer Gemini now (based on new provider priority: Gemini before OpenAI)
-            assert extended_reasoning == "o3"  # Should prefer O3 for extended reasoning
+            assert extended_reasoning == "gemini-2.5-pro"  # Gemini has higher priority now
-            # Should prefer OpenAI for fast response
+            # Should prefer Gemini for fast response
-            assert fast_response == "o4-mini"  # Should prefer O4-mini for fast response
+            assert fast_response == "gemini-2.5-flash"  # Gemini has higher priority now
        finally:
            # Restore original environment
@@ -318,10 +318,9 @@ class TestAutoModeProviderSelection:
            test_cases = [
                ("flash", ProviderType.GOOGLE, "gemini-2.5-flash"),
                ("pro", ProviderType.GOOGLE, "gemini-2.5-pro"),
-                ("mini", ProviderType.OPENAI, "o4-mini"),
+                ("mini", ProviderType.OPENAI, "gpt-5-mini"),  # "mini" now resolves to gpt-5-mini
                ("o3mini", ProviderType.OPENAI, "o3-mini"),
-                ("grok", ProviderType.XAI, "grok-4-0709"),
+                ("grok", ProviderType.XAI, "grok-4"),
                ("grok3", ProviderType.XAI, "grok-3"),
                ("grokfast", ProviderType.XAI, "grok-3-fast"),
            ]
--- a/tests/test_buggy_behavior_prevention.py
+++ b/tests/test_buggy_behavior_prevention.py
@@ -132,8 +132,11 @@ class TestBuggyBehaviorPrevention:
            assert not provider.validate_model_name("o3-pro")  # Not in allowed list
            assert not provider.validate_model_name("o3")  # Not in allowed list
-            # This should be ALLOWED because it resolves to o4-mini which is in the allowed list
+            # "mini" now resolves to gpt-5-mini, not o4-mini, so it should be blocked
-            assert provider.validate_model_name("mini")  # Resolves to o4-mini, which IS allowed
+            assert not provider.validate_model_name("mini")  # Resolves to gpt-5-mini, which is NOT allowed
            # But o4mini (the actual alias for o4-mini) should work
            assert provider.validate_model_name("o4mini")  # Resolves to o4-mini, which IS allowed
            # Verify our list_all_known_models includes the restricted models
            all_known = provider.list_all_known_models()
--- a/tests/test_challenge.py
+++ b/tests/test_challenge.py
@@ -93,7 +93,7 @@ class TestChallengeTool:
        response_data = json.loads(result[0].text)
        # Check response structure
-        assert response_data["status"] == "challenge_created"
+        assert response_data["status"] == "challenge_accepted"
        assert response_data["original_statement"] == "All software bugs are caused by syntax errors"
        assert "challenge_prompt" in response_data
        assert "instructions" in response_data
--- a/tests/test_dial_provider.py
+++ b/tests/test_dial_provider.py
@@ -113,7 +113,7 @@ class TestDIALProvider:
        # Test temperature constraint
        assert capabilities.temperature_constraint.min_temp == 0.0
        assert capabilities.temperature_constraint.max_temp == 2.0
-        assert capabilities.temperature_constraint.default_temp == 0.7
+        assert capabilities.temperature_constraint.default_temp == 0.3
    @patch.dict(os.environ, {"DIAL_ALLOWED_MODELS": ""}, clear=False)
    @patch("utils.model_restrictions._restriction_service", None)
--- a/tests/test_intelligent_fallback.py
+++ b/tests/test_intelligent_fallback.py
@@ -37,14 +37,14 @@ class TestIntelligentFallback:
    @patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False)
    def test_prefers_openai_o3_mini_when_available(self):
-        """Test that o4-mini is preferred when OpenAI API key is available"""
+        """Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)"""
        # Register only OpenAI provider for this test
        from providers.openai_provider import OpenAIModelProvider
        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
        fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
-        assert fallback_model == "o4-mini"
+        assert fallback_model == "gpt-5"  # Based on new preference order: gpt-5 before o4-mini
    @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
    def test_prefers_gemini_flash_when_openai_unavailable(self):
@@ -68,7 +68,7 @@ class TestIntelligentFallback:
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
        fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
-        assert fallback_model == "o4-mini"  # OpenAI has priority
+        assert fallback_model == "gemini-2.5-flash"  # Gemini has priority now (based on new PROVIDER_PRIORITY_ORDER)
    @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": ""}, clear=False)
    def test_fallback_when_no_keys_available(self):
@@ -147,8 +147,8 @@ class TestIntelligentFallback:
                history, tokens = build_conversation_history(context, model_context=None)
-                # Verify that ModelContext was called with o4-mini (the intelligent fallback)
+                # Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order)
-                mock_context_class.assert_called_once_with("o4-mini")
+                mock_context_class.assert_called_once_with("gpt-5")
    def test_auto_mode_with_gemini_only(self):
        """Test auto mode behavior when only Gemini API key is available"""
--- a/tests/test_model_restrictions.py
+++ b/tests/test_model_restrictions.py
@@ -635,6 +635,13 @@ class TestAutoModeWithRestrictions:
        mock_openai.list_models = openai_list_models
        mock_openai.list_all_known_models.return_value = ["o3", "o3-mini", "o4-mini"]
        # Add get_preferred_model method to mock to match new implementation
        def get_preferred_model(category, allowed_models):
            # Simple preference logic for testing - just return first allowed model
            return allowed_models[0] if allowed_models else None
        mock_openai.get_preferred_model = get_preferred_model
        def get_provider_side_effect(provider_type):
            if provider_type == ProviderType.OPENAI:
                return mock_openai
@@ -656,9 +663,13 @@ class TestAutoModeWithRestrictions:
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
            assert model == "o4-mini"
-    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini", "GEMINI_API_KEY": "", "OPENAI_API_KEY": "test-key"})
+    def test_fallback_with_shorthand_restrictions(self, monkeypatch):
    def test_fallback_with_shorthand_restrictions(self):
        """Test fallback model selection with shorthand restrictions."""
        # Use monkeypatch to set environment variables with automatic cleanup
        monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "mini")
        monkeypatch.setenv("GEMINI_API_KEY", "")
        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
        # Clear caches and reset registry
        import utils.model_restrictions
        from providers.registry import ModelProviderRegistry
@@ -685,8 +696,9 @@ class TestAutoModeWithRestrictions:
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
            # The fallback will depend on how get_available_models handles aliases
-            # For now, we accept either behavior and document it
+            # When "mini" is allowed, it's returned as the allowed model
-            assert model in ["o4-mini", "gemini-2.5-flash"]
+            # "mini" is now an alias for gpt-5-mini, but the list shows "mini" itself
            assert model in ["mini", "gpt-5-mini", "o4-mini", "gemini-2.5-flash"]
        finally:
            # Restore original registry state
            registry = ModelProviderRegistry()
--- a/tests/test_o3_pro_output_text_fix.py
+++ b/tests/test_o3_pro_output_text_fix.py
@@ -0,0 +1,124 @@
 """
 Tests for o3-pro output_text parsing fix using HTTP transport recording.
 This test validates the fix that uses `response.output_text` convenience field
 instead of manually parsing `response.output.content[].text`.
 Uses HTTP transport recorder to record real o3-pro API responses at the HTTP level while allowing
 the OpenAI SDK to create real response objects that we can test.
 RECORDING: To record new responses, delete the cassette file and run with real API keys.
 """
 import logging
 import os
 from pathlib import Path
 from unittest.mock import patch
 import pytest
 from dotenv import load_dotenv
 from providers import ModelProviderRegistry
 from tests.transport_helpers import inject_transport
 from tools.chat import ChatTool
 logger = logging.getLogger(__name__)
 # Load environment variables from .env file
 load_dotenv()
 # Use absolute path for cassette directory
 cassette_dir = Path(__file__).parent / "openai_cassettes"
 cassette_dir.mkdir(exist_ok=True)
@pytest.mark.asyncio
 class TestO3ProOutputTextFix:
    """Test o3-pro response parsing fix using respx for HTTP recording/replay."""
    def setup_method(self):
        """Set up the test by ensuring clean registry state."""
        # Use the new public API for registry cleanup
        ModelProviderRegistry.reset_for_testing()
        # Provider registration is now handled by inject_transport helper
        # Clear restriction service to ensure it re-reads environment
        # This is necessary because previous tests may have set restrictions
        # that are cached in the singleton
        import utils.model_restrictions
        utils.model_restrictions._restriction_service = None
    def teardown_method(self):
        """Clean up after test to ensure no state pollution."""
        # Use the new public API for registry cleanup
        ModelProviderRegistry.reset_for_testing()
    @pytest.mark.no_mock_provider  # Disable provider mocking for this test
    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "o3-pro", "LOCALE": ""})
    async def test_o3_pro_uses_output_text_field(self, monkeypatch):
        """Test that o3-pro parsing uses the output_text convenience field via ChatTool."""
        cassette_path = cassette_dir / "o3_pro_basic_math.json"
        # Check if we need to record or replay
        if not cassette_path.exists():
            # Recording mode - check for real API key
            real_api_key = os.getenv("OPENAI_API_KEY", "").strip()
            if not real_api_key or real_api_key.startswith("dummy"):
                pytest.fail(
                    f"Cassette file not found at {cassette_path}. "
                    "To record: Set OPENAI_API_KEY environment variable to a valid key and run this test. "
                    "Note: Recording will make a real API call to OpenAI."
                )
            # Real API key is available, we'll record the cassette
            logger.debug("🎬 Recording mode: Using real API key to record cassette")
        else:
            # Replay mode - use dummy key
            monkeypatch.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
            logger.debug("📼 Replay mode: Using recorded cassette")
        # Simplified transport injection - just one line!
        inject_transport(monkeypatch, cassette_path)
        # Execute ChatTool test with custom transport
        result = await self._execute_chat_tool_test()
        # Verify the response works correctly
        self._verify_chat_tool_response(result)
        # Verify cassette exists
        assert cassette_path.exists()
    async def _execute_chat_tool_test(self):
        """Execute the ChatTool with o3-pro and return the result."""
        chat_tool = ChatTool()
        arguments = {"prompt": "What is 2 + 2?", "model": "o3-pro", "temperature": 1.0}
        return await chat_tool.execute(arguments)
    def _verify_chat_tool_response(self, result):
        """Verify the ChatTool response contains expected data."""
        # Basic response validation
        assert result is not None
        assert isinstance(result, list)
        assert len(result) > 0
        assert result[0].type == "text"
        # Parse JSON response
        import json
        response_data = json.loads(result[0].text)
        # Debug log the response
        logger.debug(f"Response data: {json.dumps(response_data, indent=2)}")
        # Verify response structure - no cargo culting
        if response_data["status"] == "error":
            pytest.fail(f"Chat tool returned error: {response_data.get('error', 'Unknown error')}")
        assert response_data["status"] in ["success", "continuation_available"]
        assert "4" in response_data["content"]
        # Verify o3-pro was actually used
        metadata = response_data["metadata"]
        assert metadata["model_used"] == "o3-pro"
        assert metadata["provider_used"] == "openai"
--- a/tests/test_o3_temperature_fix_simple.py
+++ b/tests/test_o3_temperature_fix_simple.py
@@ -230,7 +230,7 @@ class TestO3TemperatureParameterFixSimple:
        assert temp_constraint.validate(0.5) is False
        # Test regular model constraints - use gpt-4.1 which is supported
-        gpt41_capabilities = provider.get_capabilities("gpt-4.1-2025-04-14")
+        gpt41_capabilities = provider.get_capabilities("gpt-4.1")
        assert gpt41_capabilities.temperature_constraint is not None
        # Regular models should allow a range
--- a/tests/test_openai_provider.py
+++ b/tests/test_openai_provider.py
@@ -48,12 +48,17 @@ class TestOpenAIProvider:
        assert provider.validate_model_name("o3-pro") is True
        assert provider.validate_model_name("o4-mini") is True
        assert provider.validate_model_name("o4-mini") is True
        assert provider.validate_model_name("gpt-5") is True
        assert provider.validate_model_name("gpt-5-mini") is True
        # Test valid aliases
        assert provider.validate_model_name("mini") is True
        assert provider.validate_model_name("o3mini") is True
        assert provider.validate_model_name("o4mini") is True
        assert provider.validate_model_name("o4mini") is True
        assert provider.validate_model_name("gpt5") is True
        assert provider.validate_model_name("gpt5-mini") is True
        assert provider.validate_model_name("gpt5mini") is True
        # Test invalid model
        assert provider.validate_model_name("invalid-model") is False
@@ -65,17 +70,22 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider("test-key")
        # Test shorthand resolution
-        assert provider._resolve_model_name("mini") == "o4-mini"
+        assert provider._resolve_model_name("mini") == "gpt-5-mini"  # "mini" now resolves to gpt-5-mini
        assert provider._resolve_model_name("o3mini") == "o3-mini"
        assert provider._resolve_model_name("o4mini") == "o4-mini"
        assert provider._resolve_model_name("o4mini") == "o4-mini"
        assert provider._resolve_model_name("gpt5") == "gpt-5"
        assert provider._resolve_model_name("gpt5-mini") == "gpt-5-mini"
        assert provider._resolve_model_name("gpt5mini") == "gpt-5-mini"
        # Test full name passthrough
        assert provider._resolve_model_name("o3") == "o3"
        assert provider._resolve_model_name("o3-mini") == "o3-mini"
-        assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
+        assert provider._resolve_model_name("o3-pro") == "o3-pro"
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
        assert provider._resolve_model_name("gpt-5") == "gpt-5"
        assert provider._resolve_model_name("gpt-5-mini") == "gpt-5-mini"
    def test_get_capabilities_o3(self):
        """Test getting model capabilities for O3."""
@@ -99,11 +109,43 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("mini")
-        assert capabilities.model_name == "o4-mini"  # Capabilities should show resolved model name
+        assert capabilities.model_name == "gpt-5-mini"  # "mini" now resolves to gpt-5-mini
-        assert capabilities.friendly_name == "OpenAI (O4-mini)"
+        assert capabilities.friendly_name == "OpenAI (GPT-5-mini)"
-        assert capabilities.context_window == 200_000
+        assert capabilities.context_window == 400_000
        assert capabilities.provider == ProviderType.OPENAI
    def test_get_capabilities_gpt5(self):
        """Test getting model capabilities for GPT-5."""
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("gpt-5")
        assert capabilities.model_name == "gpt-5"
        assert capabilities.friendly_name == "OpenAI (GPT-5)"
        assert capabilities.context_window == 400_000
        assert capabilities.max_output_tokens == 128_000
        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.supports_extended_thinking is True
        assert capabilities.supports_system_prompts is True
        assert capabilities.supports_streaming is True
        assert capabilities.supports_function_calling is True
        assert capabilities.supports_temperature is True
    def test_get_capabilities_gpt5_mini(self):
        """Test getting model capabilities for GPT-5-mini."""
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("gpt-5-mini")
        assert capabilities.model_name == "gpt-5-mini"
        assert capabilities.friendly_name == "OpenAI (GPT-5-mini)"
        assert capabilities.context_window == 400_000
        assert capabilities.max_output_tokens == 128_000
        assert capabilities.provider == ProviderType.OPENAI
        assert capabilities.supports_extended_thinking is True
        assert capabilities.supports_system_prompts is True
        assert capabilities.supports_streaming is True
        assert capabilities.supports_function_calling is True
        assert capabilities.supports_temperature is True
    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_resolves_alias_before_api_call(self, mock_openai_class):
        """Test that generate_content resolves aliases before making API calls.
@@ -132,21 +174,19 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider("test-key")
-        # Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1-2025-04-14, supports temperature)
+        # Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1, supports temperature)
        result = provider.generate_content(
            prompt="Test prompt",
            model_name="gpt4.1",
-            temperature=1.0,  # This should be resolved to "gpt-4.1-2025-04-14"
+            temperature=1.0,  # This should be resolved to "gpt-4.1"
        )
        # Verify the API was called with the RESOLVED model name
        mock_client.chat.completions.create.assert_called_once()
        call_kwargs = mock_client.chat.completions.create.call_args[1]
-        # CRITICAL ASSERTION: The API should receive "gpt-4.1-2025-04-14", not "gpt4.1"
+        # CRITICAL ASSERTION: The API should receive "gpt-4.1", not "gpt4.1"
-        assert (
+        assert call_kwargs["model"] == "gpt-4.1", f"Expected 'gpt-4.1' but API received '{call_kwargs['model']}'"
            call_kwargs["model"] == "gpt-4.1-2025-04-14"
        ), f"Expected 'gpt-4.1-2025-04-14' but API received '{call_kwargs['model']}'"
        # Verify other parameters (gpt-4.1 supports temperature unlike O3/O4 models)
        assert call_kwargs["temperature"] == 1.0
@@ -156,7 +196,7 @@ class TestOpenAIProvider:
        # Verify response
        assert result.content == "Test response"
-        assert result.model_name == "gpt-4.1-2025-04-14"  # Should be the resolved name
+        assert result.model_name == "gpt-4.1"  # Should be the resolved name
    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_other_aliases(self, mock_openai_class):
@@ -213,14 +253,22 @@ class TestOpenAIProvider:
        assert call_kwargs["model"] == "o3-mini"  # Should be unchanged
    def test_supports_thinking_mode(self):
-        """Test thinking mode support (currently False for all OpenAI models)."""
+        """Test thinking mode support."""
        provider = OpenAIModelProvider("test-key")
-        # All OpenAI models currently don't support thinking mode
+        # GPT-5 models support thinking mode (reasoning tokens)
        assert provider.supports_thinking_mode("gpt-5") is True
        assert provider.supports_thinking_mode("gpt-5-mini") is True
        assert provider.supports_thinking_mode("gpt5") is True  # Test with alias
        assert provider.supports_thinking_mode("gpt5mini") is True  # Test with alias
        # O3/O4 models don't support thinking mode
        assert provider.supports_thinking_mode("o3") is False
        assert provider.supports_thinking_mode("o3-mini") is False
        assert provider.supports_thinking_mode("o4-mini") is False
-        assert provider.supports_thinking_mode("mini") is False  # Test with alias too
+        assert (
            provider.supports_thinking_mode("mini") is True
        )  # "mini" now resolves to gpt-5-mini which supports thinking
    @patch("providers.openai_compatible.OpenAI")
    def test_o3_pro_routes_to_responses_endpoint(self, mock_openai_class):
@@ -230,11 +278,9 @@ class TestOpenAIProvider:
        mock_openai_class.return_value = mock_client
        mock_response = MagicMock()
-        mock_response.output = MagicMock()
+        # New o3-pro format: direct output_text field
-        mock_response.output.content = [MagicMock()]
+        mock_response.output_text = "4"
-        mock_response.output.content[0].type = "output_text"
+        mock_response.model = "o3-pro"
        mock_response.output.content[0].text = "4"
        mock_response.model = "o3-pro-2025-06-10"
        mock_response.id = "test-id"
        mock_response.created_at = 1234567890
        mock_response.usage = MagicMock()
@@ -252,13 +298,13 @@ class TestOpenAIProvider:
        # Verify responses.create was called
        mock_client.responses.create.assert_called_once()
        call_args = mock_client.responses.create.call_args[1]
-        assert call_args["model"] == "o3-pro-2025-06-10"
+        assert call_args["model"] == "o3-pro"
        assert call_args["input"][0]["role"] == "user"
        assert "What is 2 + 2?" in call_args["input"][0]["content"][0]["text"]
        # Verify the response
        assert result.content == "4"
-        assert result.model_name == "o3-pro-2025-06-10"
+        assert result.model_name == "o3-pro"
        assert result.metadata["endpoint"] == "responses"
    @patch("providers.openai_compatible.OpenAI")
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -3,6 +3,7 @@ Test per-tool model default selection functionality
 """
 import json
 import os
 from unittest.mock import MagicMock, patch
 import pytest
@@ -73,154 +74,194 @@ class TestToolModelCategories:
 class TestModelSelection:
    """Test model selection based on tool categories."""
    def teardown_method(self):
        """Clean up after each test to prevent state pollution."""
        ModelProviderRegistry.clear_cache()
        # Unregister all providers
        for provider_type in list(ProviderType):
            ModelProviderRegistry.unregister_provider(provider_type)
    def test_extended_reasoning_with_openai(self):
-        """Test EXTENDED_REASONING prefers o3 when OpenAI is available."""
+        """Test EXTENDED_REASONING with OpenAI provider."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Setup with only OpenAI provider
-            # Mock OpenAI models available
+        ModelProviderRegistry.clear_cache()
-            mock_get_available.return_value = {
+        # First unregister all providers to ensure isolation
-                "o3": ProviderType.OPENAI,
+        for provider_type in list(ProviderType):
-                "o3-mini": ProviderType.OPENAI,
+            ModelProviderRegistry.unregister_provider(provider_type)
-                "o4-mini": ProviderType.OPENAI,
+
-            }
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
            from providers.openai_provider import OpenAIModelProvider
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
            # OpenAI prefers o3 for extended reasoning
            assert model == "o3"
    def test_extended_reasoning_with_gemini_only(self):
        """Test EXTENDED_REASONING prefers pro when only Gemini is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Clear cache and unregister all providers first
-            # Mock only Gemini models available
+        ModelProviderRegistry.clear_cache()
-            mock_get_available.return_value = {
+        for provider_type in list(ProviderType):
-                "gemini-2.5-pro": ProviderType.GOOGLE,
+            ModelProviderRegistry.unregister_provider(provider_type)
-                "gemini-2.5-flash": ProviderType.GOOGLE,
+
-            }
+        # Register only Gemini provider
        with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}, clear=False):
            from providers.gemini import GeminiModelProvider
            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-            # Should find the pro model for extended reasoning
+            # Gemini should return one of its models for extended reasoning
-            assert "pro" in model or model == "gemini-2.5-pro"
+            # The default behavior may return flash when pro is not explicitly preferred
            assert model in ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.0-flash"]
    def test_fast_response_with_openai(self):
-        """Test FAST_RESPONSE prefers o4-mini when OpenAI is available."""
+        """Test FAST_RESPONSE with OpenAI provider."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Setup with only OpenAI provider
-            # Mock OpenAI models available
+        ModelProviderRegistry.clear_cache()
-            mock_get_available.return_value = {
+        # First unregister all providers to ensure isolation
-                "o3": ProviderType.OPENAI,
+        for provider_type in list(ProviderType):
-                "o3-mini": ProviderType.OPENAI,
+            ModelProviderRegistry.unregister_provider(provider_type)
-                "o4-mini": ProviderType.OPENAI,
+
-            }
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
            from providers.openai_provider import OpenAIModelProvider
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            assert model == "o4-mini"
+            # OpenAI now prefers gpt-5 for fast response (based on our new preference order)
            assert model == "gpt-5"
    def test_fast_response_with_gemini_only(self):
        """Test FAST_RESPONSE prefers flash when only Gemini is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Clear cache and unregister all providers first
-            # Mock only Gemini models available
+        ModelProviderRegistry.clear_cache()
-            mock_get_available.return_value = {
+        for provider_type in list(ProviderType):
-                "gemini-2.5-pro": ProviderType.GOOGLE,
+            ModelProviderRegistry.unregister_provider(provider_type)
-                "gemini-2.5-flash": ProviderType.GOOGLE,
+
-            }
+        # Register only Gemini provider
        with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}, clear=False):
            from providers.gemini import GeminiModelProvider
            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            # Should find the flash model for fast response
+            # Gemini should return one of its models for fast response
-            assert "flash" in model or model == "gemini-2.5-flash"
+            assert model in ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-2.5-pro"]
    def test_balanced_category_fallback(self):
        """Test BALANCED category uses existing logic."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Setup with only OpenAI provider
-            # Mock OpenAI models available
+        ModelProviderRegistry.clear_cache()
-            mock_get_available.return_value = {
+        # First unregister all providers to ensure isolation
-                "o3": ProviderType.OPENAI,
+        for provider_type in list(ProviderType):
-                "o3-mini": ProviderType.OPENAI,
+            ModelProviderRegistry.unregister_provider(provider_type)
-                "o4-mini": ProviderType.OPENAI,
+
-            }
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
            from providers.openai_provider import OpenAIModelProvider
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
-            assert model == "o4-mini"  # Balanced prefers o4-mini when OpenAI available
+            # OpenAI prefers gpt-5 for balanced (based on our new preference order)
            assert model == "gpt-5"
    def test_no_category_uses_balanced_logic(self):
        """Test that no category specified uses balanced logic."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        # Setup with only Gemini provider
-            # Mock only Gemini models available
+        with patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"}, clear=False):
-            mock_get_available.return_value = {
+            from providers.gemini import GeminiModelProvider
-                "gemini-2.5-pro": ProviderType.GOOGLE,
+
-                "gemini-2.5-flash": ProviderType.GOOGLE,
+            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
            }
            model = ModelProviderRegistry.get_preferred_fallback_model()
-            # Should pick a reasonable default, preferring flash for balanced use
+            # Should pick flash for balanced use
-            assert "flash" in model or model == "gemini-2.5-flash"
+            assert model == "gemini-2.5-flash"
 class TestFlexibleModelSelection:
    """Test that model selection handles various naming scenarios."""
    def test_fallback_handles_mixed_model_names(self):
-        """Test that fallback selection works with mix of full names and shorthands."""
+        """Test that fallback selection works with different providers."""
-        # Test with mix of full names and shorthands
+        # Test with different provider configurations
        test_cases = [
-            # Case 1: Mix of OpenAI shorthands and full names
+            # Case 1: OpenAI provider for extended reasoning
            {
-                "available": {"o3": ProviderType.OPENAI, "o4-mini": ProviderType.OPENAI},
+                "env": {"OPENAI_API_KEY": "test-key"},
                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.EXTENDED_REASONING,
                "expected": "o3",
            },
-            # Case 2: Mix of Gemini shorthands and full names
+            # Case 2: Gemini provider for fast response
            {
-                "available": {
+                "env": {"GEMINI_API_KEY": "test-key"},
-                    "gemini-2.5-flash": ProviderType.GOOGLE,
+                "provider_type": ProviderType.GOOGLE,
                    "gemini-2.5-pro": ProviderType.GOOGLE,
                },
                "category": ToolModelCategory.FAST_RESPONSE,
-                "expected_contains": "flash",
+                "expected": "gemini-2.5-flash",
            },
-            # Case 3: Only shorthands available
+            # Case 3: OpenAI provider for fast response
            {
-                "available": {"o4-mini": ProviderType.OPENAI, "o3-mini": ProviderType.OPENAI},
+                "env": {"OPENAI_API_KEY": "test-key"},
                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.FAST_RESPONSE,
-                "expected": "o4-mini",
+                "expected": "gpt-5",  # Based on new preference order
            },
        ]
        for case in test_cases:
-            with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+            # Clear registry for clean test
-                mock_get_available.return_value = case["available"]
+            ModelProviderRegistry.clear_cache()
            # First unregister all providers to ensure isolation
            for provider_type in list(ProviderType):
                ModelProviderRegistry.unregister_provider(provider_type)
            with patch.dict(os.environ, case["env"], clear=False):
                # Register the appropriate provider
                if case["provider_type"] == ProviderType.OPENAI:
                    from providers.openai_provider import OpenAIModelProvider
                    ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
                elif case["provider_type"] == ProviderType.GOOGLE:
                    from providers.gemini import GeminiModelProvider
                    ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
                model = ModelProviderRegistry.get_preferred_fallback_model(case["category"])
-
+                assert model == case["expected"], f"Failed for case: {case}, got {model}"
                if "expected" in case:
                    assert model == case["expected"], f"Failed for case: {case}"
                elif "expected_contains" in case:
                    assert (
                        case["expected_contains"] in model
                    ), f"Expected '{case['expected_contains']}' in '{model}' for case: {case}"
 class TestCustomProviderFallback:
    """Test fallback to custom/openrouter providers."""
-    @patch.object(ModelProviderRegistry, "_find_extended_thinking_model")
+    def test_extended_reasoning_custom_fallback(self):
-    def test_extended_reasoning_custom_fallback(self, mock_find_thinking):
+        """Test EXTENDED_REASONING with custom provider."""
-        """Test EXTENDED_REASONING falls back to custom thinking model."""
+        # Setup with custom provider
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
+        ModelProviderRegistry.clear_cache()
-            # No native models available, but OpenRouter is available
+        with patch.dict(os.environ, {"CUSTOM_API_URL": "http://localhost:11434", "CUSTOM_API_KEY": ""}, clear=False):
-            mock_get_available.return_value = {"openrouter-model": ProviderType.OPENROUTER}
+            from providers.custom import CustomProvider
            mock_find_thinking.return_value = "custom/thinking-model"
-            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)
            assert model == "custom/thinking-model"
            mock_find_thinking.assert_called_once()
-    @patch.object(ModelProviderRegistry, "_find_extended_thinking_model")
+            provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
-    def test_extended_reasoning_final_fallback(self, mock_find_thinking):
+            if provider:
-        """Test EXTENDED_REASONING falls back to pro when no custom found."""
+                model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
+                # Should get a model from custom provider
-            # No providers available
+                assert model is not None
            mock_get_provider.return_value = None
            mock_find_thinking.return_value = None
-            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+    def test_extended_reasoning_final_fallback(self):
-            assert model == "gemini-2.5-pro"
+        """Test EXTENDED_REASONING falls back to default when no providers."""
        # Clear all providers
        ModelProviderRegistry.clear_cache()
        for provider_type in list(
            ModelProviderRegistry._instance._providers.keys() if ModelProviderRegistry._instance else []
        ):
            ModelProviderRegistry.unregister_provider(provider_type)
        model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
        # Should fall back to hardcoded default
        assert model == "gemini-2.5-flash"
 class TestAutoModeErrorMessages:
@@ -266,42 +307,45 @@ class TestAutoModeErrorMessages:
 class TestProviderHelperMethods:
    """Test the helper methods for finding models from custom/openrouter."""
-    def test_find_extended_thinking_model_custom(self):
+    def test_extended_reasoning_with_custom_provider(self):
-        """Test finding thinking model from custom provider."""
+        """Test extended reasoning model selection with custom provider."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
+        # Setup with custom provider
        with patch.dict(os.environ, {"CUSTOM_API_URL": "http://localhost:11434", "CUSTOM_API_KEY": ""}, clear=False):
            from providers.custom import CustomProvider
-            # Mock custom provider with thinking model
+            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)
            mock_custom = MagicMock(spec=CustomProvider)
            mock_custom.model_registry = {
                "model1": {"supports_extended_thinking": False},
                "model2": {"supports_extended_thinking": True},
                "model3": {"supports_extended_thinking": False},
            }
            mock_get_provider.side_effect = lambda ptype: mock_custom if ptype == ProviderType.CUSTOM else None
-            model = ModelProviderRegistry._find_extended_thinking_model()
+            provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
-            assert model == "model2"
+            if provider:
                # Custom provider should return a model for extended reasoning
                model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
                assert model is not None
-    def test_find_extended_thinking_model_openrouter(self):
+    def test_extended_reasoning_with_openrouter(self):
-        """Test finding thinking model from openrouter."""
+        """Test extended reasoning model selection with OpenRouter."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
+        # Setup with OpenRouter provider
-            # Mock openrouter provider
+        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}, clear=False):
-            mock_openrouter = MagicMock()
+            from providers.openrouter import OpenRouterProvider
            mock_openrouter.validate_model_name.side_effect = lambda m: m == "anthropic/claude-sonnet-4"
            mock_get_provider.side_effect = lambda ptype: mock_openrouter if ptype == ProviderType.OPENROUTER else None
-            model = ModelProviderRegistry._find_extended_thinking_model()
+            ModelProviderRegistry.register_provider(ProviderType.OPENROUTER, OpenRouterProvider)
            assert model == "anthropic/claude-sonnet-4"
-    def test_find_extended_thinking_model_none_found(self):
+            # OpenRouter should provide a model for extended reasoning
-        """Test when no thinking model is found."""
+            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
+            # Should return first available OpenRouter model
-            # No providers available
+            assert model is not None
            mock_get_provider.return_value = None
-            model = ModelProviderRegistry._find_extended_thinking_model()
+    def test_fallback_when_no_providers_available(self):
-            assert model is None
+        """Test fallback when no providers are available."""
        # Clear all providers
        ModelProviderRegistry.clear_cache()
        for provider_type in list(
            ModelProviderRegistry._instance._providers.keys() if ModelProviderRegistry._instance else []
        ):
            ModelProviderRegistry.unregister_provider(provider_type)
        # Should return hardcoded fallback
        model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
        assert model == "gemini-2.5-flash"
 class TestEffectiveAutoMode:
--- a/tests/test_pii_sanitizer.py
+++ b/tests/test_pii_sanitizer.py
@@ -0,0 +1,143 @@
 #!/usr/bin/env python3
 """Test cases for PII sanitizer."""
 import unittest
 from .pii_sanitizer import PIIPattern, PIISanitizer
 class TestPIISanitizer(unittest.TestCase):
    """Test PII sanitization functionality."""
    def setUp(self):
        """Set up test sanitizer."""
        self.sanitizer = PIISanitizer()
    def test_api_key_sanitization(self):
        """Test various API key formats are sanitized."""
        test_cases = [
            # OpenAI keys
            ("sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-proj-SANITIZED"),
            ("sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN", "sk-SANITIZED"),
            # Anthropic keys
            ("sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-ant-SANITIZED"),
            # Google keys
            ("AIzaSyD-1234567890abcdefghijklmnopqrstuv", "AIza-SANITIZED"),
            # GitHub tokens
            ("ghp_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
            ("ghs_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
        ]
        for original, expected in test_cases:
            with self.subTest(original=original):
                result = self.sanitizer.sanitize_string(original)
                self.assertEqual(result, expected)
    def test_personal_info_sanitization(self):
        """Test personal information is sanitized."""
        test_cases = [
            # Email addresses
            ("john.doe@example.com", "user@example.com"),
            ("test123@company.org", "user@example.com"),
            # Phone numbers (all now use the same pattern)
            ("(555) 123-4567", "(XXX) XXX-XXXX"),
            ("555-123-4567", "(XXX) XXX-XXXX"),
            ("+1-555-123-4567", "(XXX) XXX-XXXX"),
            # SSN
            ("123-45-6789", "XXX-XX-XXXX"),
            # Credit card
            ("1234 5678 9012 3456", "XXXX-XXXX-XXXX-XXXX"),
            ("1234-5678-9012-3456", "XXXX-XXXX-XXXX-XXXX"),
        ]
        for original, expected in test_cases:
            with self.subTest(original=original):
                result = self.sanitizer.sanitize_string(original)
                self.assertEqual(result, expected)
    def test_header_sanitization(self):
        """Test HTTP header sanitization."""
        headers = {
            "Authorization": "Bearer sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
            "API-Key": "sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
            "Content-Type": "application/json",
            "User-Agent": "MyApp/1.0",
            "Cookie": "session=abc123; user=john.doe@example.com",
        }
        sanitized = self.sanitizer.sanitize_headers(headers)
        self.assertEqual(sanitized["Authorization"], "Bearer SANITIZED")
        self.assertEqual(sanitized["API-Key"], "sk-SANITIZED")
        self.assertEqual(sanitized["Content-Type"], "application/json")
        self.assertEqual(sanitized["User-Agent"], "MyApp/1.0")
        self.assertIn("user@example.com", sanitized["Cookie"])
    def test_nested_structure_sanitization(self):
        """Test sanitization of nested data structures."""
        data = {
            "user": {
                "email": "john.doe@example.com",
                "api_key": "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
            },
            "tokens": [
                "ghp_1234567890abcdefghijklmnopqrstuvwxyz",
                "Bearer sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
            ],
            "metadata": {"ip": "192.168.1.100", "phone": "(555) 123-4567"},
        }
        sanitized = self.sanitizer.sanitize_value(data)
        self.assertEqual(sanitized["user"]["email"], "user@example.com")
        self.assertEqual(sanitized["user"]["api_key"], "sk-proj-SANITIZED")
        self.assertEqual(sanitized["tokens"][0], "gh_SANITIZED")
        self.assertEqual(sanitized["tokens"][1], "Bearer sk-ant-SANITIZED")
        self.assertEqual(sanitized["metadata"]["ip"], "0.0.0.0")
        self.assertEqual(sanitized["metadata"]["phone"], "(XXX) XXX-XXXX")
    def test_url_sanitization(self):
        """Test URL parameter sanitization."""
        urls = [
            (
                "https://api.example.com/v1/users?api_key=sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
                "https://api.example.com/v1/users?api_key=SANITIZED",
            ),
            (
                "https://example.com/login?token=ghp_1234567890abcdefghijklmnopqrstuvwxyz&user=test",
                "https://example.com/login?token=SANITIZED&user=test",
            ),
        ]
        for original, expected in urls:
            with self.subTest(url=original):
                result = self.sanitizer.sanitize_url(original)
                self.assertEqual(result, expected)
    def test_disable_sanitization(self):
        """Test that sanitization can be disabled."""
        self.sanitizer.sanitize_enabled = False
        sensitive_data = "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
        result = self.sanitizer.sanitize_string(sensitive_data)
        # Should return original when disabled
        self.assertEqual(result, sensitive_data)
    def test_custom_pattern(self):
        """Test adding custom PII patterns."""
        # Add custom pattern for internal employee IDs
        custom_pattern = PIIPattern.create(
            name="employee_id", pattern=r"EMP\d{6}", replacement="EMP-REDACTED", description="Internal employee IDs"
        )
        self.sanitizer.add_pattern(custom_pattern)
        text = "Employee EMP123456 has access to the system"
        result = self.sanitizer.sanitize_string(text)
        self.assertEqual(result, "Employee EMP-REDACTED has access to the system")
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_provider_utf8.py
+++ b/tests/test_provider_utf8.py
@@ -126,7 +126,7 @@ class TestProviderUTF8Encoding(unittest.TestCase):
        mock_response.usage = Mock()
        mock_response.usage.input_tokens = 50
        mock_response.usage.output_tokens = 25
-        mock_response.model = "o3-pro-2025-06-10"
+        mock_response.model = "o3-pro"
        mock_response.id = "test-id"
        mock_response.created_at = 1234567890
@@ -141,7 +141,7 @@ class TestProviderUTF8Encoding(unittest.TestCase):
        with patch("logging.info") as mock_logging:
            response = provider.generate_content(
                prompt="Analyze this Python code for issues",
-                model_name="o3-pro-2025-06-10",
+                model_name="o3-pro",
                system_prompt="You are a code review expert.",
            )
@@ -351,7 +351,7 @@ class TestLocaleModelIntegration(unittest.TestCase):
    def test_model_name_resolution_utf8(self):
        """Test model name resolution with UTF-8."""
        provider = OpenAIModelProvider(api_key="test")
-        model_names = ["gpt-4", "gemini-2.5-flash", "claude-3-opus", "o3-pro-2025-06-10"]
+        model_names = ["gpt-4", "gemini-2.5-flash", "claude-3-opus", "o3-pro"]
        for model_name in model_names:
            resolved = provider._resolve_model_name(model_name)
            self.assertIsInstance(resolved, str)
--- a/tests/test_supported_models_aliases.py
+++ b/tests/test_supported_models_aliases.py
@@ -47,22 +47,23 @@ class TestSupportedModelsAliases:
            assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
        # Test specific aliases
-        assert "mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
+        # "mini" is now an alias for gpt-5-mini, not o4-mini
        assert "mini" in provider.SUPPORTED_MODELS["gpt-5-mini"].aliases
        assert "o4mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
        assert "o4-mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
        assert "o3mini" in provider.SUPPORTED_MODELS["o3-mini"].aliases
-        assert "o3-pro" in provider.SUPPORTED_MODELS["o3-pro-2025-06-10"].aliases
+        assert "o3-pro" in provider.SUPPORTED_MODELS["o3-pro"].aliases
-        assert "o4mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
+        assert "gpt4.1" in provider.SUPPORTED_MODELS["gpt-4.1"].aliases
        assert "gpt4.1" in provider.SUPPORTED_MODELS["gpt-4.1-2025-04-14"].aliases
        # Test alias resolution
-        assert provider._resolve_model_name("mini") == "o4-mini"
+        assert provider._resolve_model_name("mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
        assert provider._resolve_model_name("o3mini") == "o3-mini"
-        assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
+        assert provider._resolve_model_name("o3-pro") == "o3-pro"  # o3-pro is already the base model name
        assert provider._resolve_model_name("o4mini") == "o4-mini"
-        assert provider._resolve_model_name("gpt4.1") == "gpt-4.1-2025-04-14"
+        assert provider._resolve_model_name("gpt4.1") == "gpt-4.1"  # gpt4.1 resolves to gpt-4.1
        # Test case insensitive resolution
-        assert provider._resolve_model_name("Mini") == "o4-mini"
+        assert provider._resolve_model_name("Mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
        assert provider._resolve_model_name("O3MINI") == "o3-mini"
    def test_xai_provider_aliases(self):
@@ -75,25 +76,21 @@ class TestSupportedModelsAliases:
            assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"
        # Test specific aliases
-        assert "grok" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
+        assert "grok" in provider.SUPPORTED_MODELS["grok-4"].aliases
-        assert "grok-4" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
+        assert "grok4" in provider.SUPPORTED_MODELS["grok-4"].aliases
        assert "grok-4-latest" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
        assert "grok4" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
        assert "grok3" in provider.SUPPORTED_MODELS["grok-3"].aliases
        assert "grok3fast" in provider.SUPPORTED_MODELS["grok-3-fast"].aliases
        assert "grokfast" in provider.SUPPORTED_MODELS["grok-3-fast"].aliases
        # Test alias resolution
-        assert provider._resolve_model_name("grok") == "grok-4-0709"
+        assert provider._resolve_model_name("grok") == "grok-4"
-        assert provider._resolve_model_name("grok4") == "grok-4-0709"
+        assert provider._resolve_model_name("grok4") == "grok-4"
        assert provider._resolve_model_name("grok-4") == "grok-4-0709"
        assert provider._resolve_model_name("grok3") == "grok-3"
        assert provider._resolve_model_name("grok3fast") == "grok-3-fast"
        assert provider._resolve_model_name("grokfast") == "grok-3-fast"
        # Test case insensitive resolution
-        assert provider._resolve_model_name("Grok") == "grok-4-0709"
+        assert provider._resolve_model_name("Grok") == "grok-4"
        assert provider._resolve_model_name("GROK4") == "grok-4-0709"
        assert provider._resolve_model_name("GROKFAST") == "grok-3-fast"
    def test_dial_provider_aliases(self):
--- a/tests/test_xai_provider.py
+++ b/tests/test_xai_provider.py
@@ -66,10 +66,8 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")
        # Test shorthand resolution
-        assert provider._resolve_model_name("grok") == "grok-4-0709"
+        assert provider._resolve_model_name("grok") == "grok-4"
-        assert provider._resolve_model_name("grok4") == "grok-4-0709"
+        assert provider._resolve_model_name("grok4") == "grok-4"
        assert provider._resolve_model_name("grok-4") == "grok-4-0709"
        assert provider._resolve_model_name("grok-4-latest") == "grok-4-0709"
        assert provider._resolve_model_name("grok3") == "grok-3"
        assert provider._resolve_model_name("grokfast") == "grok-3-fast"
        assert provider._resolve_model_name("grok3fast") == "grok-3-fast"
@@ -96,7 +94,7 @@ class TestXAIProvider:
        # Test temperature range
        assert capabilities.temperature_constraint.min_temp == 0.0
        assert capabilities.temperature_constraint.max_temp == 2.0
-        assert capabilities.temperature_constraint.default_temp == 0.7
+        assert capabilities.temperature_constraint.default_temp == 0.3
    def test_get_capabilities_grok4(self):
        """Test getting model capabilities for GROK-4."""
@@ -135,13 +133,9 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")
        capabilities = provider.get_capabilities("grok")
-        assert capabilities.model_name == "grok-4-0709"  # Should resolve to full name
+        assert capabilities.model_name == "grok-4"  # Should resolve to full name
        assert capabilities.context_window == 256_000
        capabilities_3 = provider.get_capabilities("grok3")
        assert capabilities_3.model_name == "grok-3"  # Should resolve to full name
        assert capabilities_3.context_window == 131_072
        capabilities_fast = provider.get_capabilities("grokfast")
        assert capabilities_fast.model_name == "grok-3-fast"  # Should resolve to full name
@@ -164,7 +158,9 @@ class TestXAIProvider:
        # Grok-3 models don't support thinking mode
        assert not provider.supports_thinking_mode("grok-3")
        assert not provider.supports_thinking_mode("grok-3-fast")
-        assert not provider.supports_thinking_mode("grok3")
+        assert provider.supports_thinking_mode("grok-4")  # grok-4 supports thinking mode
        assert provider.supports_thinking_mode("grok")  # resolves to grok-4
        assert provider.supports_thinking_mode("grok4")  # resolves to grok-4
        assert not provider.supports_thinking_mode("grokfast")
    def test_provider_type(self):
@@ -186,9 +182,8 @@ class TestXAIProvider:
        assert provider.validate_model_name("grok-3") is True
        assert provider.validate_model_name("grok3") is True  # Shorthand for grok-3
-        # grok-4 and its aliases should be blocked
+        # grok should be blocked (resolves to grok-4 which is not allowed)
-        assert provider.validate_model_name("grok-4-0709") is False
+        assert provider.validate_model_name("grok") is False
        assert provider.validate_model_name("grok") is False  # Now resolves to grok-4
        # grok-3-fast should be blocked by restrictions
        assert provider.validate_model_name("grok-3-fast") is False
@@ -204,7 +199,7 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")
-        # Shorthand "grok" should be allowed (resolves to grok-4-0709)
+        # Shorthand "grok" should be allowed (resolves to grok-4)
        assert provider.validate_model_name("grok") is True
        # Full name "grok-4-0709" should NOT be allowed (only shorthand "grok" is in restriction list)
@@ -268,7 +263,7 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")
        # Check that all expected base models are present
-        assert "grok-4-0709" in provider.SUPPORTED_MODELS
+        assert "grok-4" in provider.SUPPORTED_MODELS
        assert "grok-3" in provider.SUPPORTED_MODELS
        assert "grok-3-fast" in provider.SUPPORTED_MODELS
@@ -292,7 +287,13 @@ class TestXAIProvider:
        grok3_config = provider.SUPPORTED_MODELS["grok-3"]
        assert grok3_config.context_window == 131_072
        assert grok3_config.supports_extended_thinking is False
-        assert "grok3" in grok3_config.aliases
+        # Check aliases are correctly structured
        assert "grok3" in grok3_config.aliases  # grok3 resolves to grok-3
        # Check grok-4 aliases
        grok4_config = provider.SUPPORTED_MODELS["grok-4"]
        assert "grok" in grok4_config.aliases  # grok resolves to grok-4
        assert "grok4" in grok4_config.aliases
        grok3fast_config = provider.SUPPORTED_MODELS["grok-3-fast"]
        assert "grok3fast" in grok3fast_config.aliases
@@ -303,7 +304,7 @@ class TestXAIProvider:
        """Test that generate_content resolves aliases before making API calls.
        This is the CRITICAL test that ensures aliases like 'grok' get resolved
-        to 'grok-3' before being sent to X.AI API.
+        to 'grok-4' before being sent to X.AI API.
        """
        # Set up mock OpenAI client
        mock_client = MagicMock()
@@ -328,17 +329,15 @@ class TestXAIProvider:
        # Call generate_content with alias 'grok'
        result = provider.generate_content(
-            prompt="Test prompt", model_name="grok", temperature=0.7  # This should be resolved to "grok-4-0709"
+            prompt="Test prompt", model_name="grok", temperature=0.7  # This should be resolved to "grok-4"
        )
        # Verify the API was called with the RESOLVED model name
        mock_client.chat.completions.create.assert_called_once()
        call_kwargs = mock_client.chat.completions.create.call_args[1]
-        # CRITICAL ASSERTION: The API should receive "grok-4-0709", not "grok"
+        # CRITICAL ASSERTION: The API should receive "grok-4", not "grok"
-        assert (
+        assert call_kwargs["model"] == "grok-4", f"Expected 'grok-4' but API received '{call_kwargs['model']}'"
            call_kwargs["model"] == "grok-4-0709"
        ), f"Expected 'grok-4-0709' but API received '{call_kwargs['model']}'"
        # Verify other parameters
        assert call_kwargs["temperature"] == 0.7
@@ -348,7 +347,7 @@ class TestXAIProvider:
        # Verify response
        assert result.content == "Test response"
-        assert result.model_name == "grok-4-0709"  # Should be the resolved name
+        assert result.model_name == "grok-4"  # Should be the resolved name
    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_other_aliases(self, mock_openai_class):
--- a/tests/transport_helpers.py
+++ b/tests/transport_helpers.py
@@ -0,0 +1,47 @@
 """Helper functions for HTTP transport injection in tests."""
 from tests.http_transport_recorder import TransportFactory
 def inject_transport(monkeypatch, cassette_path: str):
    """Inject HTTP transport into OpenAICompatibleProvider for testing.
    This helper simplifies the monkey patching pattern used across tests
    to inject custom HTTP transports for recording/replaying API calls.
    Also ensures OpenAI provider is properly registered for tests that need it.
    Args:
        monkeypatch: pytest monkeypatch fixture
        cassette_path: Path to cassette file for recording/replay
    Returns:
        The created transport instance
    Example:
        transport = inject_transport(monkeypatch, "path/to/cassette.json")
    """
    # Ensure OpenAI provider is registered - always needed for transport injection
    from providers.base import ProviderType
    from providers.openai_provider import OpenAIModelProvider
    from providers.registry import ModelProviderRegistry
    # Always register OpenAI provider for transport tests (API key might be dummy)
    ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
    # Create transport
    transport = TransportFactory.create_transport(str(cassette_path))
    # Inject transport using the established pattern
    from providers.openai_compatible import OpenAICompatibleProvider
    original_client_property = OpenAICompatibleProvider.client
    def patched_client_getter(self):
        if self._client is None:
            self._test_transport = transport
        return original_client_property.fget(self)
    monkeypatch.setattr(OpenAICompatibleProvider, "client", property(patched_client_getter))
    return transport
--- a/tools/challenge.py
+++ b/tools/challenge.py
@@ -152,7 +152,7 @@ class ChallengeTool(SimpleTool):
            # Return the wrapped prompt as the response
            response_data = {
-                "status": "challenge_created",
+                "status": "challenge_accepted",
                "original_statement": request.prompt,
                "challenge_prompt": wrapped_prompt,
                "instructions": (
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -23,6 +23,9 @@ from .simple.base import SimpleTool
 CHAT_FIELD_DESCRIPTIONS = {
    "prompt": (
        "You MUST provide a thorough, expressive question or share an idea with as much context as possible. "
        "IMPORTANT: When referring to code, use the files parameter to pass relevant files and only use the prompt to refer to "
        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
        "Remember: you're talking to an assistant who has deep expertise and can provide nuanced insights. Include your "
        "current thinking, specific challenges, background context, what you've already tried, and what "
        "kind of response would be most helpful. The more context and detail you provide, the more "
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -45,6 +45,9 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "and ways to reduce complexity while maintaining functionality. Map out the codebase structure, understand "
        "the business logic, and identify areas requiring deeper analysis. In all later steps, continue exploring "
        "with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover more evidence."
        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the code review sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the code review. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. MANDATORY: When continuation_id is provided (continuing a previous "
        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "code review analysis is complete and ready for expert validation."
+        "code review analysis is complete and ready for expert validation. MANDATORY: When continuation_id is "
        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the code being reviewed. Include analysis of code quality, "
@@ -91,13 +96,14 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "unnecessary complexity, etc."
    ),
    "confidence": (
-        "Indicate your current confidence in the code review assessment. Use: 'exploring' (starting analysis), 'low' "
+        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
-        "(early investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
+        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete review), 'certain' (100% confidence - "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
-        "code review is thoroughly complete and all significant issues are identified with no need for external model validation). "
+        "analysis is complete and all issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the code review is comprehensively complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
-        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also do "
+        "instead if not 200% sure. "
-        "NOT set confidence to 'certain' if the user has strongly requested that external review must be performed."
+        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
@@ -572,6 +578,17 @@ class CodeReviewTool(WorkflowTool):
        """
        Provide step-specific guidance for code review workflow.
        """
        # Check if this is a continuation - if so, skip workflow and go to expert analysis
        continuation_id = self.get_request_continuation_id(request)
        if continuation_id:
            return {
                "next_steps": (
                    "Continuing previous conversation. The expert analysis will now be performed based on the "
                    "accumulated context from the previous conversation. The analysis will build upon the prior "
                    "findings without repeating the investigation steps."
                )
            }
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)
--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -537,11 +537,13 @@ of the evidence, even when it strongly points in one direction.""",
            provider = self.get_model_provider(model_name)
            # Prepare the prompt with any relevant files
            # Use continuation_id=None for blinded consensus - each model should only see
            # original prompt + files, not conversation history or other model responses
            prompt = self.initial_prompt
            if request.relevant_files:
                file_content, _ = self._prepare_file_content_for_prompt(
                    request.relevant_files,
-                    request.continuation_id,
+                    None,  # Use None instead of request.continuation_id for blinded consensus
                    "Context files",
                )
                if file_content:
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -45,6 +45,9 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
        "could cause instability. In concurrent systems, watch for race conditions, shared state, or timing "
        "dependencies. In all later steps, continue exploring with precision: trace deeper dependencies, verify "
        "hypotheses, and adapt your understanding as you uncover more evidence."
        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the root "
-        "cause is known or the investigation is complete."
+        "cause is known or the investigation is complete. IMPORTANT: When continuation_id is "
        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or "
@@ -92,10 +97,10 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), "
        "'medium' (some supporting evidence), 'high' (strong evidence), 'very_high' (very strong evidence), "
-        "'almost_certain' (nearly confirmed), 'certain' (100% confidence - root cause and minimal fix are both "
+        "'almost_certain' (nearly confirmed), 'certain' (200% confidence - root cause and minimal fix are both "
        "confirmed locally with no need for external model validation). Do NOT use 'certain' unless the issue can be "
-        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 100% sure. Using 'certain' "
+        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 200% sure. Using 'certain' "
-        "means you have complete confidence locally and prevents external model validation. Also do "
+        "means you have ABSOLUTE confidence locally and prevents external model validation. Also do "
        "NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
@@ -165,7 +170,7 @@ class DebugIssueTool(WorkflowTool):
    def get_description(self) -> str:
        return (
-            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
+            "DEBUG & ROOT CAUSE ANALYSIS - Use this tool to perform any kind of debugging, bug hunting, or issue tracking. "
            "This tool guides you through a step-by-step investigation process where you:\n\n"
            "1. Start with step 1: describe the issue to investigate\n"
            "2. STOP and investigate using appropriate tools\n"
--- a/tools/listmodels.py
+++ b/tools/listmodels.py
@@ -225,7 +225,7 @@ class ListModelsTool(BaseTool):
                output_lines.append(f"**Error loading models**: {str(e)}")
        else:
            output_lines.append("**Status**: Not configured (set OPENROUTER_API_KEY)")
-            output_lines.append("**Note**: Provides access to GPT-4, O3, Mistral, and many more")
+            output_lines.append("**Note**: Provides access to GPT-5, O3, Mistral, and many more")
        output_lines.append("")
@@ -295,7 +295,7 @@ class ListModelsTool(BaseTool):
        # Add usage tips
        output_lines.append("\n**Usage Tips**:")
-        output_lines.append("- Use model aliases (e.g., 'flash', 'o3', 'opus') for convenience")
+        output_lines.append("- Use model aliases (e.g., 'flash', 'gpt5', 'opus') for convenience")
        output_lines.append("- In auto mode, the CLI Agent will select the best model for each task")
        output_lines.append("- Custom models are only available when CUSTOM_API_URL is set")
        output_lines.append("- OpenRouter provides access to many cloud models with one API key")
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -42,6 +42,9 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
        "performance impacts, and maintainability concerns. Map out changed files, understand the business logic, "
        "and identify areas requiring deeper analysis. In all later steps, continue exploring with precision: "
        "trace dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence."
        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the pre-commit investigation sequence, beginning at 1. Each step should "
@@ -49,11 +52,13 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the pre-commit investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "pre-commit analysis is complete and ready for expert validation."
+        "pre-commit analysis is complete and ready for expert validation. IMPORTANT: When continuation_id is "
        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the changes being committed. Include analysis of git diffs, "
@@ -87,9 +92,10 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (100% confidence - "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
        "analysis is complete and all issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
        "instead if not 200% sure. "
        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
@@ -584,6 +590,17 @@ class PrecommitTool(WorkflowTool):
        """
        Provide step-specific guidance for precommit workflow.
        """
        # Check if this is a continuation - if so, skip workflow and go to expert analysis
        continuation_id = self.get_request_continuation_id(request)
        if continuation_id:
            return {
                "next_steps": (
                    "Continuing previous conversation. The expert analysis will now be performed based on the "
                    "accumulated context from the previous conversation. The analysis will build upon the prior "
                    "findings without repeating the investigation steps."
                )
            }
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)
--- a/tools/refactor.py
+++ b/tools/refactor.py
@@ -44,6 +44,9 @@ REFACTOR_FIELD_DESCRIPTIONS = {
        "structure, understand the business logic, and identify areas requiring refactoring. In all later steps, continue "
        "exploring with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover "
        "more refactoring opportunities."
        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the refactoring investigation sequence, beginning at 1. Each step should "
--- a/tools/workflow/base.py
+++ b/tools/workflow/base.py
@@ -390,6 +390,23 @@ class WorkflowTool(BaseTool, BaseWorkflowMixin):
        """Get status for skipped expert analysis. Override for tool-specific status."""
        return "skipped_by_tool_design"
    def is_continuation_workflow(self, request) -> bool:
        """
        Check if this is a continuation workflow that should skip multi-step investigation.
        When continuation_id is provided, the workflow typically continues from a previous
        conversation and should go directly to expert analysis rather than starting a new
        multi-step investigation.
        Args:
            request: The workflow request object
        Returns:
            True if this is a continuation that should skip multi-step workflow
        """
        continuation_id = self.get_request_continuation_id(request)
        return bool(continuation_id)
    # Abstract methods that must be implemented by specific workflow tools
    # (These are inherited from BaseWorkflowMixin and must be implemented)
--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -89,6 +89,11 @@ class BaseWorkflowMixin(ABC):
        """Return the system prompt for this tool. Usually provided by BaseTool."""
        pass
    @abstractmethod
    def get_language_instruction(self) -> str:
        """Return the language instruction for localization. Usually provided by BaseTool."""
        pass
    @abstractmethod
    def get_default_temperature(self) -> float:
        """Return the default temperature for this tool. Usually provided by BaseTool."""
@@ -107,9 +112,11 @@ class BaseWorkflowMixin(ABC):
    @abstractmethod
    def _prepare_file_content_for_prompt(
        self,
-        files: list[str],
+        request_files: list[str],
        continuation_id: Optional[str],
-        description: str,
+        context_description: str = "New files",
        max_tokens: Optional[int] = None,
        reserve_tokens: int = 1_000,
        remaining_budget: Optional[int] = None,
        arguments: Optional[dict[str, Any]] = None,
        model_context: Optional[Any] = None,
@@ -299,7 +306,7 @@ class BaseWorkflowMixin(ABC):
            f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. "
            f"You MUST first work using appropriate tools. "
            f"REQUIRED ACTIONS before calling {self.get_name()} step {next_step_number}:\n"
-            + "\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+            + "\n".join(f"{i + 1}. {action}" for i, action in enumerate(required_actions))
            + f"\n\nOnly call {self.get_name()} again with step_number: {next_step_number} "
            f"AFTER completing this work."
        )
@@ -663,13 +670,13 @@ class BaseWorkflowMixin(ABC):
                self._current_model_name = None
                self._model_context = None
            # Handle continuation
            continuation_id = request.continuation_id
            # Adjust total steps if needed
            if request.step_number > request.total_steps:
                request.total_steps = request.step_number
            # Handle continuation
            continuation_id = request.continuation_id
            # Create thread for first step
            if not continuation_id and request.step_number == 1:
                clean_args = {k: v for k, v in arguments.items() if k not in ["_model_context", "_resolved_model_name"]}
@@ -818,8 +825,9 @@ class BaseWorkflowMixin(ABC):
        Default implementation provides generic response.
        """
        work_summary = self.prepare_work_summary()
        continuation_id = self.get_request_continuation_id(request)
-        return {
+        response_data = {
            "status": self.get_completion_status(),
            f"complete_{self.get_name()}": {
                "initial_request": self.get_initial_request(request.step),
@@ -839,6 +847,11 @@ class BaseWorkflowMixin(ABC):
            },
        }
        if continuation_id:
            response_data["continuation_id"] = continuation_id
        return response_data
    # ================================================================================
    # Inheritance Hook Methods - Replace hasattr/getattr Anti-patterns
    # ================================================================================
@@ -1447,8 +1460,10 @@ class BaseWorkflowMixin(ABC):
                if file_content:
                    expert_context = self._add_files_to_expert_context(expert_context, file_content)
-            # Get system prompt for this tool
+            # Get system prompt for this tool with localization support
-            system_prompt = self.get_system_prompt()
+            base_system_prompt = self.get_system_prompt()
            language_instruction = self.get_language_instruction()
            system_prompt = language_instruction + base_system_prompt
            # Check if tool wants system prompt embedded in main prompt
            if self.should_embed_system_prompt():
@@ -1547,36 +1562,21 @@ class BaseWorkflowMixin(ABC):
    # Default implementations for methods that workflow-based tools typically don't need
-    def prepare_prompt(self, request, continuation_id=None, max_tokens=None, reserve_tokens=0):
+    async def prepare_prompt(self, request) -> str:
        """
-        Base implementation for workflow tools.
+        Base implementation for workflow tools - compatible with BaseTool signature.
-        Allows subclasses to customize prompt preparation behavior by overriding
+        Workflow tools typically don't need to return a prompt since they handle
-        customize_prompt_preparation().
+        their own prompt preparation internally through the workflow execution.
        """
        # Allow subclasses to customize the prompt preparation
        self.customize_prompt_preparation(request, continuation_id, max_tokens, reserve_tokens)
        # Workflow tools typically don't need to return a prompt
        # since they handle their own prompt preparation internally
        return "", ""
    def customize_prompt_preparation(self, request, continuation_id=None, max_tokens=None, reserve_tokens=0):
        """
        Override this method in subclasses to customize prompt preparation.
        Base implementation does nothing - subclasses can extend this to add
        custom prompt preparation logic without the base class needing to
        know about specific tool capabilities.
        Args:
-            request: The request object (may have files, prompt, etc.)
+            request: The validated request object
-            continuation_id: Optional continuation ID
+
-            max_tokens: Optional max token limit
+        Returns:
-            reserve_tokens: Optional reserved token count
+            Empty string since workflow tools manage prompts internally
        """
-        # Base implementation does nothing - subclasses override as needed
+        # Workflow tools handle their prompts internally during workflow execution
-        return None
+        return ""
    def format_response(self, response: str, request, model_info=None):
        """