Merge branch 'main' into grok4-support

2025-08-07 23:04:15 -07:00
parent 22b22f7518 fcb0fe3ef2
commit 8a884c57d6
52 changed files with 2400 additions and 637 deletions
--- a/.env.example
+++ b/.env.example
@@ -37,13 +37,13 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here

 # Optional: Default model to use
 # Options: 'auto' (Claude picks best model), 'pro', 'flash', 'o3', 'o3-mini', 'o4-mini', 'o4-mini-high',
-#          'grok', 'opus-4', 'sonnet-4', or any DIAL model if DIAL is configured
+#          'gpt-5', 'gpt-5-mini', 'grok', 'opus-4', 'sonnet-4', or any DIAL model if DIAL is configured
 # When set to 'auto', Claude will select the best model for each task
 # Defaults to 'auto' if not specified
 DEFAULT_MODEL=auto

 # Optional: Default thinking mode for ThinkDeep tool
-# NOTE: Only applies to models that support extended thinking (e.g., Gemini 2.5 Pro)
+# NOTE: Only applies to models that support extended thinking (e.g., Gemini 2.5 Pro, GPT-5 models)
 #       Flash models (2.0) will use system prompt engineering instead
 # Token consumption per mode:
 #   minimal: 128 tokens   - Quick analysis, fastest response
@@ -65,6 +65,8 @@ DEFAULT_THINKING_MODE_THINKDEEP=high
 #   - o3-mini          (200K context, balanced)
 #   - o4-mini          (200K context, latest balanced, temperature=1.0 only)
 #   - o4-mini-high     (200K context, enhanced reasoning, temperature=1.0 only)
+#   - gpt-5            (400K context, 128K output, reasoning tokens)
+#   - gpt-5-mini       (400K context, 128K output, reasoning tokens)
 #   - mini             (shorthand for o4-mini)
 #
 # Supported Google/Gemini models:
--- a/config.py
+++ b/config.py
@@ -14,9 +14,9 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "5.8.2"
+__version__ = "5.8.3"
 # Last update date in ISO format
-__updated__ = "2025-06-30"
+__updated__ = "2025-08-08"
 # Primary maintainer
 __author__ = "Fahad Gilani"

@@ -75,10 +75,10 @@ DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION = 2
 #
 # IMPORTANT: This limit ONLY applies to the Claude CLI ↔ MCP Server transport boundary.
 # It does NOT limit internal MCP Server operations like system prompts, file embeddings,
-# conversation history, or content sent to external models (Gemini/O3/OpenRouter).
+# conversation history, or content sent to external models (Gemini/OpenAI/OpenRouter).
 #
 # MCP Protocol Architecture:
-# Claude CLI ←→ MCP Server ←→ External Model (Gemini/O3/etc.)
+# Claude CLI ←→ MCP Server ←→ External Model (Gemini/OpenAI/etc.)
 #     ↑                              ↑
 #     │                              │
 # MCP transport                Internal processing
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -115,6 +115,14 @@ Test isolated components and functions:
 - **File handling**: Path validation, token limits, deduplication
 - **Auto mode**: Model selection logic and fallback behavior

+### HTTP Recording/Replay Tests (HTTP Transport Recorder)
+Tests for expensive API calls (like o3-pro) use custom recording/replay:
+- **Real API validation**: Tests against actual provider responses
+- **Cost efficiency**: Record once, replay forever
+- **Provider compatibility**: Validates fixes against real APIs
+- Uses HTTP Transport Recorder for httpx-based API calls
+- See [HTTP Recording/Replay Testing Guide](./vcr-testing.md) for details
+
 ### Simulator Tests
 Validate real-world usage scenarios by simulating actual Claude prompts:
 - **Basic conversations**: Multi-turn chat functionality with real prompts
--- a/docs/vcr-testing.md
+++ b/docs/vcr-testing.md
@@ -0,0 +1,128 @@
+# HTTP Transport Recorder for Testing
+
+A custom HTTP recorder for testing expensive API calls (like o3-pro) with real responses.
+
+## Overview
+
+The HTTP Transport Recorder captures and replays HTTP interactions at the transport layer, enabling:
+- Cost-efficient testing of expensive APIs (record once, replay forever)
+- Deterministic tests with real API responses
+- Seamless integration with httpx and OpenAI SDK
+- Automatic PII sanitization for secure recordings
+
+## Quick Start
+
+```python
+from tests.transport_helpers import inject_transport
+
+# Simple one-line setup with automatic transport injection
+def test_expensive_api_call(monkeypatch):
+    inject_transport(monkeypatch, "tests/openai_cassettes/my_test.json")
+    
+    # Make API calls - automatically recorded/replayed with PII sanitization
+    result = await chat_tool.execute({"prompt": "2+2?", "model": "o3-pro"})
+```
+
+## How It Works
+
+1. **First run** (cassette doesn't exist): Records real API calls
+2. **Subsequent runs** (cassette exists): Replays saved responses
+3. **Re-record**: Delete cassette file and run again
+
+## Usage in Tests
+
+The `transport_helpers.inject_transport()` function simplifies test setup:
+
+```python
+from tests.transport_helpers import inject_transport
+
+async def test_with_recording(monkeypatch):
+    # One-line setup - handles all transport injection complexity
+    inject_transport(monkeypatch, "tests/openai_cassettes/my_test.json")
+    
+    # Use API normally - recording/replay happens transparently
+    result = await chat_tool.execute({"prompt": "2+2?", "model": "o3-pro"})
+```
+
+For manual setup, see `test_o3_pro_output_text_fix.py`.
+
+## Automatic PII Sanitization
+
+All recordings are automatically sanitized to remove sensitive data:
+
+- **API Keys & Tokens**: Bearer tokens, API keys, and auth headers
+- **Personal Data**: Email addresses, IP addresses, phone numbers
+- **URLs**: Sensitive query parameters and paths
+- **Custom Patterns**: Add your own sanitization rules
+
+Sanitization is enabled by default in `RecordingTransport`. To disable:
+
+```python
+transport = TransportFactory.create_transport(cassette_path, sanitize=False)
+```
+
+## File Structure
+
+```
+tests/
+├── openai_cassettes/           # Recorded API interactions
+│   └── *.json                  # Cassette files
+├── http_transport_recorder.py  # Transport implementation
+├── pii_sanitizer.py           # Automatic PII sanitization
+├── transport_helpers.py       # Simplified transport injection
+├── sanitize_cassettes.py      # Batch sanitization script
+└── test_o3_pro_output_text_fix.py  # Example usage
+```
+
+## Sanitizing Existing Cassettes
+
+Use the `sanitize_cassettes.py` script to clean existing recordings:
+
+```bash
+# Sanitize all cassettes (creates backups)
+python tests/sanitize_cassettes.py
+
+# Sanitize specific cassette
+python tests/sanitize_cassettes.py tests/openai_cassettes/my_test.json
+
+# Skip backup creation
+python tests/sanitize_cassettes.py --no-backup
+```
+
+The script will:
+- Create timestamped backups of original files
+- Apply comprehensive PII sanitization
+- Preserve JSON structure and functionality
+
+## Cost Management
+
+- **One-time cost**: Initial recording only
+- **Zero ongoing cost**: Replays are free
+- **CI-friendly**: No API keys needed for replay
+
+## Re-recording
+
+When API changes require new recordings:
+
+```bash
+# Delete specific cassette
+rm tests/openai_cassettes/my_test.json
+
+# Run test with real API key
+python -m pytest tests/test_o3_pro_output_text_fix.py
+```
+
+## Implementation Details
+
+- **RecordingTransport**: Captures real HTTP calls with automatic PII sanitization
+- **ReplayTransport**: Serves saved responses from cassettes
+- **TransportFactory**: Auto-selects mode based on cassette existence
+- **PIISanitizer**: Comprehensive sanitization of sensitive data (integrated by default)
+
+**Security Note**: While recordings are automatically sanitized, always review new cassette files before committing. The sanitizer removes known patterns of sensitive data, but domain-specific secrets may need custom rules.
+
+For implementation details, see:
+- `tests/http_transport_recorder.py` - Core transport implementation
+- `tests/pii_sanitizer.py` - Sanitization patterns and logic
+- `tests/transport_helpers.py` - Simplified test integration
+
--- a/providers/base.py
+++ b/providers/base.py
@@ -4,7 +4,10 @@ import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 logger = logging.getLogger(__name__)

@@ -118,10 +121,10 @@ def create_temperature_constraint(constraint_type: str) -> TemperatureConstraint
        return FixedTemperatureConstraint(1.0)
    elif constraint_type == "discrete":
        # For models with specific allowed values - using common OpenAI values as default
-        return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.7)
+        return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.3)
    else:
        # Default range constraint (for "range" or None)
-        return RangeTemperatureConstraint(0.0, 2.0, 0.7)
+        return RangeTemperatureConstraint(0.0, 2.0, 0.3)


@dataclass
@@ -154,24 +157,11 @@ class ModelCapabilities:
    # Custom model flag (for models that only work with custom endpoints)
    is_custom: bool = False  # Whether this model requires custom API endpoints

-    # Temperature constraint object - preferred way to define temperature limits
+    # Temperature constraint object - defines temperature limits and behavior
    temperature_constraint: TemperatureConstraint = field(
-        default_factory=lambda: RangeTemperatureConstraint(0.0, 2.0, 0.7)
+        default_factory=lambda: RangeTemperatureConstraint(0.0, 2.0, 0.3)
    )

-    # Backward compatibility property for existing code
-    @property
-    def temperature_range(self) -> tuple[float, float]:
-        """Backward compatibility for existing code that uses temperature_range."""
-        if isinstance(self.temperature_constraint, RangeTemperatureConstraint):
-            return (self.temperature_constraint.min_temp, self.temperature_constraint.max_temp)
-        elif isinstance(self.temperature_constraint, FixedTemperatureConstraint):
-            return (self.temperature_constraint.value, self.temperature_constraint.value)
-        elif isinstance(self.temperature_constraint, DiscreteTemperatureConstraint):
-            values = self.temperature_constraint.allowed_values
-            return (min(values), max(values))
-        return (0.0, 2.0)  # Fallback
-

@dataclass
 class ModelResponse:
@@ -212,7 +202,7 @@ class ModelProvider(ABC):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -268,18 +258,15 @@ class ModelProvider(ABC):
            if not capabilities.supports_temperature:
                return None

-            # Get temperature range
-            min_temp, max_temp = capabilities.temperature_range
+            # Use temperature constraint to get corrected value
+            corrected_temp = capabilities.temperature_constraint.get_corrected_value(requested_temperature)

-            # Clamp to valid range
-            if requested_temperature < min_temp:
-                logger.debug(f"Clamping temperature from {requested_temperature} to {min_temp} for model {model_name}")
-                return min_temp
-            elif requested_temperature > max_temp:
-                logger.debug(f"Clamping temperature from {requested_temperature} to {max_temp} for model {model_name}")
-                return max_temp
-            else:
-                return requested_temperature
+            if corrected_temp != requested_temperature:
+                logger.debug(
+                    f"Adjusting temperature from {requested_temperature} to {corrected_temp} for model {model_name}"
+                )
+
+            return corrected_temp

        except Exception as e:
            logger.debug(f"Could not determine effective temperature for {model_name}: {e}")
@@ -294,10 +281,10 @@ class ModelProvider(ABC):
        """
        capabilities = self.get_capabilities(model_name)

-        # Validate temperature
-        min_temp, max_temp = capabilities.temperature_range
-        if not min_temp <= temperature <= max_temp:
-            raise ValueError(f"Temperature {temperature} out of range [{min_temp}, {max_temp}] for model {model_name}")
+        # Validate temperature using constraint
+        if not capabilities.temperature_constraint.validate(temperature):
+            constraint_desc = capabilities.temperature_constraint.get_description()
+            raise ValueError(f"Temperature {temperature} is invalid for model {model_name}. {constraint_desc}")

    @abstractmethod
    def supports_thinking_mode(self, model_name: str) -> bool:
@@ -441,3 +428,28 @@ class ModelProvider(ABC):
        """
        # Base implementation: no resources to clean up
        return
+
+    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
+        """Get the preferred model from this provider for a given category.
+
+        Args:
+            category: The tool category requiring a model
+            allowed_models: Pre-filtered list of model names that are allowed by restrictions
+
+        Returns:
+            Model name if this provider has a preference, None otherwise
+        """
+        # Default implementation - providers can override with specific logic
+        return None
+
+    def get_model_registry(self) -> Optional[dict[str, Any]]:
+        """Get the model registry for providers that maintain one.
+
+        This is a hook method for providers like CustomProvider that maintain
+        a dynamic model registry.
+
+        Returns:
+            Model registry dict or None if not applicable
+        """
+        # Default implementation - most providers don't have a registry
+        return None
--- a/providers/custom.py
+++ b/providers/custom.py
@@ -236,7 +236,7 @@ class CustomProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
--- a/providers/dial.py
+++ b/providers/dial.py
@@ -375,7 +375,7 @@ class DIALModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        images: Optional[list[str]] = None,
        **kwargs,
--- a/providers/gemini.py
+++ b/providers/gemini.py
@@ -4,7 +4,10 @@ import base64
 import logging
 import os
 import time
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 from google import genai
 from google.genai import types
@@ -19,6 +22,25 @@ class GeminiModelProvider(ModelProvider):

    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
+        "gemini-2.5-pro": ModelCapabilities(
+            provider=ProviderType.GOOGLE,
+            model_name="gemini-2.5-pro",
+            friendly_name="Gemini (Pro 2.5)",
+            context_window=1_048_576,  # 1M tokens
+            max_output_tokens=65_536,
+            supports_extended_thinking=True,
+            supports_system_prompts=True,
+            supports_streaming=True,
+            supports_function_calling=True,
+            supports_json_mode=True,
+            supports_images=True,  # Vision capability
+            max_image_size_mb=32.0,  # Higher limit for Pro model
+            supports_temperature=True,
+            temperature_constraint=create_temperature_constraint("range"),
+            max_thinking_tokens=32768,  # Max thinking tokens for Pro model
+            description="Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis",
+            aliases=["pro", "gemini pro", "gemini-pro"],
+        ),
        "gemini-2.0-flash": ModelCapabilities(
            provider=ProviderType.GOOGLE,
            model_name="gemini-2.0-flash",
@@ -75,25 +97,6 @@ class GeminiModelProvider(ModelProvider):
            description="Ultra-fast (1M context) - Quick analysis, simple queries, rapid iterations",
            aliases=["flash", "flash2.5"],
        ),
-        "gemini-2.5-pro": ModelCapabilities(
-            provider=ProviderType.GOOGLE,
-            model_name="gemini-2.5-pro",
-            friendly_name="Gemini (Pro 2.5)",
-            context_window=1_048_576,  # 1M tokens
-            max_output_tokens=65_536,
-            supports_extended_thinking=True,
-            supports_system_prompts=True,
-            supports_streaming=True,
-            supports_function_calling=True,
-            supports_json_mode=True,
-            supports_images=True,  # Vision capability
-            max_image_size_mb=32.0,  # Higher limit for Pro model
-            supports_temperature=True,
-            temperature_constraint=create_temperature_constraint("range"),
-            max_thinking_tokens=32768,  # Max thinking tokens for Pro model
-            description="Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis",
-            aliases=["pro", "gemini pro", "gemini-pro"],
-        ),
    }

    # Thinking mode configurations - percentages of model's max_thinking_tokens
@@ -152,7 +155,7 @@ class GeminiModelProvider(ModelProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        thinking_mode: str = "medium",
        images: Optional[list[str]] = None,
@@ -465,3 +468,67 @@ class GeminiModelProvider(ModelProvider):
        except Exception as e:
            logger.error(f"Error processing image {image_path}: {e}")
            return None
+
+    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
+        """Get Gemini's preferred model for a given category from allowed models.
+
+        Args:
+            category: The tool category requiring a model
+            allowed_models: Pre-filtered list of models allowed by restrictions
+
+        Returns:
+            Preferred model name or None
+        """
+        from tools.models import ToolModelCategory
+
+        if not allowed_models:
+            return None
+
+        # Helper to find best model from candidates
+        def find_best(candidates: list[str]) -> Optional[str]:
+            """Return best model from candidates (sorted for consistency)."""
+            return sorted(candidates, reverse=True)[0] if candidates else None
+
+        if category == ToolModelCategory.EXTENDED_REASONING:
+            # For extended reasoning, prefer models with thinking support
+            # First try Pro models that support thinking
+            pro_thinking = [
+                m
+                for m in allowed_models
+                if "pro" in m and m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking
+            ]
+            if pro_thinking:
+                return find_best(pro_thinking)
+
+            # Then any model that supports thinking
+            any_thinking = [
+                m
+                for m in allowed_models
+                if m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking
+            ]
+            if any_thinking:
+                return find_best(any_thinking)
+
+            # Finally, just prefer Pro models even without thinking
+            pro_models = [m for m in allowed_models if "pro" in m]
+            if pro_models:
+                return find_best(pro_models)
+
+        elif category == ToolModelCategory.FAST_RESPONSE:
+            # Prefer Flash models for speed
+            flash_models = [m for m in allowed_models if "flash" in m]
+            if flash_models:
+                return find_best(flash_models)
+
+        # Default for BALANCED or as fallback
+        # Prefer Flash for balanced use, then Pro, then anything
+        flash_models = [m for m in allowed_models if "flash" in m]
+        if flash_models:
+            return find_best(flash_models)
+
+        pro_models = [m for m in allowed_models if "pro" in m]
+        if pro_models:
+            return find_best(pro_models)
+
+        # Ultimate fallback to best available model
+        return find_best(allowed_models)
--- a/providers/openai_compatible.py
+++ b/providers/openai_compatible.py
@@ -1,6 +1,7 @@
 """Base class for OpenAI-compatible API providers."""

 import base64
+import copy
 import ipaddress
 import logging
 import os
@@ -220,6 +221,16 @@ class OpenAICompatibleProvider(ModelProvider):

                # Create httpx client with minimal config to avoid proxy conflicts
                # Note: proxies parameter was removed in httpx 0.28.0
+                # Check for test transport injection
+                if hasattr(self, "_test_transport"):
+                    # Use custom transport for testing (HTTP recording/replay)
+                    http_client = httpx.Client(
+                        transport=self._test_transport,
+                        timeout=timeout_config,
+                        follow_redirects=True,
+                    )
+                else:
+                    # Normal production client
                    http_client = httpx.Client(
                        timeout=timeout_config,
                        follow_redirects=True,
@@ -264,6 +275,63 @@ class OpenAICompatibleProvider(ModelProvider):

        return self._client

+    def _sanitize_for_logging(self, params: dict) -> dict:
+        """Sanitize sensitive data from parameters before logging.
+
+        Args:
+            params: Dictionary of API parameters
+
+        Returns:
+            dict: Sanitized copy of parameters safe for logging
+        """
+        sanitized = copy.deepcopy(params)
+
+        # Sanitize messages content
+        if "input" in sanitized:
+            for msg in sanitized.get("input", []):
+                if isinstance(msg, dict) and "content" in msg:
+                    for content_item in msg.get("content", []):
+                        if isinstance(content_item, dict) and "text" in content_item:
+                            # Truncate long text and add ellipsis
+                            text = content_item["text"]
+                            if len(text) > 100:
+                                content_item["text"] = text[:100] + "... [truncated]"
+
+        # Remove any API keys that might be in headers/auth
+        sanitized.pop("api_key", None)
+        sanitized.pop("authorization", None)
+
+        return sanitized
+
+    def _safe_extract_output_text(self, response) -> str:
+        """Safely extract output_text from o3-pro response with validation.
+
+        Args:
+            response: Response object from OpenAI SDK
+
+        Returns:
+            str: The output text content
+
+        Raises:
+            ValueError: If output_text is missing, None, or not a string
+        """
+        logging.debug(f"Response object type: {type(response)}")
+        logging.debug(f"Response attributes: {dir(response)}")
+
+        if not hasattr(response, "output_text"):
+            raise ValueError(f"o3-pro response missing output_text field. Response type: {type(response).__name__}")
+
+        content = response.output_text
+        logging.debug(f"Extracted output_text: '{content}' (type: {type(content)})")
+
+        if content is None:
+            raise ValueError("o3-pro returned None for output_text")
+
+        if not isinstance(content, str):
+            raise ValueError(f"o3-pro output_text is not a string. Got type: {type(content).__name__}")
+
+        return content
+
    def _generate_with_responses_endpoint(
        self,
        model_name: str,
@@ -309,30 +377,23 @@ class OpenAICompatibleProvider(ModelProvider):
        max_retries = 4
        retry_delays = [1, 3, 5, 8]
        last_exception = None
+        actual_attempts = 0

        for attempt in range(max_retries):
-            try:  # Log the exact payload being sent for debugging
+            try:  # Log sanitized payload for debugging
                import json

+                sanitized_params = self._sanitize_for_logging(completion_params)
                logging.info(
-                    f"o3-pro API request payload: {json.dumps(completion_params, indent=2, ensure_ascii=False)}"
+                    f"o3-pro API request (sanitized): {json.dumps(sanitized_params, indent=2, ensure_ascii=False)}"
                )

                # Use OpenAI client's responses endpoint
                response = self.client.responses.create(**completion_params)

-                # Extract content and usage from responses endpoint format
-                # The response format is different for responses endpoint
-                content = ""
-                if hasattr(response, "output") and response.output:
-                    if hasattr(response.output, "content") and response.output.content:
-                        # Look for output_text in content
-                        for content_item in response.output.content:
-                            if hasattr(content_item, "type") and content_item.type == "output_text":
-                                content = content_item.text
-                                break
-                    elif hasattr(response.output, "text"):
-                        content = response.output.text
+                # Extract content from responses endpoint format
+                # Use validation helper to safely extract output_text
+                content = self._safe_extract_output_text(response)

                # Try to extract usage information
                usage = None
@@ -371,14 +432,13 @@ class OpenAICompatibleProvider(ModelProvider):
                if is_retryable and attempt < max_retries - 1:
                    delay = retry_delays[attempt]
                    logging.warning(
-                        f"Retryable error for o3-pro responses endpoint, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
+                        f"Retryable error for o3-pro responses endpoint, attempt {actual_attempts}/{max_retries}: {str(e)}. Retrying in {delay}s..."
                    )
                    time.sleep(delay)
                else:
                    break

        # If we get here, all retries failed
-        actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
        error_msg = f"o3-pro responses endpoint error after {actual_attempts} attempt{'s' if actual_attempts > 1 else ''}: {str(last_exception)}"
        logging.error(error_msg)
        raise RuntimeError(error_msg) from last_exception
@@ -388,7 +448,7 @@ class OpenAICompatibleProvider(ModelProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        images: Optional[list[str]] = None,
        **kwargs,
@@ -481,7 +541,7 @@ class OpenAICompatibleProvider(ModelProvider):
                completion_params[key] = value

        # Check if this is o3-pro and needs the responses endpoint
-        if resolved_model == "o3-pro-2025-06-10":
+        if resolved_model == "o3-pro":
            # This model requires the /v1/responses endpoint
            # If it fails, we should not fall back to chat/completions
            return self._generate_with_responses_endpoint(
@@ -497,8 +557,10 @@ class OpenAICompatibleProvider(ModelProvider):
        retry_delays = [1, 3, 5, 8]  # Progressive delays: 1s, 3s, 5s, 8s

        last_exception = None
+        actual_attempts = 0

        for attempt in range(max_retries):
+            actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
            try:
                # Generate completion
                response = self.client.chat.completions.create(**completion_params)
@@ -536,12 +598,11 @@ class OpenAICompatibleProvider(ModelProvider):

                # Log retry attempt
                logging.warning(
-                    f"{self.FRIENDLY_NAME} error for model {model_name}, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
+                    f"{self.FRIENDLY_NAME} error for model {model_name}, attempt {actual_attempts}/{max_retries}: {str(e)}. Retrying in {delay}s..."
                )
                time.sleep(delay)

        # If we get here, all retries failed
-        actual_attempts = attempt + 1  # Convert from 0-based index to human-readable count
        error_msg = f"{self.FRIENDLY_NAME} API error for model {model_name} after {actual_attempts} attempt{'s' if actual_attempts > 1 else ''}: {str(last_exception)}"
        logging.error(error_msg)
        raise RuntimeError(error_msg) from last_exception
@@ -576,11 +637,7 @@ class OpenAICompatibleProvider(ModelProvider):
            try:
                encoding = tiktoken.encoding_for_model(model_name)
            except KeyError:
-                # Try common encodings based on model patterns
-                if "gpt-4" in model_name or "gpt-3.5" in model_name:
                encoding = tiktoken.get_encoding("cl100k_base")
-                else:
-                    encoding = tiktoken.get_encoding("cl100k_base")  # Default

            return len(encoding.encode(text))

@@ -679,11 +736,13 @@ class OpenAICompatibleProvider(ModelProvider):
        """
        # Common vision-capable models - only include models that actually support images
        vision_models = {
+            "gpt-5",
+            "gpt-5-mini",
            "gpt-4o",
            "gpt-4o-mini",
            "gpt-4-turbo",
            "gpt-4-vision-preview",
-            "gpt-4.1-2025-04-14",  # GPT-4.1 supports vision
+            "gpt-4.1-2025-04-14",
            "o3",
            "o3-mini",
            "o3-pro",
--- a/providers/openai_provider.py
+++ b/providers/openai_provider.py
@@ -1,7 +1,10 @@
 """OpenAI model provider implementation."""

 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 from .base import (
    ModelCapabilities,
@@ -19,6 +22,60 @@ class OpenAIModelProvider(OpenAICompatibleProvider):

    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
+        "gpt-5": ModelCapabilities(
+            provider=ProviderType.OPENAI,
+            model_name="gpt-5",
+            friendly_name="OpenAI (GPT-5)",
+            context_window=400_000,  # 400K tokens
+            max_output_tokens=128_000,  # 128K max output tokens
+            supports_extended_thinking=True,  # Supports reasoning tokens
+            supports_system_prompts=True,
+            supports_streaming=True,
+            supports_function_calling=True,
+            supports_json_mode=True,
+            supports_images=True,  # GPT-5 supports vision
+            max_image_size_mb=20.0,  # 20MB per OpenAI docs
+            supports_temperature=True,  # Regular models accept temperature parameter
+            temperature_constraint=create_temperature_constraint("fixed"),
+            description="GPT-5 (400K context, 128K output) - Advanced model with reasoning support",
+            aliases=["gpt5", "gpt-5"],
+        ),
+        "gpt-5-mini": ModelCapabilities(
+            provider=ProviderType.OPENAI,
+            model_name="gpt-5-mini",
+            friendly_name="OpenAI (GPT-5-mini)",
+            context_window=400_000,  # 400K tokens
+            max_output_tokens=128_000,  # 128K max output tokens
+            supports_extended_thinking=True,  # Supports reasoning tokens
+            supports_system_prompts=True,
+            supports_streaming=True,
+            supports_function_calling=True,
+            supports_json_mode=True,
+            supports_images=True,  # GPT-5-mini supports vision
+            max_image_size_mb=20.0,  # 20MB per OpenAI docs
+            supports_temperature=True,
+            temperature_constraint=create_temperature_constraint("fixed"),
+            description="GPT-5-mini (400K context, 128K output) - Efficient variant with reasoning support",
+            aliases=["gpt5-mini", "gpt5mini", "mini"],
+        ),
+        "gpt-5-nano": ModelCapabilities(
+            provider=ProviderType.OPENAI,
+            model_name="gpt-5-nano",
+            friendly_name="OpenAI (GPT-5 nano)",
+            context_window=400_000,
+            max_output_tokens=128_000,
+            supports_extended_thinking=True,
+            supports_system_prompts=True,
+            supports_streaming=True,
+            supports_function_calling=True,
+            supports_json_mode=True,
+            supports_images=True,
+            max_image_size_mb=20.0,
+            supports_temperature=True,
+            temperature_constraint=create_temperature_constraint("fixed"),
+            description="GPT-5 nano (400K context) - Fastest, cheapest version of GPT-5 for summarization and classification tasks",
+            aliases=["gpt5nano", "gpt5-nano", "nano"],
+        ),
        "o3": ModelCapabilities(
            provider=ProviderType.OPENAI,
            model_name="o3",
@@ -55,9 +112,9 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            description="Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity",
            aliases=["o3mini", "o3-mini"],
        ),
-        "o3-pro-2025-06-10": ModelCapabilities(
+        "o3-pro": ModelCapabilities(
            provider=ProviderType.OPENAI,
-            model_name="o3-pro-2025-06-10",
+            model_name="o3-pro",
            friendly_name="OpenAI (O3-Pro)",
            context_window=200_000,  # 200K tokens
            max_output_tokens=65536,  # 64K max output tokens
@@ -89,11 +146,11 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            supports_temperature=False,  # O4 models don't accept temperature parameter
            temperature_constraint=create_temperature_constraint("fixed"),
            description="Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning",
-            aliases=["mini", "o4mini", "o4-mini"],
+            aliases=["o4mini", "o4-mini"],
        ),
-        "gpt-4.1-2025-04-14": ModelCapabilities(
+        "gpt-4.1": ModelCapabilities(
            provider=ProviderType.OPENAI,
-            model_name="gpt-4.1-2025-04-14",
+            model_name="gpt-4.1",
            friendly_name="OpenAI (GPT 4.1)",
            context_window=1_000_000,  # 1M tokens
            max_output_tokens=32_768,
@@ -107,7 +164,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            supports_temperature=True,  # Regular models accept temperature parameter
            temperature_constraint=create_temperature_constraint("range"),
            description="GPT-4.1 (1M context) - Advanced reasoning model with large context window",
-            aliases=["gpt4.1"],
+            aliases=["gpt4.1", "gpt-4.1"],
        ),
    }

@@ -119,22 +176,42 @@ class OpenAIModelProvider(OpenAICompatibleProvider):

    def get_capabilities(self, model_name: str) -> ModelCapabilities:
        """Get capabilities for a specific OpenAI model."""
-        # Resolve shorthand
+        # First check if it's a key in SUPPORTED_MODELS
+        if model_name in self.SUPPORTED_MODELS:
+            # Check if model is allowed by restrictions
+            from utils.model_restrictions import get_restriction_service
+
+            restriction_service = get_restriction_service()
+            if not restriction_service.is_allowed(ProviderType.OPENAI, model_name, model_name):
+                raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
+            return self.SUPPORTED_MODELS[model_name]
+
+        # Try resolving as alias
        resolved_name = self._resolve_model_name(model_name)

-        if resolved_name not in self.SUPPORTED_MODELS:
-            raise ValueError(f"Unsupported OpenAI model: {model_name}")
-
+        # Check if resolved name is a key
+        if resolved_name in self.SUPPORTED_MODELS:
            # Check if model is allowed by restrictions
            from utils.model_restrictions import get_restriction_service

            restriction_service = get_restriction_service()
            if not restriction_service.is_allowed(ProviderType.OPENAI, resolved_name, model_name):
                raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
-
-        # Return the ModelCapabilities object directly from SUPPORTED_MODELS
            return self.SUPPORTED_MODELS[resolved_name]

+        # Finally check if resolved name matches any API model name
+        for key, capabilities in self.SUPPORTED_MODELS.items():
+            if resolved_name == capabilities.model_name:
+                # Check if model is allowed by restrictions
+                from utils.model_restrictions import get_restriction_service
+
+                restriction_service = get_restriction_service()
+                if not restriction_service.is_allowed(ProviderType.OPENAI, key, model_name):
+                    raise ValueError(f"OpenAI model '{model_name}' is not allowed by restriction policy.")
+                return capabilities
+
+        raise ValueError(f"Unsupported OpenAI model: {model_name}")
+
    def get_provider_type(self) -> ProviderType:
        """Get the provider type."""
        return ProviderType.OPENAI
@@ -162,7 +239,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -182,6 +259,47 @@ class OpenAIModelProvider(OpenAICompatibleProvider):

    def supports_thinking_mode(self, model_name: str) -> bool:
        """Check if the model supports extended thinking mode."""
-        # Currently no OpenAI models support extended thinking
-        # This may change with future O3 models
+        # GPT-5 models support reasoning tokens (extended thinking)
+        resolved_name = self._resolve_model_name(model_name)
+        if resolved_name in ["gpt-5", "gpt-5-mini"]:
+            return True
+        # O3 models don't support extended thinking yet
        return False
+
+    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
+        """Get OpenAI's preferred model for a given category from allowed models.
+
+        Args:
+            category: The tool category requiring a model
+            allowed_models: Pre-filtered list of models allowed by restrictions
+
+        Returns:
+            Preferred model name or None
+        """
+        from tools.models import ToolModelCategory
+
+        if not allowed_models:
+            return None
+
+        # Helper to find first available from preference list
+        def find_first(preferences: list[str]) -> Optional[str]:
+            """Return first available model from preference list."""
+            for model in preferences:
+                if model in allowed_models:
+                    return model
+            return None
+
+        if category == ToolModelCategory.EXTENDED_REASONING:
+            # Prefer models with extended thinking support
+            preferred = find_first(["o3", "o3-pro", "gpt-5"])
+            return preferred if preferred else allowed_models[0]
+
+        elif category == ToolModelCategory.FAST_RESPONSE:
+            # Prefer fast, cost-efficient models
+            preferred = find_first(["gpt-5", "gpt-5-mini", "o4-mini", "o3-mini"])
+            return preferred if preferred else allowed_models[0]
+
+        else:  # BALANCED or default
+            # Prefer balanced performance/cost models
+            preferred = find_first(["gpt-5", "gpt-5-mini", "o4-mini", "o3-mini"])
+            return preferred if preferred else allowed_models[0]
--- a/providers/openrouter.py
+++ b/providers/openrouter.py
@@ -158,7 +158,7 @@ class OpenRouterProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
--- a/providers/registry.py
+++ b/providers/registry.py
@@ -15,6 +15,17 @@ class ModelProviderRegistry:

    _instance = None

+    # Provider priority order for model selection
+    # Native APIs first, then custom endpoints, then catch-all providers
+    PROVIDER_PRIORITY_ORDER = [
+        ProviderType.GOOGLE,  # Direct Gemini access
+        ProviderType.OPENAI,  # Direct OpenAI access
+        ProviderType.XAI,  # Direct X.AI GROK access
+        ProviderType.DIAL,  # DIAL unified API access
+        ProviderType.CUSTOM,  # Local/self-hosted models
+        ProviderType.OPENROUTER,  # Catch-all for cloud models
+    ]
+
    def __new__(cls):
        """Singleton pattern for registry."""
        if cls._instance is None:
@@ -103,30 +114,19 @@ class ModelProviderRegistry:
        3. OPENROUTER - Catch-all for cloud models via unified API

        Args:
-            model_name: Name of the model (e.g., "gemini-2.5-flash", "o3-mini")
+            model_name: Name of the model (e.g., "gemini-2.5-flash", "gpt5")

        Returns:
            ModelProvider instance that supports this model
        """
        logging.debug(f"get_provider_for_model called with model_name='{model_name}'")

-        # Define explicit provider priority order
-        # Native APIs first, then custom endpoints, then catch-all providers
-        PROVIDER_PRIORITY_ORDER = [
-            ProviderType.GOOGLE,  # Direct Gemini access
-            ProviderType.OPENAI,  # Direct OpenAI access
-            ProviderType.XAI,  # Direct X.AI GROK access
-            ProviderType.DIAL,  # DIAL unified API access
-            ProviderType.CUSTOM,  # Local/self-hosted models
-            ProviderType.OPENROUTER,  # Catch-all for cloud models
-        ]
-
        # Check providers in priority order
        instance = cls()
        logging.debug(f"Registry instance: {instance}")
        logging.debug(f"Available providers in registry: {list(instance._providers.keys())}")

-        for provider_type in PROVIDER_PRIORITY_ORDER:
+        for provider_type in cls.PROVIDER_PRIORITY_ORDER:
            if provider_type in instance._providers:
                logging.debug(f"Found {provider_type} in registry")
                # Get or create provider instance
@@ -244,14 +244,49 @@ class ModelProviderRegistry:

        return os.getenv(env_var)

+    @classmethod
+    def _get_allowed_models_for_provider(cls, provider: ModelProvider, provider_type: ProviderType) -> list[str]:
+        """Get a list of allowed canonical model names for a given provider.
+
+        Args:
+            provider: The provider instance to get models for
+            provider_type: The provider type for restriction checking
+
+        Returns:
+            List of model names that are both supported and allowed
+        """
+        from utils.model_restrictions import get_restriction_service
+
+        restriction_service = get_restriction_service()
+
+        allowed_models = []
+
+        # Get the provider's supported models
+        try:
+            # Use list_models to get all supported models (handles both regular and custom providers)
+            supported_models = provider.list_models(respect_restrictions=False)
+        except (NotImplementedError, AttributeError):
+            # Fallback to SUPPORTED_MODELS if list_models not implemented
+            try:
+                supported_models = list(provider.SUPPORTED_MODELS.keys())
+            except AttributeError:
+                supported_models = []
+
+        # Filter by restrictions
+        for model_name in supported_models:
+            if restriction_service.is_allowed(provider_type, model_name):
+                allowed_models.append(model_name)
+
+        return allowed_models
+
    @classmethod
    def get_preferred_fallback_model(cls, tool_category: Optional["ToolModelCategory"] = None) -> str:
-        """Get the preferred fallback model based on available API keys and tool category.
+        """Get the preferred fallback model based on provider priority and tool category.

-        This method checks which providers have valid API keys and returns
-        a sensible default model for auto mode fallback situations.
-
-        Takes into account model restrictions when selecting fallback models.
+        This method orchestrates model selection by:
+        1. Getting allowed models for each provider (respecting restrictions)
+        2. Asking providers for their preference from the allowed list
+        3. Falling back to first available model if no preference given

        Args:
            tool_category: Optional category to influence model selection
@@ -259,167 +294,42 @@ class ModelProviderRegistry:
        Returns:
            Model name string for fallback use
        """
-        # Import here to avoid circular import
        from tools.models import ToolModelCategory

-        # Get available models respecting restrictions
-        available_models = cls.get_available_models(respect_restrictions=True)
+        effective_category = tool_category or ToolModelCategory.BALANCED
+        first_available_model = None

-        # Group by provider
-        openai_models = [m for m, p in available_models.items() if p == ProviderType.OPENAI]
-        gemini_models = [m for m, p in available_models.items() if p == ProviderType.GOOGLE]
-        xai_models = [m for m, p in available_models.items() if p == ProviderType.XAI]
-        openrouter_models = [m for m, p in available_models.items() if p == ProviderType.OPENROUTER]
-        custom_models = [m for m, p in available_models.items() if p == ProviderType.CUSTOM]
+        # Ask each provider for their preference in priority order
+        for provider_type in cls.PROVIDER_PRIORITY_ORDER:
+            provider = cls.get_provider(provider_type)
+            if provider:
+                # 1. Registry filters the models first
+                allowed_models = cls._get_allowed_models_for_provider(provider, provider_type)

-        openai_available = bool(openai_models)
-        gemini_available = bool(gemini_models)
-        xai_available = bool(xai_models)
-        openrouter_available = bool(openrouter_models)
-        custom_available = bool(custom_models)
-
-        if tool_category == ToolModelCategory.EXTENDED_REASONING:
-            # Prefer thinking-capable models for deep reasoning tools
-            if openai_available and "o3" in openai_models:
-                return "o3"  # O3 for deep reasoning
-            elif openai_available and openai_models:
-                # Fall back to any available OpenAI model
-                return openai_models[0]
-            elif xai_available and "grok-3" in xai_models:
-                return "grok-3"  # GROK-3 for deep reasoning
-            elif xai_available and xai_models:
-                # Fall back to any available XAI model
-                return xai_models[0]
-            elif gemini_available and any("pro" in m for m in gemini_models):
-                # Find the pro model (handles full names)
-                return next(m for m in gemini_models if "pro" in m)
-            elif gemini_available and gemini_models:
-                # Fall back to any available Gemini model
-                return gemini_models[0]
-            elif openrouter_available:
-                # Try to find thinking-capable model from openrouter
-                thinking_model = cls._find_extended_thinking_model()
-                if thinking_model:
-                    return thinking_model
-                # Fallback to first available OpenRouter model
-                return openrouter_models[0]
-            elif custom_available:
-                # Fallback to custom models when available
-                return custom_models[0]
-            else:
-                # Fallback to pro if nothing found
-                return "gemini-2.5-pro"
-
-        elif tool_category == ToolModelCategory.FAST_RESPONSE:
-            # Prefer fast, cost-efficient models
-            if openai_available and "o4-mini" in openai_models:
-                return "o4-mini"  # Latest, fast and efficient
-            elif openai_available and "o3-mini" in openai_models:
-                return "o3-mini"  # Second choice
-            elif openai_available and openai_models:
-                # Fall back to any available OpenAI model
-                return openai_models[0]
-            elif xai_available and "grok-3-fast" in xai_models:
-                return "grok-3-fast"  # GROK-3 Fast for speed
-            elif xai_available and xai_models:
-                # Fall back to any available XAI model
-                return xai_models[0]
-            elif gemini_available and any("flash" in m for m in gemini_models):
-                # Find the flash model (handles full names)
-                # Prefer 2.5 over 2.0 for backward compatibility
-                flash_models = [m for m in gemini_models if "flash" in m]
-                # Sort to ensure 2.5 comes before 2.0
-                flash_models_sorted = sorted(flash_models, reverse=True)
-                return flash_models_sorted[0]
-            elif gemini_available and gemini_models:
-                # Fall back to any available Gemini model
-                return gemini_models[0]
-            elif openrouter_available:
-                # Fallback to first available OpenRouter model
-                return openrouter_models[0]
-            elif custom_available:
-                # Fallback to custom models when available
-                return custom_models[0]
-            else:
-                # Default to flash
-                return "gemini-2.5-flash"
-
-        # BALANCED or no category specified - use existing balanced logic
-        if openai_available and "o4-mini" in openai_models:
-            return "o4-mini"  # Latest balanced performance/cost
-        elif openai_available and "o3-mini" in openai_models:
-            return "o3-mini"  # Second choice
-        elif openai_available and openai_models:
-            return openai_models[0]
-        elif xai_available and "grok-3" in xai_models:
-            return "grok-3"  # GROK-3 as balanced choice
-        elif xai_available and xai_models:
-            return xai_models[0]
-        elif gemini_available and any("flash" in m for m in gemini_models):
-            # Prefer 2.5 over 2.0 for backward compatibility
-            flash_models = [m for m in gemini_models if "flash" in m]
-            flash_models_sorted = sorted(flash_models, reverse=True)
-            return flash_models_sorted[0]
-        elif gemini_available and gemini_models:
-            return gemini_models[0]
-        elif openrouter_available:
-            return openrouter_models[0]
-        elif custom_available:
-            # Fallback to custom models when available
-            return custom_models[0]
-        else:
-            # No models available due to restrictions - check if any providers exist
-            if not available_models:
-                # This might happen if all models are restricted
-                logging.warning("No models available due to restrictions")
-            # Return a reasonable default for backward compatibility
-            return "gemini-2.5-flash"
-
-    @classmethod
-    def _find_extended_thinking_model(cls) -> Optional[str]:
-        """Find a model suitable for extended reasoning from custom/openrouter providers.
-
-        Returns:
-            Model name if found, None otherwise
-        """
-        # Check custom provider first
-        custom_provider = cls.get_provider(ProviderType.CUSTOM)
-        if custom_provider:
-            # Check if it's a CustomModelProvider and has thinking models
-            try:
-                from providers.custom import CustomProvider
-
-                if isinstance(custom_provider, CustomProvider) and hasattr(custom_provider, "model_registry"):
-                    for model_name, config in custom_provider.model_registry.items():
-                        if config.get("supports_extended_thinking", False):
-                            return model_name
-            except ImportError:
-                pass
-
-        # Then check OpenRouter for high-context/powerful models
-        openrouter_provider = cls.get_provider(ProviderType.OPENROUTER)
-        if openrouter_provider:
-            # Prefer models known for deep reasoning
-            preferred_models = [
-                "anthropic/claude-sonnet-4",
-                "anthropic/claude-opus-4",
-                "google/gemini-2.5-pro",
-                "google/gemini-pro-1.5",
-                "meta-llama/llama-3.1-70b-instruct",
-                "mistralai/mixtral-8x7b-instruct",
-            ]
-            for model in preferred_models:
-                try:
-                    if openrouter_provider.validate_model_name(model):
-                        return model
-                except Exception as e:
-                    # Log the error for debugging purposes but continue searching
-                    import logging
-
-                    logging.warning(f"Model validation for '{model}' on OpenRouter failed: {e}")
+                if not allowed_models:
                    continue

-        return None
+                # 2. Keep track of the first available model as fallback
+                if not first_available_model:
+                    first_available_model = sorted(allowed_models)[0]
+
+                # 3. Ask provider to pick from allowed list
+                preferred_model = provider.get_preferred_model(effective_category, allowed_models)
+
+                if preferred_model:
+                    logging.debug(
+                        f"Provider {provider_type.value} selected '{preferred_model}' for category '{effective_category.value}'"
+                    )
+                    return preferred_model
+
+        # If no provider returned a preference, use first available model
+        if first_available_model:
+            logging.debug(f"No provider preference, using first available: {first_available_model}")
+            return first_available_model
+
+        # Ultimate fallback if no providers have models
+        logging.warning("No models available from any provider, using default fallback")
+        return "gemini-2.5-flash"

    @classmethod
    def get_available_providers_with_keys(cls) -> list[ProviderType]:
@@ -441,6 +351,17 @@ class ModelProviderRegistry:
        instance = cls()
        instance._initialized_providers.clear()

+    @classmethod
+    def reset_for_testing(cls) -> None:
+        """Reset the registry to a clean state for testing.
+
+        This provides a safe, public API for tests to clean up registry state
+        without directly manipulating private attributes.
+        """
+        cls._instance = None
+        if hasattr(cls, "_providers"):
+            cls._providers = {}
+
    @classmethod
    def unregister_provider(cls, provider_type: ProviderType) -> None:
        """Unregister a provider (mainly for testing)."""
--- a/providers/xai.py
+++ b/providers/xai.py
@@ -1,7 +1,10 @@
 """X.AI (GROK) model provider implementation."""

 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 from .base import (
    ModelCapabilities,
@@ -21,23 +24,23 @@ class XAIModelProvider(OpenAICompatibleProvider):

    # Model configurations using ModelCapabilities objects
    SUPPORTED_MODELS = {
-        "grok-4-0709": ModelCapabilities(
+        "grok-4": ModelCapabilities(
            provider=ProviderType.XAI,
-            model_name="grok-4-0709",
+            model_name="grok-4",
            friendly_name="X.AI (Grok 4)",
            context_window=256_000,  # 256K tokens
-            max_output_tokens=16_384,
-            supports_extended_thinking=True,  # Supports reasoning mode
+            max_output_tokens=256_000,  # 256K tokens max output
+            supports_extended_thinking=True,  # Grok-4 supports reasoning mode
            supports_system_prompts=True,
            supports_streaming=True,
-            supports_function_calling=True,
-            supports_json_mode=True,  # Supports structured outputs
-            supports_images=True,  # Supports vision/image analysis
-            max_image_size_mb=20.0,  # Assuming standard limit
+            supports_function_calling=True,  # Function calling supported
+            supports_json_mode=True,  # Structured outputs supported
+            supports_images=True,  # Multimodal capabilities
+            max_image_size_mb=20.0,  # Standard image size limit
            supports_temperature=True,
            temperature_constraint=create_temperature_constraint("range"),
-            description="GROK-4 (256K context) - Latest flagship model with reasoning, vision, and structured outputs",
-            aliases=["grok-4", "grok-4-latest", "grok4", "grok"],
+            description="GROK-4 (256K context) - Frontier multimodal reasoning model with advanced capabilities",
+            aliases=["grok", "grok4", "grok-4"],
        ),
        "grok-3": ModelCapabilities(
            provider=ProviderType.XAI,
@@ -128,7 +131,7 @@ class XAIModelProvider(OpenAICompatibleProvider):
        prompt: str,
        model_name: str,
        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
+        temperature: float = 0.3,
        max_output_tokens: Optional[int] = None,
        **kwargs,
    ) -> ModelResponse:
@@ -148,10 +151,52 @@ class XAIModelProvider(OpenAICompatibleProvider):

    def supports_thinking_mode(self, model_name: str) -> bool:
        """Check if the model supports extended thinking mode."""
-        # Check capabilities to determine thinking mode support
-        try:
-            capabilities = self.get_capabilities(model_name)
+        resolved_name = self._resolve_model_name(model_name)
+        capabilities = self.SUPPORTED_MODELS.get(resolved_name)
+        if capabilities:
            return capabilities.supports_extended_thinking
-        except ValueError:
-            # If the model is not supported, it doesn't support thinking mode.
        return False
+
+    def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]:
+        """Get XAI's preferred model for a given category from allowed models.
+
+        Args:
+            category: The tool category requiring a model
+            allowed_models: Pre-filtered list of models allowed by restrictions
+
+        Returns:
+            Preferred model name or None
+        """
+        from tools.models import ToolModelCategory
+
+        if not allowed_models:
+            return None
+
+        if category == ToolModelCategory.EXTENDED_REASONING:
+            # Prefer GROK-4 for advanced reasoning with thinking mode
+            if "grok-4" in allowed_models:
+                return "grok-4"
+            elif "grok-3" in allowed_models:
+                return "grok-3"
+            # Fall back to any available model
+            return allowed_models[0]
+
+        elif category == ToolModelCategory.FAST_RESPONSE:
+            # Prefer GROK-3-Fast for speed, then GROK-4
+            if "grok-3-fast" in allowed_models:
+                return "grok-3-fast"
+            elif "grok-4" in allowed_models:
+                return "grok-4"
+            # Fall back to any available model
+            return allowed_models[0]
+
+        else:  # BALANCED or default
+            # Prefer GROK-4 for balanced use (best overall capabilities)
+            if "grok-4" in allowed_models:
+                return "grok-4"
+            elif "grok-3" in allowed_models:
+                return "grok-3"
+            elif "grok-3-fast" in allowed_models:
+                return "grok-3-fast"
+            # Fall back to any available model
+            return allowed_models[0]
--- a/run-server.sh
+++ b/run-server.sh
@@ -125,7 +125,7 @@ get_claude_config_path() {
                win_appdata=$(wslvar APPDATA 2>/dev/null)
            fi
            
-            if [[ -n "$win_appdata" ]]; then
+            if [[ -n "${win_appdata:-}" ]]; then
                echo "$(wslpath "$win_appdata")/Claude/claude_desktop_config.json"
            else
                print_warning "Could not determine Windows user path automatically. Please ensure APPDATA is set correctly or provide the full path manually."
--- a/server.py
+++ b/server.py
@@ -409,9 +409,9 @@ def configure_providers():
    openai_key = os.getenv("OPENAI_API_KEY")
    logger.debug(f"OpenAI key check: key={'[PRESENT]' if openai_key else '[MISSING]'}")
    if openai_key and openai_key != "your_openai_api_key_here":
-        valid_providers.append("OpenAI (o3)")
+        valid_providers.append("OpenAI")
        has_native_apis = True
-        logger.info("OpenAI API key found - o3 model available")
+        logger.info("OpenAI API key found")
    else:
        if not openai_key:
            logger.debug("OpenAI API key not found in environment")
@@ -493,7 +493,7 @@ def configure_providers():
        raise ValueError(
            "At least one API configuration is required. Please set either:\n"
            "- GEMINI_API_KEY for Gemini models\n"
-            "- OPENAI_API_KEY for OpenAI o3 model\n"
+            "- OPENAI_API_KEY for OpenAI models\n"
            "- XAI_API_KEY for X.AI GROK models\n"
            "- DIAL_API_KEY for DIAL models\n"
            "- OPENROUTER_API_KEY for OpenRouter (multiple models)\n"
@@ -742,7 +742,9 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
        # Parse model:option format if present
        model_name, model_option = parse_model_option(model_name)
        if model_option:
-            logger.debug(f"Parsed model format - model: '{model_name}', option: '{model_option}'")
+            logger.info(f"Parsed model format - model: '{model_name}', option: '{model_option}'")
+        else:
+            logger.info(f"Parsed model format - model: '{model_name}'")

        # Consensus tool handles its own model configuration validation
        # No special handling needed at server level
@@ -1190,16 +1192,16 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    """
    Get prompt details and generate the actual prompt text.

-    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:o3).
+    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:gpt5).
    It generates the appropriate text that Claude will then use to call the
    underlying tool.

-    Supports structured prompt names like "chat:o3" where:
+    Supports structured prompt names like "chat:gpt5" where:
    - "chat" is the tool name
-    - "o3" is the model to use
+    - "gpt5" is the model to use

    Args:
-        name: The name of the prompt to execute (can include model like "chat:o3")
+        name: The name of the prompt to execute (can include model like "chat:gpt5")
        arguments: Optional arguments for the prompt (e.g., model, thinking_mode)

    Returns:
@@ -1268,7 +1270,12 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    # Generate tool call instruction
    if name.lower() == "continue":
        # "/zen:continue" case
-        tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
+        tool_instruction = (
+            f"Continue the previous conversation using the {tool_name} tool. "
+            "CRITICAL: You MUST provide the continuation_id from the previous response to maintain conversation context. "
+            "Additionally, you should reuse the same model that was used in the previous exchange for consistency, unless "
+            "the user specifically asks for a different model name to be used."
+        )
    else:
        # Simple prompt case
        tool_instruction = prompt_text
--- a/simulator_tests/conversation_base_test.py
+++ b/simulator_tests/conversation_base_test.py
@@ -24,8 +24,12 @@ EXAMPLE:

            # Step 2: Continue with codereview tool - memory is preserved!
            result2, _ = self.call_mcp_tool_direct("codereview", {
-                "files": ["/path/to/file.py"],
-                "prompt": "Focus on security issues",
+                "step": "Focus on security issues in this code",
+                "step_number": 1,
+                "total_steps": 1,
+                "next_step_required": False,
+                "findings": "Starting security-focused code review",
+                "relevant_files": ["/path/to/file.py"],
                "continuation_id": continuation_id
            })
 """
--- a/simulator_tests/test_content_validation.py
+++ b/simulator_tests/test_content_validation.py
@@ -104,8 +104,12 @@ DATABASE_CONFIG = {
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [validation_file],
-                    "prompt": "Review this configuration file",
+                    "step": "Review this configuration file for quality and potential issues",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting code review of configuration file",
+                    "relevant_files": [validation_file],
                    "model": "flash",
                },
            )
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -108,8 +108,12 @@ def multiply(x, y):
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
-                    "prompt": "Quick review of this simple code",
+                    "step": "Review this simple code for quality and potential issues",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting code review analysis",
+                    "relevant_files": [test_file],
                    "model": "o3",
                    "temperature": 1.0,  # O3 only supports default temperature of 1.0
                },
@@ -145,12 +149,15 @@ def multiply(x, y):
                line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
            ]

-            # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
-            openai_api_called = len(openai_api_logs) >= 3  # Should see 3 OpenAI API calls
-            openai_model_usage = len(openai_model_logs) >= 3  # Should see 3 model usage logs
-            openai_responses_received = len(openai_response_logs) >= 3  # Should see 3 responses
-            chat_calls_to_openai = len(chat_openai_logs) >= 2  # Should see 2 chat calls (o3 + o3-mini)
-            codereview_calls_to_openai = len(codereview_openai_logs) >= 1  # Should see 1 codereview call (o3)
+            # Validation criteria - check for OpenAI usage evidence (more flexible than exact counts)
+            openai_api_called = len(openai_api_logs) >= 1  # Should see at least 1 OpenAI API call
+            openai_model_usage = len(openai_model_logs) >= 1  # Should see at least 1 model usage log
+            openai_responses_received = len(openai_response_logs) >= 1  # Should see at least 1 response
+            some_chat_calls_to_openai = len(chat_openai_logs) >= 1  # Should see at least 1 chat call
+            some_workflow_calls_to_openai = (
+                len(codereview_openai_logs) >= 1
+                or len([line for line in logs.split("\n") if "openai" in line and "codereview" in line]) > 0
+            )  # Should see evidence of workflow tool usage

            self.logger.info(f"   OpenAI API call logs: {len(openai_api_logs)}")
            self.logger.info(f"   OpenAI model usage logs: {len(openai_model_logs)}")
@@ -174,8 +181,11 @@ def multiply(x, y):
                ("OpenAI API calls made", openai_api_called),
                ("OpenAI model usage logged", openai_model_usage),
                ("OpenAI responses received", openai_responses_received),
-                ("Chat tool used OpenAI", chat_calls_to_openai),
-                ("Codereview tool used OpenAI", codereview_calls_to_openai),
+                ("Chat tool used OpenAI", some_chat_calls_to_openai),
+                (
+                    "Workflow tool attempted",
+                    some_workflow_calls_to_openai or response3 is not None,
+                ),  # More flexible check
            ]

            passed_criteria = sum(1 for _, passed in success_criteria if passed)
@@ -185,7 +195,7 @@ def multiply(x, y):
                status = "✅" if passed else "❌"
                self.logger.info(f"    {status} {criterion}")

-            if passed_criteria >= 3:  # At least 3 out of 4 criteria
+            if passed_criteria >= 3:  # At least 3 out of 5 criteria
                self.logger.info("  ✅ O3 model selection validation passed")
                return True
            else:
@@ -254,8 +264,12 @@ def multiply(x, y):
            response3, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
-                    "prompt": "Quick review of this simple code",
+                    "step": "Review this simple code for quality and potential issues",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting code review analysis",
+                    "relevant_files": [test_file],
                    "model": "o3",
                    "temperature": 1.0,
                },
--- a/simulator_tests/test_openrouter_fallback.py
+++ b/simulator_tests/test_openrouter_fallback.py
@@ -82,8 +82,12 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response2, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [test_file],
-                    "prompt": "Quick review of this sum function",
+                    "step": "Quick review of this sum function for quality and potential issues",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting code review of sum function",
+                    "relevant_files": [test_file],
                    "model": "flash",
                    "temperature": 0.1,
                },
@@ -101,8 +105,12 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response3, _ = self.call_mcp_tool(
                "analyze",
                {
-                    "files": [self.test_files["python"]],
-                    "prompt": "Analyze the structure of this Python code",
+                    "step": "Analyze the structure of this Python code",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting code structure analysis",
+                    "relevant_files": [self.test_files["python"]],
                    "model": "pro",
                    "temperature": 0.1,
                },
@@ -120,7 +128,11 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
            response4, _ = self.call_mcp_tool(
                "debug",
                {
-                    "prompt": "Why might a function return None instead of a value?",
+                    "step": "Why might a function return None instead of a value?",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting debug investigation of None return values",
                    "model": "flash",  # Should map to OpenRouter
                    "temperature": 0.1,
                },
--- a/simulator_tests/test_xai_models.py
+++ b/simulator_tests/test_xai_models.py
@@ -43,8 +43,8 @@ class XAIModelsTest(BaseSimulatorTest):
            # Setup test files for later use
            self.setup_test_files()

-            # Test 1: 'grok' alias (should map to grok-3)
-            self.logger.info("  1: Testing 'grok' alias (should map to grok-3)")
+            # Test 1: 'grok' alias (should map to grok-4)
+            self.logger.info("  1: Testing 'grok' alias (should map to grok-4)")

            response1, continuation_id = self.call_mcp_tool(
                "chat",
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,13 +15,6 @@ parent_dir = Path(__file__).resolve().parent.parent
 if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

-# Set dummy API keys for tests if not already set or if empty
-if not os.environ.get("GEMINI_API_KEY"):
-    os.environ["GEMINI_API_KEY"] = "dummy-key-for-tests"
-if not os.environ.get("OPENAI_API_KEY"):
-    os.environ["OPENAI_API_KEY"] = "dummy-key-for-tests"
-if not os.environ.get("XAI_API_KEY"):
-    os.environ["XAI_API_KEY"] = "dummy-key-for-tests"

 # Set default model to a specific value for tests to avoid auto mode
 # This prevents all tests from failing due to missing model parameter
@@ -77,11 +70,27 @@ def project_path(tmp_path):
    return test_dir


+def _set_dummy_keys_if_missing():
+    """Set dummy API keys only when they are completely absent."""
+    for var in ("GEMINI_API_KEY", "OPENAI_API_KEY", "XAI_API_KEY"):
+        if not os.environ.get(var):
+            os.environ[var] = "dummy-key-for-tests"
+
+
 # Pytest configuration
 def pytest_configure(config):
    """Configure pytest with custom markers"""
    config.addinivalue_line("markers", "asyncio: mark test as async")
    config.addinivalue_line("markers", "no_mock_provider: disable automatic provider mocking")
+    # Assume we need dummy keys until we learn otherwise
+    config._needs_dummy_keys = True
+
+
+def pytest_collection_modifyitems(session, config, items):
+    """Hook that runs after test collection to check for no_mock_provider markers."""
+    # Always set dummy keys if real keys are missing
+    # This ensures tests work in CI even with no_mock_provider marker
+    _set_dummy_keys_if_missing()


@pytest.fixture(autouse=True)
--- a/tests/http_transport_recorder.py
+++ b/tests/http_transport_recorder.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+HTTP Transport Recorder for O3-Pro Testing
+
+Custom httpx transport solution that replaces respx for recording/replaying
+HTTP interactions. Provides full control over the recording process without
+respx limitations.
+
+Key Features:
+- RecordingTransport: Wraps default transport, captures real HTTP calls
+- ReplayTransport: Serves saved responses from cassettes
+- TransportFactory: Auto-selects record vs replay mode
+- JSON cassette format with data sanitization
+"""
+
+import base64
+import hashlib
+import json
+import logging
+from pathlib import Path
+from typing import Any, Optional
+
+import httpx
+
+from .pii_sanitizer import PIISanitizer
+
+logger = logging.getLogger(__name__)
+
+
+class RecordingTransport(httpx.HTTPTransport):
+    """Transport that wraps default httpx transport and records all interactions."""
+
+    def __init__(self, cassette_path: str, capture_content: bool = True, sanitize: bool = True):
+        super().__init__()
+        self.cassette_path = Path(cassette_path)
+        self.recorded_interactions = []
+        self.capture_content = capture_content
+        self.sanitizer = PIISanitizer() if sanitize else None
+
+    def handle_request(self, request: httpx.Request) -> httpx.Response:
+        """Handle request by recording interaction and delegating to real transport."""
+        logger.debug(f"RecordingTransport: Making request to {request.method} {request.url}")
+
+        # Record request BEFORE making the call
+        request_data = self._serialize_request(request)
+
+        # Make real HTTP call using parent transport
+        response = super().handle_request(request)
+
+        logger.debug(f"RecordingTransport: Got response {response.status_code}")
+
+        # Post-response content capture (proper approach)
+        if self.capture_content:
+            try:
+                # Consume the response stream to capture content
+                # Note: httpx automatically handles gzip decompression
+                content_bytes = response.read()
+                response.close()  # Close the original stream
+                logger.debug(f"RecordingTransport: Captured {len(content_bytes)} bytes")
+
+                # Serialize response with captured content
+                response_data = self._serialize_response_with_content(response, content_bytes)
+
+                # Create a new response with the same metadata but buffered content
+                # If the original response was gzipped, we need to re-compress
+                response_content = content_bytes
+                if response.headers.get("content-encoding") == "gzip":
+                    import gzip
+
+                    response_content = gzip.compress(content_bytes)
+                    logger.debug(f"Re-compressed content: {len(content_bytes)} → {len(response_content)} bytes")
+
+                new_response = httpx.Response(
+                    status_code=response.status_code,
+                    headers=response.headers,  # Keep original headers intact
+                    content=response_content,
+                    request=request,
+                    extensions=response.extensions,
+                    history=response.history,
+                )
+
+                # Record the interaction
+                self._record_interaction(request_data, response_data)
+
+                return new_response
+
+            except Exception:
+                logger.warning("Content capture failed, falling back to stub", exc_info=True)
+                response_data = self._serialize_response(response)
+                self._record_interaction(request_data, response_data)
+                return response
+        else:
+            # Legacy mode: record with stub content
+            response_data = self._serialize_response(response)
+            self._record_interaction(request_data, response_data)
+            return response
+
+    def _record_interaction(self, request_data: dict[str, Any], response_data: dict[str, Any]):
+        """Helper method to record interaction and save cassette."""
+        interaction = {"request": request_data, "response": response_data}
+        self.recorded_interactions.append(interaction)
+        self._save_cassette()
+        logger.debug(f"Saved cassette to {self.cassette_path}")
+
+    def _serialize_request(self, request: httpx.Request) -> dict[str, Any]:
+        """Serialize httpx.Request to JSON-compatible format."""
+        # For requests, we can safely read the content since it's already been prepared
+        # httpx.Request.content is safe to access multiple times
+        content = request.content
+
+        # Convert bytes to string for JSON serialization
+        if isinstance(content, bytes):
+            try:
+                content_str = content.decode("utf-8")
+            except UnicodeDecodeError:
+                # Handle binary content (shouldn't happen for o3-pro API)
+                content_str = content.hex()
+        else:
+            content_str = str(content) if content else ""
+
+        request_data = {
+            "method": request.method,
+            "url": str(request.url),
+            "path": request.url.path,
+            "headers": dict(request.headers),
+            "content": self._sanitize_request_content(content_str),
+        }
+
+        # Apply PII sanitization if enabled
+        if self.sanitizer:
+            request_data = self.sanitizer.sanitize_request(request_data)
+
+        return request_data
+
+    def _serialize_response(self, response: httpx.Response) -> dict[str, Any]:
+        """Serialize httpx.Response to JSON-compatible format (legacy method without content)."""
+        # Legacy method for backward compatibility when content capture is disabled
+        return {
+            "status_code": response.status_code,
+            "headers": dict(response.headers),
+            "content": {"note": "Response content not recorded to avoid httpx.ResponseNotRead exception"},
+            "reason_phrase": response.reason_phrase,
+        }
+
+    def _serialize_response_with_content(self, response: httpx.Response, content_bytes: bytes) -> dict[str, Any]:
+        """Serialize httpx.Response with captured content."""
+        try:
+            # Debug: check what we got
+
+            # Ensure we have bytes for base64 encoding
+            if not isinstance(content_bytes, bytes):
+                logger.warning(f"Content is not bytes, converting from {type(content_bytes)}")
+                if isinstance(content_bytes, str):
+                    content_bytes = content_bytes.encode("utf-8")
+                else:
+                    content_bytes = str(content_bytes).encode("utf-8")
+
+            # Encode content as base64 for JSON storage
+            content_b64 = base64.b64encode(content_bytes).decode("utf-8")
+            logger.debug(f"Base64 encoded {len(content_bytes)} bytes → {len(content_b64)} chars")
+
+            response_data = {
+                "status_code": response.status_code,
+                "headers": dict(response.headers),
+                "content": {"data": content_b64, "encoding": "base64", "size": len(content_bytes)},
+                "reason_phrase": response.reason_phrase,
+            }
+
+            # Apply PII sanitization if enabled
+            if self.sanitizer:
+                response_data = self.sanitizer.sanitize_response(response_data)
+
+            return response_data
+        except Exception as e:
+            logger.exception("Error in _serialize_response_with_content")
+            # Fall back to minimal info
+            return {
+                "status_code": response.status_code,
+                "headers": dict(response.headers),
+                "content": {"error": f"Failed to serialize content: {e}"},
+                "reason_phrase": response.reason_phrase,
+            }
+
+    def _sanitize_request_content(self, content: str) -> Any:
+        """Sanitize request content to remove sensitive data."""
+        try:
+            if content.strip():
+                data = json.loads(content)
+                # Don't sanitize request content for now - it's user input
+                return data
+        except json.JSONDecodeError:
+            pass
+        return content
+
+    def _save_cassette(self):
+        """Save recorded interactions to cassette file."""
+        # Ensure directory exists
+        self.cassette_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Save cassette
+        cassette_data = {"interactions": self.recorded_interactions}
+
+        self.cassette_path.write_text(json.dumps(cassette_data, indent=2, sort_keys=True))
+
+
+class ReplayTransport(httpx.MockTransport):
+    """Transport that replays saved HTTP interactions from cassettes."""
+
+    def __init__(self, cassette_path: str):
+        self.cassette_path = Path(cassette_path)
+        self.interactions = self._load_cassette()
+        super().__init__(self._handle_request)
+
+    def _load_cassette(self) -> list:
+        """Load interactions from cassette file."""
+        if not self.cassette_path.exists():
+            raise FileNotFoundError(f"Cassette file not found: {self.cassette_path}")
+
+        try:
+            cassette_data = json.loads(self.cassette_path.read_text())
+            return cassette_data.get("interactions", [])
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid cassette file format: {e}")
+
+    def _handle_request(self, request: httpx.Request) -> httpx.Response:
+        """Handle request by finding matching interaction and returning saved response."""
+        logger.debug(f"ReplayTransport: Looking for {request.method} {request.url}")
+
+        # Debug: show what we're trying to match
+        request_signature = self._get_request_signature(request)
+        logger.debug(f"Request signature: {request_signature}")
+
+        # Find matching interaction
+        interaction = self._find_matching_interaction(request)
+        if not interaction:
+            logger.warning("No matching interaction found in cassette")
+            raise ValueError(f"No matching interaction found for {request.method} {request.url}")
+
+        logger.debug("Found matching interaction in cassette")
+
+        # Build response from saved data
+        response_data = interaction["response"]
+
+        # Convert content back to appropriate format
+        content = response_data.get("content", {})
+        if isinstance(content, dict):
+            # Check if this is base64-encoded content
+            if content.get("encoding") == "base64" and "data" in content:
+                # Decode base64 content
+                try:
+                    content_bytes = base64.b64decode(content["data"])
+                    logger.debug(f"Decoded {len(content_bytes)} bytes from base64")
+                except Exception as e:
+                    logger.warning(f"Failed to decode base64 content: {e}")
+                    content_bytes = json.dumps(content).encode("utf-8")
+            else:
+                # Legacy format or stub content
+                content_bytes = json.dumps(content).encode("utf-8")
+        else:
+            content_bytes = str(content).encode("utf-8")
+
+        # Check if response expects gzipped content
+        headers = response_data.get("headers", {})
+        if headers.get("content-encoding") == "gzip":
+            # Re-compress the content for httpx
+            import gzip
+
+            content_bytes = gzip.compress(content_bytes)
+            logger.debug(f"Re-compressed for replay: {len(content_bytes)} bytes")
+
+        logger.debug(f"Returning cassette response ({len(content_bytes)} bytes)")
+
+        # Create httpx.Response
+        return httpx.Response(
+            status_code=response_data["status_code"],
+            headers=response_data.get("headers", {}),
+            content=content_bytes,
+            request=request,
+        )
+
+    def _find_matching_interaction(self, request: httpx.Request) -> Optional[dict[str, Any]]:
+        """Find interaction that matches the request."""
+        request_signature = self._get_request_signature(request)
+
+        for interaction in self.interactions:
+            saved_signature = self._get_saved_request_signature(interaction["request"])
+            if request_signature == saved_signature:
+                return interaction
+
+        return None
+
+    def _get_request_signature(self, request: httpx.Request) -> str:
+        """Generate signature for request matching."""
+        # Use method, path, and content hash for matching
+        content = request.content
+        if hasattr(content, "read"):
+            content = content.read()
+
+        if isinstance(content, bytes):
+            content_str = content.decode("utf-8", errors="ignore")
+        else:
+            content_str = str(content) if content else ""
+
+        # Parse JSON and re-serialize with sorted keys for consistent hashing
+        try:
+            if content_str.strip():
+                content_dict = json.loads(content_str)
+                content_str = json.dumps(content_dict, sort_keys=True)
+        except json.JSONDecodeError:
+            # Not JSON, use as-is
+            pass
+
+        # Create hash of content for stable matching
+        content_hash = hashlib.md5(content_str.encode()).hexdigest()
+
+        return f"{request.method}:{request.url.path}:{content_hash}"
+
+    def _get_saved_request_signature(self, saved_request: dict[str, Any]) -> str:
+        """Generate signature for saved request."""
+        method = saved_request["method"]
+        path = saved_request["path"]
+
+        # Hash the saved content
+        content = saved_request.get("content", "")
+        if isinstance(content, dict):
+            content_str = json.dumps(content, sort_keys=True)
+        else:
+            content_str = str(content)
+
+        content_hash = hashlib.md5(content_str.encode()).hexdigest()
+
+        return f"{method}:{path}:{content_hash}"
+
+
+class TransportFactory:
+    """Factory for creating appropriate transport based on cassette availability."""
+
+    @staticmethod
+    def create_transport(cassette_path: str) -> httpx.HTTPTransport:
+        """Create transport based on cassette existence and API key availability."""
+        cassette_file = Path(cassette_path)
+
+        # Check if we should record or replay
+        if cassette_file.exists():
+            # Cassette exists - use replay mode
+            return ReplayTransport(cassette_path)
+        else:
+            # No cassette - use recording mode
+            # Note: We'll check for API key in the test itself
+            return RecordingTransport(cassette_path)
+
+    @staticmethod
+    def should_record(cassette_path: str, api_key: Optional[str] = None) -> bool:
+        """Determine if we should record based on cassette and API key availability."""
+        cassette_file = Path(cassette_path)
+
+        # Record if cassette doesn't exist AND we have API key
+        return not cassette_file.exists() and bool(api_key)
+
+    @staticmethod
+    def should_replay(cassette_path: str) -> bool:
+        """Determine if we should replay based on cassette availability."""
+        cassette_file = Path(cassette_path)
+        return cassette_file.exists()
+
+
+# Example usage:
+#
+# # In test setup:
+# cassette_path = "tests/cassettes/o3_pro_basic_math.json"
+# transport = TransportFactory.create_transport(cassette_path)
+#
+# # Inject into OpenAI client:
+# provider._test_transport = transport
+#
+# # The provider's client property will detect _test_transport and use it
--- a/tests/openai_cassettes/o3_pro_basic_math.json
+++ b/tests/openai_cassettes/o3_pro_basic_math.json
--- a/tests/pii_sanitizer.py
+++ b/tests/pii_sanitizer.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+PII (Personally Identifiable Information) Sanitizer for HTTP recordings.
+
+This module provides comprehensive sanitization of sensitive data in HTTP
+request/response recordings to prevent accidental exposure of API keys,
+tokens, personal information, and other sensitive data.
+"""
+
+import logging
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from re import Pattern
+from typing import Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PIIPattern:
+    """Defines a pattern for detecting and sanitizing PII."""
+
+    name: str
+    pattern: Pattern[str]
+    replacement: str
+    description: str
+
+    @classmethod
+    def create(cls, name: str, pattern: str, replacement: str, description: str) -> "PIIPattern":
+        """Create a PIIPattern with compiled regex."""
+        return cls(name=name, pattern=re.compile(pattern), replacement=replacement, description=description)
+
+
+class PIISanitizer:
+    """Sanitizes PII from various data structures while preserving format."""
+
+    def __init__(self, patterns: Optional[list[PIIPattern]] = None):
+        """Initialize with optional custom patterns."""
+        self.patterns: list[PIIPattern] = patterns or []
+        self.sanitize_enabled = True
+
+        # Add default patterns if none provided
+        if not patterns:
+            self._add_default_patterns()
+
+    def _add_default_patterns(self):
+        """Add comprehensive default PII patterns."""
+        default_patterns = [
+            # API Keys - Core patterns (Bearer tokens handled in sanitize_headers)
+            PIIPattern.create(
+                name="openai_api_key_proj",
+                pattern=r"sk-proj-[A-Za-z0-9\-_]{48,}",
+                replacement="sk-proj-SANITIZED",
+                description="OpenAI project API keys",
+            ),
+            PIIPattern.create(
+                name="openai_api_key",
+                pattern=r"sk-[A-Za-z0-9]{48,}",
+                replacement="sk-SANITIZED",
+                description="OpenAI API keys",
+            ),
+            PIIPattern.create(
+                name="anthropic_api_key",
+                pattern=r"sk-ant-[A-Za-z0-9\-_]{48,}",
+                replacement="sk-ant-SANITIZED",
+                description="Anthropic API keys",
+            ),
+            PIIPattern.create(
+                name="google_api_key",
+                pattern=r"AIza[A-Za-z0-9\-_]{35,}",
+                replacement="AIza-SANITIZED",
+                description="Google API keys",
+            ),
+            PIIPattern.create(
+                name="github_tokens",
+                pattern=r"gh[psr]_[A-Za-z0-9]{36}",
+                replacement="gh_SANITIZED",
+                description="GitHub tokens (all types)",
+            ),
+            # JWT tokens
+            PIIPattern.create(
+                name="jwt_token",
+                pattern=r"eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+",
+                replacement="eyJ-SANITIZED",
+                description="JSON Web Tokens",
+            ),
+            # Personal Information
+            PIIPattern.create(
+                name="email_address",
+                pattern=r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
+                replacement="user@example.com",
+                description="Email addresses",
+            ),
+            PIIPattern.create(
+                name="ipv4_address",
+                pattern=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
+                replacement="0.0.0.0",
+                description="IPv4 addresses",
+            ),
+            PIIPattern.create(
+                name="ssn",
+                pattern=r"\b\d{3}-\d{2}-\d{4}\b",
+                replacement="XXX-XX-XXXX",
+                description="Social Security Numbers",
+            ),
+            PIIPattern.create(
+                name="credit_card",
+                pattern=r"\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b",
+                replacement="XXXX-XXXX-XXXX-XXXX",
+                description="Credit card numbers",
+            ),
+            PIIPattern.create(
+                name="phone_number",
+                pattern=r"(?:\+\d{1,3}[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}\b(?![\d\.\,\]\}])",
+                replacement="(XXX) XXX-XXXX",
+                description="Phone numbers (all formats)",
+            ),
+            # AWS
+            PIIPattern.create(
+                name="aws_access_key",
+                pattern=r"AKIA[0-9A-Z]{16}",
+                replacement="AKIA-SANITIZED",
+                description="AWS access keys",
+            ),
+            # Other common patterns
+            PIIPattern.create(
+                name="slack_token",
+                pattern=r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24,34}",
+                replacement="xox-SANITIZED",
+                description="Slack tokens",
+            ),
+            PIIPattern.create(
+                name="stripe_key",
+                pattern=r"(?:sk|pk)_(?:test|live)_[0-9a-zA-Z]{24,99}",
+                replacement="sk_SANITIZED",
+                description="Stripe API keys",
+            ),
+        ]
+
+        self.patterns.extend(default_patterns)
+
+    def add_pattern(self, pattern: PIIPattern):
+        """Add a custom PII pattern."""
+        self.patterns.append(pattern)
+        logger.info(f"Added PII pattern: {pattern.name}")
+
+    def sanitize_string(self, text: str) -> str:
+        """Apply all patterns to sanitize a string."""
+        if not self.sanitize_enabled or not isinstance(text, str):
+            return text
+
+        sanitized = text
+        for pattern in self.patterns:
+            if pattern.pattern.search(sanitized):
+                sanitized = pattern.pattern.sub(pattern.replacement, sanitized)
+                logger.debug(f"Applied {pattern.name} sanitization")
+
+        return sanitized
+
+    def sanitize_headers(self, headers: dict[str, str]) -> dict[str, str]:
+        """Special handling for HTTP headers."""
+        if not self.sanitize_enabled:
+            return headers
+
+        sanitized_headers = {}
+
+        for key, value in headers.items():
+            # Special case for Authorization headers to preserve auth type
+            if key.lower() == "authorization" and " " in value:
+                auth_type = value.split(" ", 1)[0]
+                if auth_type in ("Bearer", "Basic"):
+                    sanitized_headers[key] = f"{auth_type} SANITIZED"
+                else:
+                    sanitized_headers[key] = self.sanitize_string(value)
+            else:
+                # Apply standard sanitization to all other headers
+                sanitized_headers[key] = self.sanitize_string(value)
+
+        return sanitized_headers
+
+    def sanitize_value(self, value: Any) -> Any:
+        """Recursively sanitize any value (string, dict, list, etc)."""
+        if not self.sanitize_enabled:
+            return value
+
+        if isinstance(value, str):
+            return self.sanitize_string(value)
+        elif isinstance(value, dict):
+            return {k: self.sanitize_value(v) for k, v in value.items()}
+        elif isinstance(value, list):
+            return [self.sanitize_value(item) for item in value]
+        elif isinstance(value, tuple):
+            return tuple(self.sanitize_value(item) for item in value)
+        else:
+            # For other types (int, float, bool, None), return as-is
+            return value
+
+    def sanitize_url(self, url: str) -> str:
+        """Sanitize sensitive data from URLs (query params, etc)."""
+        if not self.sanitize_enabled:
+            return url
+
+        # First apply general string sanitization
+        url = self.sanitize_string(url)
+
+        # Parse and sanitize query parameters
+        if "?" in url:
+            base, query = url.split("?", 1)
+            params = []
+
+            for param in query.split("&"):
+                if "=" in param:
+                    key, value = param.split("=", 1)
+                    # Sanitize common sensitive parameter names
+                    sensitive_params = {"key", "token", "api_key", "secret", "password"}
+                    if key.lower() in sensitive_params:
+                        params.append(f"{key}=SANITIZED")
+                    else:
+                        # Still sanitize the value for PII
+                        params.append(f"{key}={self.sanitize_string(value)}")
+                else:
+                    params.append(param)
+
+            return f"{base}?{'&'.join(params)}"
+
+        return url
+
+    def sanitize_request(self, request_data: dict[str, Any]) -> dict[str, Any]:
+        """Sanitize a complete request dictionary."""
+        sanitized = deepcopy(request_data)
+
+        # Sanitize headers
+        if "headers" in sanitized:
+            sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
+
+        # Sanitize URL
+        if "url" in sanitized:
+            sanitized["url"] = self.sanitize_url(sanitized["url"])
+
+        # Sanitize content
+        if "content" in sanitized:
+            sanitized["content"] = self.sanitize_value(sanitized["content"])
+
+        return sanitized
+
+    def sanitize_response(self, response_data: dict[str, Any]) -> dict[str, Any]:
+        """Sanitize a complete response dictionary."""
+        sanitized = deepcopy(response_data)
+
+        # Sanitize headers
+        if "headers" in sanitized:
+            sanitized["headers"] = self.sanitize_headers(sanitized["headers"])
+
+        # Sanitize content
+        if "content" in sanitized:
+            # Handle base64 encoded content specially
+            if isinstance(sanitized["content"], dict) and sanitized["content"].get("encoding") == "base64":
+                if "data" in sanitized["content"]:
+                    import base64
+
+                    try:
+                        # Decode, sanitize, and re-encode the actual response body
+                        decoded_bytes = base64.b64decode(sanitized["content"]["data"])
+                        # Attempt to decode as UTF-8 for sanitization. If it fails, it's likely binary.
+                        try:
+                            decoded_str = decoded_bytes.decode("utf-8")
+                            sanitized_str = self.sanitize_string(decoded_str)
+                            sanitized["content"]["data"] = base64.b64encode(sanitized_str.encode("utf-8")).decode(
+                                "utf-8"
+                            )
+                        except UnicodeDecodeError:
+                            # Content is not text, leave as is.
+                            pass
+                    except (base64.binascii.Error, TypeError):
+                        # Handle cases where data is not valid base64
+                        pass
+
+                    # Sanitize other metadata fields
+                    for key, value in sanitized["content"].items():
+                        if key != "data":
+                            sanitized["content"][key] = self.sanitize_value(value)
+            else:
+                sanitized["content"] = self.sanitize_value(sanitized["content"])
+
+        return sanitized
+
+
+# Global instance for convenience
+default_sanitizer = PIISanitizer()
--- a/tests/sanitize_cassettes.py
+++ b/tests/sanitize_cassettes.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Script to sanitize existing cassettes by applying PII sanitization.
+
+This script will:
+1. Load existing cassettes
+2. Apply PII sanitization to all interactions
+3. Create backups of originals
+4. Save sanitized versions
+"""
+
+import json
+import shutil
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# Add tests directory to path to import our modules
+sys.path.insert(0, str(Path(__file__).parent))
+
+from pii_sanitizer import PIISanitizer
+
+
+def sanitize_cassette(cassette_path: Path, backup: bool = True) -> bool:
+    """Sanitize a single cassette file."""
+    print(f"\n🔍 Processing: {cassette_path}")
+
+    if not cassette_path.exists():
+        print(f"❌ File not found: {cassette_path}")
+        return False
+
+    try:
+        # Load cassette
+        with open(cassette_path) as f:
+            cassette_data = json.load(f)
+
+        # Create backup if requested
+        if backup:
+            backup_path = cassette_path.with_suffix(f'.backup-{datetime.now().strftime("%Y%m%d-%H%M%S")}.json')
+            shutil.copy2(cassette_path, backup_path)
+            print(f"📦 Backup created: {backup_path}")
+
+        # Initialize sanitizer
+        sanitizer = PIISanitizer()
+
+        # Sanitize interactions
+        if "interactions" in cassette_data:
+            sanitized_interactions = []
+
+            for interaction in cassette_data["interactions"]:
+                sanitized_interaction = {}
+
+                # Sanitize request
+                if "request" in interaction:
+                    sanitized_interaction["request"] = sanitizer.sanitize_request(interaction["request"])
+
+                # Sanitize response
+                if "response" in interaction:
+                    sanitized_interaction["response"] = sanitizer.sanitize_response(interaction["response"])
+
+                sanitized_interactions.append(sanitized_interaction)
+
+            cassette_data["interactions"] = sanitized_interactions
+
+        # Save sanitized cassette
+        with open(cassette_path, "w") as f:
+            json.dump(cassette_data, f, indent=2, sort_keys=True)
+
+        print(f"✅ Sanitized: {cassette_path}")
+        return True
+
+    except Exception as e:
+        print(f"❌ Error processing {cassette_path}: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+
+def main():
+    """Sanitize all cassettes in the openai_cassettes directory."""
+    cassettes_dir = Path(__file__).parent / "openai_cassettes"
+
+    if not cassettes_dir.exists():
+        print(f"❌ Directory not found: {cassettes_dir}")
+        sys.exit(1)
+
+    # Find all JSON cassettes
+    cassette_files = list(cassettes_dir.glob("*.json"))
+
+    if not cassette_files:
+        print(f"❌ No cassette files found in {cassettes_dir}")
+        sys.exit(1)
+
+    print(f"🎬 Found {len(cassette_files)} cassette(s) to sanitize")
+
+    # Process each cassette
+    success_count = 0
+    for cassette_path in cassette_files:
+        if sanitize_cassette(cassette_path):
+            success_count += 1
+
+    print(f"\n✨ Sanitization complete: {success_count}/{len(cassette_files)} cassettes processed successfully")
+
+    if success_count < len(cassette_files):
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_alias_target_restrictions.py
+++ b/tests/test_alias_target_restrictions.py
@@ -48,7 +48,8 @@ class TestAliasTargetRestrictions:
        """Test that restriction policy allows alias when target model is allowed.

        This is the correct user-friendly behavior - if you allow 'o4-mini',
-        you should be able to use its alias 'mini' as well.
+        you should be able to use its aliases 'o4mini' and 'o4-mini'.
+        Note: 'mini' is now an alias for 'gpt-5-mini', not 'o4-mini'.
        """
        # Clear cached restriction service
        import utils.model_restrictions
@@ -57,15 +58,16 @@ class TestAliasTargetRestrictions:

        provider = OpenAIModelProvider(api_key="test-key")

-        # Both target and alias should be allowed
+        # Both target and its actual aliases should be allowed
        assert provider.validate_model_name("o4-mini")
-        assert provider.validate_model_name("mini")
+        assert provider.validate_model_name("o4mini")

    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini"})  # Allow alias only
    def test_restriction_policy_allows_only_alias_when_alias_specified(self):
        """Test that restriction policy allows only the alias when just alias is specified.

-        If you restrict to 'mini', only the alias should work, not the direct target.
+        If you restrict to 'mini' (which is an alias for gpt-5-mini),
+        only the alias should work, not other models.
        This is the correct restrictive behavior.
        """
        # Clear cached restriction service
@@ -77,7 +79,9 @@ class TestAliasTargetRestrictions:

        # Only the alias should be allowed
        assert provider.validate_model_name("mini")
-        # Direct target should NOT be allowed
+        # Direct target for this alias should NOT be allowed (mini -> gpt-5-mini)
+        assert not provider.validate_model_name("gpt-5-mini")
+        # Other models should NOT be allowed
        assert not provider.validate_model_name("o4-mini")

    @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash"})  # Allow target
@@ -127,12 +131,15 @@ class TestAliasTargetRestrictions:

                # The warning should include both aliases and targets in known models
                warning_message = str(warning_calls[0])
-                assert "mini" in warning_message  # alias should be in known models
-                assert "o4-mini" in warning_message  # target should be in known models
+                assert "o4mini" in warning_message or "o4-mini" in warning_message  # aliases should be in known models

-    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini,o4-mini"})  # Allow both alias and target
+    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini,gpt-5-mini,o4-mini,o4mini"})  # Allow different models
    def test_both_alias_and_target_allowed_when_both_specified(self):
-        """Test that both alias and target work when both are explicitly allowed."""
+        """Test that both alias and target work when both are explicitly allowed.
+
+        mini -> gpt-5-mini
+        o4mini -> o4-mini
+        """
        # Clear cached restriction service
        import utils.model_restrictions

@@ -140,9 +147,11 @@ class TestAliasTargetRestrictions:

        provider = OpenAIModelProvider(api_key="test-key")

-        # Both should be allowed
-        assert provider.validate_model_name("mini")
-        assert provider.validate_model_name("o4-mini")
+        # All should be allowed since we explicitly allowed them
+        assert provider.validate_model_name("mini")  # alias for gpt-5-mini
+        assert provider.validate_model_name("gpt-5-mini")  # target
+        assert provider.validate_model_name("o4-mini")  # target
+        assert provider.validate_model_name("o4mini")  # alias for o4-mini

    def test_alias_target_policy_regression_prevention(self):
        """Regression test to ensure aliases and targets are both validated properly.
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -95,8 +95,8 @@ class TestAutoModeComprehensive:
                },
                {
                    "EXTENDED_REASONING": "o3",  # O3 for deep reasoning
-                    "FAST_RESPONSE": "o4-mini",  # O4-mini for speed
-                    "BALANCED": "o4-mini",  # O4-mini as balanced
+                    "FAST_RESPONSE": "gpt-5",  # Prefer gpt-5 for speed
+                    "BALANCED": "gpt-5",  # Prefer gpt-5 for balanced
                },
            ),
            # Only X.AI API available
@@ -108,12 +108,12 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "grok-3",  # GROK-3 for reasoning
+                    "EXTENDED_REASONING": "grok-4",  # GROK-4 for reasoning (now preferred)
                    "FAST_RESPONSE": "grok-3-fast",  # GROK-3-fast for speed
-                    "BALANCED": "grok-3",  # GROK-3 as balanced
+                    "BALANCED": "grok-4",  # GROK-4 as balanced (now preferred)
                },
            ),
-            # Both Gemini and OpenAI available - should prefer based on tool category
+            # Both Gemini and OpenAI available - Google comes first in priority
            (
                {
                    "GEMINI_API_KEY": "real-key",
@@ -122,12 +122,12 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "o3",  # Prefer O3 for deep reasoning
-                    "FAST_RESPONSE": "o4-mini",  # Prefer O4-mini for speed
-                    "BALANCED": "o4-mini",  # Prefer OpenAI for balanced
+                    "EXTENDED_REASONING": "gemini-2.5-pro",  # Gemini comes first in priority
+                    "FAST_RESPONSE": "gemini-2.5-flash",  # Prefer flash for speed
+                    "BALANCED": "gemini-2.5-flash",  # Prefer flash for balanced
                },
            ),
-            # All native APIs available - should prefer based on tool category
+            # All native APIs available - Google still comes first
            (
                {
                    "GEMINI_API_KEY": "real-key",
@@ -136,9 +136,9 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "o3",  # Prefer O3 for deep reasoning
-                    "FAST_RESPONSE": "o4-mini",  # Prefer O4-mini for speed
-                    "BALANCED": "o4-mini",  # Prefer OpenAI for balanced
+                    "EXTENDED_REASONING": "gemini-2.5-pro",  # Gemini comes first in priority
+                    "FAST_RESPONSE": "gemini-2.5-flash",  # Prefer flash for speed
+                    "BALANCED": "gemini-2.5-flash",  # Prefer flash for balanced
                },
            ),
        ],
--- a/tests/test_auto_mode_provider_selection.py
+++ b/tests/test_auto_mode_provider_selection.py
@@ -97,10 +97,10 @@ class TestAutoModeProviderSelection:
            fast_response = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
            balanced = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)

-            # Should select appropriate OpenAI models
-            assert extended_reasoning in ["o3", "o3-mini", "o4-mini"]  # Any available OpenAI model for reasoning
-            assert fast_response in ["o4-mini", "o3-mini"]  # Prefer faster models
-            assert balanced in ["o4-mini", "o3-mini"]  # Balanced selection
+            # Should select appropriate OpenAI models based on new preference order
+            assert extended_reasoning == "o3"  # O3 for extended reasoning
+            assert fast_response == "gpt-5"  # gpt-5 comes first in fast response preference
+            assert balanced == "gpt-5"  # gpt-5 for balanced

        finally:
            # Restore original environment
@@ -138,11 +138,11 @@ class TestAutoModeProviderSelection:
            )
            fast_response = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)

-            # Should prefer OpenAI for reasoning (based on fallback logic)
-            assert extended_reasoning == "o3"  # Should prefer O3 for extended reasoning
+            # Should prefer Gemini now (based on new provider priority: Gemini before OpenAI)
+            assert extended_reasoning == "gemini-2.5-pro"  # Gemini has higher priority now

-            # Should prefer OpenAI for fast response
-            assert fast_response == "o4-mini"  # Should prefer O4-mini for fast response
+            # Should prefer Gemini for fast response
+            assert fast_response == "gemini-2.5-flash"  # Gemini has higher priority now

        finally:
            # Restore original environment
@@ -318,10 +318,9 @@ class TestAutoModeProviderSelection:
            test_cases = [
                ("flash", ProviderType.GOOGLE, "gemini-2.5-flash"),
                ("pro", ProviderType.GOOGLE, "gemini-2.5-pro"),
-                ("mini", ProviderType.OPENAI, "o4-mini"),
+                ("mini", ProviderType.OPENAI, "gpt-5-mini"),  # "mini" now resolves to gpt-5-mini
                ("o3mini", ProviderType.OPENAI, "o3-mini"),
-                ("grok", ProviderType.XAI, "grok-4-0709"),
-                ("grok3", ProviderType.XAI, "grok-3"),
+                ("grok", ProviderType.XAI, "grok-4"),
                ("grokfast", ProviderType.XAI, "grok-3-fast"),
            ]

--- a/tests/test_buggy_behavior_prevention.py
+++ b/tests/test_buggy_behavior_prevention.py
@@ -132,8 +132,11 @@ class TestBuggyBehaviorPrevention:
            assert not provider.validate_model_name("o3-pro")  # Not in allowed list
            assert not provider.validate_model_name("o3")  # Not in allowed list

-            # This should be ALLOWED because it resolves to o4-mini which is in the allowed list
-            assert provider.validate_model_name("mini")  # Resolves to o4-mini, which IS allowed
+            # "mini" now resolves to gpt-5-mini, not o4-mini, so it should be blocked
+            assert not provider.validate_model_name("mini")  # Resolves to gpt-5-mini, which is NOT allowed
+
+            # But o4mini (the actual alias for o4-mini) should work
+            assert provider.validate_model_name("o4mini")  # Resolves to o4-mini, which IS allowed

            # Verify our list_all_known_models includes the restricted models
            all_known = provider.list_all_known_models()
--- a/tests/test_challenge.py
+++ b/tests/test_challenge.py
@@ -93,7 +93,7 @@ class TestChallengeTool:
        response_data = json.loads(result[0].text)

        # Check response structure
-        assert response_data["status"] == "challenge_created"
+        assert response_data["status"] == "challenge_accepted"
        assert response_data["original_statement"] == "All software bugs are caused by syntax errors"
        assert "challenge_prompt" in response_data
        assert "instructions" in response_data
--- a/tests/test_dial_provider.py
+++ b/tests/test_dial_provider.py
@@ -113,7 +113,7 @@ class TestDIALProvider:
        # Test temperature constraint
        assert capabilities.temperature_constraint.min_temp == 0.0
        assert capabilities.temperature_constraint.max_temp == 2.0
-        assert capabilities.temperature_constraint.default_temp == 0.7
+        assert capabilities.temperature_constraint.default_temp == 0.3

    @patch.dict(os.environ, {"DIAL_ALLOWED_MODELS": ""}, clear=False)
    @patch("utils.model_restrictions._restriction_service", None)
--- a/tests/test_intelligent_fallback.py
+++ b/tests/test_intelligent_fallback.py
@@ -37,14 +37,14 @@ class TestIntelligentFallback:

    @patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False)
    def test_prefers_openai_o3_mini_when_available(self):
-        """Test that o4-mini is preferred when OpenAI API key is available"""
+        """Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)"""
        # Register only OpenAI provider for this test
        from providers.openai_provider import OpenAIModelProvider

        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)

        fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
-        assert fallback_model == "o4-mini"
+        assert fallback_model == "gpt-5"  # Based on new preference order: gpt-5 before o4-mini

    @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
    def test_prefers_gemini_flash_when_openai_unavailable(self):
@@ -68,7 +68,7 @@ class TestIntelligentFallback:
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

        fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
-        assert fallback_model == "o4-mini"  # OpenAI has priority
+        assert fallback_model == "gemini-2.5-flash"  # Gemini has priority now (based on new PROVIDER_PRIORITY_ORDER)

    @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": ""}, clear=False)
    def test_fallback_when_no_keys_available(self):
@@ -147,8 +147,8 @@ class TestIntelligentFallback:

                history, tokens = build_conversation_history(context, model_context=None)

-                # Verify that ModelContext was called with o4-mini (the intelligent fallback)
-                mock_context_class.assert_called_once_with("o4-mini")
+                # Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order)
+                mock_context_class.assert_called_once_with("gpt-5")

    def test_auto_mode_with_gemini_only(self):
        """Test auto mode behavior when only Gemini API key is available"""
--- a/tests/test_model_restrictions.py
+++ b/tests/test_model_restrictions.py
@@ -635,6 +635,13 @@ class TestAutoModeWithRestrictions:
        mock_openai.list_models = openai_list_models
        mock_openai.list_all_known_models.return_value = ["o3", "o3-mini", "o4-mini"]

+        # Add get_preferred_model method to mock to match new implementation
+        def get_preferred_model(category, allowed_models):
+            # Simple preference logic for testing - just return first allowed model
+            return allowed_models[0] if allowed_models else None
+
+        mock_openai.get_preferred_model = get_preferred_model
+
        def get_provider_side_effect(provider_type):
            if provider_type == ProviderType.OPENAI:
                return mock_openai
@@ -656,9 +663,13 @@ class TestAutoModeWithRestrictions:
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
            assert model == "o4-mini"

-    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "mini", "GEMINI_API_KEY": "", "OPENAI_API_KEY": "test-key"})
-    def test_fallback_with_shorthand_restrictions(self):
+    def test_fallback_with_shorthand_restrictions(self, monkeypatch):
        """Test fallback model selection with shorthand restrictions."""
+        # Use monkeypatch to set environment variables with automatic cleanup
+        monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "mini")
+        monkeypatch.setenv("GEMINI_API_KEY", "")
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+
        # Clear caches and reset registry
        import utils.model_restrictions
        from providers.registry import ModelProviderRegistry
@@ -685,8 +696,9 @@ class TestAutoModeWithRestrictions:
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)

            # The fallback will depend on how get_available_models handles aliases
-            # For now, we accept either behavior and document it
-            assert model in ["o4-mini", "gemini-2.5-flash"]
+            # When "mini" is allowed, it's returned as the allowed model
+            # "mini" is now an alias for gpt-5-mini, but the list shows "mini" itself
+            assert model in ["mini", "gpt-5-mini", "o4-mini", "gemini-2.5-flash"]
        finally:
            # Restore original registry state
            registry = ModelProviderRegistry()
--- a/tests/test_o3_pro_output_text_fix.py
+++ b/tests/test_o3_pro_output_text_fix.py
@@ -0,0 +1,124 @@
+"""
+Tests for o3-pro output_text parsing fix using HTTP transport recording.
+
+This test validates the fix that uses `response.output_text` convenience field
+instead of manually parsing `response.output.content[].text`.
+
+Uses HTTP transport recorder to record real o3-pro API responses at the HTTP level while allowing
+the OpenAI SDK to create real response objects that we can test.
+
+RECORDING: To record new responses, delete the cassette file and run with real API keys.
+"""
+
+import logging
+import os
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from dotenv import load_dotenv
+
+from providers import ModelProviderRegistry
+from tests.transport_helpers import inject_transport
+from tools.chat import ChatTool
+
+logger = logging.getLogger(__name__)
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Use absolute path for cassette directory
+cassette_dir = Path(__file__).parent / "openai_cassettes"
+cassette_dir.mkdir(exist_ok=True)
+
+
+@pytest.mark.asyncio
+class TestO3ProOutputTextFix:
+    """Test o3-pro response parsing fix using respx for HTTP recording/replay."""
+
+    def setup_method(self):
+        """Set up the test by ensuring clean registry state."""
+        # Use the new public API for registry cleanup
+        ModelProviderRegistry.reset_for_testing()
+        # Provider registration is now handled by inject_transport helper
+
+        # Clear restriction service to ensure it re-reads environment
+        # This is necessary because previous tests may have set restrictions
+        # that are cached in the singleton
+        import utils.model_restrictions
+
+        utils.model_restrictions._restriction_service = None
+
+    def teardown_method(self):
+        """Clean up after test to ensure no state pollution."""
+        # Use the new public API for registry cleanup
+        ModelProviderRegistry.reset_for_testing()
+
+    @pytest.mark.no_mock_provider  # Disable provider mocking for this test
+    @patch.dict(os.environ, {"OPENAI_ALLOWED_MODELS": "o3-pro", "LOCALE": ""})
+    async def test_o3_pro_uses_output_text_field(self, monkeypatch):
+        """Test that o3-pro parsing uses the output_text convenience field via ChatTool."""
+        cassette_path = cassette_dir / "o3_pro_basic_math.json"
+
+        # Check if we need to record or replay
+        if not cassette_path.exists():
+            # Recording mode - check for real API key
+            real_api_key = os.getenv("OPENAI_API_KEY", "").strip()
+            if not real_api_key or real_api_key.startswith("dummy"):
+                pytest.fail(
+                    f"Cassette file not found at {cassette_path}. "
+                    "To record: Set OPENAI_API_KEY environment variable to a valid key and run this test. "
+                    "Note: Recording will make a real API call to OpenAI."
+                )
+            # Real API key is available, we'll record the cassette
+            logger.debug("🎬 Recording mode: Using real API key to record cassette")
+        else:
+            # Replay mode - use dummy key
+            monkeypatch.setenv("OPENAI_API_KEY", "dummy-key-for-replay")
+            logger.debug("📼 Replay mode: Using recorded cassette")
+
+        # Simplified transport injection - just one line!
+        inject_transport(monkeypatch, cassette_path)
+
+        # Execute ChatTool test with custom transport
+        result = await self._execute_chat_tool_test()
+
+        # Verify the response works correctly
+        self._verify_chat_tool_response(result)
+
+        # Verify cassette exists
+        assert cassette_path.exists()
+
+    async def _execute_chat_tool_test(self):
+        """Execute the ChatTool with o3-pro and return the result."""
+        chat_tool = ChatTool()
+        arguments = {"prompt": "What is 2 + 2?", "model": "o3-pro", "temperature": 1.0}
+
+        return await chat_tool.execute(arguments)
+
+    def _verify_chat_tool_response(self, result):
+        """Verify the ChatTool response contains expected data."""
+        # Basic response validation
+        assert result is not None
+        assert isinstance(result, list)
+        assert len(result) > 0
+        assert result[0].type == "text"
+
+        # Parse JSON response
+        import json
+
+        response_data = json.loads(result[0].text)
+
+        # Debug log the response
+        logger.debug(f"Response data: {json.dumps(response_data, indent=2)}")
+
+        # Verify response structure - no cargo culting
+        if response_data["status"] == "error":
+            pytest.fail(f"Chat tool returned error: {response_data.get('error', 'Unknown error')}")
+        assert response_data["status"] in ["success", "continuation_available"]
+        assert "4" in response_data["content"]
+
+        # Verify o3-pro was actually used
+        metadata = response_data["metadata"]
+        assert metadata["model_used"] == "o3-pro"
+        assert metadata["provider_used"] == "openai"
--- a/tests/test_o3_temperature_fix_simple.py
+++ b/tests/test_o3_temperature_fix_simple.py
@@ -230,7 +230,7 @@ class TestO3TemperatureParameterFixSimple:
        assert temp_constraint.validate(0.5) is False

        # Test regular model constraints - use gpt-4.1 which is supported
-        gpt41_capabilities = provider.get_capabilities("gpt-4.1-2025-04-14")
+        gpt41_capabilities = provider.get_capabilities("gpt-4.1")
        assert gpt41_capabilities.temperature_constraint is not None

        # Regular models should allow a range
--- a/tests/test_openai_provider.py
+++ b/tests/test_openai_provider.py
@@ -48,12 +48,17 @@ class TestOpenAIProvider:
        assert provider.validate_model_name("o3-pro") is True
        assert provider.validate_model_name("o4-mini") is True
        assert provider.validate_model_name("o4-mini") is True
+        assert provider.validate_model_name("gpt-5") is True
+        assert provider.validate_model_name("gpt-5-mini") is True

        # Test valid aliases
        assert provider.validate_model_name("mini") is True
        assert provider.validate_model_name("o3mini") is True
        assert provider.validate_model_name("o4mini") is True
        assert provider.validate_model_name("o4mini") is True
+        assert provider.validate_model_name("gpt5") is True
+        assert provider.validate_model_name("gpt5-mini") is True
+        assert provider.validate_model_name("gpt5mini") is True

        # Test invalid model
        assert provider.validate_model_name("invalid-model") is False
@@ -65,17 +70,22 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider("test-key")

        # Test shorthand resolution
-        assert provider._resolve_model_name("mini") == "o4-mini"
+        assert provider._resolve_model_name("mini") == "gpt-5-mini"  # "mini" now resolves to gpt-5-mini
        assert provider._resolve_model_name("o3mini") == "o3-mini"
        assert provider._resolve_model_name("o4mini") == "o4-mini"
        assert provider._resolve_model_name("o4mini") == "o4-mini"
+        assert provider._resolve_model_name("gpt5") == "gpt-5"
+        assert provider._resolve_model_name("gpt5-mini") == "gpt-5-mini"
+        assert provider._resolve_model_name("gpt5mini") == "gpt-5-mini"

        # Test full name passthrough
        assert provider._resolve_model_name("o3") == "o3"
        assert provider._resolve_model_name("o3-mini") == "o3-mini"
-        assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
+        assert provider._resolve_model_name("o3-pro") == "o3-pro"
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
+        assert provider._resolve_model_name("gpt-5") == "gpt-5"
+        assert provider._resolve_model_name("gpt-5-mini") == "gpt-5-mini"

    def test_get_capabilities_o3(self):
        """Test getting model capabilities for O3."""
@@ -99,11 +109,43 @@ class TestOpenAIProvider:
        provider = OpenAIModelProvider("test-key")

        capabilities = provider.get_capabilities("mini")
-        assert capabilities.model_name == "o4-mini"  # Capabilities should show resolved model name
-        assert capabilities.friendly_name == "OpenAI (O4-mini)"
-        assert capabilities.context_window == 200_000
+        assert capabilities.model_name == "gpt-5-mini"  # "mini" now resolves to gpt-5-mini
+        assert capabilities.friendly_name == "OpenAI (GPT-5-mini)"
+        assert capabilities.context_window == 400_000
        assert capabilities.provider == ProviderType.OPENAI

+    def test_get_capabilities_gpt5(self):
+        """Test getting model capabilities for GPT-5."""
+        provider = OpenAIModelProvider("test-key")
+
+        capabilities = provider.get_capabilities("gpt-5")
+        assert capabilities.model_name == "gpt-5"
+        assert capabilities.friendly_name == "OpenAI (GPT-5)"
+        assert capabilities.context_window == 400_000
+        assert capabilities.max_output_tokens == 128_000
+        assert capabilities.provider == ProviderType.OPENAI
+        assert capabilities.supports_extended_thinking is True
+        assert capabilities.supports_system_prompts is True
+        assert capabilities.supports_streaming is True
+        assert capabilities.supports_function_calling is True
+        assert capabilities.supports_temperature is True
+
+    def test_get_capabilities_gpt5_mini(self):
+        """Test getting model capabilities for GPT-5-mini."""
+        provider = OpenAIModelProvider("test-key")
+
+        capabilities = provider.get_capabilities("gpt-5-mini")
+        assert capabilities.model_name == "gpt-5-mini"
+        assert capabilities.friendly_name == "OpenAI (GPT-5-mini)"
+        assert capabilities.context_window == 400_000
+        assert capabilities.max_output_tokens == 128_000
+        assert capabilities.provider == ProviderType.OPENAI
+        assert capabilities.supports_extended_thinking is True
+        assert capabilities.supports_system_prompts is True
+        assert capabilities.supports_streaming is True
+        assert capabilities.supports_function_calling is True
+        assert capabilities.supports_temperature is True
+
    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_resolves_alias_before_api_call(self, mock_openai_class):
        """Test that generate_content resolves aliases before making API calls.
@@ -132,21 +174,19 @@ class TestOpenAIProvider:

        provider = OpenAIModelProvider("test-key")

-        # Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1-2025-04-14, supports temperature)
+        # Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1, supports temperature)
        result = provider.generate_content(
            prompt="Test prompt",
            model_name="gpt4.1",
-            temperature=1.0,  # This should be resolved to "gpt-4.1-2025-04-14"
+            temperature=1.0,  # This should be resolved to "gpt-4.1"
        )

        # Verify the API was called with the RESOLVED model name
        mock_client.chat.completions.create.assert_called_once()
        call_kwargs = mock_client.chat.completions.create.call_args[1]

-        # CRITICAL ASSERTION: The API should receive "gpt-4.1-2025-04-14", not "gpt4.1"
-        assert (
-            call_kwargs["model"] == "gpt-4.1-2025-04-14"
-        ), f"Expected 'gpt-4.1-2025-04-14' but API received '{call_kwargs['model']}'"
+        # CRITICAL ASSERTION: The API should receive "gpt-4.1", not "gpt4.1"
+        assert call_kwargs["model"] == "gpt-4.1", f"Expected 'gpt-4.1' but API received '{call_kwargs['model']}'"

        # Verify other parameters (gpt-4.1 supports temperature unlike O3/O4 models)
        assert call_kwargs["temperature"] == 1.0
@@ -156,7 +196,7 @@ class TestOpenAIProvider:

        # Verify response
        assert result.content == "Test response"
-        assert result.model_name == "gpt-4.1-2025-04-14"  # Should be the resolved name
+        assert result.model_name == "gpt-4.1"  # Should be the resolved name

    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_other_aliases(self, mock_openai_class):
@@ -213,14 +253,22 @@ class TestOpenAIProvider:
        assert call_kwargs["model"] == "o3-mini"  # Should be unchanged

    def test_supports_thinking_mode(self):
-        """Test thinking mode support (currently False for all OpenAI models)."""
+        """Test thinking mode support."""
        provider = OpenAIModelProvider("test-key")

-        # All OpenAI models currently don't support thinking mode
+        # GPT-5 models support thinking mode (reasoning tokens)
+        assert provider.supports_thinking_mode("gpt-5") is True
+        assert provider.supports_thinking_mode("gpt-5-mini") is True
+        assert provider.supports_thinking_mode("gpt5") is True  # Test with alias
+        assert provider.supports_thinking_mode("gpt5mini") is True  # Test with alias
+
+        # O3/O4 models don't support thinking mode
        assert provider.supports_thinking_mode("o3") is False
        assert provider.supports_thinking_mode("o3-mini") is False
        assert provider.supports_thinking_mode("o4-mini") is False
-        assert provider.supports_thinking_mode("mini") is False  # Test with alias too
+        assert (
+            provider.supports_thinking_mode("mini") is True
+        )  # "mini" now resolves to gpt-5-mini which supports thinking

    @patch("providers.openai_compatible.OpenAI")
    def test_o3_pro_routes_to_responses_endpoint(self, mock_openai_class):
@@ -230,11 +278,9 @@ class TestOpenAIProvider:
        mock_openai_class.return_value = mock_client

        mock_response = MagicMock()
-        mock_response.output = MagicMock()
-        mock_response.output.content = [MagicMock()]
-        mock_response.output.content[0].type = "output_text"
-        mock_response.output.content[0].text = "4"
-        mock_response.model = "o3-pro-2025-06-10"
+        # New o3-pro format: direct output_text field
+        mock_response.output_text = "4"
+        mock_response.model = "o3-pro"
        mock_response.id = "test-id"
        mock_response.created_at = 1234567890
        mock_response.usage = MagicMock()
@@ -252,13 +298,13 @@ class TestOpenAIProvider:
        # Verify responses.create was called
        mock_client.responses.create.assert_called_once()
        call_args = mock_client.responses.create.call_args[1]
-        assert call_args["model"] == "o3-pro-2025-06-10"
+        assert call_args["model"] == "o3-pro"
        assert call_args["input"][0]["role"] == "user"
        assert "What is 2 + 2?" in call_args["input"][0]["content"][0]["text"]

        # Verify the response
        assert result.content == "4"
-        assert result.model_name == "o3-pro-2025-06-10"
+        assert result.model_name == "o3-pro"
        assert result.metadata["endpoint"] == "responses"

    @patch("providers.openai_compatible.OpenAI")
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -3,6 +3,7 @@ Test per-tool model default selection functionality
 """

 import json
+import os
 from unittest.mock import MagicMock, patch

 import pytest
@@ -73,154 +74,194 @@ class TestToolModelCategories:
 class TestModelSelection:
    """Test model selection based on tool categories."""

+    def teardown_method(self):
+        """Clean up after each test to prevent state pollution."""
+        ModelProviderRegistry.clear_cache()
+        # Unregister all providers
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
    def test_extended_reasoning_with_openai(self):
-        """Test EXTENDED_REASONING prefers o3 when OpenAI is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock OpenAI models available
-            mock_get_available.return_value = {
-                "o3": ProviderType.OPENAI,
-                "o3-mini": ProviderType.OPENAI,
-                "o4-mini": ProviderType.OPENAI,
-            }
+        """Test EXTENDED_REASONING with OpenAI provider."""
+        # Setup with only OpenAI provider
+        ModelProviderRegistry.clear_cache()
+        # First unregister all providers to ensure isolation
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
+            from providers.openai_provider import OpenAIModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+            # OpenAI prefers o3 for extended reasoning
            assert model == "o3"

    def test_extended_reasoning_with_gemini_only(self):
        """Test EXTENDED_REASONING prefers pro when only Gemini is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock only Gemini models available
-            mock_get_available.return_value = {
-                "gemini-2.5-pro": ProviderType.GOOGLE,
-                "gemini-2.5-flash": ProviderType.GOOGLE,
-            }
+        # Clear cache and unregister all providers first
+        ModelProviderRegistry.clear_cache()
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        # Register only Gemini provider
+        with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}, clear=False):
+            from providers.gemini import GeminiModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-            # Should find the pro model for extended reasoning
-            assert "pro" in model or model == "gemini-2.5-pro"
+            # Gemini should return one of its models for extended reasoning
+            # The default behavior may return flash when pro is not explicitly preferred
+            assert model in ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.0-flash"]

    def test_fast_response_with_openai(self):
-        """Test FAST_RESPONSE prefers o4-mini when OpenAI is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock OpenAI models available
-            mock_get_available.return_value = {
-                "o3": ProviderType.OPENAI,
-                "o3-mini": ProviderType.OPENAI,
-                "o4-mini": ProviderType.OPENAI,
-            }
+        """Test FAST_RESPONSE with OpenAI provider."""
+        # Setup with only OpenAI provider
+        ModelProviderRegistry.clear_cache()
+        # First unregister all providers to ensure isolation
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
+            from providers.openai_provider import OpenAIModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            assert model == "o4-mini"
+            # OpenAI now prefers gpt-5 for fast response (based on our new preference order)
+            assert model == "gpt-5"

    def test_fast_response_with_gemini_only(self):
        """Test FAST_RESPONSE prefers flash when only Gemini is available."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock only Gemini models available
-            mock_get_available.return_value = {
-                "gemini-2.5-pro": ProviderType.GOOGLE,
-                "gemini-2.5-flash": ProviderType.GOOGLE,
-            }
+        # Clear cache and unregister all providers first
+        ModelProviderRegistry.clear_cache()
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        # Register only Gemini provider
+        with patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}, clear=False):
+            from providers.gemini import GeminiModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            # Should find the flash model for fast response
-            assert "flash" in model or model == "gemini-2.5-flash"
+            # Gemini should return one of its models for fast response
+            assert model in ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-2.5-pro"]

    def test_balanced_category_fallback(self):
        """Test BALANCED category uses existing logic."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock OpenAI models available
-            mock_get_available.return_value = {
-                "o3": ProviderType.OPENAI,
-                "o3-mini": ProviderType.OPENAI,
-                "o4-mini": ProviderType.OPENAI,
-            }
+        # Setup with only OpenAI provider
+        ModelProviderRegistry.clear_cache()
+        # First unregister all providers to ensure isolation
+        for provider_type in list(ProviderType):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}, clear=False):
+            from providers.openai_provider import OpenAIModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
-            assert model == "o4-mini"  # Balanced prefers o4-mini when OpenAI available
+            # OpenAI prefers gpt-5 for balanced (based on our new preference order)
+            assert model == "gpt-5"

    def test_no_category_uses_balanced_logic(self):
        """Test that no category specified uses balanced logic."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # Mock only Gemini models available
-            mock_get_available.return_value = {
-                "gemini-2.5-pro": ProviderType.GOOGLE,
-                "gemini-2.5-flash": ProviderType.GOOGLE,
-            }
+        # Setup with only Gemini provider
+        with patch.dict(os.environ, {"GEMINI_API_KEY": "test-key"}, clear=False):
+            from providers.gemini import GeminiModelProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

            model = ModelProviderRegistry.get_preferred_fallback_model()
-            # Should pick a reasonable default, preferring flash for balanced use
-            assert "flash" in model or model == "gemini-2.5-flash"
+            # Should pick flash for balanced use
+            assert model == "gemini-2.5-flash"


 class TestFlexibleModelSelection:
    """Test that model selection handles various naming scenarios."""

    def test_fallback_handles_mixed_model_names(self):
-        """Test that fallback selection works with mix of full names and shorthands."""
-        # Test with mix of full names and shorthands
+        """Test that fallback selection works with different providers."""
+        # Test with different provider configurations
        test_cases = [
-            # Case 1: Mix of OpenAI shorthands and full names
+            # Case 1: OpenAI provider for extended reasoning
            {
-                "available": {"o3": ProviderType.OPENAI, "o4-mini": ProviderType.OPENAI},
+                "env": {"OPENAI_API_KEY": "test-key"},
+                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.EXTENDED_REASONING,
                "expected": "o3",
            },
-            # Case 2: Mix of Gemini shorthands and full names
+            # Case 2: Gemini provider for fast response
            {
-                "available": {
-                    "gemini-2.5-flash": ProviderType.GOOGLE,
-                    "gemini-2.5-pro": ProviderType.GOOGLE,
-                },
+                "env": {"GEMINI_API_KEY": "test-key"},
+                "provider_type": ProviderType.GOOGLE,
                "category": ToolModelCategory.FAST_RESPONSE,
-                "expected_contains": "flash",
+                "expected": "gemini-2.5-flash",
            },
-            # Case 3: Only shorthands available
+            # Case 3: OpenAI provider for fast response
            {
-                "available": {"o4-mini": ProviderType.OPENAI, "o3-mini": ProviderType.OPENAI},
+                "env": {"OPENAI_API_KEY": "test-key"},
+                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.FAST_RESPONSE,
-                "expected": "o4-mini",
+                "expected": "gpt-5",  # Based on new preference order
            },
        ]

        for case in test_cases:
-            with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-                mock_get_available.return_value = case["available"]
+            # Clear registry for clean test
+            ModelProviderRegistry.clear_cache()
+            # First unregister all providers to ensure isolation
+            for provider_type in list(ProviderType):
+                ModelProviderRegistry.unregister_provider(provider_type)
+
+            with patch.dict(os.environ, case["env"], clear=False):
+                # Register the appropriate provider
+                if case["provider_type"] == ProviderType.OPENAI:
+                    from providers.openai_provider import OpenAIModelProvider
+
+                    ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
+                elif case["provider_type"] == ProviderType.GOOGLE:
+                    from providers.gemini import GeminiModelProvider
+
+                    ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

                model = ModelProviderRegistry.get_preferred_fallback_model(case["category"])
-
-                if "expected" in case:
-                    assert model == case["expected"], f"Failed for case: {case}"
-                elif "expected_contains" in case:
-                    assert (
-                        case["expected_contains"] in model
-                    ), f"Expected '{case['expected_contains']}' in '{model}' for case: {case}"
+                assert model == case["expected"], f"Failed for case: {case}, got {model}"


 class TestCustomProviderFallback:
    """Test fallback to custom/openrouter providers."""

-    @patch.object(ModelProviderRegistry, "_find_extended_thinking_model")
-    def test_extended_reasoning_custom_fallback(self, mock_find_thinking):
-        """Test EXTENDED_REASONING falls back to custom thinking model."""
-        with patch.object(ModelProviderRegistry, "get_available_models") as mock_get_available:
-            # No native models available, but OpenRouter is available
-            mock_get_available.return_value = {"openrouter-model": ProviderType.OPENROUTER}
-            mock_find_thinking.return_value = "custom/thinking-model"
+    def test_extended_reasoning_custom_fallback(self):
+        """Test EXTENDED_REASONING with custom provider."""
+        # Setup with custom provider
+        ModelProviderRegistry.clear_cache()
+        with patch.dict(os.environ, {"CUSTOM_API_URL": "http://localhost:11434", "CUSTOM_API_KEY": ""}, clear=False):
+            from providers.custom import CustomProvider
+
+            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)
+
+            provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
+            if provider:
+                model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+                # Should get a model from custom provider
+                assert model is not None
+
+    def test_extended_reasoning_final_fallback(self):
+        """Test EXTENDED_REASONING falls back to default when no providers."""
+        # Clear all providers
+        ModelProviderRegistry.clear_cache()
+        for provider_type in list(
+            ModelProviderRegistry._instance._providers.keys() if ModelProviderRegistry._instance else []
+        ):
+            ModelProviderRegistry.unregister_provider(provider_type)

        model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-            assert model == "custom/thinking-model"
-            mock_find_thinking.assert_called_once()
-
-    @patch.object(ModelProviderRegistry, "_find_extended_thinking_model")
-    def test_extended_reasoning_final_fallback(self, mock_find_thinking):
-        """Test EXTENDED_REASONING falls back to pro when no custom found."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
-            # No providers available
-            mock_get_provider.return_value = None
-            mock_find_thinking.return_value = None
-
-            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-            assert model == "gemini-2.5-pro"
+        # Should fall back to hardcoded default
+        assert model == "gemini-2.5-flash"


 class TestAutoModeErrorMessages:
@@ -266,42 +307,45 @@ class TestAutoModeErrorMessages:
 class TestProviderHelperMethods:
    """Test the helper methods for finding models from custom/openrouter."""

-    def test_find_extended_thinking_model_custom(self):
-        """Test finding thinking model from custom provider."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
+    def test_extended_reasoning_with_custom_provider(self):
+        """Test extended reasoning model selection with custom provider."""
+        # Setup with custom provider
+        with patch.dict(os.environ, {"CUSTOM_API_URL": "http://localhost:11434", "CUSTOM_API_KEY": ""}, clear=False):
            from providers.custom import CustomProvider

-            # Mock custom provider with thinking model
-            mock_custom = MagicMock(spec=CustomProvider)
-            mock_custom.model_registry = {
-                "model1": {"supports_extended_thinking": False},
-                "model2": {"supports_extended_thinking": True},
-                "model3": {"supports_extended_thinking": False},
-            }
-            mock_get_provider.side_effect = lambda ptype: mock_custom if ptype == ProviderType.CUSTOM else None
+            ModelProviderRegistry.register_provider(ProviderType.CUSTOM, CustomProvider)

-            model = ModelProviderRegistry._find_extended_thinking_model()
-            assert model == "model2"
+            provider = ModelProviderRegistry.get_provider(ProviderType.CUSTOM)
+            if provider:
+                # Custom provider should return a model for extended reasoning
+                model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+                assert model is not None

-    def test_find_extended_thinking_model_openrouter(self):
-        """Test finding thinking model from openrouter."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
-            # Mock openrouter provider
-            mock_openrouter = MagicMock()
-            mock_openrouter.validate_model_name.side_effect = lambda m: m == "anthropic/claude-sonnet-4"
-            mock_get_provider.side_effect = lambda ptype: mock_openrouter if ptype == ProviderType.OPENROUTER else None
+    def test_extended_reasoning_with_openrouter(self):
+        """Test extended reasoning model selection with OpenRouter."""
+        # Setup with OpenRouter provider
+        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}, clear=False):
+            from providers.openrouter import OpenRouterProvider

-            model = ModelProviderRegistry._find_extended_thinking_model()
-            assert model == "anthropic/claude-sonnet-4"
+            ModelProviderRegistry.register_provider(ProviderType.OPENROUTER, OpenRouterProvider)

-    def test_find_extended_thinking_model_none_found(self):
-        """Test when no thinking model is found."""
-        with patch.object(ModelProviderRegistry, "get_provider") as mock_get_provider:
-            # No providers available
-            mock_get_provider.return_value = None
+            # OpenRouter should provide a model for extended reasoning
+            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+            # Should return first available OpenRouter model
+            assert model is not None

-            model = ModelProviderRegistry._find_extended_thinking_model()
-            assert model is None
+    def test_fallback_when_no_providers_available(self):
+        """Test fallback when no providers are available."""
+        # Clear all providers
+        ModelProviderRegistry.clear_cache()
+        for provider_type in list(
+            ModelProviderRegistry._instance._providers.keys() if ModelProviderRegistry._instance else []
+        ):
+            ModelProviderRegistry.unregister_provider(provider_type)
+
+        # Should return hardcoded fallback
+        model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
+        assert model == "gemini-2.5-flash"


 class TestEffectiveAutoMode:
--- a/tests/test_pii_sanitizer.py
+++ b/tests/test_pii_sanitizer.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""Test cases for PII sanitizer."""
+
+import unittest
+
+from .pii_sanitizer import PIIPattern, PIISanitizer
+
+
+class TestPIISanitizer(unittest.TestCase):
+    """Test PII sanitization functionality."""
+
+    def setUp(self):
+        """Set up test sanitizer."""
+        self.sanitizer = PIISanitizer()
+
+    def test_api_key_sanitization(self):
+        """Test various API key formats are sanitized."""
+        test_cases = [
+            # OpenAI keys
+            ("sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-proj-SANITIZED"),
+            ("sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN", "sk-SANITIZED"),
+            # Anthropic keys
+            ("sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-ant-SANITIZED"),
+            # Google keys
+            ("AIzaSyD-1234567890abcdefghijklmnopqrstuv", "AIza-SANITIZED"),
+            # GitHub tokens
+            ("ghp_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
+            ("ghs_1234567890abcdefghijklmnopqrstuvwxyz", "gh_SANITIZED"),
+        ]
+
+        for original, expected in test_cases:
+            with self.subTest(original=original):
+                result = self.sanitizer.sanitize_string(original)
+                self.assertEqual(result, expected)
+
+    def test_personal_info_sanitization(self):
+        """Test personal information is sanitized."""
+        test_cases = [
+            # Email addresses
+            ("john.doe@example.com", "user@example.com"),
+            ("test123@company.org", "user@example.com"),
+            # Phone numbers (all now use the same pattern)
+            ("(555) 123-4567", "(XXX) XXX-XXXX"),
+            ("555-123-4567", "(XXX) XXX-XXXX"),
+            ("+1-555-123-4567", "(XXX) XXX-XXXX"),
+            # SSN
+            ("123-45-6789", "XXX-XX-XXXX"),
+            # Credit card
+            ("1234 5678 9012 3456", "XXXX-XXXX-XXXX-XXXX"),
+            ("1234-5678-9012-3456", "XXXX-XXXX-XXXX-XXXX"),
+        ]
+
+        for original, expected in test_cases:
+            with self.subTest(original=original):
+                result = self.sanitizer.sanitize_string(original)
+                self.assertEqual(result, expected)
+
+    def test_header_sanitization(self):
+        """Test HTTP header sanitization."""
+        headers = {
+            "Authorization": "Bearer sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
+            "API-Key": "sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
+            "Content-Type": "application/json",
+            "User-Agent": "MyApp/1.0",
+            "Cookie": "session=abc123; user=john.doe@example.com",
+        }
+
+        sanitized = self.sanitizer.sanitize_headers(headers)
+
+        self.assertEqual(sanitized["Authorization"], "Bearer SANITIZED")
+        self.assertEqual(sanitized["API-Key"], "sk-SANITIZED")
+        self.assertEqual(sanitized["Content-Type"], "application/json")
+        self.assertEqual(sanitized["User-Agent"], "MyApp/1.0")
+        self.assertIn("user@example.com", sanitized["Cookie"])
+
+    def test_nested_structure_sanitization(self):
+        """Test sanitization of nested data structures."""
+        data = {
+            "user": {
+                "email": "john.doe@example.com",
+                "api_key": "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
+            },
+            "tokens": [
+                "ghp_1234567890abcdefghijklmnopqrstuvwxyz",
+                "Bearer sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
+            ],
+            "metadata": {"ip": "192.168.1.100", "phone": "(555) 123-4567"},
+        }
+
+        sanitized = self.sanitizer.sanitize_value(data)
+
+        self.assertEqual(sanitized["user"]["email"], "user@example.com")
+        self.assertEqual(sanitized["user"]["api_key"], "sk-proj-SANITIZED")
+        self.assertEqual(sanitized["tokens"][0], "gh_SANITIZED")
+        self.assertEqual(sanitized["tokens"][1], "Bearer sk-ant-SANITIZED")
+        self.assertEqual(sanitized["metadata"]["ip"], "0.0.0.0")
+        self.assertEqual(sanitized["metadata"]["phone"], "(XXX) XXX-XXXX")
+
+    def test_url_sanitization(self):
+        """Test URL parameter sanitization."""
+        urls = [
+            (
+                "https://api.example.com/v1/users?api_key=sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
+                "https://api.example.com/v1/users?api_key=SANITIZED",
+            ),
+            (
+                "https://example.com/login?token=ghp_1234567890abcdefghijklmnopqrstuvwxyz&user=test",
+                "https://example.com/login?token=SANITIZED&user=test",
+            ),
+        ]
+
+        for original, expected in urls:
+            with self.subTest(url=original):
+                result = self.sanitizer.sanitize_url(original)
+                self.assertEqual(result, expected)
+
+    def test_disable_sanitization(self):
+        """Test that sanitization can be disabled."""
+        self.sanitizer.sanitize_enabled = False
+
+        sensitive_data = "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
+        result = self.sanitizer.sanitize_string(sensitive_data)
+
+        # Should return original when disabled
+        self.assertEqual(result, sensitive_data)
+
+    def test_custom_pattern(self):
+        """Test adding custom PII patterns."""
+        # Add custom pattern for internal employee IDs
+        custom_pattern = PIIPattern.create(
+            name="employee_id", pattern=r"EMP\d{6}", replacement="EMP-REDACTED", description="Internal employee IDs"
+        )
+
+        self.sanitizer.add_pattern(custom_pattern)
+
+        text = "Employee EMP123456 has access to the system"
+        result = self.sanitizer.sanitize_string(text)
+
+        self.assertEqual(result, "Employee EMP-REDACTED has access to the system")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_provider_utf8.py
+++ b/tests/test_provider_utf8.py
@@ -126,7 +126,7 @@ class TestProviderUTF8Encoding(unittest.TestCase):
        mock_response.usage = Mock()
        mock_response.usage.input_tokens = 50
        mock_response.usage.output_tokens = 25
-        mock_response.model = "o3-pro-2025-06-10"
+        mock_response.model = "o3-pro"
        mock_response.id = "test-id"
        mock_response.created_at = 1234567890

@@ -141,7 +141,7 @@ class TestProviderUTF8Encoding(unittest.TestCase):
        with patch("logging.info") as mock_logging:
            response = provider.generate_content(
                prompt="Analyze this Python code for issues",
-                model_name="o3-pro-2025-06-10",
+                model_name="o3-pro",
                system_prompt="You are a code review expert.",
            )

@@ -351,7 +351,7 @@ class TestLocaleModelIntegration(unittest.TestCase):
    def test_model_name_resolution_utf8(self):
        """Test model name resolution with UTF-8."""
        provider = OpenAIModelProvider(api_key="test")
-        model_names = ["gpt-4", "gemini-2.5-flash", "claude-3-opus", "o3-pro-2025-06-10"]
+        model_names = ["gpt-4", "gemini-2.5-flash", "claude-3-opus", "o3-pro"]
        for model_name in model_names:
            resolved = provider._resolve_model_name(model_name)
            self.assertIsInstance(resolved, str)
--- a/tests/test_supported_models_aliases.py
+++ b/tests/test_supported_models_aliases.py
@@ -47,22 +47,23 @@ class TestSupportedModelsAliases:
            assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"

        # Test specific aliases
-        assert "mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
+        # "mini" is now an alias for gpt-5-mini, not o4-mini
+        assert "mini" in provider.SUPPORTED_MODELS["gpt-5-mini"].aliases
        assert "o4mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
+        assert "o4-mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
        assert "o3mini" in provider.SUPPORTED_MODELS["o3-mini"].aliases
-        assert "o3-pro" in provider.SUPPORTED_MODELS["o3-pro-2025-06-10"].aliases
-        assert "o4mini" in provider.SUPPORTED_MODELS["o4-mini"].aliases
-        assert "gpt4.1" in provider.SUPPORTED_MODELS["gpt-4.1-2025-04-14"].aliases
+        assert "o3-pro" in provider.SUPPORTED_MODELS["o3-pro"].aliases
+        assert "gpt4.1" in provider.SUPPORTED_MODELS["gpt-4.1"].aliases

        # Test alias resolution
-        assert provider._resolve_model_name("mini") == "o4-mini"
+        assert provider._resolve_model_name("mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
        assert provider._resolve_model_name("o3mini") == "o3-mini"
-        assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
+        assert provider._resolve_model_name("o3-pro") == "o3-pro"  # o3-pro is already the base model name
        assert provider._resolve_model_name("o4mini") == "o4-mini"
-        assert provider._resolve_model_name("gpt4.1") == "gpt-4.1-2025-04-14"
+        assert provider._resolve_model_name("gpt4.1") == "gpt-4.1"  # gpt4.1 resolves to gpt-4.1

        # Test case insensitive resolution
-        assert provider._resolve_model_name("Mini") == "o4-mini"
+        assert provider._resolve_model_name("Mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
        assert provider._resolve_model_name("O3MINI") == "o3-mini"

    def test_xai_provider_aliases(self):
@@ -75,25 +76,21 @@ class TestSupportedModelsAliases:
            assert isinstance(config.aliases, list), f"{model_name} aliases must be a list"

        # Test specific aliases
-        assert "grok" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
-        assert "grok-4" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
-        assert "grok-4-latest" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
-        assert "grok4" in provider.SUPPORTED_MODELS["grok-4-0709"].aliases
+        assert "grok" in provider.SUPPORTED_MODELS["grok-4"].aliases
+        assert "grok4" in provider.SUPPORTED_MODELS["grok-4"].aliases
        assert "grok3" in provider.SUPPORTED_MODELS["grok-3"].aliases
        assert "grok3fast" in provider.SUPPORTED_MODELS["grok-3-fast"].aliases
        assert "grokfast" in provider.SUPPORTED_MODELS["grok-3-fast"].aliases

        # Test alias resolution
-        assert provider._resolve_model_name("grok") == "grok-4-0709"
-        assert provider._resolve_model_name("grok4") == "grok-4-0709"
-        assert provider._resolve_model_name("grok-4") == "grok-4-0709"
+        assert provider._resolve_model_name("grok") == "grok-4"
+        assert provider._resolve_model_name("grok4") == "grok-4"
        assert provider._resolve_model_name("grok3") == "grok-3"
        assert provider._resolve_model_name("grok3fast") == "grok-3-fast"
        assert provider._resolve_model_name("grokfast") == "grok-3-fast"

        # Test case insensitive resolution
-        assert provider._resolve_model_name("Grok") == "grok-4-0709"
-        assert provider._resolve_model_name("GROK4") == "grok-4-0709"
+        assert provider._resolve_model_name("Grok") == "grok-4"
        assert provider._resolve_model_name("GROKFAST") == "grok-3-fast"

    def test_dial_provider_aliases(self):
--- a/tests/test_xai_provider.py
+++ b/tests/test_xai_provider.py
@@ -66,10 +66,8 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")

        # Test shorthand resolution
-        assert provider._resolve_model_name("grok") == "grok-4-0709"
-        assert provider._resolve_model_name("grok4") == "grok-4-0709"
-        assert provider._resolve_model_name("grok-4") == "grok-4-0709"
-        assert provider._resolve_model_name("grok-4-latest") == "grok-4-0709"
+        assert provider._resolve_model_name("grok") == "grok-4"
+        assert provider._resolve_model_name("grok4") == "grok-4"
        assert provider._resolve_model_name("grok3") == "grok-3"
        assert provider._resolve_model_name("grokfast") == "grok-3-fast"
        assert provider._resolve_model_name("grok3fast") == "grok-3-fast"
@@ -96,7 +94,7 @@ class TestXAIProvider:
        # Test temperature range
        assert capabilities.temperature_constraint.min_temp == 0.0
        assert capabilities.temperature_constraint.max_temp == 2.0
-        assert capabilities.temperature_constraint.default_temp == 0.7
+        assert capabilities.temperature_constraint.default_temp == 0.3

    def test_get_capabilities_grok4(self):
        """Test getting model capabilities for GROK-4."""
@@ -135,13 +133,9 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")

        capabilities = provider.get_capabilities("grok")
-        assert capabilities.model_name == "grok-4-0709"  # Should resolve to full name
+        assert capabilities.model_name == "grok-4"  # Should resolve to full name
        assert capabilities.context_window == 256_000

-        capabilities_3 = provider.get_capabilities("grok3")
-        assert capabilities_3.model_name == "grok-3"  # Should resolve to full name
-        assert capabilities_3.context_window == 131_072
-
        capabilities_fast = provider.get_capabilities("grokfast")
        assert capabilities_fast.model_name == "grok-3-fast"  # Should resolve to full name

@@ -164,7 +158,9 @@ class TestXAIProvider:
        # Grok-3 models don't support thinking mode
        assert not provider.supports_thinking_mode("grok-3")
        assert not provider.supports_thinking_mode("grok-3-fast")
-        assert not provider.supports_thinking_mode("grok3")
+        assert provider.supports_thinking_mode("grok-4")  # grok-4 supports thinking mode
+        assert provider.supports_thinking_mode("grok")  # resolves to grok-4
+        assert provider.supports_thinking_mode("grok4")  # resolves to grok-4
        assert not provider.supports_thinking_mode("grokfast")

    def test_provider_type(self):
@@ -186,9 +182,8 @@ class TestXAIProvider:
        assert provider.validate_model_name("grok-3") is True
        assert provider.validate_model_name("grok3") is True  # Shorthand for grok-3

-        # grok-4 and its aliases should be blocked
-        assert provider.validate_model_name("grok-4-0709") is False
-        assert provider.validate_model_name("grok") is False  # Now resolves to grok-4
+        # grok should be blocked (resolves to grok-4 which is not allowed)
+        assert provider.validate_model_name("grok") is False

        # grok-3-fast should be blocked by restrictions
        assert provider.validate_model_name("grok-3-fast") is False
@@ -204,7 +199,7 @@ class TestXAIProvider:

        provider = XAIModelProvider("test-key")

-        # Shorthand "grok" should be allowed (resolves to grok-4-0709)
+        # Shorthand "grok" should be allowed (resolves to grok-4)
        assert provider.validate_model_name("grok") is True

        # Full name "grok-4-0709" should NOT be allowed (only shorthand "grok" is in restriction list)
@@ -268,7 +263,7 @@ class TestXAIProvider:
        provider = XAIModelProvider("test-key")

        # Check that all expected base models are present
-        assert "grok-4-0709" in provider.SUPPORTED_MODELS
+        assert "grok-4" in provider.SUPPORTED_MODELS
        assert "grok-3" in provider.SUPPORTED_MODELS
        assert "grok-3-fast" in provider.SUPPORTED_MODELS

@@ -292,7 +287,13 @@ class TestXAIProvider:
        grok3_config = provider.SUPPORTED_MODELS["grok-3"]
        assert grok3_config.context_window == 131_072
        assert grok3_config.supports_extended_thinking is False
-        assert "grok3" in grok3_config.aliases
+        # Check aliases are correctly structured
+        assert "grok3" in grok3_config.aliases  # grok3 resolves to grok-3
+
+        # Check grok-4 aliases
+        grok4_config = provider.SUPPORTED_MODELS["grok-4"]
+        assert "grok" in grok4_config.aliases  # grok resolves to grok-4
+        assert "grok4" in grok4_config.aliases

        grok3fast_config = provider.SUPPORTED_MODELS["grok-3-fast"]
        assert "grok3fast" in grok3fast_config.aliases
@@ -303,7 +304,7 @@ class TestXAIProvider:
        """Test that generate_content resolves aliases before making API calls.

        This is the CRITICAL test that ensures aliases like 'grok' get resolved
-        to 'grok-3' before being sent to X.AI API.
+        to 'grok-4' before being sent to X.AI API.
        """
        # Set up mock OpenAI client
        mock_client = MagicMock()
@@ -328,17 +329,15 @@ class TestXAIProvider:

        # Call generate_content with alias 'grok'
        result = provider.generate_content(
-            prompt="Test prompt", model_name="grok", temperature=0.7  # This should be resolved to "grok-4-0709"
+            prompt="Test prompt", model_name="grok", temperature=0.7  # This should be resolved to "grok-4"
        )

        # Verify the API was called with the RESOLVED model name
        mock_client.chat.completions.create.assert_called_once()
        call_kwargs = mock_client.chat.completions.create.call_args[1]

-        # CRITICAL ASSERTION: The API should receive "grok-4-0709", not "grok"
-        assert (
-            call_kwargs["model"] == "grok-4-0709"
-        ), f"Expected 'grok-4-0709' but API received '{call_kwargs['model']}'"
+        # CRITICAL ASSERTION: The API should receive "grok-4", not "grok"
+        assert call_kwargs["model"] == "grok-4", f"Expected 'grok-4' but API received '{call_kwargs['model']}'"

        # Verify other parameters
        assert call_kwargs["temperature"] == 0.7
@@ -348,7 +347,7 @@ class TestXAIProvider:

        # Verify response
        assert result.content == "Test response"
-        assert result.model_name == "grok-4-0709"  # Should be the resolved name
+        assert result.model_name == "grok-4"  # Should be the resolved name

    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_other_aliases(self, mock_openai_class):
--- a/tests/transport_helpers.py
+++ b/tests/transport_helpers.py
@@ -0,0 +1,47 @@
+"""Helper functions for HTTP transport injection in tests."""
+
+from tests.http_transport_recorder import TransportFactory
+
+
+def inject_transport(monkeypatch, cassette_path: str):
+    """Inject HTTP transport into OpenAICompatibleProvider for testing.
+
+    This helper simplifies the monkey patching pattern used across tests
+    to inject custom HTTP transports for recording/replaying API calls.
+
+    Also ensures OpenAI provider is properly registered for tests that need it.
+
+    Args:
+        monkeypatch: pytest monkeypatch fixture
+        cassette_path: Path to cassette file for recording/replay
+
+    Returns:
+        The created transport instance
+
+    Example:
+        transport = inject_transport(monkeypatch, "path/to/cassette.json")
+    """
+    # Ensure OpenAI provider is registered - always needed for transport injection
+    from providers.base import ProviderType
+    from providers.openai_provider import OpenAIModelProvider
+    from providers.registry import ModelProviderRegistry
+
+    # Always register OpenAI provider for transport tests (API key might be dummy)
+    ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
+
+    # Create transport
+    transport = TransportFactory.create_transport(str(cassette_path))
+
+    # Inject transport using the established pattern
+    from providers.openai_compatible import OpenAICompatibleProvider
+
+    original_client_property = OpenAICompatibleProvider.client
+
+    def patched_client_getter(self):
+        if self._client is None:
+            self._test_transport = transport
+        return original_client_property.fget(self)
+
+    monkeypatch.setattr(OpenAICompatibleProvider, "client", property(patched_client_getter))
+
+    return transport
--- a/tools/challenge.py
+++ b/tools/challenge.py
@@ -152,7 +152,7 @@ class ChallengeTool(SimpleTool):

            # Return the wrapped prompt as the response
            response_data = {
-                "status": "challenge_created",
+                "status": "challenge_accepted",
                "original_statement": request.prompt,
                "challenge_prompt": wrapped_prompt,
                "instructions": (
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -23,6 +23,9 @@ from .simple.base import SimpleTool
 CHAT_FIELD_DESCRIPTIONS = {
    "prompt": (
        "You MUST provide a thorough, expressive question or share an idea with as much context as possible. "
+        "IMPORTANT: When referring to code, use the files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
        "Remember: you're talking to an assistant who has deep expertise and can provide nuanced insights. Include your "
        "current thinking, specific challenges, background context, what you've already tried, and what "
        "kind of response would be most helpful. The more context and detail you provide, the more "
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -45,6 +45,9 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "and ways to reduce complexity while maintaining functionality. Map out the codebase structure, understand "
        "the business logic, and identify areas requiring deeper analysis. In all later steps, continue exploring "
        "with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the code review sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the code review. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. MANDATORY: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "code review analysis is complete and ready for expert validation."
+        "code review analysis is complete and ready for expert validation. MANDATORY: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the code being reviewed. Include analysis of code quality, "
@@ -91,13 +96,14 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "unnecessary complexity, etc."
    ),
    "confidence": (
-        "Indicate your current confidence in the code review assessment. Use: 'exploring' (starting analysis), 'low' "
-        "(early investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete review), 'certain' (100% confidence - "
-        "code review is thoroughly complete and all significant issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the code review is comprehensively complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
-        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also do "
-        "NOT set confidence to 'certain' if the user has strongly requested that external review must be performed."
+        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
+        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
+        "analysis is complete and all issues are identified with no need for external model validation). "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
+        "instead if not 200% sure. "
+        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
+        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
@@ -572,6 +578,17 @@ class CodeReviewTool(WorkflowTool):
        """
        Provide step-specific guidance for code review workflow.
        """
+        # Check if this is a continuation - if so, skip workflow and go to expert analysis
+        continuation_id = self.get_request_continuation_id(request)
+        if continuation_id:
+            return {
+                "next_steps": (
+                    "Continuing previous conversation. The expert analysis will now be performed based on the "
+                    "accumulated context from the previous conversation. The analysis will build upon the prior "
+                    "findings without repeating the investigation steps."
+                )
+            }
+
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)

--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -537,11 +537,13 @@ of the evidence, even when it strongly points in one direction.""",
            provider = self.get_model_provider(model_name)

            # Prepare the prompt with any relevant files
+            # Use continuation_id=None for blinded consensus - each model should only see
+            # original prompt + files, not conversation history or other model responses
            prompt = self.initial_prompt
            if request.relevant_files:
                file_content, _ = self._prepare_file_content_for_prompt(
                    request.relevant_files,
-                    request.continuation_id,
+                    None,  # Use None instead of request.continuation_id for blinded consensus
                    "Context files",
                )
                if file_content:
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -45,6 +45,9 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
        "could cause instability. In concurrent systems, watch for race conditions, shared state, or timing "
        "dependencies. In all later steps, continue exploring with precision: trace deeper dependencies, verify "
        "hypotheses, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the root "
-        "cause is known or the investigation is complete."
+        "cause is known or the investigation is complete. IMPORTANT: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or "
@@ -92,10 +97,10 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), "
        "'medium' (some supporting evidence), 'high' (strong evidence), 'very_high' (very strong evidence), "
-        "'almost_certain' (nearly confirmed), 'certain' (100% confidence - root cause and minimal fix are both "
+        "'almost_certain' (nearly confirmed), 'certain' (200% confidence - root cause and minimal fix are both "
        "confirmed locally with no need for external model validation). Do NOT use 'certain' unless the issue can be "
-        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 100% sure. Using 'certain' "
-        "means you have complete confidence locally and prevents external model validation. Also do "
+        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 200% sure. Using 'certain' "
+        "means you have ABSOLUTE confidence locally and prevents external model validation. Also do "
        "NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
@@ -165,7 +170,7 @@ class DebugIssueTool(WorkflowTool):

    def get_description(self) -> str:
        return (
-            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
+            "DEBUG & ROOT CAUSE ANALYSIS - Use this tool to perform any kind of debugging, bug hunting, or issue tracking. "
            "This tool guides you through a step-by-step investigation process where you:\n\n"
            "1. Start with step 1: describe the issue to investigate\n"
            "2. STOP and investigate using appropriate tools\n"
--- a/tools/listmodels.py
+++ b/tools/listmodels.py
@@ -225,7 +225,7 @@ class ListModelsTool(BaseTool):
                output_lines.append(f"**Error loading models**: {str(e)}")
        else:
            output_lines.append("**Status**: Not configured (set OPENROUTER_API_KEY)")
-            output_lines.append("**Note**: Provides access to GPT-4, O3, Mistral, and many more")
+            output_lines.append("**Note**: Provides access to GPT-5, O3, Mistral, and many more")

        output_lines.append("")

@@ -295,7 +295,7 @@ class ListModelsTool(BaseTool):

        # Add usage tips
        output_lines.append("\n**Usage Tips**:")
-        output_lines.append("- Use model aliases (e.g., 'flash', 'o3', 'opus') for convenience")
+        output_lines.append("- Use model aliases (e.g., 'flash', 'gpt5', 'opus') for convenience")
        output_lines.append("- In auto mode, the CLI Agent will select the best model for each task")
        output_lines.append("- Custom models are only available when CUSTOM_API_URL is set")
        output_lines.append("- OpenRouter provides access to many cloud models with one API key")
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -42,6 +42,9 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
        "performance impacts, and maintainability concerns. Map out changed files, understand the business logic, "
        "and identify areas requiring deeper analysis. In all later steps, continue exploring with precision: "
        "trace dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the pre-commit investigation sequence, beginning at 1. Each step should "
@@ -49,11 +52,13 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the pre-commit investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "pre-commit analysis is complete and ready for expert validation."
+        "pre-commit analysis is complete and ready for expert validation. IMPORTANT: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the changes being committed. Include analysis of git diffs, "
@@ -87,9 +92,10 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (100% confidence - "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
        "analysis is complete and all issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
+        "instead if not 200% sure. "
        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
@@ -584,6 +590,17 @@ class PrecommitTool(WorkflowTool):
        """
        Provide step-specific guidance for precommit workflow.
        """
+        # Check if this is a continuation - if so, skip workflow and go to expert analysis
+        continuation_id = self.get_request_continuation_id(request)
+        if continuation_id:
+            return {
+                "next_steps": (
+                    "Continuing previous conversation. The expert analysis will now be performed based on the "
+                    "accumulated context from the previous conversation. The analysis will build upon the prior "
+                    "findings without repeating the investigation steps."
+                )
+            }
+
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)

--- a/tools/refactor.py
+++ b/tools/refactor.py
@@ -44,6 +44,9 @@ REFACTOR_FIELD_DESCRIPTIONS = {
        "structure, understand the business logic, and identify areas requiring refactoring. In all later steps, continue "
        "exploring with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover "
        "more refactoring opportunities."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the refactoring investigation sequence, beginning at 1. Each step should "
--- a/tools/workflow/base.py
+++ b/tools/workflow/base.py
@@ -390,6 +390,23 @@ class WorkflowTool(BaseTool, BaseWorkflowMixin):
        """Get status for skipped expert analysis. Override for tool-specific status."""
        return "skipped_by_tool_design"

+    def is_continuation_workflow(self, request) -> bool:
+        """
+        Check if this is a continuation workflow that should skip multi-step investigation.
+
+        When continuation_id is provided, the workflow typically continues from a previous
+        conversation and should go directly to expert analysis rather than starting a new
+        multi-step investigation.
+
+        Args:
+            request: The workflow request object
+
+        Returns:
+            True if this is a continuation that should skip multi-step workflow
+        """
+        continuation_id = self.get_request_continuation_id(request)
+        return bool(continuation_id)
+
    # Abstract methods that must be implemented by specific workflow tools
    # (These are inherited from BaseWorkflowMixin and must be implemented)

--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -89,6 +89,11 @@ class BaseWorkflowMixin(ABC):
        """Return the system prompt for this tool. Usually provided by BaseTool."""
        pass

+    @abstractmethod
+    def get_language_instruction(self) -> str:
+        """Return the language instruction for localization. Usually provided by BaseTool."""
+        pass
+
    @abstractmethod
    def get_default_temperature(self) -> float:
        """Return the default temperature for this tool. Usually provided by BaseTool."""
@@ -107,9 +112,11 @@ class BaseWorkflowMixin(ABC):
    @abstractmethod
    def _prepare_file_content_for_prompt(
        self,
-        files: list[str],
+        request_files: list[str],
        continuation_id: Optional[str],
-        description: str,
+        context_description: str = "New files",
+        max_tokens: Optional[int] = None,
+        reserve_tokens: int = 1_000,
        remaining_budget: Optional[int] = None,
        arguments: Optional[dict[str, Any]] = None,
        model_context: Optional[Any] = None,
@@ -299,7 +306,7 @@ class BaseWorkflowMixin(ABC):
            f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. "
            f"You MUST first work using appropriate tools. "
            f"REQUIRED ACTIONS before calling {self.get_name()} step {next_step_number}:\n"
-            + "\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+            + "\n".join(f"{i + 1}. {action}" for i, action in enumerate(required_actions))
            + f"\n\nOnly call {self.get_name()} again with step_number: {next_step_number} "
            f"AFTER completing this work."
        )
@@ -663,13 +670,13 @@ class BaseWorkflowMixin(ABC):
                self._current_model_name = None
                self._model_context = None

+            # Handle continuation
+            continuation_id = request.continuation_id
+
            # Adjust total steps if needed
            if request.step_number > request.total_steps:
                request.total_steps = request.step_number

-            # Handle continuation
-            continuation_id = request.continuation_id
-
            # Create thread for first step
            if not continuation_id and request.step_number == 1:
                clean_args = {k: v for k, v in arguments.items() if k not in ["_model_context", "_resolved_model_name"]}
@@ -818,8 +825,9 @@ class BaseWorkflowMixin(ABC):
        Default implementation provides generic response.
        """
        work_summary = self.prepare_work_summary()
+        continuation_id = self.get_request_continuation_id(request)

-        return {
+        response_data = {
            "status": self.get_completion_status(),
            f"complete_{self.get_name()}": {
                "initial_request": self.get_initial_request(request.step),
@@ -839,6 +847,11 @@ class BaseWorkflowMixin(ABC):
            },
        }

+        if continuation_id:
+            response_data["continuation_id"] = continuation_id
+
+        return response_data
+
    # ================================================================================
    # Inheritance Hook Methods - Replace hasattr/getattr Anti-patterns
    # ================================================================================
@@ -1447,8 +1460,10 @@ class BaseWorkflowMixin(ABC):
                if file_content:
                    expert_context = self._add_files_to_expert_context(expert_context, file_content)

-            # Get system prompt for this tool
-            system_prompt = self.get_system_prompt()
+            # Get system prompt for this tool with localization support
+            base_system_prompt = self.get_system_prompt()
+            language_instruction = self.get_language_instruction()
+            system_prompt = language_instruction + base_system_prompt

            # Check if tool wants system prompt embedded in main prompt
            if self.should_embed_system_prompt():
@@ -1547,36 +1562,21 @@ class BaseWorkflowMixin(ABC):

    # Default implementations for methods that workflow-based tools typically don't need

-    def prepare_prompt(self, request, continuation_id=None, max_tokens=None, reserve_tokens=0):
+    async def prepare_prompt(self, request) -> str:
        """
-        Base implementation for workflow tools.
+        Base implementation for workflow tools - compatible with BaseTool signature.

-        Allows subclasses to customize prompt preparation behavior by overriding
-        customize_prompt_preparation().
-        """
-        # Allow subclasses to customize the prompt preparation
-        self.customize_prompt_preparation(request, continuation_id, max_tokens, reserve_tokens)
-
-        # Workflow tools typically don't need to return a prompt
-        # since they handle their own prompt preparation internally
-        return "", ""
-
-    def customize_prompt_preparation(self, request, continuation_id=None, max_tokens=None, reserve_tokens=0):
-        """
-        Override this method in subclasses to customize prompt preparation.
-
-        Base implementation does nothing - subclasses can extend this to add
-        custom prompt preparation logic without the base class needing to
-        know about specific tool capabilities.
+        Workflow tools typically don't need to return a prompt since they handle
+        their own prompt preparation internally through the workflow execution.

        Args:
-            request: The request object (may have files, prompt, etc.)
-            continuation_id: Optional continuation ID
-            max_tokens: Optional max token limit
-            reserve_tokens: Optional reserved token count
+            request: The validated request object
+
+        Returns:
+            Empty string since workflow tools manage prompts internally
        """
-        # Base implementation does nothing - subclasses override as needed
-        return None
+        # Workflow tools handle their prompts internally during workflow execution
+        return ""

    def format_response(self, response: str, request, model_info=None):
        """