Merge pull request #331 from BjornMelin/feat/openai-gpt-5.1-support

feat: add OpenAI GPT-5.1 family support
2025-11-18 10:49:30 +04:00
parent 7a1de6477a 698d391b26
commit ba63892ae2
31 changed files with 497 additions and 68 deletions
--- a/.env.example
+++ b/.env.example
@@ -55,7 +55,8 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here
 # Optional: Default model to use
 # Options: 'auto' (Claude picks best model), 'pro', 'flash', 'o3', 'o3-mini', 'o4-mini', 'o4-mini-high',
-#          'gpt-5', 'gpt-5-mini', 'grok', 'opus-4.1', 'sonnet-4.1', or any DIAL model if DIAL is configured
+#          'gpt-5.1', 'gpt-5.1-codex', 'gpt-5.1-codex-mini', 'gpt-5', 'gpt-5-mini', 'grok',
 #          'opus-4.1', 'sonnet-4.1', or any DIAL model if DIAL is configured
 # When set to 'auto', Claude will select the best model for each task
 # Defaults to 'auto' if not specified
 DEFAULT_MODEL=auto
@@ -79,12 +80,15 @@ DEFAULT_THINKING_MODE_THINKDEEP=high
 # If you want to disable a provider entirely, don't set its API key
 #
 # Supported OpenAI models:
 #   - gpt-5.1          (400K context, 128K output, reasoning tokens, streaming enabled)
 #   - gpt-5.1-codex    (400K context, 128K output, coding specialization, Responses API only)
 #   - gpt-5.1-codex-mini (400K context, 128K output, cost-efficient Codex with streaming)
 #   - gpt-5            (400K context, 128K output, reasoning tokens)
 #   - gpt-5-mini       (400K context, 128K output, reasoning tokens)
 #   - o3               (200K context, high reasoning)
 #   - o3-mini          (200K context, balanced)
 #   - o4-mini          (200K context, latest balanced, temperature=1.0 only)
 #   - o4-mini-high     (200K context, enhanced reasoning, temperature=1.0 only)
 #   - gpt-5            (400K context, 128K output, reasoning tokens)
 #   - gpt-5-mini       (400K context, 128K output, reasoning tokens)
 #   - mini             (shorthand for o4-mini)
 #
 # Supported Google/Gemini models:
@@ -122,6 +126,7 @@ DEFAULT_THINKING_MODE_THINKDEEP=high
 #
 # Examples:
 #   OPENAI_ALLOWED_MODELS=o3-mini,o4-mini,mini  # Only allow mini models (cost control)
 #   OPENAI_ALLOWED_MODELS=gpt-5.1,gpt-5.1-codex  # Pin to GPT-5.1 family
 #   GOOGLE_ALLOWED_MODELS=flash                  # Only allow Flash (fast responses)
 #   XAI_ALLOWED_MODELS=grok-3                    # Only allow standard GROK (not fast variant)
 #   OPENAI_ALLOWED_MODELS=o4-mini                # Single model standardization
--- a/.gitignore
+++ b/.gitignore
@@ -183,6 +183,7 @@ CLAUDE.local.md
 .docker_cleaned
 logs/
 *.backup
 *.backup-*.json
 /.desktop_configured
 /worktrees/
--- a/conf/openai_models.json
+++ b/conf/openai_models.json
@@ -232,6 +232,81 @@
      "supports_temperature": true,
      "max_image_size_mb": 20.0,
      "use_openai_response_api": true
    },
    {
      "model_name": "gpt-5.1",
      "friendly_name": "OpenAI (GPT-5.1)",
      "aliases": [
        "gpt5.1",
        "gpt-5.1",
        "5.1"
      ],
      "intelligence_score": 18,
      "description": "GPT-5.1 (400K context, 128K output) - Flagship reasoning model with configurable thinking effort and vision support.",
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_system_prompts": true,
      "supports_streaming": true,
      "supports_function_calling": true,
      "supports_json_mode": true,
      "supports_images": true,
      "supports_temperature": true,
      "max_image_size_mb": 20.0,
      "default_reasoning_effort": "medium",
      "allow_code_generation": true,
      "temperature_constraint": "fixed"
    },
    {
      "model_name": "gpt-5.1-codex",
      "friendly_name": "OpenAI (GPT-5.1 Codex)",
      "aliases": [
        "gpt5.1-codex",
        "gpt-5.1-codex",
        "gpt5.1code",
        "gpt-5.1-code",
        "codex-5.1"
      ],
      "intelligence_score": 19,
      "description": "GPT-5.1 Codex (400K context, 128K output) - Agentic coding specialization available through the Responses API.",
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_system_prompts": true,
      "supports_streaming": false,
      "supports_function_calling": true,
      "supports_json_mode": true,
      "supports_images": true,
      "supports_temperature": true,
      "max_image_size_mb": 20.0,
      "use_openai_response_api": true,
      "default_reasoning_effort": "high",
      "allow_code_generation": true,
      "temperature_constraint": "fixed"
    },
    {
      "model_name": "gpt-5.1-codex-mini",
      "friendly_name": "OpenAI (GPT-5.1 Codex mini)",
      "aliases": [
        "gpt5.1-codex-mini",
        "gpt-5.1-codex-mini",
        "codex-mini",
        "5.1-codex-mini"
      ],
      "intelligence_score": 16,
      "description": "GPT-5.1 Codex mini (400K context, 128K output) - Cost-efficient Codex variant with streaming support.",
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_system_prompts": true,
      "supports_streaming": true,
      "supports_function_calling": true,
      "supports_json_mode": true,
      "supports_images": true,
      "supports_temperature": true,
      "max_image_size_mb": 20.0,
      "allow_code_generation": true,
      "temperature_constraint": "fixed"
    }
  ]
 }
--- a/conf/openrouter_models.json
+++ b/conf/openrouter_models.json
@@ -366,6 +366,72 @@
      "description": "GPT-5 nano (400K context, 128K output) - Fastest, cheapest version of GPT-5 for summarization and classification tasks",
      "intelligence_score": 8
    },
    {
      "model_name": "openai/gpt-5.1",
      "aliases": [
        "gpt5.1",
        "gpt-5.1",
        "5.1"
      ],
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "supports_images": true,
      "max_image_size_mb": 20.0,
      "supports_temperature": true,
      "temperature_constraint": "fixed",
      "default_reasoning_effort": "medium",
      "allow_code_generation": true,
      "description": "GPT-5.1 (400K context, 128K output) - Flagship reasoning model with configurable thinking effort and vision support",
      "intelligence_score": 18
    },
    {
      "model_name": "openai/gpt-5.1-codex",
      "aliases": [
        "gpt5.1-codex",
        "gpt-5.1-codex",
        "gpt5.1code",
        "gpt-5.1-code",
        "codex-5.1"
      ],
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "supports_images": true,
      "max_image_size_mb": 20.0,
      "supports_temperature": true,
      "temperature_constraint": "fixed",
      "use_openai_response_api": true,
      "default_reasoning_effort": "high",
      "allow_code_generation": true,
      "description": "GPT-5.1 Codex (400K context, 128K output) - Agentic coding specialization available through the Responses API",
      "intelligence_score": 19
    },
    {
      "model_name": "openai/gpt-5.1-codex-mini",
      "aliases": [
        "gpt5.1-codex-mini",
        "gpt-5.1-codex-mini",
        "codex-mini",
        "5.1-codex-mini"
      ],
      "context_window": 400000,
      "max_output_tokens": 128000,
      "supports_extended_thinking": true,
      "supports_json_mode": true,
      "supports_function_calling": true,
      "supports_images": true,
      "max_image_size_mb": 20.0,
      "supports_temperature": true,
      "temperature_constraint": "fixed",
      "allow_code_generation": true,
      "description": "GPT-5.1 Codex mini (400K context, 128K output) - Cost-efficient Codex variant with streaming support",
      "intelligence_score": 16
    },
    {
      "model_name": "x-ai/grok-4",
      "aliases": [
--- a/docs/advanced-usage.md
+++ b/docs/advanced-usage.md
@@ -41,6 +41,9 @@ Regardless of your default configuration, you can specify models per request:
 | **`o3-mini`** | OpenAI | 200K tokens | Balanced speed/quality | Moderate complexity tasks |
 | **`o4-mini`** | OpenAI | 200K tokens | Latest reasoning model | Optimized for shorter contexts |
 | **`gpt4.1`** | OpenAI | 1M tokens | Latest GPT-4 with extended context | Large codebase analysis, comprehensive reviews |
 | **`gpt5.1`** (GPT-5.1) | OpenAI | 400K tokens | Flagship reasoning model with configurable thinking effort | Complex problems, balanced agent/coding flows |
 | **`gpt5.1-codex`** (GPT-5.1 Codex) | OpenAI | 400K tokens | Agentic coding specialization (Responses API) | Advanced coding tasks, structured code generation |
 | **`gpt5.1-codex-mini`** (GPT-5.1 Codex mini) | OpenAI | 400K tokens | Cost-efficient Codex variant with streaming | Balanced coding tasks, cost-conscious development |
 | **`gpt5`** (GPT-5) | OpenAI | 400K tokens | Advanced model with reasoning support | Complex problems requiring advanced reasoning |
 | **`gpt5-mini`** (GPT-5 Mini) | OpenAI | 400K tokens | Efficient variant with reasoning | Balanced performance and capability |
 | **`gpt5-nano`** (GPT-5 Nano) | OpenAI | 400K tokens | Fastest, cheapest GPT-5 variant | Summarization and classification tasks |
@@ -61,6 +64,10 @@ cloud models (expensive/powerful) AND local models (free/private) in the same co
  - **Flash Lite 2.0**: Text-only lightweight model (no thinking support)
 - **O3/O4 Models**: Excellent reasoning, systematic analysis, 200K context
 - **GPT-4.1**: Extended context window (1M tokens), general capabilities
 - **GPT-5.1 Series**: Latest flagship reasoning models, 400K context
  - **GPT-5.1**: Flagship model with configurable thinking effort and vision
  - **GPT-5.1 Codex**: Agentic coding specialization (Responses API, non-streaming)
  - **GPT-5.1 Codex mini**: Cost-efficient Codex variant with streaming support
 - **GPT-5 Series**: Advanced reasoning models, 400K context
  - **GPT-5**: Full-featured with reasoning support and vision
  - **GPT-5 Mini**: Balanced efficiency and capability
@@ -161,7 +168,7 @@ All tools that work with files support **both individual files and entire direct
 **`analyze`** - Analyze files or directories
 - `files`: List of file paths or directories (required)
 - `question`: What to analyze (required)  
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `analysis_type`: architecture|performance|security|quality|general
 - `output_format`: summary|detailed|actionable
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
@@ -176,7 +183,7 @@ All tools that work with files support **both individual files and entire direct
 **`codereview`** - Review code files or directories
 - `files`: List of file paths or directories (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `review_type`: full|security|performance|quick
 - `focus_on`: Specific aspects to focus on
 - `standards`: Coding standards to enforce
@@ -192,7 +199,7 @@ All tools that work with files support **both individual files and entire direct
 **`debug`** - Debug with file context
 - `error_description`: Description of the issue (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `error_context`: Stack trace or logs
 - `files`: Files or directories related to the issue
 - `runtime_info`: Environment details
@@ -208,7 +215,7 @@ All tools that work with files support **both individual files and entire direct
 **`thinkdeep`** - Extended analysis with file context
 - `current_analysis`: Your current thinking (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `problem_context`: Additional context
 - `focus_areas`: Specific aspects to focus on
 - `files`: Files or directories for context
@@ -224,7 +231,7 @@ All tools that work with files support **both individual files and entire direct
 **`testgen`** - Comprehensive test generation with edge case coverage
 - `files`: Code files or directories to generate tests for (required)
 - `prompt`: Description of what to test, testing objectives, and scope (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `test_examples`: Optional existing test files as style/pattern reference
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
@@ -239,7 +246,7 @@ All tools that work with files support **both individual files and entire direct
 - `files`: Code files or directories to analyze for refactoring opportunities (required)
 - `prompt`: Description of refactoring goals, context, and specific areas of focus (required)
 - `refactor_type`: codesmells|decompose|modernize|organization (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `focus_areas`: Specific areas to focus on (e.g., 'performance', 'readability', 'maintainability', 'security')
 - `style_guide_examples`: Optional existing code files to use as style/pattern reference
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -63,7 +63,7 @@ CUSTOM_MODEL_NAME=llama3.2                          # Default model
 **Default Model Selection:**
 ```env
-# Options: 'auto', 'pro', 'flash', 'o3', 'o3-mini', 'o4-mini', etc.
+# Options: 'auto', 'pro', 'flash', 'gpt5.1', 'gpt5.1-codex', 'gpt5.1-codex-mini', 'o3', 'o3-mini', 'o4-mini', etc.
 DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
 ```
@@ -81,12 +81,14 @@ DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
  | Provider | Canonical Models | Notable Aliases |
  |----------|-----------------|-----------------|
-  | OpenAI | `gpt-5`, `gpt-5-pro`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-codex`, `gpt-4.1`, `o3`, `o3-mini`, `o3-pro`, `o4-mini` | `gpt5`, `gpt5pro`, `mini`, `nano`, `codex`, `o3mini`, `o3pro`, `o4mini` |
+  | OpenAI | `gpt-5.1`, `gpt-5.1-codex`, `gpt-5.1-codex-mini`, `gpt-5`, `gpt-5-pro`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-codex`, `gpt-4.1`, `o3`, `o3-mini`, `o3-pro`, `o4-mini` | `gpt5.1`, `gpt-5.1`, `5.1`, `gpt5.1-codex`, `codex-5.1`, `codex-mini`, `gpt5`, `gpt5pro`, `mini`, `nano`, `codex`, `o3mini`, `o3pro`, `o4mini` |
  | Gemini | `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.0-flash`, `gemini-2.0-flash-lite` | `pro`, `gemini-pro`, `flash`, `flash-2.0`, `flashlite` |
  | X.AI | `grok-4`, `grok-3`, `grok-3-fast` | `grok`, `grok4`, `grok3`, `grok3fast`, `grokfast` |
  | OpenRouter | See `conf/openrouter_models.json` for the continually evolving catalogue | e.g., `opus`, `sonnet`, `flash`, `pro`, `mistral` |
  | Custom | User-managed entries such as `llama3.2` | Define your own aliases per entry |
  Latest OpenAI entries (`gpt-5.1`, `gpt-5.1-codex`, `gpt-5.1-codex-mini`) mirror the official model cards released on November 13, 2025: all three expose 400K-token contexts with 128K-token outputs, reasoning-token support, and multimodal inputs. `gpt-5.1-codex` is Responses-only with streaming disabled, while the base `gpt-5.1` and Codex mini support streaming along with full code-generation flags. Update your manifests if you run custom deployments so these capability bits stay accurate.
  > **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support, code generation) without editing Python.
 ### Code Generation Capability
@@ -105,7 +107,7 @@ The `allow_code_generation` capability enables models to generate complete, prod
 **When to Enable:**
- **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5, GPT-5 Pro when using Claude Code with Sonnet 4.5)
+- **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5.1 Codex, GPT-5 Pro, GPT-5.1 when using Claude Code with Sonnet 4.5)
 - **Purpose**: Get complete implementations from a more powerful reasoning model that your primary CLI can then review and apply
 - **Use case**: Large-scale implementations, major refactoring, complete module creation
@@ -169,7 +171,7 @@ Control which models can be used from each provider for cost control, compliance
 # Empty or unset = all models allowed (default)
 # OpenAI model restrictions
-OPENAI_ALLOWED_MODELS=o3-mini,o4-mini,mini
+OPENAI_ALLOWED_MODELS=gpt-5.1-codex-mini,gpt-5-mini,o3-mini,o4-mini,mini
 # Gemini model restrictions  
 GOOGLE_ALLOWED_MODELS=flash,pro
@@ -193,12 +195,17 @@ OPENROUTER_ALLOWED_MODELS=opus,sonnet,mistral
 OPENAI_ALLOWED_MODELS=o4-mini
 GOOGLE_ALLOWED_MODELS=flash
 # High-performance setup
 OPENAI_ALLOWED_MODELS=gpt-5.1-codex,gpt-5.1
 GOOGLE_ALLOWED_MODELS=pro
 # Single model standardization
 OPENAI_ALLOWED_MODELS=o4-mini
 GOOGLE_ALLOWED_MODELS=pro
 # Balanced selection
 GOOGLE_ALLOWED_MODELS=flash,pro
 OPENAI_ALLOWED_MODELS=gpt-5.1-codex-mini,gpt-5-mini,o4-mini
 XAI_ALLOWED_MODELS=grok,grok-3-fast
 ```
@@ -240,6 +247,8 @@ LOG_LEVEL=DEBUG  # Default: shows detailed operational messages
 DEFAULT_MODEL=auto
 GEMINI_API_KEY=your-gemini-key
 OPENAI_API_KEY=your-openai-key
 GOOGLE_ALLOWED_MODELS=flash,pro
 OPENAI_ALLOWED_MODELS=gpt-5.1-codex-mini,gpt-5-mini,o4-mini
 XAI_API_KEY=your-xai-key
 LOG_LEVEL=DEBUG
 CONVERSATION_TIMEOUT_HOURS=1
@@ -252,7 +261,7 @@ DEFAULT_MODEL=auto
 GEMINI_API_KEY=your-gemini-key
 OPENAI_API_KEY=your-openai-key
 GOOGLE_ALLOWED_MODELS=flash
-OPENAI_ALLOWED_MODELS=o4-mini
+OPENAI_ALLOWED_MODELS=gpt-5.1-codex-mini,o4-mini
 LOG_LEVEL=INFO
 CONVERSATION_TIMEOUT_HOURS=3
 ```
--- a/docs/custom_models.md
+++ b/docs/custom_models.md
@@ -61,6 +61,9 @@ The curated defaults in `conf/openrouter_models.json` include popular entries su
 | `llama3` | `meta-llama/llama-3-70b` | Large open-weight text model |
 | `deepseek-r1` | `deepseek/deepseek-r1-0528` | DeepSeek reasoning model |
 | `perplexity` | `perplexity/llama-3-sonar-large-32k-online` | Search-augmented model |
 | `gpt5.1`, `gpt-5.1`, `5.1` | `openai/gpt-5.1` | Flagship GPT-5.1 with reasoning and vision |
 | `gpt5.1-codex`, `codex-5.1` | `openai/gpt-5.1-codex` | Agentic coding specialization (Responses API) |
 | `codex-mini`, `gpt5.1-codex-mini` | `openai/gpt-5.1-codex-mini` | Cost-efficient Codex variant with streaming |
 Consult the JSON file for the full list, aliases, and capability flags. Add new entries as OpenRouter releases additional models.
@@ -78,6 +81,18 @@ Native catalogues (`conf/openai_models.json`, `conf/gemini_models.json`, `conf/x
 - Advertise support for JSON mode or vision if the upstream provider adds it
 - Adjust token limits when providers increase context windows
 ### Latest OpenAI releases
 OpenAI's November 13, 2025 drop introduced `gpt-5.1`, `gpt-5.1-codex`, and `gpt-5.1-codex-mini`, all of which now ship in `conf/openai_models.json`:
 | Model | Highlights | Notes |
 |-------|------------|-------|
 | `gpt-5.1` | 400K context, 128K output, multimodal IO, configurable reasoning effort | Streaming enabled; use for balanced agent/coding flows |
 | `gpt-5.1-codex` | Responses-only agentic coding version of GPT-5.1 | Streaming disabled; `use_openai_response_api=true`; `allow_code_generation=true` |
 | `gpt-5.1-codex-mini` | Cost-efficient Codex variant | Streaming enabled, retains 400K context and code-generation flag |
 These entries include pricing-friendly aliases (`gpt5.1`, `codex-5.1`, `codex-mini`) plus updated capability flags (`supports_extended_thinking`, `allow_code_generation`). Copy the manifest if you operate custom deployment names so downstream providers inherit the same metadata.
 Because providers load the manifests on import, you can tweak capabilities without touching Python. Restart the server after editing the JSON files so changes are picked up.
 To control ordering in auto mode or the `listmodels` summary, adjust the
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -29,7 +29,7 @@ You need at least one API key. Choose based on your needs:
 **OpenAI:**
 - Visit [OpenAI Platform](https://platform.openai.com/api-keys)
- Generate an API key for O3, GPT-5 access
+- Generate an API key for GPT-5.1, GPT-5.1-Codex, GPT-5, O3 access
 **X.AI (Grok):**
 - Visit [X.AI Console](https://console.x.ai/)
@@ -287,7 +287,7 @@ Add your API keys (at least one required):
 ```env
 # Choose your providers (at least one required)
 GEMINI_API_KEY=your-gemini-api-key-here      # For Gemini models  
-OPENAI_API_KEY=your-openai-api-key-here      # For O3, GPT-5
+OPENAI_API_KEY=your-openai-api-key-here      # For GPT-5.1, GPT-5.1-Codex, O3
 XAI_API_KEY=your-xai-api-key-here            # For Grok models
 OPENROUTER_API_KEY=your-openrouter-key       # For multiple models
@@ -498,7 +498,7 @@ DEFAULT_MODEL=auto
 GEMINI_API_KEY=your-key
 OPENAI_API_KEY=your-key
 GOOGLE_ALLOWED_MODELS=flash,pro
-OPENAI_ALLOWED_MODELS=o4-mini,o3-mini
+OPENAI_ALLOWED_MODELS=gpt-5.1-codex-mini,gpt-5-mini,o4-mini
 ```
 ### Cost-Optimized Setup
@@ -514,7 +514,7 @@ DEFAULT_MODEL=auto
 GEMINI_API_KEY=your-key
 OPENAI_API_KEY=your-key
 GOOGLE_ALLOWED_MODELS=pro
-OPENAI_ALLOWED_MODELS=o3
+OPENAI_ALLOWED_MODELS=gpt-5.1-codex,gpt-5.1
 ```
 ### Local-First Setup
--- a/docs/model_ranking.md
+++ b/docs/model_ranking.md
@@ -39,7 +39,7 @@ A straightforward rubric that mirrors typical provider tiers:
 | Intelligence | Guidance |
 |--------------|----------|
-| 18–19 | Frontier reasoning models (Gemini 2.5 Pro, GPT‑5) |
+| 18–19 | Frontier reasoning models (Gemini 2.5 Pro, GPT‑5.1 Codex, GPT‑5.1, GPT‑5) |
 | 15–17 | Strong general models with large context (O3 Pro, DeepSeek R1) |
 | 12–14 | Balanced assistants (Claude Opus/Sonnet, Mistral Large) |
 | 9–11  | Fast distillations (Gemini Flash, GPT-5 Mini, Mistral medium) |
--- a/docs/tools/analyze.md
+++ b/docs/tools/analyze.md
@@ -64,7 +64,7 @@ This workflow ensures methodical analysis before expert insights, resulting in d
 **Initial Configuration (used in step 1):**
 - `prompt`: What to analyze or look for (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `analysis_type`: architecture|performance|security|quality|general (default: general)
 - `output_format`: summary|detailed|actionable (default: detailed)
 - `temperature`: Temperature for analysis (0-1, default 0.2)
--- a/docs/tools/chat.md
+++ b/docs/tools/chat.md
@@ -52,7 +52,7 @@ word verdict in the end.
 ## Tool Parameters
 - `prompt`: Your question or discussion topic (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `absolute_file_paths`: Optional absolute file or directory paths for additional context
 - `images`: Optional images for visual context (absolute paths)
 - `working_directory_absolute_path`: **Required** - Absolute path to an existing directory where generated code artifacts will be saved
--- a/docs/tools/codereview.md
+++ b/docs/tools/codereview.md
@@ -79,7 +79,7 @@ The above prompt will simultaneously run two separate `codereview` tools with tw
 **Initial Review Configuration (used in step 1):**
 - `prompt`: User's summary of what the code does, expected behavior, constraints, and review objectives (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `review_type`: full|security|performance|quick (default: full)
 - `focus_on`: Specific aspects to focus on (e.g., "security vulnerabilities", "performance bottlenecks")
 - `standards`: Coding standards to enforce (e.g., "PEP8", "ESLint", "Google Style Guide")
--- a/docs/tools/debug.md
+++ b/docs/tools/debug.md
@@ -72,7 +72,7 @@ This structured approach ensures Claude performs methodical groundwork before ex
 - `images`: Visual debugging materials (error screenshots, logs, etc.)
 **Model Selection:**
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 - `use_assistant_model`: Whether to use expert analysis phase (default: true, set to false to use Claude only)
--- a/docs/tools/precommit.md
+++ b/docs/tools/precommit.md
@@ -140,7 +140,7 @@ Use zen and perform a thorough precommit ensuring there aren't any new regressio
 **Initial Configuration (used in step 1):**
 - `path`: Starting directory to search for repos (REQUIRED for step 1, must be absolute path)
 - `prompt`: The original user request description for the changes (required for context)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `compare_to`: Compare against a branch/tag instead of local changes (optional)
 - `severity_filter`: critical|high|medium|low|all (default: all)
 - `include_staged`: Include staged changes in the review (default: true)
--- a/docs/tools/refactor.md
+++ b/docs/tools/refactor.md
@@ -102,7 +102,7 @@ This results in Claude first performing its own expert analysis, encouraging it
 **Initial Configuration (used in step 1):**
 - `prompt`: Description of refactoring goals, context, and specific areas of focus (required)
 - `refactor_type`: codesmells|decompose|modernize|organization (default: codesmells)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `focus_areas`: Specific areas to focus on (e.g., 'performance', 'readability', 'maintainability', 'security')
 - `style_guide_examples`: Optional existing code files to use as style/pattern reference (absolute paths)
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
--- a/docs/tools/secaudit.md
+++ b/docs/tools/secaudit.md
@@ -85,7 +85,7 @@ security remediation plan using planner
 - `images`: Architecture diagrams, security documentation, or visual references
 **Initial Security Configuration (used in step 1):**
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `security_scope`: Application context, technology stack, and security boundary definition (required)
 - `threat_level`: low|medium|high|critical (default: medium) - determines assessment depth and urgency
 - `compliance_requirements`: List of compliance frameworks to assess against (e.g., ["PCI DSS", "SOC2"])
--- a/docs/tools/testgen.md
+++ b/docs/tools/testgen.md
@@ -69,7 +69,7 @@ Test generation excels with extended reasoning models like Gemini Pro or O3, whi
 **Initial Configuration (used in step 1):**
 - `prompt`: Description of what to test, testing objectives, and specific scope/focus areas (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `test_examples`: Optional existing test files or directories to use as style/pattern reference (absolute paths)
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 - `use_assistant_model`: Whether to use expert test generation phase (default: true, set to false to use Claude only)
--- a/docs/tools/thinkdeep.md
+++ b/docs/tools/thinkdeep.md
@@ -30,7 +30,7 @@ with the best architecture for my project
 ## Tool Parameters
 - `prompt`: Your current thinking/analysis to extend and validate (required)
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
+- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5.1|gpt5.1-codex|gpt5.1-codex-mini|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `problem_context`: Additional context about the problem or goal
 - `focus_areas`: Specific aspects to focus on (architecture, performance, security, etc.)
 - `files`: Optional file paths or directories for additional context (absolute paths)
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -115,20 +115,51 @@ class OpenAIModelProvider(RegistryBackedProviderMixin, OpenAICompatibleProvider)
        if category == ToolModelCategory.EXTENDED_REASONING:
            # Prefer models with extended thinking support
-            # GPT-5-Codex first for coding tasks
+            # GPT-5.1 Codex first for coding tasks
-            preferred = find_first(["gpt-5-codex", "gpt-5-pro", "o3", "o3-pro", "gpt-5"])
+            preferred = find_first(
                [
                    "gpt-5.1-codex",
                    "gpt-5.1",
                    "gpt-5-codex",
                    "gpt-5-pro",
                    "o3-pro",
                    "gpt-5",
                    "o3",
                ]
            )
            return preferred if preferred else allowed_models[0]
        elif category == ToolModelCategory.FAST_RESPONSE:
            # Prefer fast, cost-efficient models
-            # GPT-5 models for speed, GPT-5-Codex after (premium pricing but cached)
+            # GPT-5.1 models for speed, GPT-5.1-Codex after (premium pricing but cached)
-            preferred = find_first(["gpt-5", "gpt-5-mini", "gpt-5-codex", "o4-mini", "o3-mini"])
+            preferred = find_first(
                [
                    "gpt-5.1",
                    "gpt-5.1-codex-mini",
                    "gpt-5",
                    "gpt-5-mini",
                    "gpt-5-codex",
                    "o4-mini",
                    "o3-mini",
                ]
            )
            return preferred if preferred else allowed_models[0]
        else:  # BALANCED or default
            # Prefer balanced performance/cost models
-            # Include GPT-5-Codex for coding workflows
+            # Include GPT-5.1 family for latest capabilities
-            preferred = find_first(["gpt-5", "gpt-5-codex", "gpt-5-pro", "gpt-5-mini", "o4-mini", "o3-mini"])
+            preferred = find_first(
                [
                    "gpt-5.1",
                    "gpt-5.1-codex",
                    "gpt-5",
                    "gpt-5-codex",
                    "gpt-5-pro",
                    "gpt-5-mini",
                    "o4-mini",
                    "o3-mini",
                ]
            )
            return preferred if preferred else allowed_models[0]
--- a/tests/CASSETTE_MAINTENANCE.md
+++ b/tests/CASSETTE_MAINTENANCE.md
@@ -222,10 +222,45 @@ If you encounter issues with cassette testing:
 3. Run semantic matching tests to verify the system
 4. Open an issue if you find a bug in the matching logic
 ## Dual-Model Cassette Coverage
 Some integration tests maintain cassettes for multiple model variants to ensure regression coverage across model families. For example:
 ### Consensus Tool Cassettes
 The `test_consensus_integration.py` test uses parameterized fixtures to test both `gpt-5` and `gpt-5.1` models:
 - `tests/openai_cassettes/consensus_step1_gpt5_for.json` - Cassette for gpt-5 model
 - `tests/openai_cassettes/consensus_step1_gpt51_for.json` - Cassette for gpt-5.1 model
 **When updating consensus cassettes:**
 1. Both cassettes should be updated if the test logic changes
 2. If only one model's behavior changes, update only that cassette
 3. The test uses `@pytest.mark.parametrize` to run against both models
 4. Each cassette path is mapped in the `CONSENSUS_CASSETTES` dictionary
 **To re-record a specific model's cassette:**
 ```bash
 # Delete the specific cassette
 rm tests/openai_cassettes/consensus_step1_gpt5_for.json
 # Run the test with real API key (it will record for gpt-5)
 OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5] -v
 # Or for gpt-5.1
 rm tests/openai_cassettes/consensus_step1_gpt51_for.json
 OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5.1] -v
 ```
 This dual-coverage approach ensures that both model families continue to work correctly as the codebase evolves.
 ## Related Files
 - `tests/http_transport_recorder.py` - Cassette recording/replay implementation
 - `tests/transport_helpers.py` - Helper functions for injecting transports
 - `tests/test_cassette_semantic_matching.py` - Tests for semantic matching
 - `tests/test_o3_pro_output_text_fix.py` - Example of cassette usage
 - `tests/test_consensus_integration.py` - Example of dual-model cassette coverage
 - `tests/openai_cassettes/` - Directory containing recorded cassettes
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -193,6 +193,7 @@ def disable_force_env_override(monkeypatch):
    monkeypatch.setenv("MAX_CONVERSATION_TURNS", "50")
    import importlib
    import sys
    import config
    import utils.conversation_memory as conversation_memory
@@ -200,6 +201,10 @@ def disable_force_env_override(monkeypatch):
    importlib.reload(config)
    importlib.reload(conversation_memory)
    test_conversation_module = sys.modules.get("tests.test_conversation_memory")
    if test_conversation_module is not None:
        test_conversation_module.MAX_CONVERSATION_TURNS = conversation_memory.MAX_CONVERSATION_TURNS
    try:
        yield
    finally:
--- a/tests/openai_cassettes/consensus_step1_gpt51_for.json
+++ b/tests/openai_cassettes/consensus_step1_gpt51_for.json
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -94,9 +94,9 @@ class TestAutoModeComprehensive:
                    "OPENROUTER_API_KEY": None,
                },
                {
-                    "EXTENDED_REASONING": "gpt-5-codex",  # GPT-5-Codex prioritized for coding tasks
+                    "EXTENDED_REASONING": "gpt-5.1-codex",  # GPT-5.1 Codex prioritized for coding tasks
-                    "FAST_RESPONSE": "gpt-5",  # Prefer gpt-5 for speed
+                    "FAST_RESPONSE": "gpt-5.1",  # Prefer gpt-5.1 for speed
-                    "BALANCED": "gpt-5",  # Prefer gpt-5 for balanced
+                    "BALANCED": "gpt-5.1",  # Prefer gpt-5.1 for balanced
                },
            ),
            # Only X.AI API available
--- a/tests/test_auto_mode_model_listing.py
+++ b/tests/test_auto_mode_model_listing.py
@@ -83,7 +83,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
        pass
    monkeypatch.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
-    monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
+    monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5.1")
    monkeypatch.setenv("OPENROUTER_ALLOWED_MODELS", "gpt5nano")
    monkeypatch.setenv("XAI_ALLOWED_MODELS", "")
@@ -104,7 +104,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
        ("OPENAI_API_KEY", "test-openai"),
        ("OPENROUTER_API_KEY", "test-openrouter"),
        ("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro"),
-        ("OPENAI_ALLOWED_MODELS", "gpt-5"),
+        ("OPENAI_ALLOWED_MODELS", "gpt-5.1"),
        ("OPENROUTER_ALLOWED_MODELS", "gpt5nano"),
        ("XAI_ALLOWED_MODELS", ""),
    ):
@@ -139,7 +139,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
    assert payload["status"] == "error"
    available_models = _extract_available_models(payload["content"])
-    assert set(available_models) == {"gemini-2.5-pro", "gpt-5", "gpt5nano", "openai/gpt-5-nano"}
+    assert set(available_models) == {"gemini-2.5-pro", "gpt-5.1", "gpt5nano", "openai/gpt-5-nano"}
@pytest.mark.no_mock_provider
@@ -225,6 +225,6 @@ def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, rese
    available_models = _extract_available_models(payload["content"])
    assert "gemini-2.5-pro" in available_models
-    assert "gpt-5" in available_models
+    assert any(model in available_models for model in {"gpt-5.1", "gpt-5"})
    assert "grok-4" in available_models
    assert len(available_models) >= 5
--- a/tests/test_auto_mode_provider_selection.py
+++ b/tests/test_auto_mode_provider_selection.py
@@ -98,9 +98,9 @@ class TestAutoModeProviderSelection:
            balanced = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
            # Should select appropriate OpenAI models based on new preference order
-            assert extended_reasoning == "gpt-5-codex"  # GPT-5-Codex prioritized for extended reasoning
+            assert extended_reasoning == "gpt-5.1-codex"  # GPT-5.1 Codex prioritized for extended reasoning
-            assert fast_response == "gpt-5"  # gpt-5 comes first in fast response preference
+            assert fast_response == "gpt-5.1"  # gpt-5.1 comes first in fast response preference
-            assert balanced == "gpt-5"  # gpt-5 for balanced
+            assert balanced == "gpt-5.1"  # gpt-5.1 for balanced
        finally:
            # Restore original environment
--- a/tests/test_consensus_integration.py
+++ b/tests/test_consensus_integration.py
@@ -16,7 +16,12 @@ from tools.consensus import ConsensusTool
 # Directories for recorded HTTP interactions
 CASSETTE_DIR = Path(__file__).parent / "openai_cassettes"
 CASSETTE_DIR.mkdir(exist_ok=True)
-CONSENSUS_CASSETTE_PATH = CASSETTE_DIR / "consensus_step1_gpt5_for.json"
+
 # Mapping of OpenAI model names to their cassette files
 CONSENSUS_CASSETTES = {
    "gpt-5": CASSETTE_DIR / "consensus_step1_gpt5_for.json",
    "gpt-5.1": CASSETTE_DIR / "consensus_step1_gpt51_for.json",
 }
 GEMINI_REPLAY_DIR = Path(__file__).parent / "gemini_cassettes"
 GEMINI_REPLAY_DIR.mkdir(exist_ok=True)
@@ -26,8 +31,15 @@ GEMINI_REPLAY_PATH = GEMINI_REPLAY_DIR / "consensus" / "step2_gemini25_flash_aga
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
-async def test_consensus_multi_model_consultations(monkeypatch):
+@pytest.mark.parametrize("openai_model", ["gpt-5", "gpt-5.1"])
-    """Exercise ConsensusTool against gpt-5 (supporting) and gemini-2.0-flash (critical)."""
+async def test_consensus_multi_model_consultations(monkeypatch, openai_model):
    """Exercise ConsensusTool against OpenAI model (supporting) and gemini-2.5-flash (critical).
    Tests both gpt-5 and gpt-5.1 to ensure regression coverage for both model families.
    """
    # Get the cassette path for this model
    consensus_cassette_path = CONSENSUS_CASSETTES[openai_model]
    env_updates = {
        "DEFAULT_MODEL": "auto",
@@ -43,13 +55,14 @@ async def test_consensus_multi_model_consultations(monkeypatch):
        "CUSTOM_API_URL",
    ]
-    recording_mode = not CONSENSUS_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists()
+    recording_mode = not consensus_cassette_path.exists() or not GEMINI_REPLAY_PATH.exists()
    if recording_mode:
        openai_key = env_updates["OPENAI_API_KEY"].strip()
        gemini_key = env_updates["GEMINI_API_KEY"].strip()
        if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")):
            pytest.skip(
-                "Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record."
+                "Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY "
                "not configured. Provide real keys to record."
            )
    GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True)
@@ -66,27 +79,43 @@ async def test_consensus_multi_model_consultations(monkeypatch):
            m.setenv("GEMINI_API_KEY", "dummy-key-for-replay")
            m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay")
        # Ensure restriction policies allow the latest OpenAI models under test
        m.setenv("OPENAI_ALLOWED_MODELS", openai_model)
        m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_REPLAY_DIR))
        m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID)
        for key in keys_to_clear:
            m.delenv(key, raising=False)
-        # Reset providers and register only OpenAI & Gemini for deterministic behavior
+        # Ensure we use the built-in OpenAI catalogue rather than leftovers from
        # other tests that patch OPENAI_MODELS_CONFIG_PATH.
        m.delenv("OPENAI_MODELS_CONFIG_PATH", raising=False)
        # Reset providers/restrictions and register only OpenAI & Gemini for deterministic behavior
        ModelProviderRegistry.reset_for_testing()
        import utils.model_restrictions as model_restrictions
        model_restrictions._restriction_service = None
        from providers.gemini import GeminiModelProvider
        from providers.openai import OpenAIModelProvider
        # Earlier tests may override the OpenAI provider's registry by pointing
        # OPENAI_MODELS_CONFIG_PATH at fixtures. Force a reload so model
        # metadata is restored from conf/openai_models.json.
        OpenAIModelProvider.reload_registry()
        assert openai_model in OpenAIModelProvider.MODEL_CAPABILITIES
        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
        # Inject HTTP transport for OpenAI interactions
-        inject_transport(monkeypatch, CONSENSUS_CASSETTE_PATH)
+        inject_transport(monkeypatch, str(consensus_cassette_path))
        tool = ConsensusTool()
        models_to_consult = [
-            {"model": "gpt-5", "stance": "for"},
+            {"model": openai_model, "stance": "for"},
            {"model": "gemini-2.5-flash", "stance": "against"},
        ]
@@ -105,7 +134,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
        step1_data = json.loads(step1_response[0].text)
        assert step1_data["status"] == "analysis_and_first_model_consulted"
-        assert step1_data["model_consulted"] == "gpt-5"
+        assert step1_data["model_consulted"] == openai_model
        assert step1_data["model_response"]["status"] == "success"
        assert step1_data["model_response"]["metadata"]["provider"] == "openai"
        assert step1_data["model_response"]["verdict"]
@@ -118,7 +147,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
        summary_for_step2 = step1_data["model_response"]["verdict"][:200]
        step2_arguments = {
-            "step": f"Incorporated gpt-5 perspective: {summary_for_step2}",
+            "step": f"Incorporated {openai_model} perspective: {summary_for_step2}",
            "step_number": 2,
            "total_steps": len(models_to_consult),
            "next_step_required": False,
@@ -138,7 +167,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
    assert step2_data["model_response"]["metadata"]["provider"] == "google"
    assert step2_data["model_response"]["verdict"]
    assert step2_data["complete_consensus"]["models_consulted"] == [
-        "gpt-5:for",
+        f"{openai_model}:for",
        "gemini-2.5-flash:against",
    ]
    assert step2_data["consensus_complete"] is True
@@ -159,7 +188,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
                gemini_provider._client = None
    # Ensure cassettes exist for future replays
-    assert CONSENSUS_CASSETTE_PATH.exists()
+    assert consensus_cassette_path.exists()
    assert GEMINI_REPLAY_PATH.exists()
    # Clean up provider registry state after test
--- a/tests/test_intelligent_fallback.py
+++ b/tests/test_intelligent_fallback.py
@@ -37,14 +37,14 @@ class TestIntelligentFallback:
    @patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False)
    def test_prefers_openai_o3_mini_when_available(self):
-        """Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)"""
+        """Test that gpt-5.1 is preferred when OpenAI API key is available (based on new preference order)"""
        # Register only OpenAI provider for this test
        from providers.openai import OpenAIModelProvider
        ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
        fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
-        assert fallback_model == "gpt-5"  # Based on new preference order: gpt-5 before o4-mini
+        assert fallback_model == "gpt-5.1"  # Based on new preference order: gpt-5.1 before o4-mini
    @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
    def test_prefers_gemini_flash_when_openai_unavailable(self):
@@ -147,8 +147,8 @@ class TestIntelligentFallback:
                history, tokens = build_conversation_history(context, model_context=None)
-                # Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order)
+                # Verify that ModelContext was called with gpt-5.1 (the intelligent fallback based on new preference order)
-                mock_context_class.assert_called_once_with("gpt-5")
+                mock_context_class.assert_called_once_with("gpt-5.1")
    def test_auto_mode_with_gemini_only(self):
        """Test auto mode behavior when only Gemini API key is available"""
--- a/tests/test_openai_provider.py
+++ b/tests/test_openai_provider.py
@@ -50,6 +50,9 @@ class TestOpenAIProvider:
        assert provider.validate_model_name("o4-mini") is True
        assert provider.validate_model_name("gpt-5") is True
        assert provider.validate_model_name("gpt-5-mini") is True
        assert provider.validate_model_name("gpt-5.1") is True
        assert provider.validate_model_name("gpt-5.1-codex") is True
        assert provider.validate_model_name("gpt-5.1-codex-mini") is True
        # Test valid aliases
        assert provider.validate_model_name("mini") is True
@@ -59,6 +62,9 @@ class TestOpenAIProvider:
        assert provider.validate_model_name("gpt5") is True
        assert provider.validate_model_name("gpt5-mini") is True
        assert provider.validate_model_name("gpt5mini") is True
        assert provider.validate_model_name("gpt5.1") is True
        assert provider.validate_model_name("gpt5.1-codex") is True
        assert provider.validate_model_name("codex-mini") is True
        # Test invalid model
        assert provider.validate_model_name("invalid-model") is False
@@ -77,6 +83,9 @@ class TestOpenAIProvider:
        assert provider._resolve_model_name("gpt5") == "gpt-5"
        assert provider._resolve_model_name("gpt5-mini") == "gpt-5-mini"
        assert provider._resolve_model_name("gpt5mini") == "gpt-5-mini"
        assert provider._resolve_model_name("gpt5.1") == "gpt-5.1"
        assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex"
        assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini"
        # Test full name passthrough
        assert provider._resolve_model_name("o3") == "o3"
@@ -86,6 +95,9 @@ class TestOpenAIProvider:
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
        assert provider._resolve_model_name("gpt-5") == "gpt-5"
        assert provider._resolve_model_name("gpt-5-mini") == "gpt-5-mini"
        assert provider._resolve_model_name("gpt-5.1") == "gpt-5.1"
        assert provider._resolve_model_name("gpt-5.1-codex") == "gpt-5.1-codex"
        assert provider._resolve_model_name("gpt-5.1-codex-mini") == "gpt-5.1-codex-mini"
    def test_get_capabilities_o3(self):
        """Test getting model capabilities for O3."""
@@ -146,6 +158,36 @@ class TestOpenAIProvider:
        assert capabilities.supports_function_calling is True
        assert capabilities.supports_temperature is True
    def test_get_capabilities_gpt51(self):
        """Test GPT-5.1 capabilities reflect new metadata."""
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("gpt-5.1")
        assert capabilities.model_name == "gpt-5.1"
        assert capabilities.supports_streaming is True
        assert capabilities.supports_function_calling is True
        assert capabilities.supports_json_mode is True
        assert capabilities.allow_code_generation is True
    def test_get_capabilities_gpt51_codex(self):
        """Test GPT-5.1 Codex is responses-only and non-streaming."""
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("gpt-5.1-codex")
        assert capabilities.model_name == "gpt-5.1-codex"
        assert capabilities.supports_streaming is False
        assert capabilities.use_openai_response_api is True
        assert capabilities.allow_code_generation is True
    def test_get_capabilities_gpt51_codex_mini(self):
        """Test GPT-5.1 Codex mini exposes streaming and code generation."""
        provider = OpenAIModelProvider("test-key")
        capabilities = provider.get_capabilities("gpt-5.1-codex-mini")
        assert capabilities.model_name == "gpt-5.1-codex-mini"
        assert capabilities.supports_streaming is True
        assert capabilities.allow_code_generation is True
    @patch("providers.openai_compatible.OpenAI")
    def test_generate_content_resolves_alias_before_api_call(self, mock_openai_class):
        """Test that generate_content resolves aliases before making API calls.
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -98,8 +98,8 @@ class TestModelSelection:
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
-            # OpenAI prefers GPT-5-Codex for extended reasoning (coding tasks)
+            # OpenAI prefers GPT-5.1-Codex for extended reasoning (coding tasks)
-            assert model == "gpt-5-codex"
+            assert model == "gpt-5.1-codex"
    def test_extended_reasoning_with_gemini_only(self):
        """Test EXTENDED_REASONING prefers pro when only Gemini is available."""
@@ -133,8 +133,8 @@ class TestModelSelection:
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
-            # OpenAI now prefers gpt-5 for fast response (based on our new preference order)
+            # OpenAI now prefers gpt-5.1 for fast response (based on our new preference order)
-            assert model == "gpt-5"
+            assert model == "gpt-5.1"
    def test_fast_response_with_gemini_only(self):
        """Test FAST_RESPONSE prefers flash when only Gemini is available."""
@@ -167,8 +167,8 @@ class TestModelSelection:
            ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
            model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
-            # OpenAI prefers gpt-5 for balanced (based on our new preference order)
+            # OpenAI prefers gpt-5.1 for balanced (based on our new preference order)
-            assert model == "gpt-5"
+            assert model == "gpt-5.1"
    def test_no_category_uses_balanced_logic(self):
        """Test that no category specified uses balanced logic."""
@@ -195,7 +195,7 @@ class TestFlexibleModelSelection:
                "env": {"OPENAI_API_KEY": "test-key"},
                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.EXTENDED_REASONING,
-                "expected": "gpt-5-codex",  # GPT-5-Codex prioritized for coding tasks
+                "expected": "gpt-5.1-codex",  # GPT-5.1-Codex prioritized for coding tasks
            },
            # Case 2: Gemini provider for fast response
            {
@@ -209,7 +209,7 @@ class TestFlexibleModelSelection:
                "env": {"OPENAI_API_KEY": "test-key"},
                "provider_type": ProviderType.OPENAI,
                "category": ToolModelCategory.FAST_RESPONSE,
-                "expected": "gpt-5",  # Based on new preference order
+                "expected": "gpt-5.1",  # Based on new preference order
            },
        ]
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -209,6 +209,9 @@ class TestOpenAIProvider:
        assert provider.validate_model_name("o4-mini")
        assert provider.validate_model_name("o4mini")
        assert provider.validate_model_name("o4-mini")
        assert provider.validate_model_name("gpt-5.1")
        assert provider.validate_model_name("gpt-5.1-codex")
        assert provider.validate_model_name("gpt-5.1-codex-mini")
        assert not provider.validate_model_name("gpt-4o")
        assert not provider.validate_model_name("invalid-model")
@@ -219,3 +222,20 @@ class TestOpenAIProvider:
        aliases = ["o3", "o3mini", "o3-mini", "o4-mini", "o4mini"]
        for alias in aliases:
            assert not provider.get_capabilities(alias).supports_extended_thinking
    def test_gpt51_family_capabilities(self):
        """Ensure GPT-5.1 family exposes correct capability flags."""
        provider = OpenAIModelProvider(api_key="test-key")
        base = provider.get_capabilities("gpt-5.1")
        assert base.supports_streaming
        assert base.allow_code_generation
        codex = provider.get_capabilities("gpt-5.1-codex")
        assert not codex.supports_streaming
        assert codex.use_openai_response_api
        assert codex.allow_code_generation
        codex_mini = provider.get_capabilities("gpt-5.1-codex-mini")
        assert codex_mini.supports_streaming
        assert codex_mini.allow_code_generation
--- a/tests/test_supported_models_aliases.py
+++ b/tests/test_supported_models_aliases.py
@@ -54,6 +54,9 @@ class TestSupportedModelsAliases:
        assert "o3mini" in provider.MODEL_CAPABILITIES["o3-mini"].aliases
        assert "o3pro" in provider.MODEL_CAPABILITIES["o3-pro"].aliases
        assert "gpt4.1" in provider.MODEL_CAPABILITIES["gpt-4.1"].aliases
        assert "gpt5.1" in provider.MODEL_CAPABILITIES["gpt-5.1"].aliases
        assert "gpt5.1-codex" in provider.MODEL_CAPABILITIES["gpt-5.1-codex"].aliases
        assert "codex-mini" in provider.MODEL_CAPABILITIES["gpt-5.1-codex-mini"].aliases
        # Test alias resolution
        assert provider._resolve_model_name("mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
@@ -61,10 +64,14 @@ class TestSupportedModelsAliases:
        assert provider._resolve_model_name("o3pro") == "o3-pro"  # o3pro resolves to o3-pro
        assert provider._resolve_model_name("o4mini") == "o4-mini"
        assert provider._resolve_model_name("gpt4.1") == "gpt-4.1"  # gpt4.1 resolves to gpt-4.1
        assert provider._resolve_model_name("gpt5.1") == "gpt-5.1"
        assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex"
        assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini"
        # Test case insensitive resolution
        assert provider._resolve_model_name("Mini") == "gpt-5-mini"  # mini -> gpt-5-mini now
        assert provider._resolve_model_name("O3MINI") == "o3-mini"
        assert provider._resolve_model_name("Gpt5.1") == "gpt-5.1"
    def test_xai_provider_aliases(self):
        """Test XAI provider's alias structure."""