From 69ec38d1af368a4f7ed3eb48b3feb4e1fb4755cf Mon Sep 17 00:00:00 2001
From: Lachlan Donald <lachlan@ljd.cc>
Date: Sat, 14 Jun 2025 13:02:44 +1000
Subject: [PATCH 1/4] Add o3-pro model support and extend test coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added o3-pro model configuration to custom_models.json with 200K context
- Updated OpenAI provider to support o3-pro with fixed temperature constraint
- Extended simulator tests to include o3-pro validation scenarios

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 conf/custom_models.json                    |  9 ++++
 config.py                                  |  1 +
 providers/openai.py                        |  9 ++++
 simulator_tests/test_o3_model_selection.py | 50 +++++++++++++++++++---
 tests/test_auto_mode.py                    |  4 +-
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/conf/custom_models.json b/conf/custom_models.json
index 0838930..8c32dd8 100644
--- a/conf/custom_models.json
+++ b/conf/custom_models.json
@@ -148,6 +148,15 @@
       "supports_function_calling": true,
       "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems"
     },
+    {
+      "model_name": "openai/o3-pro",
+      "aliases": ["o3-pro", "o3pro"],
+      "context_window": 200000,
+      "supports_extended_thinking": false,
+      "supports_json_mode": true,
+      "supports_function_calling": true,
+      "description": "OpenAI's o3-pro model - professional-grade reasoning and analysis"
+    },
     {
       "model_name": "openai/o4-mini",
       "aliases": ["o4-mini", "o4mini"],
diff --git a/config.py b/config.py
index 389d86f..c1f5efb 100644
--- a/config.py
+++ b/config.py
@@ -50,6 +50,7 @@ MODEL_CAPABILITIES_DESC = {
     # OpenAI models - Available when OPENAI_API_KEY is configured
     "o3": "Strong reasoning (200K context) - Logical problems, code generation, systematic analysis",
     "o3-mini": "Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity",
+    "o3-pro": "Professional-grade reasoning (200K context) - EXTREMELY EXPENSIVE: Only for the most complex problems requiring universe-scale complexity analysis OR when the user explicitly asks for this model. Use sparingly for critical architectural decisions or exceptionally complex debugging that other models cannot handle.",
     "o4-mini": "Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning",
     "o4-mini-high": "Enhanced O4 mini (200K context) - Higher reasoning effort for complex tasks",
     # Full model names also supported (for explicit specification)
diff --git a/providers/openai.py b/providers/openai.py
index c8d73ea..0672301 100644
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -22,6 +22,10 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
             "context_window": 200_000,  # 200K tokens
             "supports_extended_thinking": False,
         },
+        "o3-pro": {
+            "context_window": 200_000,  # 200K tokens
+            "supports_extended_thinking": False,
+        },
         "o4-mini": {
             "context_window": 200_000,  # 200K tokens
             "supports_extended_thinking": False,
@@ -54,8 +58,13 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
         config = self.SUPPORTED_MODELS[resolved_name]
 
         # Define temperature constraints per model
+<<<<<<< HEAD
         if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]:
             # O3 and O4 reasoning models only support temperature=1.0
+=======
+        if model_name in ["o3", "o3-mini", "o3-pro"]:
+            # O3 models only support temperature=1.0
+>>>>>>> 155c4ec (Add o3-pro model support and extend test coverage)
             temp_constraint = FixedTemperatureConstraint(1.0)
         else:
             # Other OpenAI models support 0.0-2.0 range
diff --git a/simulator_tests/test_o3_model_selection.py b/simulator_tests/test_o3_model_selection.py
index 1feef35..d7164bd 100644
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -125,6 +125,24 @@ class O3ModelSelectionTest(BaseSimulatorTest):
 
             self.logger.info("  ✅ O3-mini model call completed")
 
+            # Test 2.5: Explicit O3-pro model selection
+            self.logger.info("  2.5: Testing explicit O3-pro model selection")
+
+            response2_5, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
+                    "model": "o3-pro",
+                    "temperature": 1.0,  # O3-pro only supports default temperature of 1.0
+                },
+            )
+
+            if not response2_5:
+                self.logger.error("  ❌ O3-pro model test failed")
+                return False
+
+            self.logger.info("  ✅ O3-pro model call completed")
+
             # Test 3: Another tool with O3 to ensure it works across tools
             self.logger.info("  3: Testing O3 with different tool (codereview)")
 
@@ -177,11 +195,11 @@ def multiply(x, y):
                 line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
             ]
 
-            # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
-            openai_api_called = len(openai_api_logs) >= 3  # Should see 3 OpenAI API calls
-            openai_model_usage = len(openai_model_logs) >= 3  # Should see 3 model usage logs
-            openai_responses_received = len(openai_response_logs) >= 3  # Should see 3 responses
-            chat_calls_to_openai = len(chat_openai_logs) >= 2  # Should see 2 chat calls (o3 + o3-mini)
+            # Validation criteria - we expect 4 OpenAI calls (3 chat + 1 codereview)
+            openai_api_called = len(openai_api_logs) >= 4  # Should see 4 OpenAI API calls
+            openai_model_usage = len(openai_model_logs) >= 4  # Should see 4 model usage logs
+            openai_responses_received = len(openai_response_logs) >= 4  # Should see 4 responses
+            chat_calls_to_openai = len(chat_openai_logs) >= 3  # Should see 3 chat calls (o3 + o3-mini + o3-pro)
             codereview_calls_to_openai = len(codereview_openai_logs) >= 1  # Should see 1 codereview call
 
             self.logger.info(f"   OpenAI API call logs: {len(openai_api_logs)}")
@@ -272,6 +290,24 @@ def multiply(x, y):
 
             self.logger.info("  ✅ O3-mini model call via OpenRouter completed")
 
+            # Test 2.5: O3-pro model via OpenRouter
+            self.logger.info("  2.5: Testing O3-pro model via OpenRouter")
+
+            response2_5, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
+                    "model": "o3-pro",
+                    "temperature": 1.0,
+                },
+            )
+
+            if not response2_5:
+                self.logger.error("  ❌ O3-pro model test via OpenRouter failed")
+                return False
+
+            self.logger.info("  ✅ O3-pro model call via OpenRouter completed")
+
             # Test 3: Codereview with O3 via OpenRouter
             self.logger.info("  3: Testing O3 with codereview tool via OpenRouter")
 
@@ -325,8 +361,8 @@ def multiply(x, y):
             self.logger.info(f"   OpenRouter response logs: {len(openrouter_response_logs)}")
 
             # Success criteria for OpenRouter
-            openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3
-            all_calls_succeeded = response1 and response2 and response3
+            openrouter_used = len(openrouter_api_logs) >= 4 or len(openrouter_model_logs) >= 4
+            all_calls_succeeded = response1 and response2 and response2_5 and response3
 
             success_criteria = [
                 ("All O3 model calls succeeded", all_calls_succeeded),
diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py
index 4bfbd58..33ef49f 100644
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -47,7 +47,7 @@ class TestAutoMode:
         from config import MODEL_CAPABILITIES_DESC
 
         # Check all expected models are present
-        expected_models = ["flash", "pro", "o3", "o3-mini"]
+        expected_models = ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]
         for model in expected_models:
             assert model in MODEL_CAPABILITIES_DESC
             assert isinstance(MODEL_CAPABILITIES_DESC[model], str)
@@ -225,7 +225,7 @@ class TestAutoMode:
 
             schema = tool.get_model_field_schema()
             assert "enum" in schema
-            assert all(model in schema["enum"] for model in ["flash", "pro", "o3"])
+            assert all(model in schema["enum"] for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"])
             assert "select the most suitable model" in schema["description"]
 
             # Test normal mode

From a3aaf6f79bdbdbda8d84e859a66ddd04a2826698 Mon Sep 17 00:00:00 2001
From: Lachlan Donald <lachlan@ljd.cc>
Date: Sat, 14 Jun 2025 13:51:37 +1000
Subject: [PATCH 2/4] Enhance o3-pro test coverage with comprehensive
 codereview testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added o3-pro codereview tests for both direct OpenAI and OpenRouter paths
- Updated validation criteria to account for additional test cases (5 total calls)
- Addresses Gemini Code Assist feedback about incomplete test coverage
- Ensures o3-pro functionality is thoroughly validated across all tools

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 simulator_tests/test_o3_model_selection.py | 68 +++++++++++++++++++---
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/simulator_tests/test_o3_model_selection.py b/simulator_tests/test_o3_model_selection.py
index d7164bd..089f13f 100644
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -143,6 +143,33 @@ class O3ModelSelectionTest(BaseSimulatorTest):
 
             self.logger.info("  ✅ O3-pro model call completed")
 
+            # Test 2.6: O3-pro with codereview tool
+            self.logger.info("  2.6: Testing O3-pro with codereview tool")
+
+            test_code_pro = """def calculate_tax(amount, rate):
+    return amount * rate
+
+def format_currency(value):
+    return f"${value:.2f}"
+"""
+            test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro)
+
+            response2_6, _ = self.call_mcp_tool(
+                "codereview",
+                {
+                    "files": [test_file_pro],
+                    "prompt": "Quick review of this tax calculation code",
+                    "model": "o3-pro",
+                    "temperature": 1.0,  # O3-pro only supports default temperature of 1.0
+                },
+            )
+
+            if not response2_6:
+                self.logger.error("  ❌ O3-pro with codereview tool failed")
+                return False
+
+            self.logger.info("  ✅ O3-pro with codereview tool completed")
+
             # Test 3: Another tool with O3 to ensure it works across tools
             self.logger.info("  3: Testing O3 with different tool (codereview)")
 
@@ -195,12 +222,12 @@ def multiply(x, y):
                 line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
             ]
 
-            # Validation criteria - we expect 4 OpenAI calls (3 chat + 1 codereview)
-            openai_api_called = len(openai_api_logs) >= 4  # Should see 4 OpenAI API calls
-            openai_model_usage = len(openai_model_logs) >= 4  # Should see 4 model usage logs
-            openai_responses_received = len(openai_response_logs) >= 4  # Should see 4 responses
+            # Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview)
+            openai_api_called = len(openai_api_logs) >= 5  # Should see 5 OpenAI API calls
+            openai_model_usage = len(openai_model_logs) >= 5  # Should see 5 model usage logs
+            openai_responses_received = len(openai_response_logs) >= 5  # Should see 5 responses
             chat_calls_to_openai = len(chat_openai_logs) >= 3  # Should see 3 chat calls (o3 + o3-mini + o3-pro)
-            codereview_calls_to_openai = len(codereview_openai_logs) >= 1  # Should see 1 codereview call
+            codereview_calls_to_openai = len(codereview_openai_logs) >= 2  # Should see 2 codereview calls (o3-pro + o3)
 
             self.logger.info(f"   OpenAI API call logs: {len(openai_api_logs)}")
             self.logger.info(f"   OpenAI model usage logs: {len(openai_model_logs)}")
@@ -308,6 +335,33 @@ def multiply(x, y):
 
             self.logger.info("  ✅ O3-pro model call via OpenRouter completed")
 
+            # Test 2.6: O3-pro with codereview tool via OpenRouter
+            self.logger.info("  2.6: Testing O3-pro with codereview tool via OpenRouter")
+
+            test_code_pro_or = """def calculate_discount(price, discount_rate):
+    return price * (1 - discount_rate)
+
+def validate_price(price):
+    return price > 0
+"""
+            test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or)
+
+            response2_6, _ = self.call_mcp_tool(
+                "codereview",
+                {
+                    "files": [test_file_pro_or],
+                    "prompt": "Quick review of this discount calculation code",
+                    "model": "o3-pro",
+                    "temperature": 1.0,
+                },
+            )
+
+            if not response2_6:
+                self.logger.error("  ❌ O3-pro with codereview tool via OpenRouter failed")
+                return False
+
+            self.logger.info("  ✅ O3-pro with codereview tool via OpenRouter completed")
+
             # Test 3: Codereview with O3 via OpenRouter
             self.logger.info("  3: Testing O3 with codereview tool via OpenRouter")
 
@@ -361,8 +415,8 @@ def multiply(x, y):
             self.logger.info(f"   OpenRouter response logs: {len(openrouter_response_logs)}")
 
             # Success criteria for OpenRouter
-            openrouter_used = len(openrouter_api_logs) >= 4 or len(openrouter_model_logs) >= 4
-            all_calls_succeeded = response1 and response2 and response2_5 and response3
+            openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5
+            all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3
 
             success_criteria = [
                 ("All O3 model calls succeeded", all_calls_succeeded),

From c12dc1d765ec499a314c81793b771e3ece34ec75 Mon Sep 17 00:00:00 2001
From: Lachlan Donald <lachlan@ljd.cc>
Date: Sat, 14 Jun 2025 15:50:40 +1000
Subject: [PATCH 3/4] Fix syntax error from incomplete merge conflict
 resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove merge conflict markers from providers/openai.py
- Include o3-pro in temperature constraint check for O3/O4 models

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 providers/openai.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/providers/openai.py b/providers/openai.py
index 0672301..4e6f944 100644
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -58,13 +58,8 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
         config = self.SUPPORTED_MODELS[resolved_name]
 
         # Define temperature constraints per model
-<<<<<<< HEAD
-        if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]:
+        if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]:
             # O3 and O4 reasoning models only support temperature=1.0
-=======
-        if model_name in ["o3", "o3-mini", "o3-pro"]:
-            # O3 models only support temperature=1.0
->>>>>>> 155c4ec (Add o3-pro model support and extend test coverage)
             temp_constraint = FixedTemperatureConstraint(1.0)
         else:
             # Other OpenAI models support 0.0-2.0 range

From 40aa1eaeb6fe9eb67580badf45516e859aea317b Mon Sep 17 00:00:00 2001
From: Lachlan Donald <lachlan@ljd.cc>
Date: Sat, 14 Jun 2025 21:09:47 +1000
Subject: [PATCH 4/4] Format test_auto_mode.py with black
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix code formatting to comply with black style requirements.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 tests/test_auto_mode.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py
index 33ef49f..3a2be41 100644
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -225,7 +225,10 @@ class TestAutoMode:
 
             schema = tool.get_model_field_schema()
             assert "enum" in schema
-            assert all(model in schema["enum"] for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"])
+            assert all(
+                model in schema["enum"]
+                for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]
+            )
             assert "select the most suitable model" in schema["description"]
 
             # Test normal mode