From 69ec38d1af368a4f7ed3eb48b3feb4e1fb4755cf Mon Sep 17 00:00:00 2001 From: Lachlan Donald Date: Sat, 14 Jun 2025 13:02:44 +1000 Subject: [PATCH 1/4] Add o3-pro model support and extend test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added o3-pro model configuration to custom_models.json with 200K context - Updated OpenAI provider to support o3-pro with fixed temperature constraint - Extended simulator tests to include o3-pro validation scenarios 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- conf/custom_models.json | 9 ++++ config.py | 1 + providers/openai.py | 9 ++++ simulator_tests/test_o3_model_selection.py | 50 +++++++++++++++++++--- tests/test_auto_mode.py | 4 +- 5 files changed, 64 insertions(+), 9 deletions(-) diff --git a/conf/custom_models.json b/conf/custom_models.json index 0838930..8c32dd8 100644 --- a/conf/custom_models.json +++ b/conf/custom_models.json @@ -148,6 +148,15 @@ "supports_function_calling": true, "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems" }, + { + "model_name": "openai/o3-pro", + "aliases": ["o3-pro", "o3pro"], + "context_window": 200000, + "supports_extended_thinking": false, + "supports_json_mode": true, + "supports_function_calling": true, + "description": "OpenAI's o3-pro model - professional-grade reasoning and analysis" + }, { "model_name": "openai/o4-mini", "aliases": ["o4-mini", "o4mini"], diff --git a/config.py b/config.py index 389d86f..c1f5efb 100644 --- a/config.py +++ b/config.py @@ -50,6 +50,7 @@ MODEL_CAPABILITIES_DESC = { # OpenAI models - Available when OPENAI_API_KEY is configured "o3": "Strong reasoning (200K context) - Logical problems, code generation, systematic analysis", "o3-mini": "Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity", + "o3-pro": "Professional-grade reasoning (200K context) - EXTREMELY EXPENSIVE: Only for the most complex problems requiring universe-scale complexity analysis OR when the user explicitly asks for this model. Use sparingly for critical architectural decisions or exceptionally complex debugging that other models cannot handle.", "o4-mini": "Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning", "o4-mini-high": "Enhanced O4 mini (200K context) - Higher reasoning effort for complex tasks", # Full model names also supported (for explicit specification) diff --git a/providers/openai.py b/providers/openai.py index c8d73ea..0672301 100644 --- a/providers/openai.py +++ b/providers/openai.py @@ -22,6 +22,10 @@ class OpenAIModelProvider(OpenAICompatibleProvider): "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, }, + "o3-pro": { + "context_window": 200_000, # 200K tokens + "supports_extended_thinking": False, + }, "o4-mini": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, @@ -54,8 +58,13 @@ class OpenAIModelProvider(OpenAICompatibleProvider): config = self.SUPPORTED_MODELS[resolved_name] # Define temperature constraints per model +<<<<<<< HEAD if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]: # O3 and O4 reasoning models only support temperature=1.0 +======= + if model_name in ["o3", "o3-mini", "o3-pro"]: + # O3 models only support temperature=1.0 +>>>>>>> 155c4ec (Add o3-pro model support and extend test coverage) temp_constraint = FixedTemperatureConstraint(1.0) else: # Other OpenAI models support 0.0-2.0 range diff --git a/simulator_tests/test_o3_model_selection.py b/simulator_tests/test_o3_model_selection.py index 1feef35..d7164bd 100644 --- a/simulator_tests/test_o3_model_selection.py +++ b/simulator_tests/test_o3_model_selection.py @@ -125,6 +125,24 @@ class O3ModelSelectionTest(BaseSimulatorTest): self.logger.info(" ✅ O3-mini model call completed") + # Test 2.5: Explicit O3-pro model selection + self.logger.info(" 2.5: Testing explicit O3-pro model selection") + + response2_5, _ = self.call_mcp_tool( + "chat", + { + "prompt": "Simple test: What is 4 + 4? Just give a brief answer.", + "model": "o3-pro", + "temperature": 1.0, # O3-pro only supports default temperature of 1.0 + }, + ) + + if not response2_5: + self.logger.error(" ❌ O3-pro model test failed") + return False + + self.logger.info(" ✅ O3-pro model call completed") + # Test 3: Another tool with O3 to ensure it works across tools self.logger.info(" 3: Testing O3 with different tool (codereview)") @@ -177,11 +195,11 @@ def multiply(x, y): line for line in logs.split("\n") if "Sending request to openai API for codereview" in line ] - # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview) - openai_api_called = len(openai_api_logs) >= 3 # Should see 3 OpenAI API calls - openai_model_usage = len(openai_model_logs) >= 3 # Should see 3 model usage logs - openai_responses_received = len(openai_response_logs) >= 3 # Should see 3 responses - chat_calls_to_openai = len(chat_openai_logs) >= 2 # Should see 2 chat calls (o3 + o3-mini) + # Validation criteria - we expect 4 OpenAI calls (3 chat + 1 codereview) + openai_api_called = len(openai_api_logs) >= 4 # Should see 4 OpenAI API calls + openai_model_usage = len(openai_model_logs) >= 4 # Should see 4 model usage logs + openai_responses_received = len(openai_response_logs) >= 4 # Should see 4 responses + chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro) codereview_calls_to_openai = len(codereview_openai_logs) >= 1 # Should see 1 codereview call self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}") @@ -272,6 +290,24 @@ def multiply(x, y): self.logger.info(" ✅ O3-mini model call via OpenRouter completed") + # Test 2.5: O3-pro model via OpenRouter + self.logger.info(" 2.5: Testing O3-pro model via OpenRouter") + + response2_5, _ = self.call_mcp_tool( + "chat", + { + "prompt": "Simple test: What is 4 + 4? Just give a brief answer.", + "model": "o3-pro", + "temperature": 1.0, + }, + ) + + if not response2_5: + self.logger.error(" ❌ O3-pro model test via OpenRouter failed") + return False + + self.logger.info(" ✅ O3-pro model call via OpenRouter completed") + # Test 3: Codereview with O3 via OpenRouter self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter") @@ -325,8 +361,8 @@ def multiply(x, y): self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}") # Success criteria for OpenRouter - openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3 - all_calls_succeeded = response1 and response2 and response3 + openrouter_used = len(openrouter_api_logs) >= 4 or len(openrouter_model_logs) >= 4 + all_calls_succeeded = response1 and response2 and response2_5 and response3 success_criteria = [ ("All O3 model calls succeeded", all_calls_succeeded), diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py index 4bfbd58..33ef49f 100644 --- a/tests/test_auto_mode.py +++ b/tests/test_auto_mode.py @@ -47,7 +47,7 @@ class TestAutoMode: from config import MODEL_CAPABILITIES_DESC # Check all expected models are present - expected_models = ["flash", "pro", "o3", "o3-mini"] + expected_models = ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"] for model in expected_models: assert model in MODEL_CAPABILITIES_DESC assert isinstance(MODEL_CAPABILITIES_DESC[model], str) @@ -225,7 +225,7 @@ class TestAutoMode: schema = tool.get_model_field_schema() assert "enum" in schema - assert all(model in schema["enum"] for model in ["flash", "pro", "o3"]) + assert all(model in schema["enum"] for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]) assert "select the most suitable model" in schema["description"] # Test normal mode From a3aaf6f79bdbdbda8d84e859a66ddd04a2826698 Mon Sep 17 00:00:00 2001 From: Lachlan Donald Date: Sat, 14 Jun 2025 13:51:37 +1000 Subject: [PATCH 2/4] Enhance o3-pro test coverage with comprehensive codereview testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added o3-pro codereview tests for both direct OpenAI and OpenRouter paths - Updated validation criteria to account for additional test cases (5 total calls) - Addresses Gemini Code Assist feedback about incomplete test coverage - Ensures o3-pro functionality is thoroughly validated across all tools 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- simulator_tests/test_o3_model_selection.py | 68 +++++++++++++++++++--- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/simulator_tests/test_o3_model_selection.py b/simulator_tests/test_o3_model_selection.py index d7164bd..089f13f 100644 --- a/simulator_tests/test_o3_model_selection.py +++ b/simulator_tests/test_o3_model_selection.py @@ -143,6 +143,33 @@ class O3ModelSelectionTest(BaseSimulatorTest): self.logger.info(" ✅ O3-pro model call completed") + # Test 2.6: O3-pro with codereview tool + self.logger.info(" 2.6: Testing O3-pro with codereview tool") + + test_code_pro = """def calculate_tax(amount, rate): + return amount * rate + +def format_currency(value): + return f"${value:.2f}" +""" + test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro) + + response2_6, _ = self.call_mcp_tool( + "codereview", + { + "files": [test_file_pro], + "prompt": "Quick review of this tax calculation code", + "model": "o3-pro", + "temperature": 1.0, # O3-pro only supports default temperature of 1.0 + }, + ) + + if not response2_6: + self.logger.error(" ❌ O3-pro with codereview tool failed") + return False + + self.logger.info(" ✅ O3-pro with codereview tool completed") + # Test 3: Another tool with O3 to ensure it works across tools self.logger.info(" 3: Testing O3 with different tool (codereview)") @@ -195,12 +222,12 @@ def multiply(x, y): line for line in logs.split("\n") if "Sending request to openai API for codereview" in line ] - # Validation criteria - we expect 4 OpenAI calls (3 chat + 1 codereview) - openai_api_called = len(openai_api_logs) >= 4 # Should see 4 OpenAI API calls - openai_model_usage = len(openai_model_logs) >= 4 # Should see 4 model usage logs - openai_responses_received = len(openai_response_logs) >= 4 # Should see 4 responses + # Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview) + openai_api_called = len(openai_api_logs) >= 5 # Should see 5 OpenAI API calls + openai_model_usage = len(openai_model_logs) >= 5 # Should see 5 model usage logs + openai_responses_received = len(openai_response_logs) >= 5 # Should see 5 responses chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro) - codereview_calls_to_openai = len(codereview_openai_logs) >= 1 # Should see 1 codereview call + codereview_calls_to_openai = len(codereview_openai_logs) >= 2 # Should see 2 codereview calls (o3-pro + o3) self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}") self.logger.info(f" OpenAI model usage logs: {len(openai_model_logs)}") @@ -308,6 +335,33 @@ def multiply(x, y): self.logger.info(" ✅ O3-pro model call via OpenRouter completed") + # Test 2.6: O3-pro with codereview tool via OpenRouter + self.logger.info(" 2.6: Testing O3-pro with codereview tool via OpenRouter") + + test_code_pro_or = """def calculate_discount(price, discount_rate): + return price * (1 - discount_rate) + +def validate_price(price): + return price > 0 +""" + test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or) + + response2_6, _ = self.call_mcp_tool( + "codereview", + { + "files": [test_file_pro_or], + "prompt": "Quick review of this discount calculation code", + "model": "o3-pro", + "temperature": 1.0, + }, + ) + + if not response2_6: + self.logger.error(" ❌ O3-pro with codereview tool via OpenRouter failed") + return False + + self.logger.info(" ✅ O3-pro with codereview tool via OpenRouter completed") + # Test 3: Codereview with O3 via OpenRouter self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter") @@ -361,8 +415,8 @@ def multiply(x, y): self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}") # Success criteria for OpenRouter - openrouter_used = len(openrouter_api_logs) >= 4 or len(openrouter_model_logs) >= 4 - all_calls_succeeded = response1 and response2 and response2_5 and response3 + openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5 + all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3 success_criteria = [ ("All O3 model calls succeeded", all_calls_succeeded), From c12dc1d765ec499a314c81793b771e3ece34ec75 Mon Sep 17 00:00:00 2001 From: Lachlan Donald Date: Sat, 14 Jun 2025 15:50:40 +1000 Subject: [PATCH 3/4] Fix syntax error from incomplete merge conflict resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove merge conflict markers from providers/openai.py - Include o3-pro in temperature constraint check for O3/O4 models 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- providers/openai.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/providers/openai.py b/providers/openai.py index 0672301..4e6f944 100644 --- a/providers/openai.py +++ b/providers/openai.py @@ -58,13 +58,8 @@ class OpenAIModelProvider(OpenAICompatibleProvider): config = self.SUPPORTED_MODELS[resolved_name] # Define temperature constraints per model -<<<<<<< HEAD - if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]: + if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]: # O3 and O4 reasoning models only support temperature=1.0 -======= - if model_name in ["o3", "o3-mini", "o3-pro"]: - # O3 models only support temperature=1.0 ->>>>>>> 155c4ec (Add o3-pro model support and extend test coverage) temp_constraint = FixedTemperatureConstraint(1.0) else: # Other OpenAI models support 0.0-2.0 range From 40aa1eaeb6fe9eb67580badf45516e859aea317b Mon Sep 17 00:00:00 2001 From: Lachlan Donald Date: Sat, 14 Jun 2025 21:09:47 +1000 Subject: [PATCH 4/4] Format test_auto_mode.py with black MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix code formatting to comply with black style requirements. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- tests/test_auto_mode.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py index 33ef49f..3a2be41 100644 --- a/tests/test_auto_mode.py +++ b/tests/test_auto_mode.py @@ -225,7 +225,10 @@ class TestAutoMode: schema = tool.get_model_field_schema() assert "enum" in schema - assert all(model in schema["enum"] for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]) + assert all( + model in schema["enum"] + for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"] + ) assert "select the most suitable model" in schema["description"] # Test normal mode