diff --git a/simulator_tests/test_o3_model_selection.py b/simulator_tests/test_o3_model_selection.py index 089f13f..035e262 100644 --- a/simulator_tests/test_o3_model_selection.py +++ b/simulator_tests/test_o3_model_selection.py @@ -125,51 +125,6 @@ class O3ModelSelectionTest(BaseSimulatorTest): self.logger.info(" ✅ O3-mini model call completed") - # Test 2.5: Explicit O3-pro model selection - self.logger.info(" 2.5: Testing explicit O3-pro model selection") - - response2_5, _ = self.call_mcp_tool( - "chat", - { - "prompt": "Simple test: What is 4 + 4? Just give a brief answer.", - "model": "o3-pro", - "temperature": 1.0, # O3-pro only supports default temperature of 1.0 - }, - ) - - if not response2_5: - self.logger.error(" ❌ O3-pro model test failed") - return False - - self.logger.info(" ✅ O3-pro model call completed") - - # Test 2.6: O3-pro with codereview tool - self.logger.info(" 2.6: Testing O3-pro with codereview tool") - - test_code_pro = """def calculate_tax(amount, rate): - return amount * rate - -def format_currency(value): - return f"${value:.2f}" -""" - test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro) - - response2_6, _ = self.call_mcp_tool( - "codereview", - { - "files": [test_file_pro], - "prompt": "Quick review of this tax calculation code", - "model": "o3-pro", - "temperature": 1.0, # O3-pro only supports default temperature of 1.0 - }, - ) - - if not response2_6: - self.logger.error(" ❌ O3-pro with codereview tool failed") - return False - - self.logger.info(" ✅ O3-pro with codereview tool completed") - # Test 3: Another tool with O3 to ensure it works across tools self.logger.info(" 3: Testing O3 with different tool (codereview)") @@ -222,12 +177,12 @@ def multiply(x, y): line for line in logs.split("\n") if "Sending request to openai API for codereview" in line ] - # Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview) - openai_api_called = len(openai_api_logs) >= 5 # Should see 5 OpenAI API calls - openai_model_usage = len(openai_model_logs) >= 5 # Should see 5 model usage logs - openai_responses_received = len(openai_response_logs) >= 5 # Should see 5 responses - chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro) - codereview_calls_to_openai = len(codereview_openai_logs) >= 2 # Should see 2 codereview calls (o3-pro + o3) + # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview) + openai_api_called = len(openai_api_logs) >= 3 # Should see 3 OpenAI API calls + openai_model_usage = len(openai_model_logs) >= 3 # Should see 3 model usage logs + openai_responses_received = len(openai_response_logs) >= 3 # Should see 3 responses + chat_calls_to_openai = len(chat_openai_logs) >= 2 # Should see 2 chat calls (o3 + o3-mini) + codereview_calls_to_openai = len(codereview_openai_logs) >= 1 # Should see 1 codereview call (o3) self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}") self.logger.info(f" OpenAI model usage logs: {len(openai_model_logs)}") @@ -317,51 +272,6 @@ def multiply(x, y): self.logger.info(" ✅ O3-mini model call via OpenRouter completed") - # Test 2.5: O3-pro model via OpenRouter - self.logger.info(" 2.5: Testing O3-pro model via OpenRouter") - - response2_5, _ = self.call_mcp_tool( - "chat", - { - "prompt": "Simple test: What is 4 + 4? Just give a brief answer.", - "model": "o3-pro", - "temperature": 1.0, - }, - ) - - if not response2_5: - self.logger.error(" ❌ O3-pro model test via OpenRouter failed") - return False - - self.logger.info(" ✅ O3-pro model call via OpenRouter completed") - - # Test 2.6: O3-pro with codereview tool via OpenRouter - self.logger.info(" 2.6: Testing O3-pro with codereview tool via OpenRouter") - - test_code_pro_or = """def calculate_discount(price, discount_rate): - return price * (1 - discount_rate) - -def validate_price(price): - return price > 0 -""" - test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or) - - response2_6, _ = self.call_mcp_tool( - "codereview", - { - "files": [test_file_pro_or], - "prompt": "Quick review of this discount calculation code", - "model": "o3-pro", - "temperature": 1.0, - }, - ) - - if not response2_6: - self.logger.error(" ❌ O3-pro with codereview tool via OpenRouter failed") - return False - - self.logger.info(" ✅ O3-pro with codereview tool via OpenRouter completed") - # Test 3: Codereview with O3 via OpenRouter self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter") @@ -415,8 +325,8 @@ def multiply(x, y): self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}") # Success criteria for OpenRouter - openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5 - all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3 + openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3 + all_calls_succeeded = response1 and response2 and response3 success_criteria = [ ("All O3 model calls succeeded", all_calls_succeeded), diff --git a/simulator_tests/test_o3_pro_expensive.py b/simulator_tests/test_o3_pro_expensive.py new file mode 100644 index 0000000..78cb7fa --- /dev/null +++ b/simulator_tests/test_o3_pro_expensive.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +O3-Pro Expensive Model Test + +⚠️ WARNING: This test uses o3-pro which is EXTREMELY EXPENSIVE! ⚠️ + +This test is intentionally NOT added to TEST_REGISTRY to prevent accidental execution. +It can only be run manually using: + python communication_simulator_test.py --individual o3_pro_expensive + +Tests that o3-pro model works with one simple chat call. That's it. +""" + +from .base_test import BaseSimulatorTest + + +class O3ProExpensiveTest(BaseSimulatorTest): + """Test o3-pro model basic functionality - EXPENSIVE, manual only""" + + @property + def test_name(self) -> str: + return "o3_pro_expensive" + + @property + def test_description(self) -> str: + return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)" + + def run_test(self) -> bool: + """Test o3-pro model with one simple chat call - EXPENSIVE!""" + try: + self.logger.warning("⚠️ ⚠️ ⚠️ EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️") + self.logger.info("Test: O3-Pro basic chat test") + + # One simple chat call + response, _ = self.call_mcp_tool( + "chat", + { + "prompt": "What is 2 + 2?", + "model": "o3-pro", + "temperature": 1.0, + }, + ) + + if response: + self.logger.info("✅ O3-Pro chat call succeeded") + self.logger.warning("💰 Test completed - check your billing!") + return True + else: + self.logger.error("❌ O3-Pro chat call failed") + return False + + except Exception as e: + self.logger.error(f"O3-Pro test failed: {e}") + return False + + +def main(): + """Run the O3-Pro expensive test""" + import sys + + print("⚠️ ⚠️ ⚠️ WARNING: This test uses O3-PRO which is EXTREMELY EXPENSIVE! ⚠️ ⚠️ ⚠️") + print("O3-Pro can cost $15-60 per 1K tokens!") + print("This is a MINIMAL test but may still cost $5-15!") + print() + + response = input("Are you absolutely sure you want to run this expensive test? Type 'YES_I_UNDERSTAND_THE_COST': ") + if response != "YES_I_UNDERSTAND_THE_COST": + print("❌ Test cancelled") + sys.exit(1) + + print("💰 Running minimal O3-Pro test...") + + verbose = "--verbose" in sys.argv or "-v" in sys.argv + test = O3ProExpensiveTest(verbose=verbose) + + success = test.run_test() + + if success: + print("✅ O3-Pro test completed successfully") + print("💰 Don't forget to check your billing!") + else: + print("❌ O3-Pro test failed") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main()