Move o3-pro test into its own

2025-06-14 19:53:33 +04:00
parent 9f973b90e5
commit b2489409eb
2 changed files with 96 additions and 98 deletions
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -125,51 +125,6 @@ class O3ModelSelectionTest(BaseSimulatorTest):

            self.logger.info("  ✅ O3-mini model call completed")

-            # Test 2.5: Explicit O3-pro model selection
-            self.logger.info("  2.5: Testing explicit O3-pro model selection")
-
-            response2_5, _ = self.call_mcp_tool(
-                "chat",
-                {
-                    "prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
-                    "model": "o3-pro",
-                    "temperature": 1.0,  # O3-pro only supports default temperature of 1.0
-                },
-            )
-
-            if not response2_5:
-                self.logger.error("  ❌ O3-pro model test failed")
-                return False
-
-            self.logger.info("  ✅ O3-pro model call completed")
-
-            # Test 2.6: O3-pro with codereview tool
-            self.logger.info("  2.6: Testing O3-pro with codereview tool")
-
-            test_code_pro = """def calculate_tax(amount, rate):
-    return amount * rate
-
-def format_currency(value):
-    return f"${value:.2f}"
-"""
-            test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro)
-
-            response2_6, _ = self.call_mcp_tool(
-                "codereview",
-                {
-                    "files": [test_file_pro],
-                    "prompt": "Quick review of this tax calculation code",
-                    "model": "o3-pro",
-                    "temperature": 1.0,  # O3-pro only supports default temperature of 1.0
-                },
-            )
-
-            if not response2_6:
-                self.logger.error("  ❌ O3-pro with codereview tool failed")
-                return False
-
-            self.logger.info("  ✅ O3-pro with codereview tool completed")
-
            # Test 3: Another tool with O3 to ensure it works across tools
            self.logger.info("  3: Testing O3 with different tool (codereview)")

@@ -222,12 +177,12 @@ def multiply(x, y):
                line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
            ]

-            # Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview)
-            openai_api_called = len(openai_api_logs) >= 5  # Should see 5 OpenAI API calls
-            openai_model_usage = len(openai_model_logs) >= 5  # Should see 5 model usage logs
-            openai_responses_received = len(openai_response_logs) >= 5  # Should see 5 responses
-            chat_calls_to_openai = len(chat_openai_logs) >= 3  # Should see 3 chat calls (o3 + o3-mini + o3-pro)
-            codereview_calls_to_openai = len(codereview_openai_logs) >= 2  # Should see 2 codereview calls (o3-pro + o3)
+            # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
+            openai_api_called = len(openai_api_logs) >= 3  # Should see 3 OpenAI API calls
+            openai_model_usage = len(openai_model_logs) >= 3  # Should see 3 model usage logs
+            openai_responses_received = len(openai_response_logs) >= 3  # Should see 3 responses
+            chat_calls_to_openai = len(chat_openai_logs) >= 2  # Should see 2 chat calls (o3 + o3-mini)
+            codereview_calls_to_openai = len(codereview_openai_logs) >= 1  # Should see 1 codereview call (o3)

            self.logger.info(f"   OpenAI API call logs: {len(openai_api_logs)}")
            self.logger.info(f"   OpenAI model usage logs: {len(openai_model_logs)}")
@@ -317,51 +272,6 @@ def multiply(x, y):

            self.logger.info("  ✅ O3-mini model call via OpenRouter completed")

-            # Test 2.5: O3-pro model via OpenRouter
-            self.logger.info("  2.5: Testing O3-pro model via OpenRouter")
-
-            response2_5, _ = self.call_mcp_tool(
-                "chat",
-                {
-                    "prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
-                    "model": "o3-pro",
-                    "temperature": 1.0,
-                },
-            )
-
-            if not response2_5:
-                self.logger.error("  ❌ O3-pro model test via OpenRouter failed")
-                return False
-
-            self.logger.info("  ✅ O3-pro model call via OpenRouter completed")
-
-            # Test 2.6: O3-pro with codereview tool via OpenRouter
-            self.logger.info("  2.6: Testing O3-pro with codereview tool via OpenRouter")
-
-            test_code_pro_or = """def calculate_discount(price, discount_rate):
-    return price * (1 - discount_rate)
-
-def validate_price(price):
-    return price > 0
-"""
-            test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or)
-
-            response2_6, _ = self.call_mcp_tool(
-                "codereview",
-                {
-                    "files": [test_file_pro_or],
-                    "prompt": "Quick review of this discount calculation code",
-                    "model": "o3-pro",
-                    "temperature": 1.0,
-                },
-            )
-
-            if not response2_6:
-                self.logger.error("  ❌ O3-pro with codereview tool via OpenRouter failed")
-                return False
-
-            self.logger.info("  ✅ O3-pro with codereview tool via OpenRouter completed")
-
            # Test 3: Codereview with O3 via OpenRouter
            self.logger.info("  3: Testing O3 with codereview tool via OpenRouter")

@@ -415,8 +325,8 @@ def multiply(x, y):
            self.logger.info(f"   OpenRouter response logs: {len(openrouter_response_logs)}")

            # Success criteria for OpenRouter
-            openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5
-            all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3
+            openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3
+            all_calls_succeeded = response1 and response2 and response3

            success_criteria = [
                ("All O3 model calls succeeded", all_calls_succeeded),
--- a/simulator_tests/test_o3_pro_expensive.py
+++ b/simulator_tests/test_o3_pro_expensive.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+O3-Pro Expensive Model Test
+
+⚠️  WARNING: This test uses o3-pro which is EXTREMELY EXPENSIVE! ⚠️
+
+This test is intentionally NOT added to TEST_REGISTRY to prevent accidental execution.
+It can only be run manually using:
+    python communication_simulator_test.py --individual o3_pro_expensive
+
+Tests that o3-pro model works with one simple chat call. That's it.
+"""
+
+from .base_test import BaseSimulatorTest
+
+
+class O3ProExpensiveTest(BaseSimulatorTest):
+    """Test o3-pro model basic functionality - EXPENSIVE, manual only"""
+
+    @property
+    def test_name(self) -> str:
+        return "o3_pro_expensive"
+
+    @property
+    def test_description(self) -> str:
+        return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)"
+
+    def run_test(self) -> bool:
+        """Test o3-pro model with one simple chat call - EXPENSIVE!"""
+        try:
+            self.logger.warning("⚠️ ⚠️ ⚠️  EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️")
+            self.logger.info("Test: O3-Pro basic chat test")
+
+            # One simple chat call
+            response, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "What is 2 + 2?",
+                    "model": "o3-pro",
+                    "temperature": 1.0,
+                },
+            )
+
+            if response:
+                self.logger.info("✅ O3-Pro chat call succeeded")
+                self.logger.warning("💰 Test completed - check your billing!")
+                return True
+            else:
+                self.logger.error("❌ O3-Pro chat call failed")
+                return False
+
+        except Exception as e:
+            self.logger.error(f"O3-Pro test failed: {e}")
+            return False
+
+
+def main():
+    """Run the O3-Pro expensive test"""
+    import sys
+
+    print("⚠️ ⚠️ ⚠️  WARNING: This test uses O3-PRO which is EXTREMELY EXPENSIVE! ⚠️ ⚠️ ⚠️")
+    print("O3-Pro can cost $15-60 per 1K tokens!")
+    print("This is a MINIMAL test but may still cost $5-15!")
+    print()
+
+    response = input("Are you absolutely sure you want to run this expensive test? Type 'YES_I_UNDERSTAND_THE_COST': ")
+    if response != "YES_I_UNDERSTAND_THE_COST":
+        print("❌ Test cancelled")
+        sys.exit(1)
+
+    print("💰 Running minimal O3-Pro test...")
+
+    verbose = "--verbose" in sys.argv or "-v" in sys.argv
+    test = O3ProExpensiveTest(verbose=verbose)
+
+    success = test.run_test()
+
+    if success:
+        print("✅ O3-Pro test completed successfully")
+        print("💰 Don't forget to check your billing!")
+    else:
+        print("❌ O3-Pro test failed")
+
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()