Move o3-pro test into its own
This commit is contained in:
@@ -125,51 +125,6 @@ class O3ModelSelectionTest(BaseSimulatorTest):
|
||||
|
||||
self.logger.info(" ✅ O3-mini model call completed")
|
||||
|
||||
# Test 2.5: Explicit O3-pro model selection
|
||||
self.logger.info(" 2.5: Testing explicit O3-pro model selection")
|
||||
|
||||
response2_5, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_5:
|
||||
self.logger.error(" ❌ O3-pro model test failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro model call completed")
|
||||
|
||||
# Test 2.6: O3-pro with codereview tool
|
||||
self.logger.info(" 2.6: Testing O3-pro with codereview tool")
|
||||
|
||||
test_code_pro = """def calculate_tax(amount, rate):
|
||||
return amount * rate
|
||||
|
||||
def format_currency(value):
|
||||
return f"${value:.2f}"
|
||||
"""
|
||||
test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro)
|
||||
|
||||
response2_6, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [test_file_pro],
|
||||
"prompt": "Quick review of this tax calculation code",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_6:
|
||||
self.logger.error(" ❌ O3-pro with codereview tool failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro with codereview tool completed")
|
||||
|
||||
# Test 3: Another tool with O3 to ensure it works across tools
|
||||
self.logger.info(" 3: Testing O3 with different tool (codereview)")
|
||||
|
||||
@@ -222,12 +177,12 @@ def multiply(x, y):
|
||||
line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
|
||||
]
|
||||
|
||||
# Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview)
|
||||
openai_api_called = len(openai_api_logs) >= 5 # Should see 5 OpenAI API calls
|
||||
openai_model_usage = len(openai_model_logs) >= 5 # Should see 5 model usage logs
|
||||
openai_responses_received = len(openai_response_logs) >= 5 # Should see 5 responses
|
||||
chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro)
|
||||
codereview_calls_to_openai = len(codereview_openai_logs) >= 2 # Should see 2 codereview calls (o3-pro + o3)
|
||||
# Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
|
||||
openai_api_called = len(openai_api_logs) >= 3 # Should see 3 OpenAI API calls
|
||||
openai_model_usage = len(openai_model_logs) >= 3 # Should see 3 model usage logs
|
||||
openai_responses_received = len(openai_response_logs) >= 3 # Should see 3 responses
|
||||
chat_calls_to_openai = len(chat_openai_logs) >= 2 # Should see 2 chat calls (o3 + o3-mini)
|
||||
codereview_calls_to_openai = len(codereview_openai_logs) >= 1 # Should see 1 codereview call (o3)
|
||||
|
||||
self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}")
|
||||
self.logger.info(f" OpenAI model usage logs: {len(openai_model_logs)}")
|
||||
@@ -317,51 +272,6 @@ def multiply(x, y):
|
||||
|
||||
self.logger.info(" ✅ O3-mini model call via OpenRouter completed")
|
||||
|
||||
# Test 2.5: O3-pro model via OpenRouter
|
||||
self.logger.info(" 2.5: Testing O3-pro model via OpenRouter")
|
||||
|
||||
response2_5, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_5:
|
||||
self.logger.error(" ❌ O3-pro model test via OpenRouter failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro model call via OpenRouter completed")
|
||||
|
||||
# Test 2.6: O3-pro with codereview tool via OpenRouter
|
||||
self.logger.info(" 2.6: Testing O3-pro with codereview tool via OpenRouter")
|
||||
|
||||
test_code_pro_or = """def calculate_discount(price, discount_rate):
|
||||
return price * (1 - discount_rate)
|
||||
|
||||
def validate_price(price):
|
||||
return price > 0
|
||||
"""
|
||||
test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or)
|
||||
|
||||
response2_6, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [test_file_pro_or],
|
||||
"prompt": "Quick review of this discount calculation code",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_6:
|
||||
self.logger.error(" ❌ O3-pro with codereview tool via OpenRouter failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro with codereview tool via OpenRouter completed")
|
||||
|
||||
# Test 3: Codereview with O3 via OpenRouter
|
||||
self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter")
|
||||
|
||||
@@ -415,8 +325,8 @@ def multiply(x, y):
|
||||
self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}")
|
||||
|
||||
# Success criteria for OpenRouter
|
||||
openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5
|
||||
all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3
|
||||
openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3
|
||||
all_calls_succeeded = response1 and response2 and response3
|
||||
|
||||
success_criteria = [
|
||||
("All O3 model calls succeeded", all_calls_succeeded),
|
||||
|
||||
88
simulator_tests/test_o3_pro_expensive.py
Normal file
88
simulator_tests/test_o3_pro_expensive.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
O3-Pro Expensive Model Test
|
||||
|
||||
⚠️ WARNING: This test uses o3-pro which is EXTREMELY EXPENSIVE! ⚠️
|
||||
|
||||
This test is intentionally NOT added to TEST_REGISTRY to prevent accidental execution.
|
||||
It can only be run manually using:
|
||||
python communication_simulator_test.py --individual o3_pro_expensive
|
||||
|
||||
Tests that o3-pro model works with one simple chat call. That's it.
|
||||
"""
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class O3ProExpensiveTest(BaseSimulatorTest):
|
||||
"""Test o3-pro model basic functionality - EXPENSIVE, manual only"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "o3_pro_expensive"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test o3-pro model with one simple chat call - EXPENSIVE!"""
|
||||
try:
|
||||
self.logger.warning("⚠️ ⚠️ ⚠️ EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️")
|
||||
self.logger.info("Test: O3-Pro basic chat test")
|
||||
|
||||
# One simple chat call
|
||||
response, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "What is 2 + 2?",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
if response:
|
||||
self.logger.info("✅ O3-Pro chat call succeeded")
|
||||
self.logger.warning("💰 Test completed - check your billing!")
|
||||
return True
|
||||
else:
|
||||
self.logger.error("❌ O3-Pro chat call failed")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"O3-Pro test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the O3-Pro expensive test"""
|
||||
import sys
|
||||
|
||||
print("⚠️ ⚠️ ⚠️ WARNING: This test uses O3-PRO which is EXTREMELY EXPENSIVE! ⚠️ ⚠️ ⚠️")
|
||||
print("O3-Pro can cost $15-60 per 1K tokens!")
|
||||
print("This is a MINIMAL test but may still cost $5-15!")
|
||||
print()
|
||||
|
||||
response = input("Are you absolutely sure you want to run this expensive test? Type 'YES_I_UNDERSTAND_THE_COST': ")
|
||||
if response != "YES_I_UNDERSTAND_THE_COST":
|
||||
print("❌ Test cancelled")
|
||||
sys.exit(1)
|
||||
|
||||
print("💰 Running minimal O3-Pro test...")
|
||||
|
||||
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
||||
test = O3ProExpensiveTest(verbose=verbose)
|
||||
|
||||
success = test.run_test()
|
||||
|
||||
if success:
|
||||
print("✅ O3-Pro test completed successfully")
|
||||
print("💰 Don't forget to check your billing!")
|
||||
else:
|
||||
print("❌ O3-Pro test failed")
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user