Files
my-pal-mcp-server/simulator_tests/test_o3_model_selection.py
Lachlan Donald a3aaf6f79b Enhance o3-pro test coverage with comprehensive codereview testing
- Added o3-pro codereview tests for both direct OpenAI and OpenRouter paths
- Updated validation criteria to account for additional test cases (5 total calls)
- Addresses Gemini Code Assist feedback about incomplete test coverage
- Ensures o3-pro functionality is thoroughly validated across all tools

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-14 15:49:19 +10:00

460 lines
18 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
O3 Model Selection Test
Tests that O3 models are properly selected and used when explicitly specified,
regardless of the default model configuration (even when set to auto).
Validates model selection via Docker logs.
"""
import datetime
import subprocess
from .base_test import BaseSimulatorTest
class O3ModelSelectionTest(BaseSimulatorTest):
"""Test O3 model selection and usage"""
@property
def test_name(self) -> str:
return "o3_model_selection"
@property
def test_description(self) -> str:
return "O3 model selection and usage validation"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
# Read logs directly from the log file - use more lines to ensure we get all test-related logs
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def run_test(self) -> bool:
"""Test O3 model selection and usage"""
try:
self.logger.info(" Test: O3 model selection and usage validation")
# Check which API keys are configured
check_cmd = [
"docker",
"exec",
self.container_name,
"python",
"-c",
'import os; print(f\'OPENAI_KEY:{bool(os.environ.get("OPENAI_API_KEY"))}|OPENROUTER_KEY:{bool(os.environ.get("OPENROUTER_API_KEY"))}\')',
]
result = subprocess.run(check_cmd, capture_output=True, text=True)
has_openai = False
has_openrouter = False
if result.returncode == 0:
output = result.stdout.strip()
if "OPENAI_KEY:True" in output:
has_openai = True
if "OPENROUTER_KEY:True" in output:
has_openrouter = True
# If only OpenRouter is configured, adjust test expectations
if has_openrouter and not has_openai:
self.logger.info(" Only OpenRouter configured - O3 models will be routed through OpenRouter")
return self._run_openrouter_o3_test()
# If neither OpenAI nor OpenRouter is configured, skip the test
if not has_openai and not has_openrouter:
self.logger.info(" ⚠️ Neither OpenAI nor OpenRouter API keys configured - skipping test")
self.logger.info(
" This test requires either OPENAI_API_KEY or OPENROUTER_API_KEY to be set in .env"
)
self.logger.info(" ✅ Test skipped (no API keys configured)")
return True # Return True to indicate test passed/skipped
# Original test for when OpenAI is configured
self.logger.info(" OpenAI API configured - expecting direct OpenAI API calls")
# Setup test files for later use
self.setup_test_files()
# Get timestamp for log filtering
datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
# Test 1: Explicit O3 model selection
self.logger.info(" 1: Testing explicit O3 model selection")
response1, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 2 + 2? Just give a brief answer.",
"model": "o3",
"temperature": 1.0, # O3 only supports default temperature of 1.0
},
)
if not response1:
self.logger.error(" ❌ O3 model test failed")
return False
self.logger.info(" ✅ O3 model call completed")
# Test 2: Explicit O3-mini model selection
self.logger.info(" 2: Testing explicit O3-mini model selection")
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 3 + 3? Just give a brief answer.",
"model": "o3-mini",
"temperature": 1.0, # O3-mini only supports default temperature of 1.0
},
)
if not response2:
self.logger.error(" ❌ O3-mini model test failed")
return False
self.logger.info(" ✅ O3-mini model call completed")
# Test 2.5: Explicit O3-pro model selection
self.logger.info(" 2.5: Testing explicit O3-pro model selection")
response2_5, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
"model": "o3-pro",
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
},
)
if not response2_5:
self.logger.error(" ❌ O3-pro model test failed")
return False
self.logger.info(" ✅ O3-pro model call completed")
# Test 2.6: O3-pro with codereview tool
self.logger.info(" 2.6: Testing O3-pro with codereview tool")
test_code_pro = """def calculate_tax(amount, rate):
return amount * rate
def format_currency(value):
return f"${value:.2f}"
"""
test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro)
response2_6, _ = self.call_mcp_tool(
"codereview",
{
"files": [test_file_pro],
"prompt": "Quick review of this tax calculation code",
"model": "o3-pro",
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
},
)
if not response2_6:
self.logger.error(" ❌ O3-pro with codereview tool failed")
return False
self.logger.info(" ✅ O3-pro with codereview tool completed")
# Test 3: Another tool with O3 to ensure it works across tools
self.logger.info(" 3: Testing O3 with different tool (codereview)")
# Create a simple test file
test_code = """def add(a, b):
return a + b
def multiply(x, y):
return x * y
"""
test_file = self.create_additional_test_file("simple_math.py", test_code)
response3, _ = self.call_mcp_tool(
"codereview",
{
"files": [test_file],
"prompt": "Quick review of this simple code",
"model": "o3",
"temperature": 1.0, # O3 only supports default temperature of 1.0
},
)
if not response3:
self.logger.error(" ❌ O3 with codereview tool failed")
return False
self.logger.info(" ✅ O3 with codereview tool completed")
# Validate model usage from server logs
self.logger.info(" 4: Validating model usage in logs")
logs = self.get_recent_server_logs()
# Check for OpenAI API calls (this proves O3 models are being used)
openai_api_logs = [line for line in logs.split("\n") if "Sending request to openai API for" in line]
# Check for OpenAI model usage logs
openai_model_logs = [
line for line in logs.split("\n") if "Using model:" in line and "openai provider" in line
]
# Check for successful OpenAI responses
openai_response_logs = [
line for line in logs.split("\n") if "openai provider" in line and "Using model:" in line
]
# Check that we have both chat and codereview tool calls to OpenAI
chat_openai_logs = [line for line in logs.split("\n") if "Sending request to openai API for chat" in line]
codereview_openai_logs = [
line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
]
# Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview)
openai_api_called = len(openai_api_logs) >= 5 # Should see 5 OpenAI API calls
openai_model_usage = len(openai_model_logs) >= 5 # Should see 5 model usage logs
openai_responses_received = len(openai_response_logs) >= 5 # Should see 5 responses
chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro)
codereview_calls_to_openai = len(codereview_openai_logs) >= 2 # Should see 2 codereview calls (o3-pro + o3)
self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}")
self.logger.info(f" OpenAI model usage logs: {len(openai_model_logs)}")
self.logger.info(f" OpenAI response logs: {len(openai_response_logs)}")
self.logger.info(f" Chat calls to OpenAI: {len(chat_openai_logs)}")
self.logger.info(f" Codereview calls to OpenAI: {len(codereview_openai_logs)}")
# Log sample evidence for debugging
if self.verbose and openai_api_logs:
self.logger.debug(" 📋 Sample OpenAI API logs:")
for log in openai_api_logs[:5]:
self.logger.debug(f" {log}")
if self.verbose and chat_openai_logs:
self.logger.debug(" 📋 Sample chat OpenAI logs:")
for log in chat_openai_logs[:3]:
self.logger.debug(f" {log}")
# Success criteria
success_criteria = [
("OpenAI API calls made", openai_api_called),
("OpenAI model usage logged", openai_model_usage),
("OpenAI responses received", openai_responses_received),
("Chat tool used OpenAI", chat_calls_to_openai),
("Codereview tool used OpenAI", codereview_calls_to_openai),
]
passed_criteria = sum(1 for _, passed in success_criteria if passed)
self.logger.info(f" Success criteria met: {passed_criteria}/{len(success_criteria)}")
for criterion, passed in success_criteria:
status = "" if passed else ""
self.logger.info(f" {status} {criterion}")
if passed_criteria >= 3: # At least 3 out of 4 criteria
self.logger.info(" ✅ O3 model selection validation passed")
return True
else:
self.logger.error(" ❌ O3 model selection validation failed")
return False
except Exception as e:
self.logger.error(f"O3 model selection test failed: {e}")
return False
finally:
self.cleanup_test_files()
def _run_openrouter_o3_test(self) -> bool:
"""Test O3 model selection when using OpenRouter"""
try:
# Setup test files
self.setup_test_files()
# Test 1: O3 model via OpenRouter
self.logger.info(" 1: Testing O3 model via OpenRouter")
response1, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 2 + 2? Just give a brief answer.",
"model": "o3",
"temperature": 1.0,
},
)
if not response1:
self.logger.error(" ❌ O3 model test via OpenRouter failed")
return False
self.logger.info(" ✅ O3 model call via OpenRouter completed")
# Test 2: O3-mini model via OpenRouter
self.logger.info(" 2: Testing O3-mini model via OpenRouter")
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 3 + 3? Just give a brief answer.",
"model": "o3-mini",
"temperature": 1.0,
},
)
if not response2:
self.logger.error(" ❌ O3-mini model test via OpenRouter failed")
return False
self.logger.info(" ✅ O3-mini model call via OpenRouter completed")
# Test 2.5: O3-pro model via OpenRouter
self.logger.info(" 2.5: Testing O3-pro model via OpenRouter")
response2_5, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
"model": "o3-pro",
"temperature": 1.0,
},
)
if not response2_5:
self.logger.error(" ❌ O3-pro model test via OpenRouter failed")
return False
self.logger.info(" ✅ O3-pro model call via OpenRouter completed")
# Test 2.6: O3-pro with codereview tool via OpenRouter
self.logger.info(" 2.6: Testing O3-pro with codereview tool via OpenRouter")
test_code_pro_or = """def calculate_discount(price, discount_rate):
return price * (1 - discount_rate)
def validate_price(price):
return price > 0
"""
test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or)
response2_6, _ = self.call_mcp_tool(
"codereview",
{
"files": [test_file_pro_or],
"prompt": "Quick review of this discount calculation code",
"model": "o3-pro",
"temperature": 1.0,
},
)
if not response2_6:
self.logger.error(" ❌ O3-pro with codereview tool via OpenRouter failed")
return False
self.logger.info(" ✅ O3-pro with codereview tool via OpenRouter completed")
# Test 3: Codereview with O3 via OpenRouter
self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter")
test_code = """def add(a, b):
return a + b
def multiply(x, y):
return x * y
"""
test_file = self.create_additional_test_file("simple_math.py", test_code)
response3, _ = self.call_mcp_tool(
"codereview",
{
"files": [test_file],
"prompt": "Quick review of this simple code",
"model": "o3",
"temperature": 1.0,
},
)
if not response3:
self.logger.error(" ❌ O3 with codereview tool via OpenRouter failed")
return False
self.logger.info(" ✅ O3 with codereview tool via OpenRouter completed")
# Validate OpenRouter usage in logs
self.logger.info(" 4: Validating OpenRouter usage in logs")
logs = self.get_recent_server_logs()
# Check for OpenRouter API calls
openrouter_api_logs = [
line
for line in logs.split("\n")
if "openrouter" in line.lower() and ("API" in line or "request" in line)
]
# Check for model resolution through OpenRouter
openrouter_model_logs = [
line for line in logs.split("\n") if "openrouter" in line.lower() and ("o3" in line or "model" in line)
]
# Check for successful responses
openrouter_response_logs = [
line for line in logs.split("\n") if "openrouter" in line.lower() and "response" in line
]
self.logger.info(f" OpenRouter API logs: {len(openrouter_api_logs)}")
self.logger.info(f" OpenRouter model logs: {len(openrouter_model_logs)}")
self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}")
# Success criteria for OpenRouter
openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5
all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3
success_criteria = [
("All O3 model calls succeeded", all_calls_succeeded),
("OpenRouter provider was used", openrouter_used),
]
passed_criteria = sum(1 for _, passed in success_criteria if passed)
self.logger.info(f" Success criteria met: {passed_criteria}/{len(success_criteria)}")
for criterion, passed in success_criteria:
status = "" if passed else ""
self.logger.info(f" {status} {criterion}")
if passed_criteria == len(success_criteria):
self.logger.info(" ✅ O3 model selection via OpenRouter passed")
return True
else:
self.logger.error(" ❌ O3 model selection via OpenRouter failed")
return False
except Exception as e:
self.logger.error(f"OpenRouter O3 test failed: {e}")
return False
finally:
self.cleanup_test_files()
def main():
"""Run the O3 model selection tests"""
import sys
verbose = "--verbose" in sys.argv or "-v" in sys.argv
test = O3ModelSelectionTest(verbose=verbose)
success = test.run_test()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()