Merge pull request #36 from lox/add-o3-pro-support
feat: Add o3-pro model support
This commit is contained in:
@@ -148,6 +148,15 @@
|
||||
"supports_function_calling": true,
|
||||
"description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems"
|
||||
},
|
||||
{
|
||||
"model_name": "openai/o3-pro",
|
||||
"aliases": ["o3-pro", "o3pro"],
|
||||
"context_window": 200000,
|
||||
"supports_extended_thinking": false,
|
||||
"supports_json_mode": true,
|
||||
"supports_function_calling": true,
|
||||
"description": "OpenAI's o3-pro model - professional-grade reasoning and analysis"
|
||||
},
|
||||
{
|
||||
"model_name": "openai/o4-mini",
|
||||
"aliases": ["o4-mini", "o4mini"],
|
||||
|
||||
@@ -50,6 +50,7 @@ MODEL_CAPABILITIES_DESC = {
|
||||
# OpenAI models - Available when OPENAI_API_KEY is configured
|
||||
"o3": "Strong reasoning (200K context) - Logical problems, code generation, systematic analysis",
|
||||
"o3-mini": "Fast O3 variant (200K context) - Balanced performance/speed, moderate complexity",
|
||||
"o3-pro": "Professional-grade reasoning (200K context) - EXTREMELY EXPENSIVE: Only for the most complex problems requiring universe-scale complexity analysis OR when the user explicitly asks for this model. Use sparingly for critical architectural decisions or exceptionally complex debugging that other models cannot handle.",
|
||||
"o4-mini": "Latest reasoning model (200K context) - Optimized for shorter contexts, rapid reasoning",
|
||||
"o4-mini-high": "Enhanced O4 mini (200K context) - Higher reasoning effort for complex tasks",
|
||||
# Full model names also supported (for explicit specification)
|
||||
|
||||
@@ -26,6 +26,10 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
},
|
||||
"o3-pro": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
},
|
||||
"o4-mini": {
|
||||
"context_window": 200_000, # 200K tokens
|
||||
"supports_extended_thinking": False,
|
||||
@@ -66,7 +70,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
|
||||
config = self.SUPPORTED_MODELS[resolved_name]
|
||||
|
||||
# Define temperature constraints per model
|
||||
if resolved_name in ["o3", "o3-mini", "o4-mini", "o4-mini-high"]:
|
||||
if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]:
|
||||
# O3 and O4 reasoning models only support temperature=1.0
|
||||
temp_constraint = FixedTemperatureConstraint(1.0)
|
||||
else:
|
||||
|
||||
@@ -125,6 +125,51 @@ class O3ModelSelectionTest(BaseSimulatorTest):
|
||||
|
||||
self.logger.info(" ✅ O3-mini model call completed")
|
||||
|
||||
# Test 2.5: Explicit O3-pro model selection
|
||||
self.logger.info(" 2.5: Testing explicit O3-pro model selection")
|
||||
|
||||
response2_5, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_5:
|
||||
self.logger.error(" ❌ O3-pro model test failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro model call completed")
|
||||
|
||||
# Test 2.6: O3-pro with codereview tool
|
||||
self.logger.info(" 2.6: Testing O3-pro with codereview tool")
|
||||
|
||||
test_code_pro = """def calculate_tax(amount, rate):
|
||||
return amount * rate
|
||||
|
||||
def format_currency(value):
|
||||
return f"${value:.2f}"
|
||||
"""
|
||||
test_file_pro = self.create_additional_test_file("tax_calc.py", test_code_pro)
|
||||
|
||||
response2_6, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [test_file_pro],
|
||||
"prompt": "Quick review of this tax calculation code",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0, # O3-pro only supports default temperature of 1.0
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_6:
|
||||
self.logger.error(" ❌ O3-pro with codereview tool failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro with codereview tool completed")
|
||||
|
||||
# Test 3: Another tool with O3 to ensure it works across tools
|
||||
self.logger.info(" 3: Testing O3 with different tool (codereview)")
|
||||
|
||||
@@ -177,12 +222,12 @@ def multiply(x, y):
|
||||
line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
|
||||
]
|
||||
|
||||
# Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
|
||||
openai_api_called = len(openai_api_logs) >= 3 # Should see 3 OpenAI API calls
|
||||
openai_model_usage = len(openai_model_logs) >= 3 # Should see 3 model usage logs
|
||||
openai_responses_received = len(openai_response_logs) >= 3 # Should see 3 responses
|
||||
chat_calls_to_openai = len(chat_openai_logs) >= 2 # Should see 2 chat calls (o3 + o3-mini)
|
||||
codereview_calls_to_openai = len(codereview_openai_logs) >= 1 # Should see 1 codereview call
|
||||
# Validation criteria - we expect 5 OpenAI calls (3 chat + 2 codereview)
|
||||
openai_api_called = len(openai_api_logs) >= 5 # Should see 5 OpenAI API calls
|
||||
openai_model_usage = len(openai_model_logs) >= 5 # Should see 5 model usage logs
|
||||
openai_responses_received = len(openai_response_logs) >= 5 # Should see 5 responses
|
||||
chat_calls_to_openai = len(chat_openai_logs) >= 3 # Should see 3 chat calls (o3 + o3-mini + o3-pro)
|
||||
codereview_calls_to_openai = len(codereview_openai_logs) >= 2 # Should see 2 codereview calls (o3-pro + o3)
|
||||
|
||||
self.logger.info(f" OpenAI API call logs: {len(openai_api_logs)}")
|
||||
self.logger.info(f" OpenAI model usage logs: {len(openai_model_logs)}")
|
||||
@@ -272,6 +317,51 @@ def multiply(x, y):
|
||||
|
||||
self.logger.info(" ✅ O3-mini model call via OpenRouter completed")
|
||||
|
||||
# Test 2.5: O3-pro model via OpenRouter
|
||||
self.logger.info(" 2.5: Testing O3-pro model via OpenRouter")
|
||||
|
||||
response2_5, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Simple test: What is 4 + 4? Just give a brief answer.",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_5:
|
||||
self.logger.error(" ❌ O3-pro model test via OpenRouter failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro model call via OpenRouter completed")
|
||||
|
||||
# Test 2.6: O3-pro with codereview tool via OpenRouter
|
||||
self.logger.info(" 2.6: Testing O3-pro with codereview tool via OpenRouter")
|
||||
|
||||
test_code_pro_or = """def calculate_discount(price, discount_rate):
|
||||
return price * (1 - discount_rate)
|
||||
|
||||
def validate_price(price):
|
||||
return price > 0
|
||||
"""
|
||||
test_file_pro_or = self.create_additional_test_file("discount_calc.py", test_code_pro_or)
|
||||
|
||||
response2_6, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [test_file_pro_or],
|
||||
"prompt": "Quick review of this discount calculation code",
|
||||
"model": "o3-pro",
|
||||
"temperature": 1.0,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2_6:
|
||||
self.logger.error(" ❌ O3-pro with codereview tool via OpenRouter failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ O3-pro with codereview tool via OpenRouter completed")
|
||||
|
||||
# Test 3: Codereview with O3 via OpenRouter
|
||||
self.logger.info(" 3: Testing O3 with codereview tool via OpenRouter")
|
||||
|
||||
@@ -325,8 +415,8 @@ def multiply(x, y):
|
||||
self.logger.info(f" OpenRouter response logs: {len(openrouter_response_logs)}")
|
||||
|
||||
# Success criteria for OpenRouter
|
||||
openrouter_used = len(openrouter_api_logs) >= 3 or len(openrouter_model_logs) >= 3
|
||||
all_calls_succeeded = response1 and response2 and response3
|
||||
openrouter_used = len(openrouter_api_logs) >= 5 or len(openrouter_model_logs) >= 5
|
||||
all_calls_succeeded = response1 and response2 and response2_5 and response2_6 and response3
|
||||
|
||||
success_criteria = [
|
||||
("All O3 model calls succeeded", all_calls_succeeded),
|
||||
|
||||
@@ -47,7 +47,7 @@ class TestAutoMode:
|
||||
from config import MODEL_CAPABILITIES_DESC
|
||||
|
||||
# Check all expected models are present
|
||||
expected_models = ["flash", "pro", "o3", "o3-mini"]
|
||||
expected_models = ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]
|
||||
for model in expected_models:
|
||||
assert model in MODEL_CAPABILITIES_DESC
|
||||
assert isinstance(MODEL_CAPABILITIES_DESC[model], str)
|
||||
@@ -225,7 +225,10 @@ class TestAutoMode:
|
||||
|
||||
schema = tool.get_model_field_schema()
|
||||
assert "enum" in schema
|
||||
assert all(model in schema["enum"] for model in ["flash", "pro", "o3"])
|
||||
assert all(
|
||||
model in schema["enum"]
|
||||
for model in ["flash", "pro", "o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]
|
||||
)
|
||||
assert "select the most suitable model" in schema["description"]
|
||||
|
||||
# Test normal mode
|
||||
|
||||
Reference in New Issue
Block a user