New tests for O3-pro
Improved prompts for shorthand input
This commit is contained in:
Fahad
2025-06-16 20:00:08 +04:00
parent 5f69ad4049
commit 9b98df650b
8 changed files with 400 additions and 50 deletions

View File

@@ -592,10 +592,41 @@ For detailed tool parameters and configuration options, see the [Advanced Usage
### Prompt Support ### Prompt Support
- `/zen:thinkdeeper with o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore` Zen supports powerful structured prompts in Claude Code for quick access to tools and models:
- `/zen:precommit use gemini pro and confirm these changes match our requirements in COOL_FEATURE.md`
- `/zen:testgen write me tests for class ABC` #### Basic Tool Prompts
- `/zen:refactor using local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift` - `/zen:thinkdeeper` - Use thinkdeep tool with auto-selected model
- `/zen:chat` - Use chat tool with auto-selected model
- `/zen:codereview` - Use codereview tool with auto-selected model
- `/zen:analyze` - Use analyze tool with auto-selected model
#### Model-Specific Tool Prompts
- `/zen:chat:o3 hello there` - Use chat tool specifically with O3 model
- `/zen:thinkdeep:flash analyze this quickly` - Use thinkdeep tool with Flash for speed
- `/zen:codereview:pro review for security` - Use codereview tool with Gemini Pro for thorough analysis
- `/zen:debug:grok help with this error` - Use debug tool with GROK model
- `/zen:analyze:gemini-2.5-flash-preview-05-20 examine these files` - Use analyze tool with specific Gemini model
#### Continuation Prompts
- `/zen:continue` - Continue previous conversation using chat tool
- `/zen:chat:continue` - Continue previous conversation using chat tool specifically
- `/zen:thinkdeep:continue` - Continue previous conversation using thinkdeep tool
- `/zen:analyze:continue` - Continue previous conversation using analyze tool
#### Advanced Examples
- `/zen:thinkdeeper:o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore`
- `/zen:precommit:pro confirm these changes match our requirements in COOL_FEATURE.md`
- `/zen:testgen:flash write me tests for class ABC`
- `/zen:refactor:local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift`
#### Syntax Format
The structured prompt format is: `/zen:[tool]:[model / continue] [your_message]`
- `[tool]` - Any available tool name (chat, thinkdeep, codereview, debug, analyze, etc.)
- `[model / continue]` - Either a specific model name (o3, flash, pro, grok, etc.) or the keyword `continue` to continue the conversation using this tool
- `[your_message]` - Your actual prompt or question
**Note**: When using `:continue`, it intelligently resumes the previous conversation with the specified tool, maintaining full context and conversation history.
### Add Your Own Tools ### Add Your Own Tools

View File

@@ -14,7 +14,7 @@ import os
# These values are used in server responses and for tracking releases # These values are used in server responses and for tracking releases
# IMPORTANT: This is the single source of truth for version and author info # IMPORTANT: This is the single source of truth for version and author info
# Semantic versioning: MAJOR.MINOR.PATCH # Semantic versioning: MAJOR.MINOR.PATCH
__version__ = "4.8.2" __version__ = "4.8.3"
# Last update date in ISO format # Last update date in ISO format
__updated__ = "2025-06-16" __updated__ = "2025-06-16"
# Primary maintainer # Primary maintainer

View File

@@ -32,12 +32,14 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
"supports_images": True, # O3 models support vision "supports_images": True, # O3 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs "max_image_size_mb": 20.0, # 20MB per OpenAI docs
}, },
"o3-pro": { "o3-pro-2025-06-10": {
"context_window": 200_000, # 200K tokens "context_window": 200_000, # 200K tokens
"supports_extended_thinking": False, "supports_extended_thinking": False,
"supports_images": True, # O3 models support vision "supports_images": True, # O3 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs "max_image_size_mb": 20.0, # 20MB per OpenAI docs
}, },
# Aliases
"o3-pro": "o3-pro-2025-06-10",
"o4-mini": { "o4-mini": {
"context_window": 200_000, # 200K tokens "context_window": 200_000, # 200K tokens
"supports_extended_thinking": False, "supports_extended_thinking": False,
@@ -89,7 +91,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
config = self.SUPPORTED_MODELS[resolved_name] config = self.SUPPORTED_MODELS[resolved_name]
# Define temperature constraints per model # Define temperature constraints per model
if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]: if resolved_name in ["o3", "o3-mini", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-high"]:
# O3 and O4 reasoning models only support temperature=1.0 # O3 and O4 reasoning models only support temperature=1.0
temp_constraint = FixedTemperatureConstraint(1.0) temp_constraint = FixedTemperatureConstraint(1.0)
else: else:

View File

@@ -224,6 +224,138 @@ class OpenAICompatibleProvider(ModelProvider):
return self._client return self._client
def _generate_with_responses_endpoint(
self,
model_name: str,
messages: list,
temperature: float,
max_output_tokens: Optional[int] = None,
**kwargs,
) -> ModelResponse:
"""Generate content using the /v1/responses endpoint for o3-pro via OpenAI library."""
# Convert messages to the correct format for responses endpoint
input_messages = []
for message in messages:
role = message.get("role", "")
content = message.get("content", "")
if role == "system":
# System messages can be treated as user messages for o3-pro
input_messages.append(
{"role": "user", "content": [{"type": "input_text", "text": f"System: {content}"}]}
)
elif role == "user":
input_messages.append({"role": "user", "content": [{"type": "input_text", "text": content}]})
elif role == "assistant":
input_messages.append({"role": "assistant", "content": [{"type": "output_text", "text": content}]})
# Prepare completion parameters for responses endpoint
completion_params = {
"model": model_name,
"input": input_messages,
"text": {"format": {"type": "text"}},
"reasoning": {"effort": "medium", "summary": "auto"},
"tools": [],
"store": True,
}
# Temperature is not in the documented parameters for responses endpoint
# but we'll try to add it in case it's supported
# Add max tokens if specified
if max_output_tokens:
completion_params["max_tokens"] = max_output_tokens
# Add any additional OpenAI-specific parameters
for key, value in kwargs.items():
if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop"]:
completion_params[key] = value
# Retry logic with progressive delays
max_retries = 4
retry_delays = [1, 3, 5, 8]
last_exception = None
for attempt in range(max_retries):
try:
# Use OpenAI client's responses endpoint
response = self.client.responses.create(**completion_params)
# Extract content and usage from responses endpoint format
# The response format is different for responses endpoint
content = ""
if hasattr(response, "output") and response.output:
if hasattr(response.output, "content") and response.output.content:
# Look for output_text in content
for content_item in response.output.content:
if hasattr(content_item, "type") and content_item.type == "output_text":
content = content_item.text
break
elif hasattr(response.output, "text"):
content = response.output.text
# Try to extract usage information
usage = None
if hasattr(response, "usage"):
usage = self._extract_usage(response)
elif hasattr(response, "input_tokens") and hasattr(response, "output_tokens"):
usage = {
"input_tokens": getattr(response, "input_tokens", 0),
"output_tokens": getattr(response, "output_tokens", 0),
"total_tokens": getattr(response, "input_tokens", 0) + getattr(response, "output_tokens", 0),
}
return ModelResponse(
content=content,
usage=usage,
model_name=model_name,
friendly_name=self.FRIENDLY_NAME,
provider=self.get_provider_type(),
metadata={
"model": getattr(response, "model", model_name),
"id": getattr(response, "id", ""),
"created": getattr(response, "created_at", 0),
"endpoint": "responses",
},
)
except Exception as e:
last_exception = e
# Check if this is a retryable error
error_str = str(e).lower()
is_retryable = any(
term in error_str
for term in [
"timeout",
"connection",
"network",
"temporary",
"unavailable",
"retry",
"429",
"500",
"502",
"503",
"504",
]
)
if is_retryable and attempt < max_retries - 1:
delay = retry_delays[attempt]
logging.warning(
f"Retryable error for o3-pro responses endpoint, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
)
time.sleep(delay)
else:
break
# If we get here, all retries failed
error_msg = f"o3-pro responses endpoint error after {max_retries} attempts: {str(last_exception)}"
logging.error(error_msg)
raise RuntimeError(error_msg) from last_exception
def generate_content( def generate_content(
self, self,
prompt: str, prompt: str,
@@ -301,6 +433,22 @@ class OpenAICompatibleProvider(ModelProvider):
if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]: if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]:
completion_params[key] = value completion_params[key] = value
# Check if this is o3-pro and needs the responses endpoint
resolved_model = model_name
if hasattr(self, "_resolve_model_name"):
resolved_model = self._resolve_model_name(model_name)
if resolved_model == "o3-pro-2025-06-10":
# This model requires the /v1/responses endpoint
# If it fails, we should not fall back to chat/completions
return self._generate_with_responses_endpoint(
model_name=resolved_model,
messages=messages,
temperature=temperature,
max_output_tokens=max_output_tokens,
**kwargs,
)
# Retry logic with progressive delays # Retry logic with progressive delays
max_retries = 4 # Total of 4 attempts max_retries = 4 # Total of 4 attempts
retry_delays = [1, 3, 5, 8] # Progressive delays: 1s, 3s, 5s, 8s retry_delays = [1, 3, 5, 8] # Progressive delays: 1s, 3s, 5s, 8s

107
server.py
View File

@@ -925,6 +925,15 @@ async def handle_list_prompts() -> list[Prompt]:
) )
) )
# Add special "continue" prompt
prompts.append(
Prompt(
name="continue",
description="Continue the previous conversation using the chat tool",
arguments=[],
)
)
logger.debug(f"Returning {len(prompts)} prompts to MCP client") logger.debug(f"Returning {len(prompts)} prompts to MCP client")
return prompts return prompts
@@ -934,12 +943,16 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
""" """
Get prompt details and generate the actual prompt text. Get prompt details and generate the actual prompt text.
This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper). This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:o3).
It generates the appropriate text that Claude will then use to call the It generates the appropriate text that Claude will then use to call the
underlying tool. underlying tool.
Supports structured prompt names like "chat:o3" where:
- "chat" is the tool name
- "o3" is the model to use
Args: Args:
name: The name of the prompt to execute name: The name of the prompt to execute (can include model like "chat:o3")
arguments: Optional arguments for the prompt (e.g., model, thinking_mode) arguments: Optional arguments for the prompt (e.g., model, thinking_mode)
Returns: Returns:
@@ -950,39 +963,74 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
""" """
logger.debug(f"MCP client requested prompt: {name} with args: {arguments}") logger.debug(f"MCP client requested prompt: {name} with args: {arguments}")
# Find the corresponding tool by checking prompt names # Parse structured prompt names like "chat:o3" or "chat:continue"
tool_name = None parsed_model = None
template_info = None is_continuation = False
base_name = name
# Check if it's a known prompt name if ":" in name:
for t_name, t_info in PROMPT_TEMPLATES.items(): parts = name.split(":", 1)
if t_info["name"] == name: base_name = parts[0]
tool_name = t_name second_part = parts[1]
template_info = t_info
break
# If not found, check if it's a direct tool name # Check if the second part is "continue" (special keyword)
if not tool_name and name in TOOLS: if second_part.lower() == "continue":
tool_name = name is_continuation = True
logger.debug(f"Parsed continuation prompt: tool='{base_name}', continue=True")
else:
parsed_model = second_part
logger.debug(f"Parsed structured prompt: tool='{base_name}', model='{parsed_model}'")
# Handle special "continue" cases
if base_name.lower() == "continue":
# This is "/zen:continue" - use chat tool as default for continuation
tool_name = "chat"
is_continuation = True
template_info = { template_info = {
"name": name, "name": "continue",
"description": f"Use {name} tool", "description": "Continue the previous conversation",
"template": f"Use {name}", "template": "Continue the conversation",
} }
logger.debug("Using /zen:continue - defaulting to chat tool with continuation")
else:
# Find the corresponding tool by checking prompt names
tool_name = None
template_info = None
if not tool_name: # Check if it's a known prompt name (using base_name)
logger.error(f"Unknown prompt requested: {name}") for t_name, t_info in PROMPT_TEMPLATES.items():
raise ValueError(f"Unknown prompt: {name}") if t_info["name"] == base_name:
tool_name = t_name
template_info = t_info
break
# If not found, check if it's a direct tool name
if not tool_name and base_name in TOOLS:
tool_name = base_name
template_info = {
"name": base_name,
"description": f"Use {base_name} tool",
"template": f"Use {base_name}",
}
if not tool_name:
logger.error(f"Unknown prompt requested: {name} (base: {base_name})")
raise ValueError(f"Unknown prompt: {name}")
# Get the template # Get the template
template = template_info.get("template", f"Use {tool_name}") template = template_info.get("template", f"Use {tool_name}")
# Safe template expansion with defaults # Safe template expansion with defaults
# Prioritize: parsed model > arguments model > "auto"
final_model = parsed_model or (arguments.get("model", "auto") if arguments else "auto")
prompt_args = { prompt_args = {
"model": arguments.get("model", "auto") if arguments else "auto", "model": final_model,
"thinking_mode": arguments.get("thinking_mode", "medium") if arguments else "medium", "thinking_mode": arguments.get("thinking_mode", "medium") if arguments else "medium",
} }
logger.debug(f"Using model '{final_model}' for prompt '{name}'")
# Safely format the template # Safely format the template
try: try:
prompt_text = template.format(**prompt_args) prompt_text = template.format(**prompt_args)
@@ -990,6 +1038,21 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
logger.warning(f"Missing template argument {e} for prompt {name}, using raw template") logger.warning(f"Missing template argument {e} for prompt {name}, using raw template")
prompt_text = template # Fallback to raw template prompt_text = template # Fallback to raw template
# Generate tool call instruction based on the type of prompt
if is_continuation:
if base_name.lower() == "continue":
# "/zen:continue" case
tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
else:
# "/zen:chat:continue" case
tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
elif parsed_model:
# "/zen:chat:o3" case
tool_instruction = f"Use the {tool_name} tool with model '{parsed_model}'"
else:
# "/zen:chat" case
tool_instruction = prompt_text
return GetPromptResult( return GetPromptResult(
prompt=Prompt( prompt=Prompt(
name=name, name=name,
@@ -999,7 +1062,7 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
messages=[ messages=[
PromptMessage( PromptMessage(
role="user", role="user",
content={"type": "text", "text": prompt_text}, content={"type": "text", "text": tool_instruction},
) )
], ],
) )

View File

@@ -8,7 +8,10 @@ This test is intentionally NOT added to TEST_REGISTRY to prevent accidental exec
It can only be run manually using: It can only be run manually using:
python communication_simulator_test.py --individual o3_pro_expensive python communication_simulator_test.py --individual o3_pro_expensive
Tests that o3-pro model works with one simple chat call. That's it. Tests that o3-pro model:
1. Uses the correct /v1/responses endpoint (not /v1/chat/completions)
2. Successfully completes a chat call
3. Returns properly formatted response
""" """
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
@@ -26,13 +29,16 @@ class O3ProExpensiveTest(BaseSimulatorTest):
return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)" return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)"
def run_test(self) -> bool: def run_test(self) -> bool:
"""Test o3-pro model with one simple chat call - EXPENSIVE!""" """Test o3-pro model with endpoint verification - EXPENSIVE!"""
try: try:
self.logger.warning("⚠️ ⚠️ ⚠️ EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️") self.logger.warning("⚠️ ⚠️ ⚠️ EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️")
self.logger.info("Test: O3-Pro basic chat test") self.logger.info("Test: O3-Pro endpoint and functionality test")
# First, verify we're hitting the right endpoint by checking logs
self.logger.info("Step 1: Testing o3-pro with chat tool")
# One simple chat call # One simple chat call
response, _ = self.call_mcp_tool( response, tool_result = self.call_mcp_tool(
"chat", "chat",
{ {
"prompt": "What is 2 + 2?", "prompt": "What is 2 + 2?",
@@ -41,16 +47,44 @@ class O3ProExpensiveTest(BaseSimulatorTest):
}, },
) )
if response: if not response:
self.logger.info(" O3-Pro chat call succeeded") self.logger.error(" O3-Pro chat call failed - no response")
self.logger.warning("💰 Test completed - check your billing!") if tool_result and "error" in tool_result:
return True error_msg = tool_result["error"]
else: self.logger.error(f"Error details: {error_msg}")
self.logger.error("❌ O3-Pro chat call failed") # Check if it's the endpoint error we're trying to fix
if "v1/responses" in str(error_msg) and "v1/chat/completions" in str(error_msg):
self.logger.error(
"❌ ENDPOINT BUG DETECTED: o3-pro is trying to use chat/completions instead of responses endpoint!"
)
return False return False
# Check the metadata to verify endpoint was used
if tool_result and isinstance(tool_result, dict):
metadata = tool_result.get("metadata", {})
endpoint_used = metadata.get("endpoint", "unknown")
if endpoint_used == "responses":
self.logger.info("✅ Correct endpoint used: /v1/responses")
else:
self.logger.warning(f"⚠️ Endpoint used: {endpoint_used} (expected: responses)")
# Verify the response content
if response and "4" in str(response):
self.logger.info("✅ O3-Pro response is mathematically correct")
else:
self.logger.warning(f"⚠️ Unexpected response: {response}")
self.logger.info("✅ O3-Pro test completed successfully")
self.logger.warning("💰 Test completed - check your billing!")
return True
except Exception as e: except Exception as e:
self.logger.error(f"O3-Pro test failed: {e}") self.logger.error(f"O3-Pro test failed with exception: {e}")
# Log the full error for debugging endpoint issues
import traceback
self.logger.error(f"Full traceback: {traceback.format_exc()}")
return False return False

View File

@@ -263,17 +263,18 @@ class TestProviderIntegration:
@patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "flash"}) @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "flash"})
def test_gemini_parameter_order_regression_protection(self): def test_gemini_parameter_order_regression_protection(self):
"""Test that prevents regression of parameter order bug in is_allowed calls. """Test that prevents regression of parameter order bug in is_allowed calls.
This test specifically catches the bug where parameters were incorrectly This test specifically catches the bug where parameters were incorrectly
passed as (provider, user_input, resolved_name) instead of passed as (provider, user_input, resolved_name) instead of
(provider, resolved_name, user_input). (provider, resolved_name, user_input).
The bug was subtle because the is_allowed method uses OR logic, so it The bug was subtle because the is_allowed method uses OR logic, so it
worked in most cases by accident. This test creates a scenario where worked in most cases by accident. This test creates a scenario where
the parameter order matters. the parameter order matters.
""" """
# Clear any cached restriction service # Clear any cached restriction service
import utils.model_restrictions import utils.model_restrictions
utils.model_restrictions._restriction_service = None utils.model_restrictions._restriction_service = None
provider = GeminiModelProvider(api_key="test-key") provider = GeminiModelProvider(api_key="test-key")
@@ -295,13 +296,14 @@ class TestProviderIntegration:
@patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash-preview-05-20"}) @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash-preview-05-20"})
def test_gemini_parameter_order_edge_case_full_name_only(self): def test_gemini_parameter_order_edge_case_full_name_only(self):
"""Test parameter order with only full name allowed, not alias. """Test parameter order with only full name allowed, not alias.
This is the reverse scenario - only the full canonical name is allowed, This is the reverse scenario - only the full canonical name is allowed,
not the shorthand alias. This tests that the parameter order is correct not the shorthand alias. This tests that the parameter order is correct
when resolving aliases. when resolving aliases.
""" """
# Clear any cached restriction service # Clear any cached restriction service
import utils.model_restrictions import utils.model_restrictions
utils.model_restrictions._restriction_service = None utils.model_restrictions._restriction_service = None
provider = GeminiModelProvider(api_key="test-key") provider = GeminiModelProvider(api_key="test-key")

View File

@@ -75,7 +75,7 @@ class TestOpenAIProvider:
# Test full name passthrough # Test full name passthrough
assert provider._resolve_model_name("o3") == "o3" assert provider._resolve_model_name("o3") == "o3"
assert provider._resolve_model_name("o3-mini") == "o3-mini" assert provider._resolve_model_name("o3-mini") == "o3-mini"
assert provider._resolve_model_name("o3-pro") == "o3-pro" assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
assert provider._resolve_model_name("o4-mini") == "o4-mini" assert provider._resolve_model_name("o4-mini") == "o4-mini"
assert provider._resolve_model_name("o4-mini-high") == "o4-mini-high" assert provider._resolve_model_name("o4-mini-high") == "o4-mini-high"
@@ -196,7 +196,7 @@ class TestOpenAIProvider:
mock_response.choices = [MagicMock()] mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "Test response" mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop" mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3-pro" mock_response.model = "o3-mini"
mock_response.usage = MagicMock() mock_response.usage = MagicMock()
mock_response.usage.prompt_tokens = 10 mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5 mock_response.usage.completion_tokens = 5
@@ -205,10 +205,10 @@ class TestOpenAIProvider:
provider = OpenAIModelProvider("test-key") provider = OpenAIModelProvider("test-key")
# Test full model name passes through unchanged # Test full model name passes through unchanged (use o3-mini since o3-pro has special handling)
provider.generate_content(prompt="Test", model_name="o3-pro", temperature=1.0) provider.generate_content(prompt="Test", model_name="o3-mini", temperature=1.0)
call_kwargs = mock_client.chat.completions.create.call_args[1] call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs["model"] == "o3-pro" # Should be unchanged assert call_kwargs["model"] == "o3-mini" # Should be unchanged
def test_supports_thinking_mode(self): def test_supports_thinking_mode(self):
"""Test thinking mode support (currently False for all OpenAI models).""" """Test thinking mode support (currently False for all OpenAI models)."""
@@ -219,3 +219,73 @@ class TestOpenAIProvider:
assert provider.supports_thinking_mode("o3-mini") is False assert provider.supports_thinking_mode("o3-mini") is False
assert provider.supports_thinking_mode("o4-mini") is False assert provider.supports_thinking_mode("o4-mini") is False
assert provider.supports_thinking_mode("mini") is False # Test with alias too assert provider.supports_thinking_mode("mini") is False # Test with alias too
@patch("providers.openai_compatible.OpenAI")
def test_o3_pro_routes_to_responses_endpoint(self, mock_openai_class):
"""Test that o3-pro model routes to the /v1/responses endpoint (mock test)."""
# Set up mock for OpenAI client responses endpoint
mock_client = MagicMock()
mock_openai_class.return_value = mock_client
mock_response = MagicMock()
mock_response.output = MagicMock()
mock_response.output.content = [MagicMock()]
mock_response.output.content[0].type = "output_text"
mock_response.output.content[0].text = "4"
mock_response.model = "o3-pro-2025-06-10"
mock_response.id = "test-id"
mock_response.created_at = 1234567890
mock_response.usage = MagicMock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.responses.create.return_value = mock_response
provider = OpenAIModelProvider("test-key")
# Generate content with o3-pro
result = provider.generate_content(prompt="What is 2 + 2?", model_name="o3-pro", temperature=1.0)
# Verify responses.create was called
mock_client.responses.create.assert_called_once()
call_args = mock_client.responses.create.call_args[1]
assert call_args["model"] == "o3-pro-2025-06-10"
assert call_args["input"][0]["role"] == "user"
assert "What is 2 + 2?" in call_args["input"][0]["content"][0]["text"]
# Verify the response
assert result.content == "4"
assert result.model_name == "o3-pro-2025-06-10"
assert result.metadata["endpoint"] == "responses"
@patch("providers.openai_compatible.OpenAI")
def test_non_o3_pro_uses_chat_completions(self, mock_openai_class):
"""Test that non-o3-pro models use the standard chat completions endpoint."""
# Set up mock
mock_client = MagicMock()
mock_openai_class.return_value = mock_client
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3-mini"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = MagicMock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
provider = OpenAIModelProvider("test-key")
# Generate content with o3-mini (not o3-pro)
result = provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=1.0)
# Verify chat.completions.create was called
mock_client.chat.completions.create.assert_called_once()
# Verify the response
assert result.content == "Test response"
assert result.model_name == "o3-mini"