Fixes O3-Pro connection https://github.com/BeehiveInnovations/zen-mcp-server/issues/56

New tests for O3-pro Improved prompts for shorthand input
2025-06-16 20:00:08 +04:00
parent 5f69ad4049
commit 9b98df650b
8 changed files with 400 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -592,10 +592,41 @@ For detailed tool parameters and configuration options, see the [Advanced Usage

 ### Prompt Support

- `/zen:thinkdeeper with o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore`
- `/zen:precommit use gemini pro and confirm these changes match our requirements in COOL_FEATURE.md`
- `/zen:testgen write me tests for class ABC`
- `/zen:refactor using local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift`
+Zen supports powerful structured prompts in Claude Code for quick access to tools and models:
+
+#### Basic Tool Prompts
+- `/zen:thinkdeeper` - Use thinkdeep tool with auto-selected model
+- `/zen:chat` - Use chat tool with auto-selected model  
+- `/zen:codereview` - Use codereview tool with auto-selected model
+- `/zen:analyze` - Use analyze tool with auto-selected model
+
+#### Model-Specific Tool Prompts
+- `/zen:chat:o3 hello there` - Use chat tool specifically with O3 model
+- `/zen:thinkdeep:flash analyze this quickly` - Use thinkdeep tool with Flash for speed
+- `/zen:codereview:pro review for security` - Use codereview tool with Gemini Pro for thorough analysis
+- `/zen:debug:grok help with this error` - Use debug tool with GROK model
+- `/zen:analyze:gemini-2.5-flash-preview-05-20 examine these files` - Use analyze tool with specific Gemini model
+
+#### Continuation Prompts
+- `/zen:continue` - Continue previous conversation using chat tool
+- `/zen:chat:continue` - Continue previous conversation using chat tool specifically
+- `/zen:thinkdeep:continue` - Continue previous conversation using thinkdeep tool
+- `/zen:analyze:continue` - Continue previous conversation using analyze tool
+
+#### Advanced Examples
+- `/zen:thinkdeeper:o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore`
+- `/zen:precommit:pro confirm these changes match our requirements in COOL_FEATURE.md`
+- `/zen:testgen:flash write me tests for class ABC`
+- `/zen:refactor:local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift`
+
+#### Syntax Format
+The structured prompt format is: `/zen:[tool]:[model / continue] [your_message]`
+
+- `[tool]` - Any available tool name (chat, thinkdeep, codereview, debug, analyze, etc.)
+- `[model / continue]` - Either a specific model name (o3, flash, pro, grok, etc.) or the keyword `continue` to continue the conversation using this tool
+- `[your_message]` - Your actual prompt or question
+
+**Note**: When using `:continue`, it intelligently resumes the previous conversation with the specified tool, maintaining full context and conversation history.

 ### Add Your Own Tools

--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "4.8.2"
+__version__ = "4.8.3"
 # Last update date in ISO format
 __updated__ = "2025-06-16"
 # Primary maintainer
--- a/providers/openai.py
+++ b/providers/openai.py
@@ -32,12 +32,14 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
            "supports_images": True,  # O3 models support vision
            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
-        "o3-pro": {
+        "o3-pro-2025-06-10": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
            "supports_images": True,  # O3 models support vision
            "max_image_size_mb": 20.0,  # 20MB per OpenAI docs
        },
+        # Aliases
+        "o3-pro": "o3-pro-2025-06-10",
        "o4-mini": {
            "context_window": 200_000,  # 200K tokens
            "supports_extended_thinking": False,
@@ -89,7 +91,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
        config = self.SUPPORTED_MODELS[resolved_name]

        # Define temperature constraints per model
-        if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]:
+        if resolved_name in ["o3", "o3-mini", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-high"]:
            # O3 and O4 reasoning models only support temperature=1.0
            temp_constraint = FixedTemperatureConstraint(1.0)
        else:
--- a/providers/openai_compatible.py
+++ b/providers/openai_compatible.py
@@ -224,6 +224,138 @@ class OpenAICompatibleProvider(ModelProvider):

        return self._client

+    def _generate_with_responses_endpoint(
+        self,
+        model_name: str,
+        messages: list,
+        temperature: float,
+        max_output_tokens: Optional[int] = None,
+        **kwargs,
+    ) -> ModelResponse:
+        """Generate content using the /v1/responses endpoint for o3-pro via OpenAI library."""
+        # Convert messages to the correct format for responses endpoint
+        input_messages = []
+
+        for message in messages:
+            role = message.get("role", "")
+            content = message.get("content", "")
+
+            if role == "system":
+                # System messages can be treated as user messages for o3-pro
+                input_messages.append(
+                    {"role": "user", "content": [{"type": "input_text", "text": f"System: {content}"}]}
+                )
+            elif role == "user":
+                input_messages.append({"role": "user", "content": [{"type": "input_text", "text": content}]})
+            elif role == "assistant":
+                input_messages.append({"role": "assistant", "content": [{"type": "output_text", "text": content}]})
+
+        # Prepare completion parameters for responses endpoint
+        completion_params = {
+            "model": model_name,
+            "input": input_messages,
+            "text": {"format": {"type": "text"}},
+            "reasoning": {"effort": "medium", "summary": "auto"},
+            "tools": [],
+            "store": True,
+        }
+
+        # Temperature is not in the documented parameters for responses endpoint
+        # but we'll try to add it in case it's supported
+
+        # Add max tokens if specified
+        if max_output_tokens:
+            completion_params["max_tokens"] = max_output_tokens
+
+        # Add any additional OpenAI-specific parameters
+        for key, value in kwargs.items():
+            if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop"]:
+                completion_params[key] = value
+
+        # Retry logic with progressive delays
+        max_retries = 4
+        retry_delays = [1, 3, 5, 8]
+        last_exception = None
+
+        for attempt in range(max_retries):
+            try:
+                # Use OpenAI client's responses endpoint
+                response = self.client.responses.create(**completion_params)
+
+                # Extract content and usage from responses endpoint format
+                # The response format is different for responses endpoint
+                content = ""
+                if hasattr(response, "output") and response.output:
+                    if hasattr(response.output, "content") and response.output.content:
+                        # Look for output_text in content
+                        for content_item in response.output.content:
+                            if hasattr(content_item, "type") and content_item.type == "output_text":
+                                content = content_item.text
+                                break
+                    elif hasattr(response.output, "text"):
+                        content = response.output.text
+
+                # Try to extract usage information
+                usage = None
+                if hasattr(response, "usage"):
+                    usage = self._extract_usage(response)
+                elif hasattr(response, "input_tokens") and hasattr(response, "output_tokens"):
+                    usage = {
+                        "input_tokens": getattr(response, "input_tokens", 0),
+                        "output_tokens": getattr(response, "output_tokens", 0),
+                        "total_tokens": getattr(response, "input_tokens", 0) + getattr(response, "output_tokens", 0),
+                    }
+
+                return ModelResponse(
+                    content=content,
+                    usage=usage,
+                    model_name=model_name,
+                    friendly_name=self.FRIENDLY_NAME,
+                    provider=self.get_provider_type(),
+                    metadata={
+                        "model": getattr(response, "model", model_name),
+                        "id": getattr(response, "id", ""),
+                        "created": getattr(response, "created_at", 0),
+                        "endpoint": "responses",
+                    },
+                )
+
+            except Exception as e:
+                last_exception = e
+
+                # Check if this is a retryable error
+                error_str = str(e).lower()
+                is_retryable = any(
+                    term in error_str
+                    for term in [
+                        "timeout",
+                        "connection",
+                        "network",
+                        "temporary",
+                        "unavailable",
+                        "retry",
+                        "429",
+                        "500",
+                        "502",
+                        "503",
+                        "504",
+                    ]
+                )
+
+                if is_retryable and attempt < max_retries - 1:
+                    delay = retry_delays[attempt]
+                    logging.warning(
+                        f"Retryable error for o3-pro responses endpoint, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..."
+                    )
+                    time.sleep(delay)
+                else:
+                    break
+
+        # If we get here, all retries failed
+        error_msg = f"o3-pro responses endpoint error after {max_retries} attempts: {str(last_exception)}"
+        logging.error(error_msg)
+        raise RuntimeError(error_msg) from last_exception
+
    def generate_content(
        self,
        prompt: str,
@@ -301,6 +433,22 @@ class OpenAICompatibleProvider(ModelProvider):
            if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]:
                completion_params[key] = value

+        # Check if this is o3-pro and needs the responses endpoint
+        resolved_model = model_name
+        if hasattr(self, "_resolve_model_name"):
+            resolved_model = self._resolve_model_name(model_name)
+
+        if resolved_model == "o3-pro-2025-06-10":
+            # This model requires the /v1/responses endpoint
+            # If it fails, we should not fall back to chat/completions
+            return self._generate_with_responses_endpoint(
+                model_name=resolved_model,
+                messages=messages,
+                temperature=temperature,
+                max_output_tokens=max_output_tokens,
+                **kwargs,
+            )
+
        # Retry logic with progressive delays
        max_retries = 4  # Total of 4 attempts
        retry_delays = [1, 3, 5, 8]  # Progressive delays: 1s, 3s, 5s, 8s
--- a/server.py
+++ b/server.py
@@ -925,6 +925,15 @@ async def handle_list_prompts() -> list[Prompt]:
                )
            )

+    # Add special "continue" prompt
+    prompts.append(
+        Prompt(
+            name="continue",
+            description="Continue the previous conversation using the chat tool",
+            arguments=[],
+        )
+    )
+
    logger.debug(f"Returning {len(prompts)} prompts to MCP client")
    return prompts

@@ -934,12 +943,16 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    """
    Get prompt details and generate the actual prompt text.

-    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper).
+    This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:o3).
    It generates the appropriate text that Claude will then use to call the
    underlying tool.

+    Supports structured prompt names like "chat:o3" where:
+    - "chat" is the tool name
+    - "o3" is the model to use
+
    Args:
-        name: The name of the prompt to execute
+        name: The name of the prompt to execute (can include model like "chat:o3")
        arguments: Optional arguments for the prompt (e.g., model, thinking_mode)

    Returns:
@@ -950,39 +963,74 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
    """
    logger.debug(f"MCP client requested prompt: {name} with args: {arguments}")

-    # Find the corresponding tool by checking prompt names
-    tool_name = None
-    template_info = None
+    # Parse structured prompt names like "chat:o3" or "chat:continue"
+    parsed_model = None
+    is_continuation = False
+    base_name = name

-    # Check if it's a known prompt name
-    for t_name, t_info in PROMPT_TEMPLATES.items():
-        if t_info["name"] == name:
-            tool_name = t_name
-            template_info = t_info
-            break
+    if ":" in name:
+        parts = name.split(":", 1)
+        base_name = parts[0]
+        second_part = parts[1]

-    # If not found, check if it's a direct tool name
-    if not tool_name and name in TOOLS:
-        tool_name = name
+        # Check if the second part is "continue" (special keyword)
+        if second_part.lower() == "continue":
+            is_continuation = True
+            logger.debug(f"Parsed continuation prompt: tool='{base_name}', continue=True")
+        else:
+            parsed_model = second_part
+            logger.debug(f"Parsed structured prompt: tool='{base_name}', model='{parsed_model}'")
+
+    # Handle special "continue" cases
+    if base_name.lower() == "continue":
+        # This is "/zen:continue" - use chat tool as default for continuation
+        tool_name = "chat"
+        is_continuation = True
        template_info = {
-            "name": name,
-            "description": f"Use {name} tool",
-            "template": f"Use {name}",
+            "name": "continue",
+            "description": "Continue the previous conversation",
+            "template": "Continue the conversation",
        }
+        logger.debug("Using /zen:continue - defaulting to chat tool with continuation")
+    else:
+        # Find the corresponding tool by checking prompt names
+        tool_name = None
+        template_info = None

-    if not tool_name:
-        logger.error(f"Unknown prompt requested: {name}")
-        raise ValueError(f"Unknown prompt: {name}")
+        # Check if it's a known prompt name (using base_name)
+        for t_name, t_info in PROMPT_TEMPLATES.items():
+            if t_info["name"] == base_name:
+                tool_name = t_name
+                template_info = t_info
+                break
+
+        # If not found, check if it's a direct tool name
+        if not tool_name and base_name in TOOLS:
+            tool_name = base_name
+            template_info = {
+                "name": base_name,
+                "description": f"Use {base_name} tool",
+                "template": f"Use {base_name}",
+            }
+
+        if not tool_name:
+            logger.error(f"Unknown prompt requested: {name} (base: {base_name})")
+            raise ValueError(f"Unknown prompt: {name}")

    # Get the template
    template = template_info.get("template", f"Use {tool_name}")

    # Safe template expansion with defaults
+    # Prioritize: parsed model > arguments model > "auto"
+    final_model = parsed_model or (arguments.get("model", "auto") if arguments else "auto")
+
    prompt_args = {
-        "model": arguments.get("model", "auto") if arguments else "auto",
+        "model": final_model,
        "thinking_mode": arguments.get("thinking_mode", "medium") if arguments else "medium",
    }

+    logger.debug(f"Using model '{final_model}' for prompt '{name}'")
+
    # Safely format the template
    try:
        prompt_text = template.format(**prompt_args)
@@ -990,6 +1038,21 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
        logger.warning(f"Missing template argument {e} for prompt {name}, using raw template")
        prompt_text = template  # Fallback to raw template

+    # Generate tool call instruction based on the type of prompt
+    if is_continuation:
+        if base_name.lower() == "continue":
+            # "/zen:continue" case
+            tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
+        else:
+            # "/zen:chat:continue" case
+            tool_instruction = f"Continue the previous conversation using the {tool_name} tool"
+    elif parsed_model:
+        # "/zen:chat:o3" case
+        tool_instruction = f"Use the {tool_name} tool with model '{parsed_model}'"
+    else:
+        # "/zen:chat" case
+        tool_instruction = prompt_text
+
    return GetPromptResult(
        prompt=Prompt(
            name=name,
@@ -999,7 +1062,7 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP
        messages=[
            PromptMessage(
                role="user",
-                content={"type": "text", "text": prompt_text},
+                content={"type": "text", "text": tool_instruction},
            )
        ],
    )
--- a/simulator_tests/test_o3_pro_expensive.py
+++ b/simulator_tests/test_o3_pro_expensive.py
@@ -8,7 +8,10 @@ This test is intentionally NOT added to TEST_REGISTRY to prevent accidental exec
 It can only be run manually using:
    python communication_simulator_test.py --individual o3_pro_expensive

-Tests that o3-pro model works with one simple chat call. That's it.
+Tests that o3-pro model:
+1. Uses the correct /v1/responses endpoint (not /v1/chat/completions)
+2. Successfully completes a chat call
+3. Returns properly formatted response
 """

 from .base_test import BaseSimulatorTest
@@ -26,13 +29,16 @@ class O3ProExpensiveTest(BaseSimulatorTest):
        return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)"

    def run_test(self) -> bool:
-        """Test o3-pro model with one simple chat call - EXPENSIVE!"""
+        """Test o3-pro model with endpoint verification - EXPENSIVE!"""
        try:
            self.logger.warning("⚠️ ⚠️ ⚠️  EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️")
-            self.logger.info("Test: O3-Pro basic chat test")
+            self.logger.info("Test: O3-Pro endpoint and functionality test")
+
+            # First, verify we're hitting the right endpoint by checking logs
+            self.logger.info("Step 1: Testing o3-pro with chat tool")

            # One simple chat call
-            response, _ = self.call_mcp_tool(
+            response, tool_result = self.call_mcp_tool(
                "chat",
                {
                    "prompt": "What is 2 + 2?",
@@ -41,16 +47,44 @@ class O3ProExpensiveTest(BaseSimulatorTest):
                },
            )

-            if response:
-                self.logger.info("✅ O3-Pro chat call succeeded")
-                self.logger.warning("💰 Test completed - check your billing!")
-                return True
-            else:
-                self.logger.error("❌ O3-Pro chat call failed")
+            if not response:
+                self.logger.error("❌ O3-Pro chat call failed - no response")
+                if tool_result and "error" in tool_result:
+                    error_msg = tool_result["error"]
+                    self.logger.error(f"Error details: {error_msg}")
+                    # Check if it's the endpoint error we're trying to fix
+                    if "v1/responses" in str(error_msg) and "v1/chat/completions" in str(error_msg):
+                        self.logger.error(
+                            "❌ ENDPOINT BUG DETECTED: o3-pro is trying to use chat/completions instead of responses endpoint!"
+                        )
                return False

+            # Check the metadata to verify endpoint was used
+            if tool_result and isinstance(tool_result, dict):
+                metadata = tool_result.get("metadata", {})
+                endpoint_used = metadata.get("endpoint", "unknown")
+
+                if endpoint_used == "responses":
+                    self.logger.info("✅ Correct endpoint used: /v1/responses")
+                else:
+                    self.logger.warning(f"⚠️ Endpoint used: {endpoint_used} (expected: responses)")
+
+            # Verify the response content
+            if response and "4" in str(response):
+                self.logger.info("✅ O3-Pro response is mathematically correct")
+            else:
+                self.logger.warning(f"⚠️ Unexpected response: {response}")
+
+            self.logger.info("✅ O3-Pro test completed successfully")
+            self.logger.warning("💰 Test completed - check your billing!")
+            return True
+
        except Exception as e:
-            self.logger.error(f"O3-Pro test failed: {e}")
+            self.logger.error(f"O3-Pro test failed with exception: {e}")
+            # Log the full error for debugging endpoint issues
+            import traceback
+
+            self.logger.error(f"Full traceback: {traceback.format_exc()}")
            return False


--- a/tests/test_model_restrictions.py
+++ b/tests/test_model_restrictions.py
@@ -263,17 +263,18 @@ class TestProviderIntegration:
    @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "flash"})
    def test_gemini_parameter_order_regression_protection(self):
        """Test that prevents regression of parameter order bug in is_allowed calls.
-        
-        This test specifically catches the bug where parameters were incorrectly 
-        passed as (provider, user_input, resolved_name) instead of 
+
+        This test specifically catches the bug where parameters were incorrectly
+        passed as (provider, user_input, resolved_name) instead of
        (provider, resolved_name, user_input).
-        
+
        The bug was subtle because the is_allowed method uses OR logic, so it
        worked in most cases by accident. This test creates a scenario where
        the parameter order matters.
        """
        # Clear any cached restriction service
        import utils.model_restrictions
+
        utils.model_restrictions._restriction_service = None

        provider = GeminiModelProvider(api_key="test-key")
@@ -295,13 +296,14 @@ class TestProviderIntegration:
    @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash-preview-05-20"})
    def test_gemini_parameter_order_edge_case_full_name_only(self):
        """Test parameter order with only full name allowed, not alias.
-        
+
        This is the reverse scenario - only the full canonical name is allowed,
        not the shorthand alias. This tests that the parameter order is correct
        when resolving aliases.
        """
        # Clear any cached restriction service
        import utils.model_restrictions
+
        utils.model_restrictions._restriction_service = None

        provider = GeminiModelProvider(api_key="test-key")
--- a/tests/test_openai_provider.py
+++ b/tests/test_openai_provider.py
@@ -75,7 +75,7 @@ class TestOpenAIProvider:
        # Test full name passthrough
        assert provider._resolve_model_name("o3") == "o3"
        assert provider._resolve_model_name("o3-mini") == "o3-mini"
-        assert provider._resolve_model_name("o3-pro") == "o3-pro"
+        assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10"
        assert provider._resolve_model_name("o4-mini") == "o4-mini"
        assert provider._resolve_model_name("o4-mini-high") == "o4-mini-high"

@@ -196,7 +196,7 @@ class TestOpenAIProvider:
        mock_response.choices = [MagicMock()]
        mock_response.choices[0].message.content = "Test response"
        mock_response.choices[0].finish_reason = "stop"
-        mock_response.model = "o3-pro"
+        mock_response.model = "o3-mini"
        mock_response.usage = MagicMock()
        mock_response.usage.prompt_tokens = 10
        mock_response.usage.completion_tokens = 5
@@ -205,10 +205,10 @@ class TestOpenAIProvider:

        provider = OpenAIModelProvider("test-key")

-        # Test full model name passes through unchanged
-        provider.generate_content(prompt="Test", model_name="o3-pro", temperature=1.0)
+        # Test full model name passes through unchanged (use o3-mini since o3-pro has special handling)
+        provider.generate_content(prompt="Test", model_name="o3-mini", temperature=1.0)
        call_kwargs = mock_client.chat.completions.create.call_args[1]
-        assert call_kwargs["model"] == "o3-pro"  # Should be unchanged
+        assert call_kwargs["model"] == "o3-mini"  # Should be unchanged

    def test_supports_thinking_mode(self):
        """Test thinking mode support (currently False for all OpenAI models)."""
@@ -219,3 +219,73 @@ class TestOpenAIProvider:
        assert provider.supports_thinking_mode("o3-mini") is False
        assert provider.supports_thinking_mode("o4-mini") is False
        assert provider.supports_thinking_mode("mini") is False  # Test with alias too
+
+    @patch("providers.openai_compatible.OpenAI")
+    def test_o3_pro_routes_to_responses_endpoint(self, mock_openai_class):
+        """Test that o3-pro model routes to the /v1/responses endpoint (mock test)."""
+        # Set up mock for OpenAI client responses endpoint
+        mock_client = MagicMock()
+        mock_openai_class.return_value = mock_client
+
+        mock_response = MagicMock()
+        mock_response.output = MagicMock()
+        mock_response.output.content = [MagicMock()]
+        mock_response.output.content[0].type = "output_text"
+        mock_response.output.content[0].text = "4"
+        mock_response.model = "o3-pro-2025-06-10"
+        mock_response.id = "test-id"
+        mock_response.created_at = 1234567890
+        mock_response.usage = MagicMock()
+        mock_response.usage.prompt_tokens = 10
+        mock_response.usage.completion_tokens = 5
+        mock_response.usage.total_tokens = 15
+
+        mock_client.responses.create.return_value = mock_response
+
+        provider = OpenAIModelProvider("test-key")
+
+        # Generate content with o3-pro
+        result = provider.generate_content(prompt="What is 2 + 2?", model_name="o3-pro", temperature=1.0)
+
+        # Verify responses.create was called
+        mock_client.responses.create.assert_called_once()
+        call_args = mock_client.responses.create.call_args[1]
+        assert call_args["model"] == "o3-pro-2025-06-10"
+        assert call_args["input"][0]["role"] == "user"
+        assert "What is 2 + 2?" in call_args["input"][0]["content"][0]["text"]
+
+        # Verify the response
+        assert result.content == "4"
+        assert result.model_name == "o3-pro-2025-06-10"
+        assert result.metadata["endpoint"] == "responses"
+
+    @patch("providers.openai_compatible.OpenAI")
+    def test_non_o3_pro_uses_chat_completions(self, mock_openai_class):
+        """Test that non-o3-pro models use the standard chat completions endpoint."""
+        # Set up mock
+        mock_client = MagicMock()
+        mock_openai_class.return_value = mock_client
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = "Test response"
+        mock_response.choices[0].finish_reason = "stop"
+        mock_response.model = "o3-mini"
+        mock_response.id = "test-id"
+        mock_response.created = 1234567890
+        mock_response.usage = MagicMock()
+        mock_response.usage.prompt_tokens = 10
+        mock_response.usage.completion_tokens = 5
+        mock_response.usage.total_tokens = 15
+        mock_client.chat.completions.create.return_value = mock_response
+
+        provider = OpenAIModelProvider("test-key")
+
+        # Generate content with o3-mini (not o3-pro)
+        result = provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=1.0)
+
+        # Verify chat.completions.create was called
+        mock_client.chat.completions.create.assert_called_once()
+
+        # Verify the response
+        assert result.content == "Test response"
+        assert result.model_name == "o3-mini"