Fixes O3-Pro connection https://github.com/BeehiveInnovations/zen-mcp-server/issues/56

New tests for O3-pro Improved prompts for shorthand input
2025-06-16 20:00:08 +04:00
parent 5f69ad4049
commit 9b98df650b
8 changed files with 400 additions and 50 deletions
--- a/simulator_tests/test_o3_pro_expensive.py
+++ b/simulator_tests/test_o3_pro_expensive.py
@@ -8,7 +8,10 @@ This test is intentionally NOT added to TEST_REGISTRY to prevent accidental exec
 It can only be run manually using:
    python communication_simulator_test.py --individual o3_pro_expensive

-Tests that o3-pro model works with one simple chat call. That's it.
+Tests that o3-pro model:
+1. Uses the correct /v1/responses endpoint (not /v1/chat/completions)
+2. Successfully completes a chat call
+3. Returns properly formatted response
 """

 from .base_test import BaseSimulatorTest
@@ -26,13 +29,16 @@ class O3ProExpensiveTest(BaseSimulatorTest):
        return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)"

    def run_test(self) -> bool:
-        """Test o3-pro model with one simple chat call - EXPENSIVE!"""
+        """Test o3-pro model with endpoint verification - EXPENSIVE!"""
        try:
            self.logger.warning("⚠️ ⚠️ ⚠️  EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️")
-            self.logger.info("Test: O3-Pro basic chat test")
+            self.logger.info("Test: O3-Pro endpoint and functionality test")
+
+            # First, verify we're hitting the right endpoint by checking logs
+            self.logger.info("Step 1: Testing o3-pro with chat tool")

            # One simple chat call
-            response, _ = self.call_mcp_tool(
+            response, tool_result = self.call_mcp_tool(
                "chat",
                {
                    "prompt": "What is 2 + 2?",
@@ -41,16 +47,44 @@ class O3ProExpensiveTest(BaseSimulatorTest):
                },
            )

-            if response:
-                self.logger.info("✅ O3-Pro chat call succeeded")
-                self.logger.warning("💰 Test completed - check your billing!")
-                return True
-            else:
-                self.logger.error("❌ O3-Pro chat call failed")
+            if not response:
+                self.logger.error("❌ O3-Pro chat call failed - no response")
+                if tool_result and "error" in tool_result:
+                    error_msg = tool_result["error"]
+                    self.logger.error(f"Error details: {error_msg}")
+                    # Check if it's the endpoint error we're trying to fix
+                    if "v1/responses" in str(error_msg) and "v1/chat/completions" in str(error_msg):
+                        self.logger.error(
+                            "❌ ENDPOINT BUG DETECTED: o3-pro is trying to use chat/completions instead of responses endpoint!"
+                        )
                return False

+            # Check the metadata to verify endpoint was used
+            if tool_result and isinstance(tool_result, dict):
+                metadata = tool_result.get("metadata", {})
+                endpoint_used = metadata.get("endpoint", "unknown")
+
+                if endpoint_used == "responses":
+                    self.logger.info("✅ Correct endpoint used: /v1/responses")
+                else:
+                    self.logger.warning(f"⚠️ Endpoint used: {endpoint_used} (expected: responses)")
+
+            # Verify the response content
+            if response and "4" in str(response):
+                self.logger.info("✅ O3-Pro response is mathematically correct")
+            else:
+                self.logger.warning(f"⚠️ Unexpected response: {response}")
+
+            self.logger.info("✅ O3-Pro test completed successfully")
+            self.logger.warning("💰 Test completed - check your billing!")
+            return True
+
        except Exception as e:
-            self.logger.error(f"O3-Pro test failed: {e}")
+            self.logger.error(f"O3-Pro test failed with exception: {e}")
+            # Log the full error for debugging endpoint issues
+            import traceback
+
+            self.logger.error(f"Full traceback: {traceback.format_exc()}")
            return False