diff --git a/README.md b/README.md index d19cb68..209bcd1 100644 --- a/README.md +++ b/README.md @@ -592,10 +592,41 @@ For detailed tool parameters and configuration options, see the [Advanced Usage ### Prompt Support -- `/zen:thinkdeeper with o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore` -- `/zen:precommit use gemini pro and confirm these changes match our requirements in COOL_FEATURE.md` -- `/zen:testgen write me tests for class ABC` -- `/zen:refactor using local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift` +Zen supports powerful structured prompts in Claude Code for quick access to tools and models: + +#### Basic Tool Prompts +- `/zen:thinkdeeper` - Use thinkdeep tool with auto-selected model +- `/zen:chat` - Use chat tool with auto-selected model +- `/zen:codereview` - Use codereview tool with auto-selected model +- `/zen:analyze` - Use analyze tool with auto-selected model + +#### Model-Specific Tool Prompts +- `/zen:chat:o3 hello there` - Use chat tool specifically with O3 model +- `/zen:thinkdeep:flash analyze this quickly` - Use thinkdeep tool with Flash for speed +- `/zen:codereview:pro review for security` - Use codereview tool with Gemini Pro for thorough analysis +- `/zen:debug:grok help with this error` - Use debug tool with GROK model +- `/zen:analyze:gemini-2.5-flash-preview-05-20 examine these files` - Use analyze tool with specific Gemini model + +#### Continuation Prompts +- `/zen:continue` - Continue previous conversation using chat tool +- `/zen:chat:continue` - Continue previous conversation using chat tool specifically +- `/zen:thinkdeep:continue` - Continue previous conversation using thinkdeep tool +- `/zen:analyze:continue` - Continue previous conversation using analyze tool + +#### Advanced Examples +- `/zen:thinkdeeper:o3 check if the algorithm in @sort.py is performant and if there are alternatives we could explore` +- `/zen:precommit:pro confirm these changes match our requirements in COOL_FEATURE.md` +- `/zen:testgen:flash write me tests for class ABC` +- `/zen:refactor:local-llama propose a decomposition strategy, make a plan and save it in FIXES.md then share this with o3 to confirm along with large_file.swift` + +#### Syntax Format +The structured prompt format is: `/zen:[tool]:[model / continue] [your_message]` + +- `[tool]` - Any available tool name (chat, thinkdeep, codereview, debug, analyze, etc.) +- `[model / continue]` - Either a specific model name (o3, flash, pro, grok, etc.) or the keyword `continue` to continue the conversation using this tool +- `[your_message]` - Your actual prompt or question + +**Note**: When using `:continue`, it intelligently resumes the previous conversation with the specified tool, maintaining full context and conversation history. ### Add Your Own Tools diff --git a/config.py b/config.py index 2a460aa..36fcf2e 100644 --- a/config.py +++ b/config.py @@ -14,7 +14,7 @@ import os # These values are used in server responses and for tracking releases # IMPORTANT: This is the single source of truth for version and author info # Semantic versioning: MAJOR.MINOR.PATCH -__version__ = "4.8.2" +__version__ = "4.8.3" # Last update date in ISO format __updated__ = "2025-06-16" # Primary maintainer diff --git a/providers/openai.py b/providers/openai.py index 181bef9..5fd8be1 100644 --- a/providers/openai.py +++ b/providers/openai.py @@ -32,12 +32,14 @@ class OpenAIModelProvider(OpenAICompatibleProvider): "supports_images": True, # O3 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs }, - "o3-pro": { + "o3-pro-2025-06-10": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, "supports_images": True, # O3 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs }, + # Aliases + "o3-pro": "o3-pro-2025-06-10", "o4-mini": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, @@ -89,7 +91,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider): config = self.SUPPORTED_MODELS[resolved_name] # Define temperature constraints per model - if resolved_name in ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]: + if resolved_name in ["o3", "o3-mini", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-high"]: # O3 and O4 reasoning models only support temperature=1.0 temp_constraint = FixedTemperatureConstraint(1.0) else: diff --git a/providers/openai_compatible.py b/providers/openai_compatible.py index 2db8f92..5fa138e 100644 --- a/providers/openai_compatible.py +++ b/providers/openai_compatible.py @@ -224,6 +224,138 @@ class OpenAICompatibleProvider(ModelProvider): return self._client + def _generate_with_responses_endpoint( + self, + model_name: str, + messages: list, + temperature: float, + max_output_tokens: Optional[int] = None, + **kwargs, + ) -> ModelResponse: + """Generate content using the /v1/responses endpoint for o3-pro via OpenAI library.""" + # Convert messages to the correct format for responses endpoint + input_messages = [] + + for message in messages: + role = message.get("role", "") + content = message.get("content", "") + + if role == "system": + # System messages can be treated as user messages for o3-pro + input_messages.append( + {"role": "user", "content": [{"type": "input_text", "text": f"System: {content}"}]} + ) + elif role == "user": + input_messages.append({"role": "user", "content": [{"type": "input_text", "text": content}]}) + elif role == "assistant": + input_messages.append({"role": "assistant", "content": [{"type": "output_text", "text": content}]}) + + # Prepare completion parameters for responses endpoint + completion_params = { + "model": model_name, + "input": input_messages, + "text": {"format": {"type": "text"}}, + "reasoning": {"effort": "medium", "summary": "auto"}, + "tools": [], + "store": True, + } + + # Temperature is not in the documented parameters for responses endpoint + # but we'll try to add it in case it's supported + + # Add max tokens if specified + if max_output_tokens: + completion_params["max_tokens"] = max_output_tokens + + # Add any additional OpenAI-specific parameters + for key, value in kwargs.items(): + if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop"]: + completion_params[key] = value + + # Retry logic with progressive delays + max_retries = 4 + retry_delays = [1, 3, 5, 8] + last_exception = None + + for attempt in range(max_retries): + try: + # Use OpenAI client's responses endpoint + response = self.client.responses.create(**completion_params) + + # Extract content and usage from responses endpoint format + # The response format is different for responses endpoint + content = "" + if hasattr(response, "output") and response.output: + if hasattr(response.output, "content") and response.output.content: + # Look for output_text in content + for content_item in response.output.content: + if hasattr(content_item, "type") and content_item.type == "output_text": + content = content_item.text + break + elif hasattr(response.output, "text"): + content = response.output.text + + # Try to extract usage information + usage = None + if hasattr(response, "usage"): + usage = self._extract_usage(response) + elif hasattr(response, "input_tokens") and hasattr(response, "output_tokens"): + usage = { + "input_tokens": getattr(response, "input_tokens", 0), + "output_tokens": getattr(response, "output_tokens", 0), + "total_tokens": getattr(response, "input_tokens", 0) + getattr(response, "output_tokens", 0), + } + + return ModelResponse( + content=content, + usage=usage, + model_name=model_name, + friendly_name=self.FRIENDLY_NAME, + provider=self.get_provider_type(), + metadata={ + "model": getattr(response, "model", model_name), + "id": getattr(response, "id", ""), + "created": getattr(response, "created_at", 0), + "endpoint": "responses", + }, + ) + + except Exception as e: + last_exception = e + + # Check if this is a retryable error + error_str = str(e).lower() + is_retryable = any( + term in error_str + for term in [ + "timeout", + "connection", + "network", + "temporary", + "unavailable", + "retry", + "429", + "500", + "502", + "503", + "504", + ] + ) + + if is_retryable and attempt < max_retries - 1: + delay = retry_delays[attempt] + logging.warning( + f"Retryable error for o3-pro responses endpoint, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..." + ) + time.sleep(delay) + else: + break + + # If we get here, all retries failed + error_msg = f"o3-pro responses endpoint error after {max_retries} attempts: {str(last_exception)}" + logging.error(error_msg) + raise RuntimeError(error_msg) from last_exception + def generate_content( self, prompt: str, @@ -301,6 +433,22 @@ class OpenAICompatibleProvider(ModelProvider): if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]: completion_params[key] = value + # Check if this is o3-pro and needs the responses endpoint + resolved_model = model_name + if hasattr(self, "_resolve_model_name"): + resolved_model = self._resolve_model_name(model_name) + + if resolved_model == "o3-pro-2025-06-10": + # This model requires the /v1/responses endpoint + # If it fails, we should not fall back to chat/completions + return self._generate_with_responses_endpoint( + model_name=resolved_model, + messages=messages, + temperature=temperature, + max_output_tokens=max_output_tokens, + **kwargs, + ) + # Retry logic with progressive delays max_retries = 4 # Total of 4 attempts retry_delays = [1, 3, 5, 8] # Progressive delays: 1s, 3s, 5s, 8s diff --git a/server.py b/server.py index 360d5f6..7056564 100644 --- a/server.py +++ b/server.py @@ -925,6 +925,15 @@ async def handle_list_prompts() -> list[Prompt]: ) ) + # Add special "continue" prompt + prompts.append( + Prompt( + name="continue", + description="Continue the previous conversation using the chat tool", + arguments=[], + ) + ) + logger.debug(f"Returning {len(prompts)} prompts to MCP client") return prompts @@ -934,12 +943,16 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP """ Get prompt details and generate the actual prompt text. - This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper). + This handler is called when a user invokes a prompt (e.g., /zen:thinkdeeper or /zen:chat:o3). It generates the appropriate text that Claude will then use to call the underlying tool. + Supports structured prompt names like "chat:o3" where: + - "chat" is the tool name + - "o3" is the model to use + Args: - name: The name of the prompt to execute + name: The name of the prompt to execute (can include model like "chat:o3") arguments: Optional arguments for the prompt (e.g., model, thinking_mode) Returns: @@ -950,39 +963,74 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP """ logger.debug(f"MCP client requested prompt: {name} with args: {arguments}") - # Find the corresponding tool by checking prompt names - tool_name = None - template_info = None + # Parse structured prompt names like "chat:o3" or "chat:continue" + parsed_model = None + is_continuation = False + base_name = name - # Check if it's a known prompt name - for t_name, t_info in PROMPT_TEMPLATES.items(): - if t_info["name"] == name: - tool_name = t_name - template_info = t_info - break + if ":" in name: + parts = name.split(":", 1) + base_name = parts[0] + second_part = parts[1] - # If not found, check if it's a direct tool name - if not tool_name and name in TOOLS: - tool_name = name + # Check if the second part is "continue" (special keyword) + if second_part.lower() == "continue": + is_continuation = True + logger.debug(f"Parsed continuation prompt: tool='{base_name}', continue=True") + else: + parsed_model = second_part + logger.debug(f"Parsed structured prompt: tool='{base_name}', model='{parsed_model}'") + + # Handle special "continue" cases + if base_name.lower() == "continue": + # This is "/zen:continue" - use chat tool as default for continuation + tool_name = "chat" + is_continuation = True template_info = { - "name": name, - "description": f"Use {name} tool", - "template": f"Use {name}", + "name": "continue", + "description": "Continue the previous conversation", + "template": "Continue the conversation", } + logger.debug("Using /zen:continue - defaulting to chat tool with continuation") + else: + # Find the corresponding tool by checking prompt names + tool_name = None + template_info = None - if not tool_name: - logger.error(f"Unknown prompt requested: {name}") - raise ValueError(f"Unknown prompt: {name}") + # Check if it's a known prompt name (using base_name) + for t_name, t_info in PROMPT_TEMPLATES.items(): + if t_info["name"] == base_name: + tool_name = t_name + template_info = t_info + break + + # If not found, check if it's a direct tool name + if not tool_name and base_name in TOOLS: + tool_name = base_name + template_info = { + "name": base_name, + "description": f"Use {base_name} tool", + "template": f"Use {base_name}", + } + + if not tool_name: + logger.error(f"Unknown prompt requested: {name} (base: {base_name})") + raise ValueError(f"Unknown prompt: {name}") # Get the template template = template_info.get("template", f"Use {tool_name}") # Safe template expansion with defaults + # Prioritize: parsed model > arguments model > "auto" + final_model = parsed_model or (arguments.get("model", "auto") if arguments else "auto") + prompt_args = { - "model": arguments.get("model", "auto") if arguments else "auto", + "model": final_model, "thinking_mode": arguments.get("thinking_mode", "medium") if arguments else "medium", } + logger.debug(f"Using model '{final_model}' for prompt '{name}'") + # Safely format the template try: prompt_text = template.format(**prompt_args) @@ -990,6 +1038,21 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP logger.warning(f"Missing template argument {e} for prompt {name}, using raw template") prompt_text = template # Fallback to raw template + # Generate tool call instruction based on the type of prompt + if is_continuation: + if base_name.lower() == "continue": + # "/zen:continue" case + tool_instruction = f"Continue the previous conversation using the {tool_name} tool" + else: + # "/zen:chat:continue" case + tool_instruction = f"Continue the previous conversation using the {tool_name} tool" + elif parsed_model: + # "/zen:chat:o3" case + tool_instruction = f"Use the {tool_name} tool with model '{parsed_model}'" + else: + # "/zen:chat" case + tool_instruction = prompt_text + return GetPromptResult( prompt=Prompt( name=name, @@ -999,7 +1062,7 @@ async def handle_get_prompt(name: str, arguments: dict[str, Any] = None) -> GetP messages=[ PromptMessage( role="user", - content={"type": "text", "text": prompt_text}, + content={"type": "text", "text": tool_instruction}, ) ], ) diff --git a/simulator_tests/test_o3_pro_expensive.py b/simulator_tests/test_o3_pro_expensive.py index 78cb7fa..4c06c9a 100644 --- a/simulator_tests/test_o3_pro_expensive.py +++ b/simulator_tests/test_o3_pro_expensive.py @@ -8,7 +8,10 @@ This test is intentionally NOT added to TEST_REGISTRY to prevent accidental exec It can only be run manually using: python communication_simulator_test.py --individual o3_pro_expensive -Tests that o3-pro model works with one simple chat call. That's it. +Tests that o3-pro model: +1. Uses the correct /v1/responses endpoint (not /v1/chat/completions) +2. Successfully completes a chat call +3. Returns properly formatted response """ from .base_test import BaseSimulatorTest @@ -26,13 +29,16 @@ class O3ProExpensiveTest(BaseSimulatorTest): return "⚠️ EXPENSIVE O3-Pro basic validation (manual only)" def run_test(self) -> bool: - """Test o3-pro model with one simple chat call - EXPENSIVE!""" + """Test o3-pro model with endpoint verification - EXPENSIVE!""" try: self.logger.warning("⚠️ ⚠️ ⚠️ EXPENSIVE TEST - O3-PRO COSTS ~$15-60 PER 1K TOKENS! ⚠️ ⚠️ ⚠️") - self.logger.info("Test: O3-Pro basic chat test") + self.logger.info("Test: O3-Pro endpoint and functionality test") + + # First, verify we're hitting the right endpoint by checking logs + self.logger.info("Step 1: Testing o3-pro with chat tool") # One simple chat call - response, _ = self.call_mcp_tool( + response, tool_result = self.call_mcp_tool( "chat", { "prompt": "What is 2 + 2?", @@ -41,16 +47,44 @@ class O3ProExpensiveTest(BaseSimulatorTest): }, ) - if response: - self.logger.info("✅ O3-Pro chat call succeeded") - self.logger.warning("💰 Test completed - check your billing!") - return True - else: - self.logger.error("❌ O3-Pro chat call failed") + if not response: + self.logger.error("❌ O3-Pro chat call failed - no response") + if tool_result and "error" in tool_result: + error_msg = tool_result["error"] + self.logger.error(f"Error details: {error_msg}") + # Check if it's the endpoint error we're trying to fix + if "v1/responses" in str(error_msg) and "v1/chat/completions" in str(error_msg): + self.logger.error( + "❌ ENDPOINT BUG DETECTED: o3-pro is trying to use chat/completions instead of responses endpoint!" + ) return False + # Check the metadata to verify endpoint was used + if tool_result and isinstance(tool_result, dict): + metadata = tool_result.get("metadata", {}) + endpoint_used = metadata.get("endpoint", "unknown") + + if endpoint_used == "responses": + self.logger.info("✅ Correct endpoint used: /v1/responses") + else: + self.logger.warning(f"⚠️ Endpoint used: {endpoint_used} (expected: responses)") + + # Verify the response content + if response and "4" in str(response): + self.logger.info("✅ O3-Pro response is mathematically correct") + else: + self.logger.warning(f"⚠️ Unexpected response: {response}") + + self.logger.info("✅ O3-Pro test completed successfully") + self.logger.warning("💰 Test completed - check your billing!") + return True + except Exception as e: - self.logger.error(f"O3-Pro test failed: {e}") + self.logger.error(f"O3-Pro test failed with exception: {e}") + # Log the full error for debugging endpoint issues + import traceback + + self.logger.error(f"Full traceback: {traceback.format_exc()}") return False diff --git a/tests/test_model_restrictions.py b/tests/test_model_restrictions.py index 6c2656a..acbe2bd 100644 --- a/tests/test_model_restrictions.py +++ b/tests/test_model_restrictions.py @@ -263,17 +263,18 @@ class TestProviderIntegration: @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "flash"}) def test_gemini_parameter_order_regression_protection(self): """Test that prevents regression of parameter order bug in is_allowed calls. - - This test specifically catches the bug where parameters were incorrectly - passed as (provider, user_input, resolved_name) instead of + + This test specifically catches the bug where parameters were incorrectly + passed as (provider, user_input, resolved_name) instead of (provider, resolved_name, user_input). - + The bug was subtle because the is_allowed method uses OR logic, so it worked in most cases by accident. This test creates a scenario where the parameter order matters. """ # Clear any cached restriction service import utils.model_restrictions + utils.model_restrictions._restriction_service = None provider = GeminiModelProvider(api_key="test-key") @@ -295,13 +296,14 @@ class TestProviderIntegration: @patch.dict(os.environ, {"GOOGLE_ALLOWED_MODELS": "gemini-2.5-flash-preview-05-20"}) def test_gemini_parameter_order_edge_case_full_name_only(self): """Test parameter order with only full name allowed, not alias. - + This is the reverse scenario - only the full canonical name is allowed, not the shorthand alias. This tests that the parameter order is correct when resolving aliases. """ # Clear any cached restriction service import utils.model_restrictions + utils.model_restrictions._restriction_service = None provider = GeminiModelProvider(api_key="test-key") diff --git a/tests/test_openai_provider.py b/tests/test_openai_provider.py index 8f1a936..d63a486 100644 --- a/tests/test_openai_provider.py +++ b/tests/test_openai_provider.py @@ -75,7 +75,7 @@ class TestOpenAIProvider: # Test full name passthrough assert provider._resolve_model_name("o3") == "o3" assert provider._resolve_model_name("o3-mini") == "o3-mini" - assert provider._resolve_model_name("o3-pro") == "o3-pro" + assert provider._resolve_model_name("o3-pro") == "o3-pro-2025-06-10" assert provider._resolve_model_name("o4-mini") == "o4-mini" assert provider._resolve_model_name("o4-mini-high") == "o4-mini-high" @@ -196,7 +196,7 @@ class TestOpenAIProvider: mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "Test response" mock_response.choices[0].finish_reason = "stop" - mock_response.model = "o3-pro" + mock_response.model = "o3-mini" mock_response.usage = MagicMock() mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 @@ -205,10 +205,10 @@ class TestOpenAIProvider: provider = OpenAIModelProvider("test-key") - # Test full model name passes through unchanged - provider.generate_content(prompt="Test", model_name="o3-pro", temperature=1.0) + # Test full model name passes through unchanged (use o3-mini since o3-pro has special handling) + provider.generate_content(prompt="Test", model_name="o3-mini", temperature=1.0) call_kwargs = mock_client.chat.completions.create.call_args[1] - assert call_kwargs["model"] == "o3-pro" # Should be unchanged + assert call_kwargs["model"] == "o3-mini" # Should be unchanged def test_supports_thinking_mode(self): """Test thinking mode support (currently False for all OpenAI models).""" @@ -219,3 +219,73 @@ class TestOpenAIProvider: assert provider.supports_thinking_mode("o3-mini") is False assert provider.supports_thinking_mode("o4-mini") is False assert provider.supports_thinking_mode("mini") is False # Test with alias too + + @patch("providers.openai_compatible.OpenAI") + def test_o3_pro_routes_to_responses_endpoint(self, mock_openai_class): + """Test that o3-pro model routes to the /v1/responses endpoint (mock test).""" + # Set up mock for OpenAI client responses endpoint + mock_client = MagicMock() + mock_openai_class.return_value = mock_client + + mock_response = MagicMock() + mock_response.output = MagicMock() + mock_response.output.content = [MagicMock()] + mock_response.output.content[0].type = "output_text" + mock_response.output.content[0].text = "4" + mock_response.model = "o3-pro-2025-06-10" + mock_response.id = "test-id" + mock_response.created_at = 1234567890 + mock_response.usage = MagicMock() + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + mock_response.usage.total_tokens = 15 + + mock_client.responses.create.return_value = mock_response + + provider = OpenAIModelProvider("test-key") + + # Generate content with o3-pro + result = provider.generate_content(prompt="What is 2 + 2?", model_name="o3-pro", temperature=1.0) + + # Verify responses.create was called + mock_client.responses.create.assert_called_once() + call_args = mock_client.responses.create.call_args[1] + assert call_args["model"] == "o3-pro-2025-06-10" + assert call_args["input"][0]["role"] == "user" + assert "What is 2 + 2?" in call_args["input"][0]["content"][0]["text"] + + # Verify the response + assert result.content == "4" + assert result.model_name == "o3-pro-2025-06-10" + assert result.metadata["endpoint"] == "responses" + + @patch("providers.openai_compatible.OpenAI") + def test_non_o3_pro_uses_chat_completions(self, mock_openai_class): + """Test that non-o3-pro models use the standard chat completions endpoint.""" + # Set up mock + mock_client = MagicMock() + mock_openai_class.return_value = mock_client + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "Test response" + mock_response.choices[0].finish_reason = "stop" + mock_response.model = "o3-mini" + mock_response.id = "test-id" + mock_response.created = 1234567890 + mock_response.usage = MagicMock() + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + mock_response.usage.total_tokens = 15 + mock_client.chat.completions.create.return_value = mock_response + + provider = OpenAIModelProvider("test-key") + + # Generate content with o3-mini (not o3-pro) + result = provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=1.0) + + # Verify chat.completions.create was called + mock_client.chat.completions.create.assert_called_once() + + # Verify the response + assert result.content == "Test response" + assert result.model_name == "o3-mini"