From 9f3b70d6d704a06e5ffc7a4cc54c2e7ab961a210 Mon Sep 17 00:00:00 2001 From: Fahad Date: Thu, 19 Jun 2025 08:30:46 +0400 Subject: [PATCH] Added proper temperature constraints to the model, fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/78 Prompt tweaks --- conf/custom_models.json | 16 ++ config.py | 2 +- providers/base.py | 35 ++++ providers/custom.py | 1 + providers/gemini.py | 1 + providers/openai_compatible.py | 32 +++- providers/openai_provider.py | 27 ++- providers/openrouter_registry.py | 22 ++- systemprompts/chat_prompt.py | 22 +-- systemprompts/precommit_prompt.py | 67 ++++--- systemprompts/thinkdeep_prompt.py | 32 ++-- tests/test_o3_temperature_fix_simple.py | 239 ++++++++++++++++++++++++ tests/test_openai_provider.py | 18 +- 13 files changed, 435 insertions(+), 79 deletions(-) create mode 100644 tests/test_o3_temperature_fix_simple.py diff --git a/conf/custom_models.json b/conf/custom_models.json index 1466451..8299a91 100644 --- a/conf/custom_models.json +++ b/conf/custom_models.json @@ -27,6 +27,8 @@ "supports_function_calling": "Whether the model supports function/tool calling", "supports_images": "Whether the model can process images/visual input", "max_image_size_mb": "Maximum total size in MB for all images combined (capped at 40MB max for custom models)", + "supports_temperature": "Whether the model accepts temperature parameter in API calls (set to false for O3/O4 reasoning models)", + "temperature_constraint": "Type of temperature constraint: 'fixed' (fixed value), 'range' (continuous range), 'discrete' (specific values), or omit for default range", "is_custom": "Set to true for models that should ONLY be used with custom API endpoints (Ollama, vLLM, etc.). False or omitted for OpenRouter/cloud models.", "description": "Human-readable description of the model" }, @@ -39,6 +41,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 10.0, + "supports_temperature": true, + "temperature_constraint": "range", "is_custom": true, "description": "Example custom/local model for Ollama, vLLM, etc." } @@ -152,6 +156,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o3 model - well-rounded and powerful across domains with vision" }, { @@ -163,6 +169,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o3-mini model - balanced performance and speed with vision" }, { @@ -174,6 +182,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems with vision" }, { @@ -185,6 +195,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o3-pro model - professional-grade reasoning and analysis with vision" }, { @@ -196,6 +208,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning and vision" }, { @@ -207,6 +221,8 @@ "supports_function_calling": true, "supports_images": true, "max_image_size_mb": 20.0, + "supports_temperature": false, + "temperature_constraint": "fixed", "description": "OpenAI's o4-mini with high reasoning effort - enhanced for complex tasks with vision" }, { diff --git a/config.py b/config.py index 38f310c..a873162 100644 --- a/config.py +++ b/config.py @@ -14,7 +14,7 @@ import os # These values are used in server responses and for tracking releases # IMPORTANT: This is the single source of truth for version and author info # Semantic versioning: MAJOR.MINOR.PATCH -__version__ = "5.1.3" +__version__ = "5.1.4" # Last update date in ISO format __updated__ = "2025-06-19" # Primary maintainer diff --git a/providers/base.py b/providers/base.py index 38bdab6..386e4f2 100644 --- a/providers/base.py +++ b/providers/base.py @@ -100,6 +100,26 @@ class DiscreteTemperatureConstraint(TemperatureConstraint): return self.default_temp +def create_temperature_constraint(constraint_type: str) -> TemperatureConstraint: + """Create temperature constraint object from configuration string. + + Args: + constraint_type: Type of constraint ("fixed", "range", "discrete") + + Returns: + TemperatureConstraint object based on configuration + """ + if constraint_type == "fixed": + # Fixed temperature models (O3/O4) only support temperature=1.0 + return FixedTemperatureConstraint(1.0) + elif constraint_type == "discrete": + # For models with specific allowed values - using common OpenAI values as default + return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.7) + else: + # Default range constraint (for "range" or None) + return RangeTemperatureConstraint(0.0, 2.0, 0.7) + + @dataclass class ModelCapabilities: """Capabilities and constraints for a specific model.""" @@ -114,6 +134,7 @@ class ModelCapabilities: supports_function_calling: bool = False supports_images: bool = False # Whether model can process images max_image_size_mb: float = 0.0 # Maximum total size for all images in MB + supports_temperature: bool = True # Whether model accepts temperature parameter in API calls # Temperature constraint object - preferred way to define temperature limits temperature_constraint: TemperatureConstraint = field( @@ -245,3 +266,17 @@ class ModelProvider(ABC): List of all model names and alias targets known by this provider """ pass + + def _resolve_model_name(self, model_name: str) -> str: + """Resolve model shorthand to full name. + + Base implementation returns the model name unchanged. + Subclasses should override to provide alias resolution. + + Args: + model_name: Model name that may be an alias + + Returns: + Resolved model name + """ + return model_name diff --git a/providers/custom.py b/providers/custom.py index 70a1d41..bad1062 100644 --- a/providers/custom.py +++ b/providers/custom.py @@ -162,6 +162,7 @@ class CustomProvider(OpenAICompatibleProvider): supports_system_prompts=True, supports_streaming=True, supports_function_calling=False, # Conservative default + supports_temperature=True, # Most custom models accept temperature parameter temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7), ) diff --git a/providers/gemini.py b/providers/gemini.py index 4530481..6aec45a 100644 --- a/providers/gemini.py +++ b/providers/gemini.py @@ -94,6 +94,7 @@ class GeminiModelProvider(ModelProvider): supports_function_calling=True, supports_images=config.get("supports_images", False), max_image_size_mb=config.get("max_image_size_mb", 0.0), + supports_temperature=True, # Gemini models accept temperature parameter temperature_constraint=temp_constraint, ) diff --git a/providers/openai_compatible.py b/providers/openai_compatible.py index 90fc30c..f65df88 100644 --- a/providers/openai_compatible.py +++ b/providers/openai_compatible.py @@ -448,23 +448,41 @@ class OpenAICompatibleProvider(ModelProvider): completion_params = { "model": model_name, "messages": messages, - "temperature": temperature, } - # Add max tokens if specified - if max_output_tokens: + # Check model capabilities once to determine parameter support + resolved_model = self._resolve_model_name(model_name) + + # Get model capabilities once to avoid duplicate calls + try: + capabilities = self.get_capabilities(model_name) + # Defensive check for supports_temperature field (backward compatibility) + supports_temperature = getattr(capabilities, "supports_temperature", True) + except Exception as e: + # If capability check fails, fall back to conservative behavior + # Default to including temperature for most models (backward compatibility) + logging.debug(f"Failed to check temperature support for {model_name}: {e}") + supports_temperature = True + + # Add temperature parameter if supported + if supports_temperature: + completion_params["temperature"] = temperature + + # Add max tokens if specified and model supports it + # O3/O4 models that don't support temperature also don't support max_tokens + if max_output_tokens and supports_temperature: completion_params["max_tokens"] = max_output_tokens # Add any additional OpenAI-specific parameters + # Use capabilities to filter parameters for reasoning models for key, value in kwargs.items(): if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]: + # Reasoning models (those that don't support temperature) also don't support these parameters + if not supports_temperature and key in ["top_p", "frequency_penalty", "presence_penalty"]: + continue # Skip unsupported parameters for reasoning models completion_params[key] = value # Check if this is o3-pro and needs the responses endpoint - resolved_model = model_name - if hasattr(self, "_resolve_model_name"): - resolved_model = self._resolve_model_name(model_name) - if resolved_model == "o3-pro-2025-06-10": # This model requires the /v1/responses endpoint # If it fails, we should not fall back to chat/completions diff --git a/providers/openai_provider.py b/providers/openai_provider.py index 48a102b..ed31760 100644 --- a/providers/openai_provider.py +++ b/providers/openai_provider.py @@ -4,11 +4,10 @@ import logging from typing import Optional from .base import ( - FixedTemperatureConstraint, ModelCapabilities, ModelResponse, ProviderType, - RangeTemperatureConstraint, + create_temperature_constraint, ) from .openai_compatible import OpenAICompatibleProvider @@ -25,18 +24,24 @@ class OpenAIModelProvider(OpenAICompatibleProvider): "supports_extended_thinking": False, "supports_images": True, # O3 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": False, # O3 models don't accept temperature parameter + "temperature_constraint": "fixed", # Fixed at 1.0 }, "o3-mini": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, "supports_images": True, # O3 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": False, # O3 models don't accept temperature parameter + "temperature_constraint": "fixed", # Fixed at 1.0 }, "o3-pro-2025-06-10": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, "supports_images": True, # O3 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": False, # O3 models don't accept temperature parameter + "temperature_constraint": "fixed", # Fixed at 1.0 }, # Aliases "o3-pro": "o3-pro-2025-06-10", @@ -45,18 +50,24 @@ class OpenAIModelProvider(OpenAICompatibleProvider): "supports_extended_thinking": False, "supports_images": True, # O4 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": False, # O4 models don't accept temperature parameter + "temperature_constraint": "fixed", # Fixed at 1.0 }, "o4-mini-high": { "context_window": 200_000, # 200K tokens "supports_extended_thinking": False, "supports_images": True, # O4 models support vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": False, # O4 models don't accept temperature parameter + "temperature_constraint": "fixed", # Fixed at 1.0 }, "gpt-4.1-2025-04-14": { "context_window": 1_000_000, # 1M tokens "supports_extended_thinking": False, "supports_images": True, # GPT-4.1 supports vision "max_image_size_mb": 20.0, # 20MB per OpenAI docs + "supports_temperature": True, # Regular models accept temperature parameter + "temperature_constraint": "range", # 0.0-2.0 range }, # Shorthands "mini": "o4-mini", # Default 'mini' to latest mini model @@ -90,13 +101,10 @@ class OpenAIModelProvider(OpenAICompatibleProvider): config = self.SUPPORTED_MODELS[resolved_name] - # Define temperature constraints per model - if resolved_name in ["o3", "o3-mini", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-high"]: - # O3 and O4 reasoning models only support temperature=1.0 - temp_constraint = FixedTemperatureConstraint(1.0) - else: - # Other OpenAI models (including GPT-4.1) support 0.0-2.0 range - temp_constraint = RangeTemperatureConstraint(0.0, 2.0, 0.7) + # Get temperature constraints and support from configuration + supports_temperature = config.get("supports_temperature", True) # Default to True for backward compatibility + temp_constraint_type = config.get("temperature_constraint", "range") # Default to range + temp_constraint = create_temperature_constraint(temp_constraint_type) return ModelCapabilities( provider=ProviderType.OPENAI, @@ -109,6 +117,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider): supports_function_calling=True, supports_images=config.get("supports_images", False), max_image_size_mb=config.get("max_image_size_mb", 0.0), + supports_temperature=supports_temperature, temperature_constraint=temp_constraint, ) diff --git a/providers/openrouter_registry.py b/providers/openrouter_registry.py index eb5c011..47258c8 100644 --- a/providers/openrouter_registry.py +++ b/providers/openrouter_registry.py @@ -8,7 +8,12 @@ from typing import Optional from utils.file_utils import read_json_file -from .base import ModelCapabilities, ProviderType, RangeTemperatureConstraint +from .base import ( + ModelCapabilities, + ProviderType, + TemperatureConstraint, + create_temperature_constraint, +) @dataclass @@ -25,9 +30,21 @@ class OpenRouterModelConfig: supports_json_mode: bool = False supports_images: bool = False # Whether model can process images max_image_size_mb: float = 0.0 # Maximum total size for all images in MB + supports_temperature: bool = True # Whether model accepts temperature parameter in API calls + temperature_constraint: Optional[str] = ( + None # Type of temperature constraint: "fixed", "range", "discrete", or None for default range + ) is_custom: bool = False # True for models that should only be used with custom endpoints description: str = "" + def _create_temperature_constraint(self) -> TemperatureConstraint: + """Create temperature constraint object from configuration. + + Returns: + TemperatureConstraint object based on configuration + """ + return create_temperature_constraint(self.temperature_constraint or "range") + def to_capabilities(self) -> ModelCapabilities: """Convert to ModelCapabilities object.""" return ModelCapabilities( @@ -41,7 +58,8 @@ class OpenRouterModelConfig: supports_function_calling=self.supports_function_calling, supports_images=self.supports_images, max_image_size_mb=self.max_image_size_mb, - temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 1.0), + supports_temperature=self.supports_temperature, + temperature_constraint=self._create_temperature_constraint(), ) diff --git a/systemprompts/chat_prompt.py b/systemprompts/chat_prompt.py index fe2967f..73ba673 100644 --- a/systemprompts/chat_prompt.py +++ b/systemprompts/chat_prompt.py @@ -4,7 +4,7 @@ Chat tool system prompt CHAT_PROMPT = """ You are a senior engineering thought-partner collaborating with Claude. Your mission is to brainstorm, validate ideas, -and offer well-reasoned second opinions on technical decisions. +and offer well-reasoned second opinions on technical decisions when they are justified and practical. CRITICAL LINE NUMBER INSTRUCTIONS Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be @@ -26,27 +26,27 @@ provided unless for some reason its content is missing or incomplete: SCOPE & FOCUS • Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints. -• Recommend new technologies or patterns ONLY with a clear, compelling benefit that aligns with stated goals. -• Keep proposals practical and implementable; avoid speculative or off-stack detours. +• Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity. +• Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs. +• Keep proposals practical and directly actionable within the existing architecture. COLLABORATION APPROACH -1. Engage deeply with Claude's input - extend, refine, and explore alternatives within the existing context. +1. Engage deeply with Claude's input – extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial. 2. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use. 3. Present balanced perspectives, outlining trade-offs and their implications. 4. Challenge assumptions constructively while respecting current design choices and goals. -5. Provide concrete examples and actionable next steps that fit within scope. Direct, achievable next-steps where -needed. +5. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes. BRAINSTORMING GUIDELINES -• Offer multiple viable strategies compatible with the current environment but keep it to the point. -• Suggest creative solutions and alternatives that work within the current project constraints, scope and limitations -• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice +• Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment. +• Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted. +• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice. • Evaluate scalability, maintainability, and operational realities inside the existing architecture and current framework. -• Reference industry best practices relevant to the technologies in use +• Reference industry best practices relevant to the technologies in use. • Communicate concisely and technically, assuming an experienced engineering audience. REMEMBER -Act as a peer, not a lecturer. Aim for depth over breadth, stay within project boundaries, and help the team +Act as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team reach sound, actionable decisions. """ diff --git a/systemprompts/precommit_prompt.py b/systemprompts/precommit_prompt.py index b507a0c..17c8683 100644 --- a/systemprompts/precommit_prompt.py +++ b/systemprompts/precommit_prompt.py @@ -4,20 +4,31 @@ Precommit tool system prompt PRECOMMIT_PROMPT = """ ROLE -You are an expert pre-commit reviewer. Analyse git diffs as a senior developer giving a final sign-off to production. +You are an expert pre-commit reviewer and senior engineering partner performing final code validation before production. +Your responsibility goes beyond surface-level correctness — you are expected to think several steps ahead. Your review + must assess whether the changes: +- Introduce any patterns, structures, or decisions that may become future liabilities +- Create brittle dependencies or tight coupling that could make maintenance harder +- Omit critical safety, validation, or test scaffolding that may not fail now, but will cause issues down the line +- Interact with other known areas of fragility in the codebase even if not directly touched + +Your task is to detect potential future consequences or systemic risks, not just immediate issues. Think like an +engineer responsible for this code months later, debugging production incidents or onboarding a new developer. + +In addition to reviewing correctness, completeness, and quality of the change, apply long-term architectural thinking. +Your feedback helps ensure this code won't cause silent regressions, developer confusion, or downstream side effects later. CRITICAL LINE NUMBER INSTRUCTIONS Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be included in any code you generate. Always reference specific line numbers for Claude to locate -exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity. +exact positions if needed. Include a very short code excerpt alongside for clarity. Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code snippets. IF MORE INFORMATION IS NEEDED -If you need additional context (e.g., related files not in the diff, test files, configuration) to provide thorough -analysis and without this context your review would be ineffective or biased, you MUST respond ONLY with this JSON -format (and nothing else). Do NOT ask for the same file you've been provided unless for some reason its content is -missing or incomplete: +If you need additional context (e.g., related files not in the diff, test files, configuration) to perform a proper +review—and without which your analysis would be incomplete or inaccurate—you MUST respond ONLY with this JSON format +(and nothing else). Do NOT request files you've already been provided unless their content is missing or incomplete: { "status": "files_required_to_continue", "mandatory_instructions": "", @@ -26,34 +37,36 @@ missing or incomplete: INPUTS PROVIDED 1. Git diff (staged or branch comparison) -2. Original request / acceptance criteria or some context around what changed +2. Original request / acceptance criteria or context around what changed 3. File names and related code SCOPE & FOCUS -• Review ONLY the changes in the diff and the given code -• From the diff, infer what got changed and why, determine if the changes make logical sense -• Ensure they correctly implement the request, are secure (where applicable), efficient, and maintainable and do not -cause potential regressions -• DO NOT propose broad refactors or off-scope improvements. Stick to the code and changes you have visibility into. +• Review ONLY the changes in the diff and the related code provided. +• From the diff, infer what changed and why. Determine if the changes make logical, structural, and functional sense. +• Ensure the changes correctly implement the request, are secure (where applicable), performant, and maintainable. +• DO NOT propose broad refactors or unrelated improvements. Stay strictly within the boundaries of the provided changes. REVIEW METHOD -1. Identify tech stack, frameworks, and patterns present in the diff. -2. Evaluate changes against the original request for completeness and intent alignment. -3. Detect issues, prioritising by severity (CRITICAL → HIGH → MEDIUM → LOW). -4. Highlight incomplete changes, or changes that would cause bugs, regressions, crashes or data loss or race conditions -5. Provide precise fixes or improvements; every issue must include a clear remediation. -6. Acknowledge good patterns to reinforce best practice. +1. Identify tech stack, frameworks, and patterns in the diff. +2. Evaluate changes against the original request for completeness and alignment. +3. Detect issues, prioritized by severity (CRITICAL → HIGH → MEDIUM → LOW). +4. Flag bugs, regressions, crash risks, data loss, or race conditions. +5. Recommend specific fixes for each issue raised; include code where helpful. +6. Acknowledge sound patterns to reinforce best practices. -CORE ANALYSIS (adapt to the diff and stack) -• Security – injection risks, auth/authz flaws, sensitive-data exposure, insecure dependencies, memory safety -• Bugs & Logic Errors – off-by-one, null refs, race conditions, incorrect branching -• Performance – inefficient algorithms, resource leaks, blocking operations -• Code Quality – DRY violations, complexity, SOLID adherence +CORE ANALYSIS (adapt to diff and stack) +• Security – injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety +• Bugs & Logic Errors – off-by-one, null refs, incorrect logic, race conditions +• Performance – inefficient logic, blocking calls, leaks +• Code Quality – complexity, duplicated logic and DRY violations, SOLID violations -ADDITIONAL ANALYSIS (apply only when relevant) +ADDITIONAL ANALYSIS (only when relevant) • Language/runtime concerns – memory management, concurrency, exception handling • System/integration – config handling, external calls, operational impact • Testing – coverage gaps for new logic + • If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic + that is high-risk or complex. + • In such cases, offer a low-severity suggestion encouraging basic tests, rather than marking it as a required fix. • Change-specific pitfalls – unused new functions, partial enum updates, scope creep, risky deletions • Determine if there are any new dependencies added but not declared, or new functionality added but not used • Determine unintended side effects: could changes in file_A break module_B even if module_B wasn't changed? @@ -84,9 +97,9 @@ that apply: [LOW] ... MAKE RECOMMENDATIONS: -Make a final, short, clear, to the point statement or list in a brief bullet point: -- Mention top priority fixes to be IMMEDIATELY made before commit -- Notable positives to keep +Make a final, short, and focused statement or bullet list: +- Top priority fixes that MUST IMMEDIATELY be addressed before commit +- Notable positives to retain Be thorough yet actionable. Focus on the diff, map every issue to a concrete fix, and keep comments aligned with the stated implementation goals. Your goal is to help flag anything that could potentially slip through diff --git a/systemprompts/thinkdeep_prompt.py b/systemprompts/thinkdeep_prompt.py index a2ee672..7b48975 100644 --- a/systemprompts/thinkdeep_prompt.py +++ b/systemprompts/thinkdeep_prompt.py @@ -4,8 +4,8 @@ ThinkDeep tool system prompt THINKDEEP_PROMPT = """ ROLE -You are a senior engineering collaborator working with Claude on complex software problems. Claude will send you -content—analysis, prompts, questions, ideas, or theories—to deepen, validate, and extend. +You are a senior engineering collaborator working alongside Claude on complex software problems. Claude will send you +content—analysis, prompts, questions, ideas, or theories—to deepen, validate, or extend with rigor and clarity. CRITICAL LINE NUMBER INSTRUCTIONS Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be @@ -26,25 +26,27 @@ been provided unless for some reason its content is missing or incomplete: GUIDELINES 1. Begin with context analysis: identify tech stack, languages, frameworks, and project constraints. -2. Stay on scope: avoid speculative or oversized ideas; keep suggestions practical and implementable. -3. Challenge and enrich: find gaps, question assumptions, surface hidden complexities. -4. Provide actionable next steps: concrete advice, trade-offs, and implementation tactics. -5. Use concise, direct, technical language; assume an experienced engineering audience. +2. Stay on scope: avoid speculative, over-engineered, or oversized ideas; keep suggestions practical and grounded. +3. Challenge and enrich: find gaps, question assumptions, and surface hidden complexities or risks. +4. Provide actionable next steps: offer specific advice, trade-offs, and implementation strategies. +5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment. +6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted. +7. Use concise, technical language; assume an experienced engineering audience. KEY FOCUS AREAS (apply when relevant) -- Architecture & Design: modularity, patterns, API boundaries, dependencies -- Performance & Scalability: algorithm efficiency, concurrency, caching +- Architecture & Design: modularity, boundaries, abstraction layers, dependencies +- Performance & Scalability: algorithmic efficiency, concurrency, caching, bottlenecks - Security & Safety: validation, authentication/authorization, error handling, vulnerabilities - Quality & Maintainability: readability, testing, monitoring, refactoring -- Integration & Deployment: external systems, compatibility, operational concerns +- Integration & Deployment: ONLY IF APPLICABLE TO THE QUESTION - external systems, compatibility, configuration, operational concerns EVALUATION -Your response will be reviewed by Claude before any decision is made. Aim to enhance decision-making rather -than deliver final answers. +Your response will be reviewed by Claude before any decision is made. Your goal is to practically extend Claude's thinking, +surface blind spots, and refine options—not to deliver final answers in isolation. REMINDERS -- Ground all insights in the current project's scope and constraints. -- If additional information is necessary, such as code snippets, files, project details, use the clarification JSON -- Prefer depth over breadth; propose alternatives ONLY when they materially improve the current approach and add value -- Your goal is to be the perfect development partner that extends Claude's capabilities and thought process +- Ground all insights in the current project's architecture, limitations, and goals. +- If further context is needed, request it via the clarification JSON—nothing else. +- Prioritize depth over breadth; propose alternatives ONLY if they clearly add value and improve the current approach. +- Be the ideal development partner—rigorous, focused, and fluent in real-world software trade-offs. """ diff --git a/tests/test_o3_temperature_fix_simple.py b/tests/test_o3_temperature_fix_simple.py new file mode 100644 index 0000000..da0ea60 --- /dev/null +++ b/tests/test_o3_temperature_fix_simple.py @@ -0,0 +1,239 @@ +""" +Simple integration test for the O3 model temperature parameter fix. + +This test confirms that the fix properly excludes temperature parameters +for O3 models while maintaining them for regular models. +""" + +from unittest.mock import Mock, patch + +from providers.openai_provider import OpenAIModelProvider + + +class TestO3TemperatureParameterFixSimple: + """Simple test for O3 model parameter filtering.""" + + @patch("utils.model_restrictions.get_restriction_service") + @patch("providers.openai_compatible.OpenAI") + def test_o3_models_exclude_temperature_from_api_call(self, mock_openai_class, mock_restriction_service): + """Test that O3 models don't send temperature to the API.""" + # Mock restriction service to allow all models + mock_service = Mock() + mock_service.is_allowed.return_value = True + mock_restriction_service.return_value = mock_service + + # Setup mock client + mock_client = Mock() + mock_openai_class.return_value = mock_client + + # Setup mock response + mock_response = Mock() + mock_response.choices = [Mock()] + mock_response.choices[0].message.content = "Test response" + mock_response.choices[0].finish_reason = "stop" + mock_response.model = "o3-mini" + mock_response.id = "test-id" + mock_response.created = 1234567890 + mock_response.usage = Mock() + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + mock_response.usage.total_tokens = 15 + + mock_client.chat.completions.create.return_value = mock_response + + # Create provider + provider = OpenAIModelProvider(api_key="test-key") + + # Override _resolve_model_name to return the resolved model name + provider._resolve_model_name = lambda name: name + # Override model validation to bypass restrictions + provider.validate_model_name = lambda name: True + + # Call generate_content with O3 model + provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=0.5, max_output_tokens=100) + + # Verify the API call was made without temperature or max_tokens + mock_client.chat.completions.create.assert_called_once() + call_kwargs = mock_client.chat.completions.create.call_args[1] + + assert "temperature" not in call_kwargs, "O3 models should not include temperature parameter" + assert "max_tokens" not in call_kwargs, "O3 models should not include max_tokens parameter" + assert call_kwargs["model"] == "o3-mini" + assert "messages" in call_kwargs + + @patch("utils.model_restrictions.get_restriction_service") + @patch("providers.openai_compatible.OpenAI") + def test_regular_models_include_temperature_in_api_call(self, mock_openai_class, mock_restriction_service): + """Test that regular models still send temperature to the API.""" + # Mock restriction service to allow all models + mock_service = Mock() + mock_service.is_allowed.return_value = True + mock_restriction_service.return_value = mock_service + + # Setup mock client + mock_client = Mock() + mock_openai_class.return_value = mock_client + + # Setup mock response + mock_response = Mock() + mock_response.choices = [Mock()] + mock_response.choices[0].message.content = "Test response" + mock_response.choices[0].finish_reason = "stop" + mock_response.model = "gpt-4.1-2025-04-14" + mock_response.id = "test-id" + mock_response.created = 1234567890 + mock_response.usage = Mock() + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + mock_response.usage.total_tokens = 15 + + mock_client.chat.completions.create.return_value = mock_response + + # Create provider + provider = OpenAIModelProvider(api_key="test-key") + + # Override _resolve_model_name to return the resolved model name + provider._resolve_model_name = lambda name: name + # Override model validation to bypass restrictions + provider.validate_model_name = lambda name: True + + # Call generate_content with regular model (use supported model) + provider.generate_content( + prompt="Test prompt", model_name="gpt-4.1-2025-04-14", temperature=0.5, max_output_tokens=100 + ) + + # Verify the API call was made WITH temperature and max_tokens + mock_client.chat.completions.create.assert_called_once() + call_kwargs = mock_client.chat.completions.create.call_args[1] + + assert call_kwargs["temperature"] == 0.5, "Regular models should include temperature parameter" + assert call_kwargs["max_tokens"] == 100, "Regular models should include max_tokens parameter" + assert call_kwargs["model"] == "gpt-4.1-2025-04-14" + + @patch("utils.model_restrictions.get_restriction_service") + @patch("providers.openai_compatible.OpenAI") + def test_o3_models_filter_unsupported_parameters(self, mock_openai_class, mock_restriction_service): + """Test that O3 models filter out top_p, frequency_penalty, etc.""" + # Mock restriction service to allow all models + mock_service = Mock() + mock_service.is_allowed.return_value = True + mock_restriction_service.return_value = mock_service + + # Setup mock client + mock_client = Mock() + mock_openai_class.return_value = mock_client + + # Setup mock response + mock_response = Mock() + mock_response.choices = [Mock()] + mock_response.choices[0].message.content = "Test response" + mock_response.choices[0].finish_reason = "stop" + mock_response.model = "o3" + mock_response.id = "test-id" + mock_response.created = 1234567890 + mock_response.usage = Mock() + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + mock_response.usage.total_tokens = 15 + + mock_client.chat.completions.create.return_value = mock_response + + # Create provider + provider = OpenAIModelProvider(api_key="test-key") + + # Override _resolve_model_name to return the resolved model name + provider._resolve_model_name = lambda name: name + # Override model validation to bypass restrictions + provider.validate_model_name = lambda name: True + + # Call generate_content with O3 model and unsupported parameters + provider.generate_content( + prompt="Test prompt", + model_name="o3", + temperature=0.5, + top_p=0.9, + frequency_penalty=0.1, + presence_penalty=0.1, + seed=42, + stop=["END"], + ) + + # Verify the API call filters out unsupported parameters + mock_client.chat.completions.create.assert_called_once() + call_kwargs = mock_client.chat.completions.create.call_args[1] + + # Should be excluded for O3 models + assert "temperature" not in call_kwargs, "O3 should not include temperature" + assert "top_p" not in call_kwargs, "O3 should not include top_p" + assert "frequency_penalty" not in call_kwargs, "O3 should not include frequency_penalty" + assert "presence_penalty" not in call_kwargs, "O3 should not include presence_penalty" + + # Should be included (supported parameters) + assert call_kwargs["seed"] == 42, "O3 should include seed parameter" + assert call_kwargs["stop"] == ["END"], "O3 should include stop parameter" + + @patch("utils.model_restrictions.get_restriction_service") + def test_all_o3_models_have_correct_temperature_capability(self, mock_restriction_service): + """Test that all O3/O4 models have supports_temperature=False in their capabilities.""" + from providers.openai_provider import OpenAIModelProvider + + # Mock restriction service to allow all models + mock_service = Mock() + mock_service.is_allowed.return_value = True + mock_restriction_service.return_value = mock_service + + provider = OpenAIModelProvider(api_key="test-key") + + # Test O3/O4 models that should NOT support temperature parameter + o3_o4_models = ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"] + + for model in o3_o4_models: + capabilities = provider.get_capabilities(model) + assert hasattr( + capabilities, "supports_temperature" + ), f"Model {model} capabilities should have supports_temperature field" + assert capabilities.supports_temperature is False, f"Model {model} should have supports_temperature=False" + + # Test that regular models DO support temperature parameter + regular_models = ["gpt-4.1-2025-04-14"] + + for model in regular_models: + try: + capabilities = provider.get_capabilities(model) + assert hasattr( + capabilities, "supports_temperature" + ), f"Model {model} capabilities should have supports_temperature field" + assert capabilities.supports_temperature is True, f"Model {model} should have supports_temperature=True" + except ValueError: + # Skip if model not in SUPPORTED_MODELS (that's okay for this test) + pass + + @patch("utils.model_restrictions.get_restriction_service") + def test_openai_provider_temperature_constraints(self, mock_restriction_service): + """Test that OpenAI provider has correct temperature constraints for O3 models.""" + from providers.openai_provider import OpenAIModelProvider + + # Mock restriction service to allow all models + mock_service = Mock() + mock_service.is_allowed.return_value = True + mock_restriction_service.return_value = mock_service + + provider = OpenAIModelProvider(api_key="test-key") + + # Test O3 model constraints + o3_capabilities = provider.get_capabilities("o3-mini") + assert o3_capabilities.temperature_constraint is not None + + # O3 models should have fixed temperature constraint + temp_constraint = o3_capabilities.temperature_constraint + assert temp_constraint.validate(1.0) is True + assert temp_constraint.validate(0.5) is False + + # Test regular model constraints - use gpt-4.1 which is supported + gpt41_capabilities = provider.get_capabilities("gpt-4.1-2025-04-14") + assert gpt41_capabilities.temperature_constraint is not None + + # Regular models should allow a range + temp_constraint = gpt41_capabilities.temperature_constraint + assert temp_constraint.validate(0.5) is True + assert temp_constraint.validate(1.0) is True diff --git a/tests/test_openai_provider.py b/tests/test_openai_provider.py index 55548b7..e9e3ae8 100644 --- a/tests/test_openai_provider.py +++ b/tests/test_openai_provider.py @@ -122,7 +122,7 @@ class TestOpenAIProvider: mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "Test response" mock_response.choices[0].finish_reason = "stop" - mock_response.model = "o4-mini" # API returns the resolved model name + mock_response.model = "gpt-4.1-2025-04-14" # API returns the resolved model name mock_response.id = "test-id" mock_response.created = 1234567890 mock_response.usage = MagicMock() @@ -134,19 +134,23 @@ class TestOpenAIProvider: provider = OpenAIModelProvider("test-key") - # Call generate_content with alias 'mini' + # Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1-2025-04-14, supports temperature) result = provider.generate_content( - prompt="Test prompt", model_name="mini", temperature=1.0 # This should be resolved to "o4-mini" + prompt="Test prompt", + model_name="gpt4.1", + temperature=1.0, # This should be resolved to "gpt-4.1-2025-04-14" ) # Verify the API was called with the RESOLVED model name mock_client.chat.completions.create.assert_called_once() call_kwargs = mock_client.chat.completions.create.call_args[1] - # CRITICAL ASSERTION: The API should receive "o4-mini", not "mini" - assert call_kwargs["model"] == "o4-mini", f"Expected 'o4-mini' but API received '{call_kwargs['model']}'" + # CRITICAL ASSERTION: The API should receive "gpt-4.1-2025-04-14", not "gpt4.1" + assert ( + call_kwargs["model"] == "gpt-4.1-2025-04-14" + ), f"Expected 'gpt-4.1-2025-04-14' but API received '{call_kwargs['model']}'" - # Verify other parameters + # Verify other parameters (gpt-4.1 supports temperature unlike O3/O4 models) assert call_kwargs["temperature"] == 1.0 assert len(call_kwargs["messages"]) == 1 assert call_kwargs["messages"][0]["role"] == "user" @@ -154,7 +158,7 @@ class TestOpenAIProvider: # Verify response assert result.content == "Test response" - assert result.model_name == "o4-mini" # Should be the resolved name + assert result.model_name == "gpt-4.1-2025-04-14" # Should be the resolved name @patch("providers.openai_compatible.OpenAI") def test_generate_content_other_aliases(self, mock_openai_class):