Added proper temperature constraints to the model, fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/78

Prompt tweaks
This commit is contained in:
Fahad
2025-06-19 08:30:46 +04:00
parent ec3a466b1c
commit 9f3b70d6d7
13 changed files with 435 additions and 79 deletions

View File

@@ -27,6 +27,8 @@
"supports_function_calling": "Whether the model supports function/tool calling",
"supports_images": "Whether the model can process images/visual input",
"max_image_size_mb": "Maximum total size in MB for all images combined (capped at 40MB max for custom models)",
"supports_temperature": "Whether the model accepts temperature parameter in API calls (set to false for O3/O4 reasoning models)",
"temperature_constraint": "Type of temperature constraint: 'fixed' (fixed value), 'range' (continuous range), 'discrete' (specific values), or omit for default range",
"is_custom": "Set to true for models that should ONLY be used with custom API endpoints (Ollama, vLLM, etc.). False or omitted for OpenRouter/cloud models.",
"description": "Human-readable description of the model"
},
@@ -39,6 +41,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 10.0,
"supports_temperature": true,
"temperature_constraint": "range",
"is_custom": true,
"description": "Example custom/local model for Ollama, vLLM, etc."
}
@@ -152,6 +156,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3 model - well-rounded and powerful across domains with vision"
},
{
@@ -163,6 +169,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-mini model - balanced performance and speed with vision"
},
{
@@ -174,6 +182,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-mini with high reasoning effort - optimized for complex problems with vision"
},
{
@@ -185,6 +195,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o3-pro model - professional-grade reasoning and analysis with vision"
},
{
@@ -196,6 +208,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o4-mini model - optimized for shorter contexts with rapid reasoning and vision"
},
{
@@ -207,6 +221,8 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"supports_temperature": false,
"temperature_constraint": "fixed",
"description": "OpenAI's o4-mini with high reasoning effort - enhanced for complex tasks with vision"
},
{

View File

@@ -14,7 +14,7 @@ import os
# These values are used in server responses and for tracking releases
# IMPORTANT: This is the single source of truth for version and author info
# Semantic versioning: MAJOR.MINOR.PATCH
__version__ = "5.1.3"
__version__ = "5.1.4"
# Last update date in ISO format
__updated__ = "2025-06-19"
# Primary maintainer

View File

@@ -100,6 +100,26 @@ class DiscreteTemperatureConstraint(TemperatureConstraint):
return self.default_temp
def create_temperature_constraint(constraint_type: str) -> TemperatureConstraint:
"""Create temperature constraint object from configuration string.
Args:
constraint_type: Type of constraint ("fixed", "range", "discrete")
Returns:
TemperatureConstraint object based on configuration
"""
if constraint_type == "fixed":
# Fixed temperature models (O3/O4) only support temperature=1.0
return FixedTemperatureConstraint(1.0)
elif constraint_type == "discrete":
# For models with specific allowed values - using common OpenAI values as default
return DiscreteTemperatureConstraint([0.0, 0.3, 0.7, 1.0, 1.5, 2.0], 0.7)
else:
# Default range constraint (for "range" or None)
return RangeTemperatureConstraint(0.0, 2.0, 0.7)
@dataclass
class ModelCapabilities:
"""Capabilities and constraints for a specific model."""
@@ -114,6 +134,7 @@ class ModelCapabilities:
supports_function_calling: bool = False
supports_images: bool = False # Whether model can process images
max_image_size_mb: float = 0.0 # Maximum total size for all images in MB
supports_temperature: bool = True # Whether model accepts temperature parameter in API calls
# Temperature constraint object - preferred way to define temperature limits
temperature_constraint: TemperatureConstraint = field(
@@ -245,3 +266,17 @@ class ModelProvider(ABC):
List of all model names and alias targets known by this provider
"""
pass
def _resolve_model_name(self, model_name: str) -> str:
"""Resolve model shorthand to full name.
Base implementation returns the model name unchanged.
Subclasses should override to provide alias resolution.
Args:
model_name: Model name that may be an alias
Returns:
Resolved model name
"""
return model_name

View File

@@ -162,6 +162,7 @@ class CustomProvider(OpenAICompatibleProvider):
supports_system_prompts=True,
supports_streaming=True,
supports_function_calling=False, # Conservative default
supports_temperature=True, # Most custom models accept temperature parameter
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 0.7),
)

View File

@@ -94,6 +94,7 @@ class GeminiModelProvider(ModelProvider):
supports_function_calling=True,
supports_images=config.get("supports_images", False),
max_image_size_mb=config.get("max_image_size_mb", 0.0),
supports_temperature=True, # Gemini models accept temperature parameter
temperature_constraint=temp_constraint,
)

View File

@@ -448,23 +448,41 @@ class OpenAICompatibleProvider(ModelProvider):
completion_params = {
"model": model_name,
"messages": messages,
"temperature": temperature,
}
# Add max tokens if specified
if max_output_tokens:
# Check model capabilities once to determine parameter support
resolved_model = self._resolve_model_name(model_name)
# Get model capabilities once to avoid duplicate calls
try:
capabilities = self.get_capabilities(model_name)
# Defensive check for supports_temperature field (backward compatibility)
supports_temperature = getattr(capabilities, "supports_temperature", True)
except Exception as e:
# If capability check fails, fall back to conservative behavior
# Default to including temperature for most models (backward compatibility)
logging.debug(f"Failed to check temperature support for {model_name}: {e}")
supports_temperature = True
# Add temperature parameter if supported
if supports_temperature:
completion_params["temperature"] = temperature
# Add max tokens if specified and model supports it
# O3/O4 models that don't support temperature also don't support max_tokens
if max_output_tokens and supports_temperature:
completion_params["max_tokens"] = max_output_tokens
# Add any additional OpenAI-specific parameters
# Use capabilities to filter parameters for reasoning models
for key, value in kwargs.items():
if key in ["top_p", "frequency_penalty", "presence_penalty", "seed", "stop", "stream"]:
# Reasoning models (those that don't support temperature) also don't support these parameters
if not supports_temperature and key in ["top_p", "frequency_penalty", "presence_penalty"]:
continue # Skip unsupported parameters for reasoning models
completion_params[key] = value
# Check if this is o3-pro and needs the responses endpoint
resolved_model = model_name
if hasattr(self, "_resolve_model_name"):
resolved_model = self._resolve_model_name(model_name)
if resolved_model == "o3-pro-2025-06-10":
# This model requires the /v1/responses endpoint
# If it fails, we should not fall back to chat/completions

View File

@@ -4,11 +4,10 @@ import logging
from typing import Optional
from .base import (
FixedTemperatureConstraint,
ModelCapabilities,
ModelResponse,
ProviderType,
RangeTemperatureConstraint,
create_temperature_constraint,
)
from .openai_compatible import OpenAICompatibleProvider
@@ -25,18 +24,24 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
"supports_extended_thinking": False,
"supports_images": True, # O3 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": False, # O3 models don't accept temperature parameter
"temperature_constraint": "fixed", # Fixed at 1.0
},
"o3-mini": {
"context_window": 200_000, # 200K tokens
"supports_extended_thinking": False,
"supports_images": True, # O3 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": False, # O3 models don't accept temperature parameter
"temperature_constraint": "fixed", # Fixed at 1.0
},
"o3-pro-2025-06-10": {
"context_window": 200_000, # 200K tokens
"supports_extended_thinking": False,
"supports_images": True, # O3 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": False, # O3 models don't accept temperature parameter
"temperature_constraint": "fixed", # Fixed at 1.0
},
# Aliases
"o3-pro": "o3-pro-2025-06-10",
@@ -45,18 +50,24 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
"supports_extended_thinking": False,
"supports_images": True, # O4 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": False, # O4 models don't accept temperature parameter
"temperature_constraint": "fixed", # Fixed at 1.0
},
"o4-mini-high": {
"context_window": 200_000, # 200K tokens
"supports_extended_thinking": False,
"supports_images": True, # O4 models support vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": False, # O4 models don't accept temperature parameter
"temperature_constraint": "fixed", # Fixed at 1.0
},
"gpt-4.1-2025-04-14": {
"context_window": 1_000_000, # 1M tokens
"supports_extended_thinking": False,
"supports_images": True, # GPT-4.1 supports vision
"max_image_size_mb": 20.0, # 20MB per OpenAI docs
"supports_temperature": True, # Regular models accept temperature parameter
"temperature_constraint": "range", # 0.0-2.0 range
},
# Shorthands
"mini": "o4-mini", # Default 'mini' to latest mini model
@@ -90,13 +101,10 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
config = self.SUPPORTED_MODELS[resolved_name]
# Define temperature constraints per model
if resolved_name in ["o3", "o3-mini", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-high"]:
# O3 and O4 reasoning models only support temperature=1.0
temp_constraint = FixedTemperatureConstraint(1.0)
else:
# Other OpenAI models (including GPT-4.1) support 0.0-2.0 range
temp_constraint = RangeTemperatureConstraint(0.0, 2.0, 0.7)
# Get temperature constraints and support from configuration
supports_temperature = config.get("supports_temperature", True) # Default to True for backward compatibility
temp_constraint_type = config.get("temperature_constraint", "range") # Default to range
temp_constraint = create_temperature_constraint(temp_constraint_type)
return ModelCapabilities(
provider=ProviderType.OPENAI,
@@ -109,6 +117,7 @@ class OpenAIModelProvider(OpenAICompatibleProvider):
supports_function_calling=True,
supports_images=config.get("supports_images", False),
max_image_size_mb=config.get("max_image_size_mb", 0.0),
supports_temperature=supports_temperature,
temperature_constraint=temp_constraint,
)

View File

@@ -8,7 +8,12 @@ from typing import Optional
from utils.file_utils import read_json_file
from .base import ModelCapabilities, ProviderType, RangeTemperatureConstraint
from .base import (
ModelCapabilities,
ProviderType,
TemperatureConstraint,
create_temperature_constraint,
)
@dataclass
@@ -25,9 +30,21 @@ class OpenRouterModelConfig:
supports_json_mode: bool = False
supports_images: bool = False # Whether model can process images
max_image_size_mb: float = 0.0 # Maximum total size for all images in MB
supports_temperature: bool = True # Whether model accepts temperature parameter in API calls
temperature_constraint: Optional[str] = (
None # Type of temperature constraint: "fixed", "range", "discrete", or None for default range
)
is_custom: bool = False # True for models that should only be used with custom endpoints
description: str = ""
def _create_temperature_constraint(self) -> TemperatureConstraint:
"""Create temperature constraint object from configuration.
Returns:
TemperatureConstraint object based on configuration
"""
return create_temperature_constraint(self.temperature_constraint or "range")
def to_capabilities(self) -> ModelCapabilities:
"""Convert to ModelCapabilities object."""
return ModelCapabilities(
@@ -41,7 +58,8 @@ class OpenRouterModelConfig:
supports_function_calling=self.supports_function_calling,
supports_images=self.supports_images,
max_image_size_mb=self.max_image_size_mb,
temperature_constraint=RangeTemperatureConstraint(0.0, 2.0, 1.0),
supports_temperature=self.supports_temperature,
temperature_constraint=self._create_temperature_constraint(),
)

View File

@@ -4,7 +4,7 @@ Chat tool system prompt
CHAT_PROMPT = """
You are a senior engineering thought-partner collaborating with Claude. Your mission is to brainstorm, validate ideas,
and offer well-reasoned second opinions on technical decisions.
and offer well-reasoned second opinions on technical decisions when they are justified and practical.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
@@ -26,27 +26,27 @@ provided unless for some reason its content is missing or incomplete:
SCOPE & FOCUS
• Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints.
• Recommend new technologies or patterns ONLY with a clear, compelling benefit that aligns with stated goals.
Keep proposals practical and implementable; avoid speculative or off-stack detours.
• Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.
Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.
• Keep proposals practical and directly actionable within the existing architecture.
COLLABORATION APPROACH
1. Engage deeply with Claude's input - extend, refine, and explore alternatives within the existing context.
1. Engage deeply with Claude's input extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.
2. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use.
3. Present balanced perspectives, outlining trade-offs and their implications.
4. Challenge assumptions constructively while respecting current design choices and goals.
5. Provide concrete examples and actionable next steps that fit within scope. Direct, achievable next-steps where
needed.
5. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes.
BRAINSTORMING GUIDELINES
• Offer multiple viable strategies compatible with the current environment but keep it to the point.
• Suggest creative solutions and alternatives that work within the current project constraints, scope and limitations
• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice
• Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.
• Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.
• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice.
• Evaluate scalability, maintainability, and operational realities inside the existing architecture and current
framework.
• Reference industry best practices relevant to the technologies in use
• Reference industry best practices relevant to the technologies in use.
• Communicate concisely and technically, assuming an experienced engineering audience.
REMEMBER
Act as a peer, not a lecturer. Aim for depth over breadth, stay within project boundaries, and help the team
Act as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team
reach sound, actionable decisions.
"""

View File

@@ -4,20 +4,31 @@ Precommit tool system prompt
PRECOMMIT_PROMPT = """
ROLE
You are an expert pre-commit reviewer. Analyse git diffs as a senior developer giving a final sign-off to production.
You are an expert pre-commit reviewer and senior engineering partner performing final code validation before production.
Your responsibility goes beyond surface-level correctness — you are expected to think several steps ahead. Your review
must assess whether the changes:
- Introduce any patterns, structures, or decisions that may become future liabilities
- Create brittle dependencies or tight coupling that could make maintenance harder
- Omit critical safety, validation, or test scaffolding that may not fail now, but will cause issues down the line
- Interact with other known areas of fragility in the codebase even if not directly touched
Your task is to detect potential future consequences or systemic risks, not just immediate issues. Think like an
engineer responsible for this code months later, debugging production incidents or onboarding a new developer.
In addition to reviewing correctness, completeness, and quality of the change, apply long-term architectural thinking.
Your feedback helps ensure this code won't cause silent regressions, developer confusion, or downstream side effects later.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
included in any code you generate. Always reference specific line numbers for Claude to locate
exact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.
exact positions if needed. Include a very short code excerpt alongside for clarity.
Include context_start_text and context_end_text as backup references. Never include "LINE│" markers in generated code
snippets.
IF MORE INFORMATION IS NEEDED
If you need additional context (e.g., related files not in the diff, test files, configuration) to provide thorough
analysis and without this context your review would be ineffective or biased, you MUST respond ONLY with this JSON
format (and nothing else). Do NOT ask for the same file you've been provided unless for some reason its content is
missing or incomplete:
If you need additional context (e.g., related files not in the diff, test files, configuration) to perform a proper
review—and without which your analysis would be incomplete or inaccurate—you MUST respond ONLY with this JSON format
(and nothing else). Do NOT request files you've already been provided unless their content is missing or incomplete:
{
"status": "files_required_to_continue",
"mandatory_instructions": "<your critical instructions for Claude>",
@@ -26,34 +37,36 @@ missing or incomplete:
INPUTS PROVIDED
1. Git diff (staged or branch comparison)
2. Original request / acceptance criteria or some context around what changed
2. Original request / acceptance criteria or context around what changed
3. File names and related code
SCOPE & FOCUS
• Review ONLY the changes in the diff and the given code
• From the diff, infer what got changed and why, determine if the changes make logical sense
• Ensure they correctly implement the request, are secure (where applicable), efficient, and maintainable and do not
cause potential regressions
• DO NOT propose broad refactors or off-scope improvements. Stick to the code and changes you have visibility into.
• Review ONLY the changes in the diff and the related code provided.
• From the diff, infer what changed and why. Determine if the changes make logical, structural, and functional sense.
• Ensure the changes correctly implement the request, are secure (where applicable), performant, and maintainable.
• DO NOT propose broad refactors or unrelated improvements. Stay strictly within the boundaries of the provided changes.
REVIEW METHOD
1. Identify tech stack, frameworks, and patterns present in the diff.
2. Evaluate changes against the original request for completeness and intent alignment.
3. Detect issues, prioritising by severity (CRITICAL → HIGH → MEDIUM → LOW).
4. Highlight incomplete changes, or changes that would cause bugs, regressions, crashes or data loss or race conditions
5. Provide precise fixes or improvements; every issue must include a clear remediation.
6. Acknowledge good patterns to reinforce best practice.
1. Identify tech stack, frameworks, and patterns in the diff.
2. Evaluate changes against the original request for completeness and alignment.
3. Detect issues, prioritized by severity (CRITICAL → HIGH → MEDIUM → LOW).
4. Flag bugs, regressions, crash risks, data loss, or race conditions.
5. Recommend specific fixes for each issue raised; include code where helpful.
6. Acknowledge sound patterns to reinforce best practices.
CORE ANALYSIS (adapt to the diff and stack)
• Security injection risks, auth/authz flaws, sensitive-data exposure, insecure dependencies, memory safety
• Bugs & Logic Errors off-by-one, null refs, race conditions, incorrect branching
• Performance inefficient algorithms, resource leaks, blocking operations
• Code Quality DRY violations, complexity, SOLID adherence
CORE ANALYSIS (adapt to diff and stack)
• Security injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety
• Bugs & Logic Errors off-by-one, null refs, incorrect logic, race conditions
• Performance inefficient logic, blocking calls, leaks
• Code Quality complexity, duplicated logic and DRY violations, SOLID violations
ADDITIONAL ANALYSIS (apply only when relevant)
ADDITIONAL ANALYSIS (only when relevant)
• Language/runtime concerns memory management, concurrency, exception handling
• System/integration config handling, external calls, operational impact
• Testing coverage gaps for new logic
• If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic
that is high-risk or complex.
• In such cases, offer a low-severity suggestion encouraging basic tests, rather than marking it as a required fix.
• Change-specific pitfalls unused new functions, partial enum updates, scope creep, risky deletions
• Determine if there are any new dependencies added but not declared, or new functionality added but not used
• Determine unintended side effects: could changes in file_A break module_B even if module_B wasn't changed?
@@ -84,9 +97,9 @@ that apply:
[LOW] ...
MAKE RECOMMENDATIONS:
Make a final, short, clear, to the point statement or list in a brief bullet point:
- Mention top priority fixes to be IMMEDIATELY made before commit
- Notable positives to keep
Make a final, short, and focused statement or bullet list:
- Top priority fixes that MUST IMMEDIATELY be addressed before commit
- Notable positives to retain
Be thorough yet actionable. Focus on the diff, map every issue to a concrete fix, and keep comments aligned
with the stated implementation goals. Your goal is to help flag anything that could potentially slip through

View File

@@ -4,8 +4,8 @@ ThinkDeep tool system prompt
THINKDEEP_PROMPT = """
ROLE
You are a senior engineering collaborator working with Claude on complex software problems. Claude will send you
content—analysis, prompts, questions, ideas, or theories—to deepen, validate, and extend.
You are a senior engineering collaborator working alongside Claude on complex software problems. Claude will send you
content—analysis, prompts, questions, ideas, or theories—to deepen, validate, or extend with rigor and clarity.
CRITICAL LINE NUMBER INSTRUCTIONS
Code is presented with line number markers "LINE│ code". These markers are for reference ONLY and MUST NOT be
@@ -26,25 +26,27 @@ been provided unless for some reason its content is missing or incomplete:
GUIDELINES
1. Begin with context analysis: identify tech stack, languages, frameworks, and project constraints.
2. Stay on scope: avoid speculative or oversized ideas; keep suggestions practical and implementable.
3. Challenge and enrich: find gaps, question assumptions, surface hidden complexities.
4. Provide actionable next steps: concrete advice, trade-offs, and implementation tactics.
5. Use concise, direct, technical language; assume an experienced engineering audience.
2. Stay on scope: avoid speculative, over-engineered, or oversized ideas; keep suggestions practical and grounded.
3. Challenge and enrich: find gaps, question assumptions, and surface hidden complexities or risks.
4. Provide actionable next steps: offer specific advice, trade-offs, and implementation strategies.
5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.
6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.
7. Use concise, technical language; assume an experienced engineering audience.
KEY FOCUS AREAS (apply when relevant)
- Architecture & Design: modularity, patterns, API boundaries, dependencies
- Performance & Scalability: algorithm efficiency, concurrency, caching
- Architecture & Design: modularity, boundaries, abstraction layers, dependencies
- Performance & Scalability: algorithmic efficiency, concurrency, caching, bottlenecks
- Security & Safety: validation, authentication/authorization, error handling, vulnerabilities
- Quality & Maintainability: readability, testing, monitoring, refactoring
- Integration & Deployment: external systems, compatibility, operational concerns
- Integration & Deployment: ONLY IF APPLICABLE TO THE QUESTION - external systems, compatibility, configuration, operational concerns
EVALUATION
Your response will be reviewed by Claude before any decision is made. Aim to enhance decision-making rather
than deliver final answers.
Your response will be reviewed by Claude before any decision is made. Your goal is to practically extend Claude's thinking,
surface blind spots, and refine options—not to deliver final answers in isolation.
REMINDERS
- Ground all insights in the current project's scope and constraints.
- If additional information is necessary, such as code snippets, files, project details, use the clarification JSON
- Prefer depth over breadth; propose alternatives ONLY when they materially improve the current approach and add value
- Your goal is to be the perfect development partner that extends Claude's capabilities and thought process
- Ground all insights in the current project's architecture, limitations, and goals.
- If further context is needed, request it via the clarification JSON—nothing else.
- Prioritize depth over breadth; propose alternatives ONLY if they clearly add value and improve the current approach.
- Be the ideal development partner—rigorous, focused, and fluent in real-world software trade-offs.
"""

View File

@@ -0,0 +1,239 @@
"""
Simple integration test for the O3 model temperature parameter fix.
This test confirms that the fix properly excludes temperature parameters
for O3 models while maintaining them for regular models.
"""
from unittest.mock import Mock, patch
from providers.openai_provider import OpenAIModelProvider
class TestO3TemperatureParameterFixSimple:
"""Simple test for O3 model parameter filtering."""
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_o3_models_exclude_temperature_from_api_call(self, mock_openai_class, mock_restriction_service):
"""Test that O3 models don't send temperature to the API."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3-mini"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with O3 model
provider.generate_content(prompt="Test prompt", model_name="o3-mini", temperature=0.5, max_output_tokens=100)
# Verify the API call was made without temperature or max_tokens
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert "temperature" not in call_kwargs, "O3 models should not include temperature parameter"
assert "max_tokens" not in call_kwargs, "O3 models should not include max_tokens parameter"
assert call_kwargs["model"] == "o3-mini"
assert "messages" in call_kwargs
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_regular_models_include_temperature_in_api_call(self, mock_openai_class, mock_restriction_service):
"""Test that regular models still send temperature to the API."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "gpt-4.1-2025-04-14"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with regular model (use supported model)
provider.generate_content(
prompt="Test prompt", model_name="gpt-4.1-2025-04-14", temperature=0.5, max_output_tokens=100
)
# Verify the API call was made WITH temperature and max_tokens
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs["temperature"] == 0.5, "Regular models should include temperature parameter"
assert call_kwargs["max_tokens"] == 100, "Regular models should include max_tokens parameter"
assert call_kwargs["model"] == "gpt-4.1-2025-04-14"
@patch("utils.model_restrictions.get_restriction_service")
@patch("providers.openai_compatible.OpenAI")
def test_o3_models_filter_unsupported_parameters(self, mock_openai_class, mock_restriction_service):
"""Test that O3 models filter out top_p, frequency_penalty, etc."""
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
# Setup mock client
mock_client = Mock()
mock_openai_class.return_value = mock_client
# Setup mock response
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o3"
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = Mock()
mock_response.usage.prompt_tokens = 10
mock_response.usage.completion_tokens = 5
mock_response.usage.total_tokens = 15
mock_client.chat.completions.create.return_value = mock_response
# Create provider
provider = OpenAIModelProvider(api_key="test-key")
# Override _resolve_model_name to return the resolved model name
provider._resolve_model_name = lambda name: name
# Override model validation to bypass restrictions
provider.validate_model_name = lambda name: True
# Call generate_content with O3 model and unsupported parameters
provider.generate_content(
prompt="Test prompt",
model_name="o3",
temperature=0.5,
top_p=0.9,
frequency_penalty=0.1,
presence_penalty=0.1,
seed=42,
stop=["END"],
)
# Verify the API call filters out unsupported parameters
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
# Should be excluded for O3 models
assert "temperature" not in call_kwargs, "O3 should not include temperature"
assert "top_p" not in call_kwargs, "O3 should not include top_p"
assert "frequency_penalty" not in call_kwargs, "O3 should not include frequency_penalty"
assert "presence_penalty" not in call_kwargs, "O3 should not include presence_penalty"
# Should be included (supported parameters)
assert call_kwargs["seed"] == 42, "O3 should include seed parameter"
assert call_kwargs["stop"] == ["END"], "O3 should include stop parameter"
@patch("utils.model_restrictions.get_restriction_service")
def test_all_o3_models_have_correct_temperature_capability(self, mock_restriction_service):
"""Test that all O3/O4 models have supports_temperature=False in their capabilities."""
from providers.openai_provider import OpenAIModelProvider
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
provider = OpenAIModelProvider(api_key="test-key")
# Test O3/O4 models that should NOT support temperature parameter
o3_o4_models = ["o3", "o3-mini", "o3-pro", "o4-mini", "o4-mini-high"]
for model in o3_o4_models:
capabilities = provider.get_capabilities(model)
assert hasattr(
capabilities, "supports_temperature"
), f"Model {model} capabilities should have supports_temperature field"
assert capabilities.supports_temperature is False, f"Model {model} should have supports_temperature=False"
# Test that regular models DO support temperature parameter
regular_models = ["gpt-4.1-2025-04-14"]
for model in regular_models:
try:
capabilities = provider.get_capabilities(model)
assert hasattr(
capabilities, "supports_temperature"
), f"Model {model} capabilities should have supports_temperature field"
assert capabilities.supports_temperature is True, f"Model {model} should have supports_temperature=True"
except ValueError:
# Skip if model not in SUPPORTED_MODELS (that's okay for this test)
pass
@patch("utils.model_restrictions.get_restriction_service")
def test_openai_provider_temperature_constraints(self, mock_restriction_service):
"""Test that OpenAI provider has correct temperature constraints for O3 models."""
from providers.openai_provider import OpenAIModelProvider
# Mock restriction service to allow all models
mock_service = Mock()
mock_service.is_allowed.return_value = True
mock_restriction_service.return_value = mock_service
provider = OpenAIModelProvider(api_key="test-key")
# Test O3 model constraints
o3_capabilities = provider.get_capabilities("o3-mini")
assert o3_capabilities.temperature_constraint is not None
# O3 models should have fixed temperature constraint
temp_constraint = o3_capabilities.temperature_constraint
assert temp_constraint.validate(1.0) is True
assert temp_constraint.validate(0.5) is False
# Test regular model constraints - use gpt-4.1 which is supported
gpt41_capabilities = provider.get_capabilities("gpt-4.1-2025-04-14")
assert gpt41_capabilities.temperature_constraint is not None
# Regular models should allow a range
temp_constraint = gpt41_capabilities.temperature_constraint
assert temp_constraint.validate(0.5) is True
assert temp_constraint.validate(1.0) is True

View File

@@ -122,7 +122,7 @@ class TestOpenAIProvider:
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "Test response"
mock_response.choices[0].finish_reason = "stop"
mock_response.model = "o4-mini" # API returns the resolved model name
mock_response.model = "gpt-4.1-2025-04-14" # API returns the resolved model name
mock_response.id = "test-id"
mock_response.created = 1234567890
mock_response.usage = MagicMock()
@@ -134,19 +134,23 @@ class TestOpenAIProvider:
provider = OpenAIModelProvider("test-key")
# Call generate_content with alias 'mini'
# Call generate_content with alias 'gpt4.1' (resolves to gpt-4.1-2025-04-14, supports temperature)
result = provider.generate_content(
prompt="Test prompt", model_name="mini", temperature=1.0 # This should be resolved to "o4-mini"
prompt="Test prompt",
model_name="gpt4.1",
temperature=1.0, # This should be resolved to "gpt-4.1-2025-04-14"
)
# Verify the API was called with the RESOLVED model name
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
# CRITICAL ASSERTION: The API should receive "o4-mini", not "mini"
assert call_kwargs["model"] == "o4-mini", f"Expected 'o4-mini' but API received '{call_kwargs['model']}'"
# CRITICAL ASSERTION: The API should receive "gpt-4.1-2025-04-14", not "gpt4.1"
assert (
call_kwargs["model"] == "gpt-4.1-2025-04-14"
), f"Expected 'gpt-4.1-2025-04-14' but API received '{call_kwargs['model']}'"
# Verify other parameters
# Verify other parameters (gpt-4.1 supports temperature unlike O3/O4 models)
assert call_kwargs["temperature"] == 1.0
assert len(call_kwargs["messages"]) == 1
assert call_kwargs["messages"][0]["role"] == "user"
@@ -154,7 +158,7 @@ class TestOpenAIProvider:
# Verify response
assert result.content == "Test response"
assert result.model_name == "o4-mini" # Should be the resolved name
assert result.model_name == "gpt-4.1-2025-04-14" # Should be the resolved name
@patch("providers.openai_compatible.OpenAI")
def test_generate_content_other_aliases(self, mock_openai_class):