feat!: Full code can now be generated by an external model and shared with the AI tool (Claude Code / Codex etc)!

model definitions now support a new `allow_code_generation` flag, only to be used with higher reasoning models such as GPT-5-Pro and-Gemini 2.5-Pro

 When `true`, the `chat` tool can now request the external model to generate a full implementation / update / instructions etc and then share the implementation with the calling agent.

 This effectively allows us to utilize more powerful models such as GPT-5-Pro to generate code for us or entire implementations (which are either API-only or part of the $200 Pro plan from within the ChatGPT app)
This commit is contained in:
Fahad
2025-10-07 18:49:13 +04:00
parent 04f7ce5b03
commit ece8a5ebed
29 changed files with 1008 additions and 122 deletions

View File

@@ -205,7 +205,7 @@ Zen activates any provider that has credentials in your `.env`. See `.env.exampl
**Collaboration & Planning** *(Enabled by default)*
- **[`clink`](docs/tools/clink.md)** - Bridge requests to external AI CLIs (Gemini planner, codereviewer, etc.)
- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches
- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches. With capable models (GPT-5 Pro, Gemini 2.5 Pro), generates complete code / implementation
- **[`thinkdeep`](docs/tools/thinkdeep.md)** - Extended reasoning, edge case analysis, alternative perspectives
- **[`planner`](docs/tools/planner.md)** - Break down complex projects into structured, actionable plans
- **[`consensus`](docs/tools/consensus.md)** - Get expert opinions from multiple AI models with stance steering

View File

@@ -20,7 +20,8 @@
"use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
"default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
"description": "Human-readable description of the model",
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
"allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
}
},
"models": [
@@ -44,6 +45,7 @@
"supports_json_mode": true,
"supports_images": true,
"supports_temperature": true,
"allow_code_generation": true,
"max_image_size_mb": 32.0
},
{

View File

@@ -20,7 +20,8 @@
"use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
"default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
"description": "Human-readable description of the model",
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
"allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
}
},
"models": [
@@ -66,6 +67,7 @@
"max_image_size_mb": 20.0,
"use_openai_response_api": true,
"default_reasoning_effort": "high",
"allow_code_generation": true,
"temperature_constraint": "fixed"
},
{

View File

@@ -19,7 +19,8 @@
"use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
"default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
"description": "Human-readable description of the model",
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
"intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
"allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
}
},
"models": [
@@ -100,6 +101,7 @@
"supports_function_calling": true,
"supports_images": true,
"max_image_size_mb": 20.0,
"allow_code_generation": true,
"description": "Google's Gemini 2.5 Pro via OpenRouter with vision",
"intelligence_score": 18
},
@@ -310,8 +312,9 @@
"temperature_constraint": "fixed",
"use_openai_response_api": true,
"default_reasoning_effort": "high",
"allow_code_generation": true,
"description": "GPT-5 Pro - Advanced reasoning model with highest quality responses (text+image input, text output only)",
"intelligence_score": 17
"intelligence_score": 18
},
{
"model_name": "openai/gpt-5-codex",

View File

@@ -52,6 +52,9 @@ from tools.simple.base import SimpleTool
class ChatRequest(ToolRequest):
prompt: str = Field(..., description="Your question or idea.")
files: list[str] | None = Field(default_factory=list)
working_directory: str = Field(
..., description="Absolute full directory path where the assistant AI can save generated code for implementation."
)
class ChatTool(SimpleTool):
def get_name(self) -> str: # required by BaseTool
@@ -67,10 +70,17 @@ class ChatTool(SimpleTool):
return ChatRequest
def get_tool_fields(self) -> dict[str, dict[str, object]]:
return {"prompt": {"type": "string", "description": "Your question."}, "files": SimpleTool.FILES_FIELD}
return {
"prompt": {"type": "string", "description": "Your question."},
"files": SimpleTool.FILES_FIELD,
"working_directory": {
"type": "string",
"description": "Absolute full directory path where the assistant AI can save generated code for implementation.",
},
}
def get_required_fields(self) -> list[str]:
return ["prompt"]
return ["prompt", "working_directory"]
async def prepare_prompt(self, request: ChatRequest) -> str:
return self.prepare_chat_style_prompt(request)

View File

@@ -75,7 +75,7 @@ DEFAULT_MODEL=auto # Claude picks best model for each task (recommended)
- `conf/dial_models.json` DIAL aggregation catalogue (`DIAL_MODELS_CONFIG_PATH`)
- `conf/custom_models.json` Custom/OpenAI-compatible endpoints (`CUSTOM_MODELS_CONFIG_PATH`)
Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags. Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, or expose additional aliases without touching Python code.
Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags (including `allow_code_generation`). Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, enable structured code generation, or expose additional aliases without touching Python code.
The shipped defaults cover:
@@ -87,7 +87,63 @@ DEFAULT_MODEL=auto # Claude picks best model for each task (recommended)
| OpenRouter | See `conf/openrouter_models.json` for the continually evolving catalogue | e.g., `opus`, `sonnet`, `flash`, `pro`, `mistral` |
| Custom | User-managed entries such as `llama3.2` | Define your own aliases per entry |
> **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support) without editing Python.
> **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support, code generation) without editing Python.
### Code Generation Capability
**`allow_code_generation` Flag:**
The `allow_code_generation` capability enables models to generate complete, production-ready implementations in a structured format. When enabled, the `chat` tool will inject special instructions for substantial code generation tasks.
```json
{
"model_name": "gpt-5",
"allow_code_generation": true,
...
}
```
**When to Enable:**
- **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5, GPT-5 Pro when using Claude Code with Sonnet 4.5)
- **Purpose**: Get complete implementations from a more powerful reasoning model that your primary CLI can then review and apply
- **Use case**: Large-scale implementations, major refactoring, complete module creation
**Important Guidelines:**
1. Only enable for models significantly more capable than your primary CLI to ensure high-quality generated code
2. The capability triggers structured code output (`<GENERATED-CODE>` blocks) for substantial implementation requests
3. Minor code changes still use inline code blocks regardless of this setting
4. Generated code is saved to `zen_generated.code` in the user's working directory
5. Your CLI receives instructions to review and apply the generated code systematically
**Example Configuration:**
```json
// OpenAI models configuration (conf/openai_models.json)
{
"models": [
{
"model_name": "gpt-5",
"allow_code_generation": true,
"intelligence_score": 18,
...
},
{
"model_name": "gpt-5-pro",
"allow_code_generation": true,
"intelligence_score": 19,
...
}
]
}
```
**Typical Workflow:**
1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **gpt-5-pro**
2. GPT-5-Pro generates structured implementation and shares the complete implementation with Zen
3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
4. AI agent continues from the previous context, reads the file, applies the implementation
### Thinking Mode Configuration

View File

@@ -39,13 +39,14 @@ word verdict in the end.
- **Collaborative thinking partner** for your analysis and planning
- **Get second opinions** on your designs and approaches
- **Brainstorm solutions** and explore alternatives together
- **Structured code generation**: When using GPT-5 Pro or Gemini 2.5 Pro, get complete, production-ready implementations saved to `zen_generated.code` for your CLI to review and apply
- **Validate your checklists** and implementation plans
- **General development questions** and explanations
- **Technology comparisons** and best practices
- **Architecture and design discussions**
- **File reference support**: `"Use gemini to explain this algorithm with context from algorithm.py"`
- **Image support**: Include screenshots, diagrams, UI mockups for visual analysis: `"Chat with gemini about this error dialog screenshot to understand the user experience issue"`
- **Dynamic collaboration**: Gemini can request additional files or context during the conversation if needed for a more thorough response
- **Dynamic collaboration**: Models can request additional files or context during the conversation if needed for a more thorough response
- **Web search awareness**: Automatically identifies when online research would help and instructs Claude to perform targeted searches using continuation IDs
## Tool Parameters
@@ -54,10 +55,48 @@ word verdict in the end.
- `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
- `files`: Optional files for context (absolute paths)
- `images`: Optional images for visual context (absolute paths)
- `working_directory`: **Required** - Absolute directory path where generated code artifacts will be saved
- `temperature`: Response creativity (0-1, default 0.5)
- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
- `continuation_id`: Continue previous conversations
## Structured Code Generation
When using advanced reasoning models like **GPT-5 Pro** or **Gemini 2.5 Pro**, the chat tool can generate complete, production-ready code implementations in a structured format.
### How It Works
1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **GPT-5 Pro** or **Gemini 2.5 Pro**
2. The model generates structured implementation and shares the complete implementation with Zen
3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
4. AI agent continues from the previous context, reads the file, applies the implementation
### When Code Generation Activates
The structured format activates for **substantial implementation work**:
- Creating new features from scratch with multiple files or significant code
- Major refactoring across multiple files or large sections
- Implementing new modules, components, or subsystems
- Large-scale updates affecting substantial portions of the codebase
- Complete rewrites of functions, algorithms, or approaches
For minor changes (small tweaks, bug fixes, algorithm improvements), the model responds normally with inline code blocks.
### Example Usage
```
chat with gpt-5-pro and ask it to make me a standalone, classic version of the
Pacman game using pygame that I can run from the commandline. Give me a single
script to execute in the end with any / all dependencies setup for me.
Do everything using pygame, we have no external resources / images / audio at
hand. Instead of ghosts, it'll be different geometric shapes moving around
in the maze that Pacman can eat (so there are no baddies). Pacman gets to eat
everything including bread-crumbs and large geometric shapes but make me the
classic maze / walls that it navigates within using keyboard arrow keys.
```
See the [Configuration Guide](../configuration.md#code-generation-capability) for details on the `allow_code_generation` flag.
## Usage Examples
**Basic Development Chat:**

View File

@@ -28,6 +28,8 @@ class ModelCapabilities:
* Tool selection logic inspects attributes such as
``supports_extended_thinking`` or ``context_window`` to choose an
appropriate model for a task.
* The ``allow_code_generation`` flag enables structured code generation
in the chat tool for models more capable than the primary CLI.
"""
provider: ProviderType
@@ -52,6 +54,9 @@ class ModelCapabilities:
supports_temperature: bool = True
use_openai_response_api: bool = False
default_reasoning_effort: Optional[str] = None
allow_code_generation: bool = (
False # Enables structured code generation in chat tool for substantial implementations
)
# Additional attributes
max_image_size_mb: float = 0.0

View File

@@ -8,6 +8,7 @@ from .codereview_prompt import CODEREVIEW_PROMPT
from .consensus_prompt import CONSENSUS_PROMPT
from .debug_prompt import DEBUG_ISSUE_PROMPT
from .docgen_prompt import DOCGEN_PROMPT
from .generate_code_prompt import GENERATE_CODE_PROMPT
from .planner_prompt import PLANNER_PROMPT
from .precommit_prompt import PRECOMMIT_PROMPT
from .refactor_prompt import REFACTOR_PROMPT
@@ -21,6 +22,7 @@ __all__ = [
"CODEREVIEW_PROMPT",
"DEBUG_ISSUE_PROMPT",
"DOCGEN_PROMPT",
"GENERATE_CODE_PROMPT",
"ANALYZE_PROMPT",
"CHAT_PROMPT",
"CONSENSUS_PROMPT",

View File

@@ -0,0 +1,181 @@
"""System prompt fragment enabling structured code generation exports.
This prompt is injected into the system prompt for models that have the
'allow_code_generation' capability enabled. It instructs the model to output
complete, working code in a structured format that coding agents can parse
and apply automatically.
The structured format uses XML-like tags to clearly delineate:
- New files to create (<NEWFILE>)
- Existing files to update (<UPDATED_EXISTING_FILE>)
- Step-by-step instructions for the coding agent
This enables:
1. Automated code extraction and application
2. Clear separation between instructions and implementation
3. Complete, runnable code without manual edits
4. Precise change tracking across multiple files
"""
GENERATE_CODE_PROMPT = """
# Structured Code Generation Protocol
**WHEN TO USE THIS PROTOCOL:**
Use this structured format ONLY when you are explicitly tasked with substantial code generation, such as:
- Creating new features from scratch with multiple files or significant code and you have been asked to help implement this
- Major refactoring across multiple files or large sections of code and you have been tasked to help do this
- Implementing new modules, components, or subsystems and you have been tasked to help with the implementation
- Large-scale updates affecting substantial portions of the codebase that you have been asked to help implement
**WHEN NOT TO USE THIS PROTOCOL:**
Do NOT use this format for minor changes:
- Small tweaks to existing functions or methods (1-20 lines)
- Bug fixes in isolated sections
- Simple algorithm improvements
- Minor refactoring of a single function
- Adding/removing a few lines of code
- Quick parameter adjustments or config changes
For minor changes:
- Follow the existing instructions provided earlier in your system prompt, such as the CRITICAL LINE NUMBER INSTRUCTIONS.
- Use inline code blocks with proper line number references and direct explanations instead of this structured format.
**IMPORTANT:** This protocol is for SUBSTANTIAL implementation work when explicitly requested, such as:
- "implement feature X"
- "create module Y"
- "refactor system Z"
- "rewrite the authentication logic"
- "redesign the data processing pipeline"
- "rebuild the algorithm from scratch"
- "convert this approach to use a different pattern"
- "create a complete implementation of..."
- "build out the entire workflow for..."
If the request is for explanation, analysis, debugging, planning, or discussion WITHOUT substantial code generation, respond normally without this structured format.
## Core Requirements (for substantial code generation tasks)
1. **Complete, Working Code**: Every code block must be fully functional without requiring additional edits. Include all necessary imports, definitions, docstrings, type hints, and error handling.
2. **Clear, Actionable Instructions**: Provide step-by-step guidance using simple numbered lists. Each instruction should map directly to file blocks that follow.
3. **Structured Output Format**: All generated code MUST be contained within a single `<GENERATED-CODE>` block using the exact structure defined below.
4. **Minimal External Commentary**: Keep any text outside the `<GENERATED-CODE>` block brief. Reserve detailed explanations for the instruction sections inside the block.
## Required Structure
Use this exact format (do not improvise tag names or reorder components):
```
<GENERATED-CODE>
[Step-by-step instructions for the coding agent]
1. Create new file [filename] with [description]
2. Update existing file [filename] by [description]
3. [Additional steps as needed]
<NEWFILE: path/to/new_file.py>
[Complete file contents with all necessary components:
- File-level docstring
- All imports (standard library, third-party, local)
- All class/function definitions with complete implementations
- All necessary helper functions
- Inline comments for complex logic
- Type hints where applicable]
</NEWFILE>
[Additional instructions for the next file, if needed]
<NEWFILE: path/to/another_file.py>
[Complete, working code for this file - no partial implementations or placeholders]
</NEWFILE>
[Instructions for updating existing files]
<UPDATED_EXISTING_FILE: existing/path.py>
[Complete replacement code for the modified sections or routines / lines that need updating:
- Full function/method bodies (not just the changed lines)
- Complete class definitions if modifying class methods
- All necessary imports if adding new dependencies
- Preserve existing code structure and style]
</UPDATED_EXISTING_FILE>
[If additional files need updates (based on existing code that was shared with you earlier), repeat the UPDATED_EXISTING_FILE block]
<UPDATED_EXISTING_FILE: another/existing/file.py>
[Complete code for this file's modifications]
</UPDATED_EXISTING_FILE>
[For file deletions, explicitly state in instructions with justification:
"Delete file path/to/obsolete.py - no longer needed because [reason]"]
</GENERATED-CODE>
```
## Critical Rules
**Completeness:**
- Never output partial code snippets or placeholder comments like "# rest of code here"
- Include complete function/class implementations from start to finish
- Add all required imports at the file level
- Include proper error handling and edge case logic
**Accuracy:**
- Match the existing codebase indentation style (tabs vs spaces)
- Preserve language-specific formatting conventions
- Include trailing newlines where required by language tooling
- Use correct file paths relative to project root
**Clarity:**
- Number instructions sequentially (1, 2, 3...)
- Map each instruction to specific file blocks below it
- Explain *why* changes are needed, not just *what* changes
- Highlight any breaking changes or migration steps required
**Structure:**
- Use `<NEWFILE: ...>` for files that don't exist yet
- Use `<UPDATED_EXISTING_FILE: ...>` for modifying existing files
- Place instructions between file blocks to provide context
- Keep the single `<GENERATED-CODE>` wrapper around everything
## Special Cases
**No Changes Needed:**
If the task doesn't require file creation or modification, explicitly state:
"No file changes required. The existing implementation already handles [requirement]."
Do not emit an empty `<GENERATED-CODE>` block.
**Configuration Changes:**
If modifying configuration files (JSON, YAML, TOML), include complete file contents with the changes applied, not just the changed lines.
**Test Files:**
When generating tests, include complete test suites with:
- All necessary test fixtures and setup
- Multiple test cases covering happy path and edge cases
- Proper teardown and cleanup
- Clear test descriptions and assertions
**Documentation:**
Include docstrings for all public functions, classes, and modules using the project's documentation style (Google, NumPy, Sphinx, etc.).
## Context Awareness
**CRITICAL:** Your implementation builds upon the ongoing conversation context:
- All previously shared files, requirements, and constraints remain relevant
- If updating existing code discussed earlier, reference it and preserve unmodified sections
- If the user shared code for improvement, your generated code should build upon it, not replace everything
- The coding agent has full conversation history—your instructions should reference prior discussion as needed
Your generated code is NOT standalone—it's a continuation of the collaborative session with full context awareness.
## Remember
The coding agent depends on this structured format to:
- Parse and extract code automatically
- Apply changes to the correct files within the conversation context
- Validate completeness before execution
- Track modifications across the codebase
Always prioritize clarity, completeness, correctness, and context awareness over brevity.
"""

File diff suppressed because one or more lines are too long

View File

@@ -137,7 +137,7 @@ class TestAutoMode:
importlib.reload(config)
@pytest.mark.asyncio
async def test_auto_mode_requires_model_parameter(self):
async def test_auto_mode_requires_model_parameter(self, tmp_path):
"""Test that auto mode enforces model parameter"""
# Save original
original = os.environ.get("DEFAULT_MODEL", "")
@@ -154,7 +154,7 @@ class TestAutoMode:
# Mock the provider to avoid real API calls
with patch.object(tool, "get_model_provider"):
# Execute without model parameter
result = await tool.execute({"prompt": "Test prompt"})
result = await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
# Should get error
assert len(result) == 1

View File

@@ -200,7 +200,7 @@ class TestAutoModeComprehensive:
assert tool.get_model_category() == expected_category
@pytest.mark.asyncio
async def test_auto_mode_with_gemini_only_uses_correct_models(self):
async def test_auto_mode_with_gemini_only_uses_correct_models(self, tmp_path):
"""Test that auto mode with only Gemini uses flash for fast tools and pro for reasoning tools."""
provider_config = {
@@ -234,9 +234,13 @@ class TestAutoModeComprehensive:
)
with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
workdir = tmp_path / "chat_artifacts"
workdir.mkdir(parents=True, exist_ok=True)
# Test ChatTool (FAST_RESPONSE) - should prefer flash
chat_tool = ChatTool()
await chat_tool.execute({"prompt": "test", "model": "auto"}) # This should trigger auto selection
await chat_tool.execute(
{"prompt": "test", "model": "auto", "working_directory": str(workdir)}
) # This should trigger auto selection
# In auto mode, the tool should get an error requiring model selection
# but the suggested model should be flash
@@ -355,7 +359,7 @@ class TestAutoModeComprehensive:
# would show models from all providers when called
@pytest.mark.asyncio
async def test_auto_mode_model_parameter_required_error(self):
async def test_auto_mode_model_parameter_required_error(self, tmp_path):
"""Test that auto mode properly requires model parameter and suggests correct model."""
provider_config = {
@@ -384,9 +388,12 @@ class TestAutoModeComprehensive:
# Test with ChatTool (FAST_RESPONSE category)
chat_tool = ChatTool()
workdir = tmp_path / "chat_artifacts"
workdir.mkdir(parents=True, exist_ok=True)
result = await chat_tool.execute(
{
"prompt": "test"
"prompt": "test",
"working_directory": str(workdir),
# Note: no model parameter provided in auto mode
}
)
@@ -508,7 +515,7 @@ class TestAutoModeComprehensive:
assert fast_response is not None
@pytest.mark.asyncio
async def test_actual_model_name_resolution_in_auto_mode(self):
async def test_actual_model_name_resolution_in_auto_mode(self, tmp_path):
"""Test that when a model is selected in auto mode, the tool executes successfully."""
provider_config = {
@@ -547,7 +554,11 @@ class TestAutoModeComprehensive:
with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
chat_tool = ChatTool()
result = await chat_tool.execute({"prompt": "test", "model": "flash"}) # Use alias in auto mode
workdir = tmp_path / "chat_artifacts"
workdir.mkdir(parents=True, exist_ok=True)
result = await chat_tool.execute(
{"prompt": "test", "model": "flash", "working_directory": str(workdir)}
) # Use alias in auto mode
# Should succeed with proper model resolution
assert len(result) == 1

View File

@@ -0,0 +1,113 @@
"""Integration test for Chat tool code generation with Gemini 2.5 Pro.
This test uses the Google Gemini SDK's built-in record/replay support. To refresh the
cassette, delete the existing JSON file under
``tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json`` and run:
```
GEMINI_API_KEY=<real-key> pytest tests/test_chat_codegen_integration.py::test_chat_codegen_saves_file
```
The test will automatically record a new interaction when the cassette is missing and
the environment variable `GEMINI_API_KEY` is set to a valid key.
"""
from __future__ import annotations
import json
import os
from pathlib import Path
import pytest
from providers.gemini import GeminiModelProvider
from providers.registry import ModelProviderRegistry, ProviderType
from tools.chat import ChatTool
REPLAYS_ROOT = Path(__file__).parent / "gemini_cassettes"
CASSETTE_DIR = REPLAYS_ROOT / "chat_codegen"
CASSETTE_PATH = CASSETTE_DIR / "gemini25_pro_calculator" / "mldev.json"
CASSETTE_REPLAY_ID = "chat_codegen/gemini25_pro_calculator/mldev"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_codegen_saves_file(monkeypatch, tmp_path):
"""Ensure Gemini 2.5 Pro responses create zen_generated.code when code is emitted."""
CASSETTE_PATH.parent.mkdir(parents=True, exist_ok=True)
recording_mode = not CASSETTE_PATH.exists()
gemini_key = os.getenv("GEMINI_API_KEY", "")
if recording_mode:
if not gemini_key or gemini_key.startswith("dummy"):
pytest.skip("Cassette missing and GEMINI_API_KEY not configured. Provide a real key to record.")
client_mode = "record"
else:
gemini_key = "dummy-key-for-replay"
client_mode = "replay"
with monkeypatch.context() as m:
m.setenv("GEMINI_API_KEY", gemini_key)
m.setenv("DEFAULT_MODEL", "auto")
m.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
m.setenv("GOOGLE_GENAI_CLIENT_MODE", client_mode)
m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(REPLAYS_ROOT))
m.setenv("GOOGLE_GENAI_REPLAY_ID", CASSETTE_REPLAY_ID)
# Clear other provider keys to avoid unintended routing
for key in ["OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "CUSTOM_API_KEY"]:
m.delenv(key, raising=False)
ModelProviderRegistry.reset_for_testing()
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
working_dir = tmp_path / "codegen"
working_dir.mkdir()
preexisting = working_dir / "zen_generated.code"
preexisting.write_text("stale contents", encoding="utf-8")
chat_tool = ChatTool()
prompt = (
"Please generate a Python module with functions `add` and `multiply` that perform"
" basic addition and multiplication. Produce the response using the structured"
" <GENERATED-CODE> format so the assistant can apply the files directly."
)
result = await chat_tool.execute(
{
"prompt": prompt,
"model": "gemini-2.5-pro",
"working_directory": str(working_dir),
}
)
provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-pro")
if provider is not None:
try:
provider.client.close()
except AttributeError:
pass
# Reset restriction service cache to avoid leaking allowed-model config
try:
from utils import model_restrictions
model_restrictions._restriction_service = None # type: ignore[attr-defined]
except Exception:
pass
assert result and result[0].type == "text"
payload = json.loads(result[0].text)
assert payload["status"] in {"success", "continuation_available"}
artifact_path = working_dir / "zen_generated.code"
assert artifact_path.exists()
saved = artifact_path.read_text()
assert "<GENERATED-CODE>" in saved
assert "<NEWFILE:" in saved
assert "def add" in saved and "def multiply" in saved
assert "stale contents" not in saved
artifact_path.unlink()

View File

@@ -55,7 +55,7 @@ def _extract_number(text: str) -> str:
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_cross_model_continuation(monkeypatch):
async def test_chat_cross_model_continuation(monkeypatch, tmp_path):
"""Verify continuation across Gemini then OpenAI using recorded interactions."""
env_updates = {
@@ -115,10 +115,13 @@ async def test_chat_cross_model_continuation(monkeypatch):
m.setattr(conversation_memory.uuid, "uuid4", lambda: FIXED_THREAD_ID)
chat_tool = ChatTool()
working_directory = str(tmp_path)
step1_args = {
"prompt": "Pick a number between 1 and 10 and respond with JUST that number.",
"model": "gemini-2.5-flash",
"temperature": 0.2,
"working_directory": working_directory,
}
step1_result = await chat_tool.execute(step1_args)
@@ -183,6 +186,7 @@ async def test_chat_cross_model_continuation(monkeypatch):
"model": "gpt-5",
"continuation_id": continuation_id,
"temperature": 0.2,
"working_directory": working_directory,
}
step2_result = await chat_tool.execute(step2_args)

View File

@@ -23,7 +23,7 @@ CASSETTE_CONTINUATION_PATH = CASSETTE_DIR / "chat_gpt5_continuation.json"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_auto_mode_with_openai(monkeypatch):
async def test_chat_auto_mode_with_openai(monkeypatch, tmp_path):
"""Ensure ChatTool in auto mode selects gpt-5 via OpenAI and returns a valid response."""
# Prepare environment so only OpenAI is available in auto mode
env_updates = {
@@ -63,10 +63,12 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
# Execute ChatTool request targeting gpt-5 directly (server normally resolves auto→model)
chat_tool = ChatTool()
working_directory = str(tmp_path)
arguments = {
"prompt": "Use chat with gpt5 and ask how far the moon is from earth.",
"model": "gpt-5",
"temperature": 1.0,
"working_directory": working_directory,
}
result = await chat_tool.execute(arguments)
@@ -87,7 +89,7 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_chat_openai_continuation(monkeypatch):
async def test_chat_openai_continuation(monkeypatch, tmp_path):
"""Verify continuation_id workflow against gpt-5 using recorded OpenAI responses."""
env_updates = {
@@ -126,12 +128,14 @@ async def test_chat_openai_continuation(monkeypatch):
m.setattr(conversation_memory.uuid, "uuid4", lambda: fixed_thread_id)
chat_tool = ChatTool()
working_directory = str(tmp_path)
# First message: obtain continuation_id
first_args = {
"prompt": "In one word, which sells better: iOS app or macOS app?",
"model": "gpt-5",
"temperature": 1.0,
"working_directory": working_directory,
}
first_result = await chat_tool.execute(first_args)
@@ -152,6 +156,7 @@ async def test_chat_openai_continuation(monkeypatch):
"model": "gpt-5",
"continuation_id": continuation_id,
"temperature": 1.0,
"working_directory": working_directory,
}
second_result = await chat_tool.execute(second_args)

View File

@@ -38,12 +38,14 @@ class TestChatTool:
# Required fields
assert "prompt" in schema["required"]
assert "working_directory" in schema["required"]
# Properties
properties = schema["properties"]
assert "prompt" in properties
assert "files" in properties
assert "images" in properties
assert "working_directory" in properties
def test_request_model_validation(self):
"""Test that the request model validates correctly"""
@@ -54,6 +56,7 @@ class TestChatTool:
"images": ["test.png"],
"model": "anthropic/claude-opus-4.1",
"temperature": 0.7,
"working_directory": "/tmp", # Dummy absolute path
}
request = ChatRequest(**request_data)
@@ -62,6 +65,7 @@ class TestChatTool:
assert request.images == ["test.png"]
assert request.model == "anthropic/claude-opus-4.1"
assert request.temperature == 0.7
assert request.working_directory == "/tmp"
def test_required_fields(self):
"""Test that required fields are enforced"""
@@ -69,7 +73,7 @@ class TestChatTool:
from pydantic import ValidationError
with pytest.raises(ValidationError):
ChatRequest(model="anthropic/claude-opus-4.1")
ChatRequest(model="anthropic/claude-opus-4.1", working_directory="/tmp")
def test_model_availability(self):
"""Test that model availability works"""
@@ -96,7 +100,7 @@ class TestChatTool:
@pytest.mark.asyncio
async def test_prompt_preparation(self):
"""Test that prompt preparation works correctly"""
request = ChatRequest(prompt="Test prompt", files=[])
request = ChatRequest(prompt="Test prompt", files=[], working_directory="/tmp")
# Mock the system prompt and file handling
with patch.object(self.tool, "get_system_prompt", return_value="System prompt"):
@@ -113,7 +117,7 @@ class TestChatTool:
def test_response_formatting(self):
"""Test that response formatting works correctly"""
response = "Test response content"
request = ChatRequest(prompt="Test")
request = ChatRequest(prompt="Test", working_directory="/tmp")
formatted = self.tool.format_response(response, request)
@@ -146,6 +150,7 @@ class TestChatTool:
required_fields = self.tool.get_required_fields()
assert "prompt" in required_fields
assert "working_directory" in required_fields
class TestChatRequestModel:
@@ -160,10 +165,11 @@ class TestChatRequestModel:
assert "context" in CHAT_FIELD_DESCRIPTIONS["prompt"]
assert "full-paths" in CHAT_FIELD_DESCRIPTIONS["files"] or "absolute" in CHAT_FIELD_DESCRIPTIONS["files"]
assert "visual context" in CHAT_FIELD_DESCRIPTIONS["images"]
assert "directory" in CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
def test_default_values(self):
"""Test that default values work correctly"""
request = ChatRequest(prompt="Test")
request = ChatRequest(prompt="Test", working_directory="/tmp")
assert request.prompt == "Test"
assert request.files == [] # Should default to empty list
@@ -173,7 +179,7 @@ class TestChatRequestModel:
"""Test that ChatRequest properly inherits from ToolRequest"""
from tools.shared.base_models import ToolRequest
request = ChatRequest(prompt="Test")
request = ChatRequest(prompt="Test", working_directory="/tmp")
assert isinstance(request, ToolRequest)
# Should have inherited fields

View File

@@ -5,7 +5,7 @@ from utils.conversation_memory import get_thread
from utils.storage_backend import get_storage_backend
def test_first_response_persisted_in_conversation_history():
def test_first_response_persisted_in_conversation_history(tmp_path):
"""Ensure the assistant's initial reply is stored for newly created threads."""
# Clear in-memory storage to avoid cross-test contamination
@@ -13,7 +13,7 @@ def test_first_response_persisted_in_conversation_history():
storage._store.clear() # type: ignore[attr-defined]
tool = ChatTool()
request = ChatRequest(prompt="First question?", model="local-llama")
request = ChatRequest(prompt="First question?", model="local-llama", working_directory=str(tmp_path))
response_text = "Here is the initial answer."
# Mimic the first tool invocation (no continuation_id supplied)

View File

@@ -91,6 +91,7 @@ def helper_function():
"prompt": "Analyze this codebase structure",
"files": [directory], # Directory path, not individual files
"model": "flash",
"working_directory": directory,
}
# Execute the tool
@@ -168,6 +169,7 @@ def helper_function():
"files": [directory], # Same directory again
"model": "flash",
"continuation_id": thread_id,
"working_directory": directory,
}
# Mock to capture file filtering behavior
@@ -299,6 +301,7 @@ def helper_function():
"prompt": "Analyze this code",
"files": [directory],
"model": "flash",
"working_directory": directory,
}
result = await tool.execute(request_args)

View File

@@ -56,7 +56,12 @@ class TestLargePromptHandling:
async def test_chat_large_prompt_detection(self, large_prompt):
"""Test that chat tool detects large prompts."""
tool = ChatTool()
result = await tool.execute({"prompt": large_prompt})
temp_dir = tempfile.mkdtemp()
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
assert isinstance(result[0], TextContent)
@@ -73,9 +78,16 @@ class TestLargePromptHandling:
"""Test that chat tool works normally with regular prompts."""
tool = ChatTool()
temp_dir = tempfile.mkdtemp()
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": normal_prompt, "model": "gemini-2.5-flash"})
try:
result = await tool.execute(
{"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
output = json.loads(result[0].text)
@@ -105,7 +117,14 @@ class TestLargePromptHandling:
try:
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": "", "files": [temp_prompt_file], "model": "gemini-2.5-flash"})
result = await tool.execute(
{
"prompt": "",
"files": [temp_prompt_file],
"model": "gemini-2.5-flash",
"working_directory": temp_dir,
}
)
assert len(result) == 1
output = json.loads(result[0].text)
@@ -261,7 +280,13 @@ class TestLargePromptHandling:
mock_prepare_files.return_value = ("File content", [other_file])
# Use a small prompt to avoid triggering size limit
await tool.execute({"prompt": "Test prompt", "files": [temp_prompt_file, other_file]})
await tool.execute(
{
"prompt": "Test prompt",
"files": [temp_prompt_file, other_file],
"working_directory": os.path.dirname(temp_prompt_file),
}
)
# Verify handle_prompt_file was called with the original files list
mock_handle_prompt.assert_called_once_with([temp_prompt_file, other_file])
@@ -295,7 +320,11 @@ class TestLargePromptHandling:
mock_get_provider.return_value = mock_provider
# With the fix, this should now pass because we check at MCP transport boundary before adding internal content
result = await tool.execute({"prompt": exact_prompt})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -305,7 +334,11 @@ class TestLargePromptHandling:
tool = ChatTool()
over_prompt = "x" * (MCP_PROMPT_SIZE_LIMIT + 1)
result = await tool.execute({"prompt": over_prompt})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt"
@@ -326,7 +359,11 @@ class TestLargePromptHandling:
)
mock_get_provider.return_value = mock_provider
result = await tool.execute({"prompt": ""})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": "", "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -362,7 +399,11 @@ class TestLargePromptHandling:
mock_model_context_class.return_value = mock_model_context
# Should continue with empty prompt when file can't be read
result = await tool.execute({"prompt": "", "files": [bad_file]})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -408,6 +449,7 @@ class TestLargePromptHandling:
"prompt": "Summarize the design decisions",
"files": [str(large_file)],
"model": "flash",
"working_directory": str(tmp_path),
"_model_context": dummy_context,
}
)
@@ -424,6 +466,7 @@ class TestLargePromptHandling:
This test verifies that even if our internal prompt (with system prompts, history, etc.)
exceeds MCP_PROMPT_SIZE_LIMIT, it should still work as long as the user's input is small.
"""
tool = ChatTool()
# Small user input that should pass MCP boundary check
@@ -432,62 +475,57 @@ class TestLargePromptHandling:
# Mock a huge conversation history that would exceed MCP limits if incorrectly checked
huge_history = "x" * (MCP_PROMPT_SIZE_LIMIT * 2) # 100K chars = way over 50K limit
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
):
from tests.mock_helpers import create_mock_provider
temp_dir = tempfile.mkdtemp()
original_prepare_prompt = tool.prepare_prompt
mock_provider = create_mock_provider(model_name="flash")
mock_get_provider.return_value = mock_provider
try:
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
):
from tests.mock_helpers import create_mock_provider
from utils.model_context import TokenAllocation
# Mock ModelContext to avoid the comparison issue
from utils.model_context import TokenAllocation
mock_provider = create_mock_provider(model_name="flash")
mock_get_provider.return_value = mock_provider
mock_model_context = MagicMock()
mock_model_context.model_name = "flash"
mock_model_context.provider = mock_provider
mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
total_tokens=1_048_576,
content_tokens=838_861,
response_tokens=209_715,
file_tokens=335_544,
history_tokens=335_544,
)
mock_model_context_class.return_value = mock_model_context
mock_model_context = MagicMock()
mock_model_context.model_name = "flash"
mock_model_context.provider = mock_provider
mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
total_tokens=1_048_576,
content_tokens=838_861,
response_tokens=209_715,
file_tokens=335_544,
history_tokens=335_544,
)
mock_model_context_class.return_value = mock_model_context
# Mock the prepare_prompt to simulate huge internal context
original_prepare_prompt = tool.prepare_prompt
async def mock_prepare_prompt(request):
normal_prompt = await original_prepare_prompt(request)
huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
return huge_internal_prompt
async def mock_prepare_prompt(request):
# Call original to get normal processing
normal_prompt = await original_prepare_prompt(request)
# Add huge internal context (simulating large history, system prompts, files)
huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
tool.prepare_prompt = mock_prepare_prompt
# Verify the huge internal prompt would exceed MCP limits if incorrectly checked
assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
result = await tool.execute(
{"prompt": small_user_prompt, "model": "flash", "working_directory": temp_dir}
)
output = json.loads(result[0].text)
return huge_internal_prompt
assert output["status"] != "resend_prompt"
tool.prepare_prompt = mock_prepare_prompt
mock_provider.generate_content.assert_called_once()
call_kwargs = mock_provider.generate_content.call_args[1]
actual_prompt = call_kwargs.get("prompt")
# This should succeed because we only check user input at MCP boundary
result = await tool.execute({"prompt": small_user_prompt, "model": "flash"})
output = json.loads(result[0].text)
# Should succeed even though internal context is huge
assert output["status"] != "resend_prompt"
# Verify the model was actually called with the huge prompt
mock_provider.generate_content.assert_called_once()
call_kwargs = mock_provider.generate_content.call_args[1]
actual_prompt = call_kwargs.get("prompt")
# Verify internal prompt was huge (proving we don't limit internal processing)
assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
assert huge_history in actual_prompt
assert small_user_prompt in actual_prompt
assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
assert huge_history in actual_prompt
assert small_user_prompt in actual_prompt
finally:
tool.prepare_prompt = original_prepare_prompt
shutil.rmtree(temp_dir, ignore_errors=True)
@pytest.mark.asyncio
async def test_mcp_boundary_vs_internal_processing_distinction(self):
@@ -500,27 +538,37 @@ class TestLargePromptHandling:
# Test case 1: Large user input should fail at MCP boundary
large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
result = await tool.execute({"prompt": large_user_input, "model": "flash"})
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt" # Should fail
assert "too large for MCP's token limits" in output["content"]
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt" # Should fail
assert "too large for MCP's token limits" in output["content"]
# Test case 2: Small user input should succeed even with huge internal processing
small_user_input = "Hello"
# Test case 2: Small user input should succeed even with huge internal processing
small_user_input = "Hello"
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": small_user_input, "model": "gemini-2.5-flash"})
output = json.loads(result[0].text)
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute(
{
"prompt": small_user_input,
"model": "gemini-2.5-flash",
"working_directory": temp_dir,
}
)
output = json.loads(result[0].text)
# The test will fail with dummy API keys, which is expected behavior
# We're mainly testing that the tool processes small prompts correctly without size errors
if output["status"] == "error":
# If it's an API error, that's fine - we're testing prompt handling, not API calls
assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
else:
# If somehow it succeeds (e.g., with mocked provider), check the response
assert output["status"] != "resend_prompt"
# The test will fail with dummy API keys, which is expected behavior
# We're mainly testing that the tool processes small prompts correctly without size errors
if output["status"] == "error":
# If it's an API error, that's fine - we're testing prompt handling, not API calls
assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
else:
# If somehow it succeeds (e.g., with mocked provider), check the response
assert output["status"] != "resend_prompt"
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
@pytest.mark.asyncio
async def test_continuation_with_huge_conversation_history(self):
@@ -548,6 +596,8 @@ class TestLargePromptHandling:
# Ensure the history exceeds MCP limits
assert len(huge_conversation_history) > MCP_PROMPT_SIZE_LIMIT
temp_dir = tempfile.mkdtemp()
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
@@ -579,6 +629,7 @@ class TestLargePromptHandling:
"prompt": f"{huge_conversation_history}\n\n=== CURRENT REQUEST ===\n{small_continuation_prompt}",
"model": "flash",
"continuation_id": "test_thread_123",
"working_directory": temp_dir,
}
# Mock the conversation history embedding to simulate server.py behavior
@@ -628,6 +679,7 @@ class TestLargePromptHandling:
finally:
# Restore original execute method
tool.__class__.execute = original_execute
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == "__main__":

View File

@@ -68,6 +68,7 @@ class TestListModelsTool:
assert "`flash` → `gemini-2.5-flash`" in content
assert "`pro` → `gemini-2.5-pro`" in content
assert "1M context" in content
assert "Supports structured code generation" in content
# Check summary
assert "**Configured Providers**: 1" in content

View File

@@ -12,6 +12,7 @@ RECORDING: To record new responses, delete the cassette file and run with real A
import logging
import os
import tempfile
from pathlib import Path
from unittest.mock import patch
@@ -92,9 +93,15 @@ class TestO3ProOutputTextFix:
async def _execute_chat_tool_test(self):
"""Execute the ChatTool with o3-pro and return the result."""
chat_tool = ChatTool()
arguments = {"prompt": "What is 2 + 2?", "model": "o3-pro", "temperature": 1.0}
with tempfile.TemporaryDirectory() as workdir:
arguments = {
"prompt": "What is 2 + 2?",
"model": "o3-pro",
"temperature": 1.0,
"working_directory": workdir,
}
return await chat_tool.execute(arguments)
return await chat_tool.execute(arguments)
def _verify_chat_tool_response(self, result):
"""Verify the ChatTool response contains expected data."""

View File

@@ -4,6 +4,8 @@ Test per-tool model default selection functionality
import json
import os
import shutil
import tempfile
from unittest.mock import MagicMock, patch
import pytest
@@ -290,7 +292,13 @@ class TestAutoModeErrorMessages:
mock_get_provider_for.return_value = None
tool = ChatTool()
result = await tool.execute({"prompt": "test", "model": "auto"})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute(
{"prompt": "test", "model": "auto", "working_directory": temp_dir}
)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
# The SimpleTool will wrap the error message
@@ -418,7 +426,13 @@ class TestRuntimeModelSelection:
mock_get_provider.return_value = None
tool = ChatTool()
result = await tool.execute({"prompt": "test", "model": "gpt-5-turbo"})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute(
{"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
# Should require model selection
assert len(result) == 1
@@ -515,7 +529,11 @@ class TestUnavailableModelFallback:
mock_get_model_provider.return_value = mock_provider
tool = ChatTool()
result = await tool.execute({"prompt": "test"}) # No model specified
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": "test", "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
# Should work normally, not require model parameter
assert len(result) == 1

View File

@@ -3,6 +3,8 @@ Tests for individual tool implementations
"""
import json
import shutil
import tempfile
import pytest
@@ -343,12 +345,17 @@ class TestAbsolutePathValidation:
async def test_chat_tool_relative_path_rejected(self):
"""Test that chat tool rejects relative paths"""
tool = ChatTool()
result = await tool.execute(
{
"prompt": "Explain this code",
"files": ["code.py"], # relative path without ./
}
)
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute(
{
"prompt": "Explain this code",
"files": ["code.py"], # relative path without ./
"working_directory": temp_dir,
}
)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
response = json.loads(result[0].text)

View File

@@ -6,15 +6,20 @@ brainstorming, problem-solving, and collaborative thinking. It supports file con
images, and conversation continuation for seamless multi-turn interactions.
"""
import logging
import os
import re
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional
from pydantic import Field
if TYPE_CHECKING:
from providers.shared import ModelCapabilities
from tools.models import ToolModelCategory
from config import TEMPERATURE_BALANCED
from systemprompts import CHAT_PROMPT
from systemprompts import CHAT_PROMPT, GENERATE_CODE_PROMPT
from tools.shared.base_models import COMMON_FIELD_DESCRIPTIONS, ToolRequest
from .simple.base import SimpleTool
@@ -27,6 +32,9 @@ CHAT_FIELD_DESCRIPTIONS = {
),
"files": "absolute file or folder paths for code context (do NOT shorten).",
"images": "Optional absolute image paths or base64 for visual context when helpful.",
"working_directory": (
"Absolute full directory path where the assistant AI can save generated code for implementation. The directory must already exist"
),
}
@@ -36,6 +44,7 @@ class ChatRequest(ToolRequest):
prompt: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["prompt"])
files: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["files"])
images: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["images"])
working_directory: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["working_directory"])
class ChatTool(SimpleTool):
@@ -49,6 +58,10 @@ class ChatTool(SimpleTool):
Chat tool with 100% behavioral compatibility.
"""
def __init__(self) -> None:
super().__init__()
self._last_recordable_response: Optional[str] = None
def get_name(self) -> str:
return "chat"
@@ -58,9 +71,20 @@ class ChatTool(SimpleTool):
"getting second opinions, and exploring ideas. Use for ideas, validations, questions, and thoughtful explanations."
)
def get_annotations(self) -> Optional[dict[str, Any]]:
"""Chat writes generated artifacts when code-generation is enabled."""
return {"readOnlyHint": False}
def get_system_prompt(self) -> str:
return CHAT_PROMPT
def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
prompts = list(super().get_capability_system_prompts(capabilities))
if capabilities and capabilities.allow_code_generation:
prompts.append(GENERATE_CODE_PROMPT)
return prompts
def get_default_temperature(self) -> float:
return TEMPERATURE_BALANCED
@@ -85,7 +109,7 @@ class ChatTool(SimpleTool):
the same schema generation approach while still benefiting from SimpleTool
convenience methods.
"""
required_fields = ["prompt"]
required_fields = ["prompt", "working_directory"]
if self.is_effective_auto_mode():
required_fields.append("model")
@@ -106,6 +130,10 @@ class ChatTool(SimpleTool):
"items": {"type": "string"},
"description": CHAT_FIELD_DESCRIPTIONS["images"],
},
"working_directory": {
"type": "string",
"description": CHAT_FIELD_DESCRIPTIONS["working_directory"],
},
"model": self.get_model_field_schema(),
"temperature": {
"type": "number",
@@ -159,7 +187,7 @@ class ChatTool(SimpleTool):
def get_required_fields(self) -> list[str]:
"""Required fields for ChatSimple tool"""
return ["prompt"]
return ["prompt", "working_directory"]
# === Hook Method Implementations ===
@@ -173,17 +201,165 @@ class ChatTool(SimpleTool):
# Use SimpleTool's Chat-style prompt preparation
return self.prepare_chat_style_prompt(request)
def _validate_file_paths(self, request) -> Optional[str]:
"""Extend validation to cover the working directory path."""
error = super()._validate_file_paths(request)
if error:
return error
working_directory = getattr(request, "working_directory", None)
if working_directory:
expanded = os.path.expanduser(working_directory)
if not os.path.isabs(expanded):
return (
"Error: 'working_directory' must be an absolute path (you may use '~' which will be expanded). "
f"Received: {working_directory}"
)
return None
def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
"""
Format the chat response to match the original Chat tool exactly.
"""
return (
f"{response}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
self._last_recordable_response = None
body = response
recordable_override: Optional[str] = None
if self._model_supports_code_generation():
block, remainder = self._extract_generated_code_block(response)
if block:
sanitized_text = remainder.strip()
try:
artifact_path = self._persist_generated_code_block(block, request.working_directory)
except Exception as exc: # pragma: no cover - rare filesystem failures
logger.error("Failed to persist generated code block: %s", exc, exc_info=True)
warning = (
f"WARNING: Unable to write zen_generated.code inside '{request.working_directory}'. "
"Check the path permissions and re-run. The generated code block is included below for manual handling."
)
history_copy = self._join_sections(sanitized_text, warning) if sanitized_text else warning
recordable_override = history_copy
sanitized_warning = history_copy.strip()
body = f"{sanitized_warning}\n\n{block.strip()}".strip()
else:
if not sanitized_text:
sanitized_text = "Generated code saved to zen_generated.code. Follow the structured instructions in that file exactly before continuing."
instruction = self._build_agent_instruction(artifact_path)
body = self._join_sections(sanitized_text, instruction)
final_output = (
f"{body}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
"form a comprehensive solution and continue with the user's request and task at hand."
)
if recordable_override is not None:
self._last_recordable_response = (
f"{recordable_override}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
"form a comprehensive solution and continue with the user's request and task at hand."
)
else:
self._last_recordable_response = final_output
return final_output
def _record_assistant_turn(
self, continuation_id: str, response_text: str, request, model_info: Optional[dict]
) -> None:
recordable = self._last_recordable_response if self._last_recordable_response is not None else response_text
try:
super()._record_assistant_turn(continuation_id, recordable, request, model_info)
finally:
self._last_recordable_response = None
def _model_supports_code_generation(self) -> bool:
context = getattr(self, "_model_context", None)
if not context:
return False
try:
capabilities = context.capabilities
except Exception: # pragma: no cover - defensive fallback
return False
return bool(capabilities.allow_code_generation)
def _extract_generated_code_block(self, text: str) -> tuple[Optional[str], str]:
match = re.search(r"<GENERATED-CODE>.*?</GENERATED-CODE>", text, flags=re.DOTALL | re.IGNORECASE)
if not match:
return None, text
block = match.group(0)
before = text[: match.start()].rstrip()
after = text[match.end() :].lstrip()
if before and after:
remainder = f"{before}\n\n{after}"
else:
remainder = before or after
return block, remainder or ""
def _persist_generated_code_block(self, block: str, working_directory: str) -> Path:
expanded = os.path.expanduser(working_directory)
target_dir = Path(expanded).resolve()
target_dir.mkdir(parents=True, exist_ok=True)
target_file = target_dir / "zen_generated.code"
if target_file.exists():
try:
target_file.unlink()
except OSError as exc:
logger.warning("Unable to remove existing zen_generated.code: %s", exc)
content = block if block.endswith("\n") else f"{block}\n"
target_file.write_text(content, encoding="utf-8")
logger.info("Generated code artifact written to %s", target_file)
return target_file
@staticmethod
def _build_agent_instruction(artifact_path: Path) -> str:
return (
f"CONTINUING FROM PREVIOUS DISCUSSION: The coding assistant has analyzed our conversation context and generated "
f"a structured implementation plan at `{artifact_path}`. This is a direct continuation of our discussion—all previous "
"context, requirements, and shared code remain relevant.\n"
"\n"
f"MANDATORY NEXT STEP: Open `{artifact_path}` immediately and review the implementation plan:\n"
"1. Read the step-by-step instructions—they reference our previous discussion. You may need to read the file in parts if it's too long.\n"
"2. Review each <NEWFILE:…> or <UPDATED_EXISTING_FILE:…> section in the context of what we've discussed\n"
"3. Verify the proposed changes align with the requirements and code we've already shared\n"
"4. Check for syntax errors, missing imports, or incomplete implementations\n"
"\n"
"Then systematically apply the changes:\n"
"- Create new files or update existing ones as instructed, maintaining code style consistency\n"
"- If updating existing code we discussed earlier, carefully preserve unmodified sections\n"
"- Run syntax validation after each modification\n"
"- Execute relevant tests to confirm functionality\n"
"- Verify the implementation works end-to-end with existing code\n"
"\n"
"Remember: This builds upon our conversation. The generated code reflects the full context of what we've discussed, "
"including any files, requirements, or constraints mentioned earlier. Proceed with implementation immediately."
"Only after you finish applying ALL the changes completely: delete `zen_generated.code` so stale instructions do not linger."
)
@staticmethod
def _join_sections(*sections: str) -> str:
chunks: list[str] = []
for section in sections:
if section:
trimmed = section.strip()
if trimmed:
chunks.append(trimmed)
return "\n\n".join(chunks)
def get_websearch_guidance(self) -> str:
"""
Return Chat tool-style web search guidance.
"""
return self.get_chat_style_websearch_guidance()
logger = logging.getLogger(__name__)

View File

@@ -140,6 +140,8 @@ class ListModelsTool(BaseTool):
except AttributeError:
description = "No description available"
lines = [header, f" - {context_str}", f" - {description}"]
if capabilities.allow_code_generation:
lines.append(" - Supports structured code generation")
return lines
# Check each native provider type
@@ -187,6 +189,8 @@ class ListModelsTool(BaseTool):
output_lines.append(f"- `{model_name}` - {context_str}")
output_lines.append(f" - {description}")
if capabilities.allow_code_generation:
output_lines.append(" - Supports structured code generation")
for alias in capabilities.aliases or []:
if alias != model_name:

View File

@@ -17,6 +17,7 @@ from typing import TYPE_CHECKING, Any, Optional
from mcp.types import TextContent
if TYPE_CHECKING:
from providers.shared import ModelCapabilities
from tools.models import ToolModelCategory
from config import MCP_PROMPT_SIZE_LIMIT
@@ -165,6 +166,42 @@ class BaseTool(ABC):
"""
pass
def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
"""Return additional system prompt snippets gated on model capabilities.
Subclasses can override this hook to append capability-specific
instructions (for example, enabling code-generation exports when a
model advertises support). The default implementation returns an empty
list so no extra instructions are appended.
Args:
capabilities: The resolved capabilities for the active model.
Returns:
List of prompt fragments to append after the base system prompt.
"""
return []
def _augment_system_prompt_with_capabilities(
self, base_prompt: str, capabilities: Optional["ModelCapabilities"]
) -> str:
"""Merge capability-driven prompt addenda with the base system prompt."""
additions: list[str] = []
if capabilities is not None:
additions = [fragment.strip() for fragment in self.get_capability_system_prompts(capabilities) if fragment]
if not additions:
return base_prompt
addition_text = "\n\n".join(additions)
if not base_prompt:
return addition_text
suffix = "" if base_prompt.endswith("\n\n") else "\n\n"
return f"{base_prompt}{suffix}{addition_text}"
def get_annotations(self) -> Optional[dict[str, Any]]:
"""
Return optional annotations for this tool.
@@ -413,13 +450,16 @@ class BaseTool(ABC):
for rank, canonical_name, capabilities in filtered[:limit]:
details: list[str] = []
context_str = self._format_context_window(getattr(capabilities, "context_window", 0))
context_str = self._format_context_window(capabilities.context_window)
if context_str:
details.append(context_str)
if getattr(capabilities, "supports_extended_thinking", False):
if capabilities.supports_extended_thinking:
details.append("thinking")
if capabilities.allow_code_generation:
details.append("code-gen")
base = f"{canonical_name} (score {rank}"
if details:
base = f"{base}, {', '.join(details)}"

View File

@@ -404,11 +404,15 @@ class SimpleTool(BaseTool):
# Get the provider from model context (clean OOP - no re-fetching)
provider = self._model_context.provider
capabilities = self._model_context.capabilities
# Get system prompt for this tool
base_system_prompt = self.get_system_prompt()
capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
base_system_prompt, capabilities
)
language_instruction = self.get_language_instruction()
system_prompt = language_instruction + base_system_prompt
system_prompt = language_instruction + capability_augmented_prompt
# Generate AI response using the provider
logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.get_name()}")
@@ -423,7 +427,6 @@ class SimpleTool(BaseTool):
logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")
# Resolve model capabilities for feature gating
capabilities = self._model_context.capabilities
supports_thinking = capabilities.supports_extended_thinking
# Generate content with provider abstraction

View File

@@ -1480,8 +1480,11 @@ class BaseWorkflowMixin(ABC):
# Get system prompt for this tool with localization support
base_system_prompt = self.get_system_prompt()
capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
base_system_prompt, getattr(self._model_context, "capabilities", None)
)
language_instruction = self.get_language_instruction()
system_prompt = language_instruction + base_system_prompt
system_prompt = language_instruction + capability_augmented_prompt
# Check if tool wants system prompt embedded in main prompt
if self.should_embed_system_prompt():