feat!: Full code can now be generated by an external model and shared with the AI tool (Claude Code / Codex etc)!

model definitions now support a new `allow_code_generation` flag, only to be used with higher reasoning models such as GPT-5-Pro and-Gemini 2.5-Pro When `true`, the `chat` tool can now request the external model to generate a full implementation / update / instructions etc and then share the implementation with the calling agent. This effectively allows us to utilize more powerful models such as GPT-5-Pro to generate code for us or entire implementations (which are either API-only or part of the $200 Pro plan from within the ChatGPT app)
2025-10-07 18:49:13 +04:00
parent 04f7ce5b03
commit ece8a5ebed
29 changed files with 1008 additions and 122 deletions
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ Zen activates any provider that has credentials in your `.env`. See `.env.exampl
 **Collaboration & Planning** *(Enabled by default)*
 - **[`clink`](docs/tools/clink.md)** - Bridge requests to external AI CLIs (Gemini planner, codereviewer, etc.)
- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches
+- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches. With capable models (GPT-5 Pro, Gemini 2.5 Pro), generates complete code / implementation
 - **[`thinkdeep`](docs/tools/thinkdeep.md)** - Extended reasoning, edge case analysis, alternative perspectives
 - **[`planner`](docs/tools/planner.md)** - Break down complex projects into structured, actionable plans
 - **[`consensus`](docs/tools/consensus.md)** - Get expert opinions from multiple AI models with stance steering
--- a/conf/gemini_models.json
+++ b/conf/gemini_models.json
@@ -20,7 +20,8 @@
      "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
      "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
      "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
    }
  },
  "models": [
@@ -44,6 +45,7 @@
      "supports_json_mode": true,
      "supports_images": true,
      "supports_temperature": true,
      "allow_code_generation": true,
      "max_image_size_mb": 32.0
    },
    {
--- a/conf/openai_models.json
+++ b/conf/openai_models.json
@@ -20,7 +20,8 @@
      "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
      "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
      "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
    }
  },
  "models": [
@@ -66,6 +67,7 @@
      "max_image_size_mb": 20.0,
      "use_openai_response_api": true,
      "default_reasoning_effort": "high",
      "allow_code_generation": true,
      "temperature_constraint": "fixed"
    },
    {
--- a/conf/openrouter_models.json
+++ b/conf/openrouter_models.json
@@ -19,7 +19,8 @@
      "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
      "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
      "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
    }
  },
  "models": [
@@ -100,6 +101,7 @@
      "supports_function_calling": true,
      "supports_images": true,
      "max_image_size_mb": 20.0,
      "allow_code_generation": true,
      "description": "Google's Gemini 2.5 Pro via OpenRouter with vision",
      "intelligence_score": 18
    },
@@ -310,8 +312,9 @@
      "temperature_constraint": "fixed",
      "use_openai_response_api": true,
      "default_reasoning_effort": "high",
      "allow_code_generation": true,
      "description": "GPT-5 Pro - Advanced reasoning model with highest quality responses (text+image input, text output only)",
-      "intelligence_score": 17
+      "intelligence_score": 18
    },
    {
      "model_name": "openai/gpt-5-codex",
--- a/docs/adding_tools.md
+++ b/docs/adding_tools.md
@@ -52,6 +52,9 @@ from tools.simple.base import SimpleTool
 class ChatRequest(ToolRequest):
    prompt: str = Field(..., description="Your question or idea.")
    files: list[str] | None = Field(default_factory=list)
    working_directory: str = Field(
        ..., description="Absolute full directory path where the assistant AI can save generated code for implementation."
    )
 class ChatTool(SimpleTool):
    def get_name(self) -> str:  # required by BaseTool
@@ -67,10 +70,17 @@ class ChatTool(SimpleTool):
        return ChatRequest
    def get_tool_fields(self) -> dict[str, dict[str, object]]:
-        return {"prompt": {"type": "string", "description": "Your question."}, "files": SimpleTool.FILES_FIELD}
+        return {
            "prompt": {"type": "string", "description": "Your question."},
            "files": SimpleTool.FILES_FIELD,
            "working_directory": {
                "type": "string",
                "description": "Absolute full directory path where the assistant AI can save generated code for implementation.",
            },
        }
    def get_required_fields(self) -> list[str]:
-        return ["prompt"]
+        return ["prompt", "working_directory"]
    async def prepare_prompt(self, request: ChatRequest) -> str:
        return self.prepare_chat_style_prompt(request)
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -75,7 +75,7 @@ DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
  - `conf/dial_models.json` – DIAL aggregation catalogue (`DIAL_MODELS_CONFIG_PATH`)
  - `conf/custom_models.json` – Custom/OpenAI-compatible endpoints (`CUSTOM_MODELS_CONFIG_PATH`)
-  Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags. Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, or expose additional aliases without touching Python code.
+  Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags (including `allow_code_generation`). Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, enable structured code generation, or expose additional aliases without touching Python code.
  The shipped defaults cover:
@@ -87,7 +87,63 @@ DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
  | OpenRouter | See `conf/openrouter_models.json` for the continually evolving catalogue | e.g., `opus`, `sonnet`, `flash`, `pro`, `mistral` |
  | Custom | User-managed entries such as `llama3.2` | Define your own aliases per entry |
-  > **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support) without editing Python.
+  > **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support, code generation) without editing Python.
 ### Code Generation Capability
 **`allow_code_generation` Flag:**
 The `allow_code_generation` capability enables models to generate complete, production-ready implementations in a structured format. When enabled, the `chat` tool will inject special instructions for substantial code generation tasks.
 ```json
 {
  "model_name": "gpt-5",
  "allow_code_generation": true,
  ...
 }
 ```
 **When to Enable:**
 - **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5, GPT-5 Pro when using Claude Code with Sonnet 4.5)
 - **Purpose**: Get complete implementations from a more powerful reasoning model that your primary CLI can then review and apply
 - **Use case**: Large-scale implementations, major refactoring, complete module creation
 **Important Guidelines:**
 1. Only enable for models significantly more capable than your primary CLI to ensure high-quality generated code
 2. The capability triggers structured code output (`<GENERATED-CODE>` blocks) for substantial implementation requests
 3. Minor code changes still use inline code blocks regardless of this setting
 4. Generated code is saved to `zen_generated.code` in the user's working directory
 5. Your CLI receives instructions to review and apply the generated code systematically
 **Example Configuration:**
 ```json
 // OpenAI models configuration (conf/openai_models.json)
 {
  "models": [
    {
      "model_name": "gpt-5",
      "allow_code_generation": true,
      "intelligence_score": 18,
      ...
    },
    {
      "model_name": "gpt-5-pro",
      "allow_code_generation": true,
      "intelligence_score": 19,
      ...
    }
  ]
 }
 ```
 **Typical Workflow:**
 1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **gpt-5-pro**
 2. GPT-5-Pro generates structured implementation and shares the complete implementation with Zen
 3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
 4. AI agent continues from the previous context, reads the file, applies the implementation
 ### Thinking Mode Configuration
--- a/docs/tools/chat.md
+++ b/docs/tools/chat.md
@@ -39,13 +39,14 @@ word verdict in the end.
 - **Collaborative thinking partner** for your analysis and planning
 - **Get second opinions** on your designs and approaches
 - **Brainstorm solutions** and explore alternatives together
 - **Structured code generation**: When using GPT-5 Pro or Gemini 2.5 Pro, get complete, production-ready implementations saved to `zen_generated.code` for your CLI to review and apply
 - **Validate your checklists** and implementation plans
 - **General development questions** and explanations
 - **Technology comparisons** and best practices
 - **Architecture and design discussions**
 - **File reference support**: `"Use gemini to explain this algorithm with context from algorithm.py"`
 - **Image support**: Include screenshots, diagrams, UI mockups for visual analysis: `"Chat with gemini about this error dialog screenshot to understand the user experience issue"`
- **Dynamic collaboration**: Gemini can request additional files or context during the conversation if needed for a more thorough response
+- **Dynamic collaboration**: Models can request additional files or context during the conversation if needed for a more thorough response
 - **Web search awareness**: Automatically identifies when online research would help and instructs Claude to perform targeted searches using continuation IDs
 ## Tool Parameters
@@ -54,10 +55,48 @@ word verdict in the end.
 - `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `files`: Optional files for context (absolute paths)
 - `images`: Optional images for visual context (absolute paths)
 - `working_directory`: **Required** - Absolute directory path where generated code artifacts will be saved
 - `temperature`: Response creativity (0-1, default 0.5)
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 - `continuation_id`: Continue previous conversations
 ## Structured Code Generation
 When using advanced reasoning models like **GPT-5 Pro** or **Gemini 2.5 Pro**, the chat tool can generate complete, production-ready code implementations in a structured format.
 ### How It Works
 1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **GPT-5 Pro** or **Gemini 2.5 Pro**
 2. The model generates structured implementation and shares the complete implementation with Zen
 3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
 4. AI agent continues from the previous context, reads the file, applies the implementation
 ### When Code Generation Activates
 The structured format activates for **substantial implementation work**:
 - Creating new features from scratch with multiple files or significant code
 - Major refactoring across multiple files or large sections
 - Implementing new modules, components, or subsystems
 - Large-scale updates affecting substantial portions of the codebase
 - Complete rewrites of functions, algorithms, or approaches
 For minor changes (small tweaks, bug fixes, algorithm improvements), the model responds normally with inline code blocks.
 ### Example Usage
 ```
 chat with gpt-5-pro and ask it to make me a standalone, classic version of the
 Pacman game using pygame that I can run from the commandline. Give me a single
 script to execute in the end with any / all dependencies setup for me. 
 Do everything using pygame, we have no external resources / images / audio at
 hand. Instead of ghosts, it'll be different geometric shapes moving around 
 in the maze that Pacman can eat (so there are no baddies). Pacman gets to eat
 everything including bread-crumbs and large geometric shapes but make me the
 classic maze / walls that it navigates within using keyboard arrow keys.
 ```
 See the [Configuration Guide](../configuration.md#code-generation-capability) for details on the `allow_code_generation` flag.
 ## Usage Examples
 **Basic Development Chat:**
--- a/providers/shared/model_capabilities.py
+++ b/providers/shared/model_capabilities.py
@@ -28,6 +28,8 @@ class ModelCapabilities:
        * Tool selection logic inspects attributes such as
          ``supports_extended_thinking`` or ``context_window`` to choose an
          appropriate model for a task.
        * The ``allow_code_generation`` flag enables structured code generation
          in the chat tool for models more capable than the primary CLI.
    """
    provider: ProviderType
@@ -52,6 +54,9 @@ class ModelCapabilities:
    supports_temperature: bool = True
    use_openai_response_api: bool = False
    default_reasoning_effort: Optional[str] = None
    allow_code_generation: bool = (
        False  # Enables structured code generation in chat tool for substantial implementations
    )
    # Additional attributes
    max_image_size_mb: float = 0.0
--- a/systemprompts/init.py
+++ b/systemprompts/init.py
@@ -8,6 +8,7 @@ from .codereview_prompt import CODEREVIEW_PROMPT
 from .consensus_prompt import CONSENSUS_PROMPT
 from .debug_prompt import DEBUG_ISSUE_PROMPT
 from .docgen_prompt import DOCGEN_PROMPT
 from .generate_code_prompt import GENERATE_CODE_PROMPT
 from .planner_prompt import PLANNER_PROMPT
 from .precommit_prompt import PRECOMMIT_PROMPT
 from .refactor_prompt import REFACTOR_PROMPT
@@ -21,6 +22,7 @@ __all__ = [
    "CODEREVIEW_PROMPT",
    "DEBUG_ISSUE_PROMPT",
    "DOCGEN_PROMPT",
    "GENERATE_CODE_PROMPT",
    "ANALYZE_PROMPT",
    "CHAT_PROMPT",
    "CONSENSUS_PROMPT",
--- a/systemprompts/generate_code_prompt.py
+++ b/systemprompts/generate_code_prompt.py
@@ -0,0 +1,181 @@
 """System prompt fragment enabling structured code generation exports.
 This prompt is injected into the system prompt for models that have the
 'allow_code_generation' capability enabled. It instructs the model to output
 complete, working code in a structured format that coding agents can parse
 and apply automatically.
 The structured format uses XML-like tags to clearly delineate:
 - New files to create (<NEWFILE>)
 - Existing files to update (<UPDATED_EXISTING_FILE>)
 - Step-by-step instructions for the coding agent
 This enables:
 1. Automated code extraction and application
 2. Clear separation between instructions and implementation
 3. Complete, runnable code without manual edits
 4. Precise change tracking across multiple files
 """
 GENERATE_CODE_PROMPT = """
 # Structured Code Generation Protocol
 **WHEN TO USE THIS PROTOCOL:**
 Use this structured format ONLY when you are explicitly tasked with substantial code generation, such as:
 - Creating new features from scratch with multiple files or significant code and you have been asked to help implement this
 - Major refactoring across multiple files or large sections of code and you have been tasked to help do this
 - Implementing new modules, components, or subsystems and you have been tasked to help with the implementation
 - Large-scale updates affecting substantial portions of the codebase that you have been asked to help implement
 **WHEN NOT TO USE THIS PROTOCOL:**
 Do NOT use this format for minor changes:
 - Small tweaks to existing functions or methods (1-20 lines)
 - Bug fixes in isolated sections
 - Simple algorithm improvements
 - Minor refactoring of a single function
 - Adding/removing a few lines of code
 - Quick parameter adjustments or config changes
 For minor changes:
 - Follow the existing instructions provided earlier in your system prompt, such as the CRITICAL LINE NUMBER INSTRUCTIONS.
 - Use inline code blocks with proper line number references and direct explanations instead of this structured format.
 **IMPORTANT:** This protocol is for SUBSTANTIAL implementation work when explicitly requested, such as:
 - "implement feature X"
 - "create module Y"
 - "refactor system Z"
 - "rewrite the authentication logic"
 - "redesign the data processing pipeline"
 - "rebuild the algorithm from scratch"
 - "convert this approach to use a different pattern"
 - "create a complete implementation of..."
 - "build out the entire workflow for..."
 If the request is for explanation, analysis, debugging, planning, or discussion WITHOUT substantial code generation, respond normally without this structured format.
 ## Core Requirements (for substantial code generation tasks)
 1. **Complete, Working Code**: Every code block must be fully functional without requiring additional edits. Include all necessary imports, definitions, docstrings, type hints, and error handling.
 2. **Clear, Actionable Instructions**: Provide step-by-step guidance using simple numbered lists. Each instruction should map directly to file blocks that follow.
 3. **Structured Output Format**: All generated code MUST be contained within a single `<GENERATED-CODE>` block using the exact structure defined below.
 4. **Minimal External Commentary**: Keep any text outside the `<GENERATED-CODE>` block brief. Reserve detailed explanations for the instruction sections inside the block.
 ## Required Structure
 Use this exact format (do not improvise tag names or reorder components):
 ```
 <GENERATED-CODE>
 [Step-by-step instructions for the coding agent]
 1. Create new file [filename] with [description]
 2. Update existing file [filename] by [description]
 3. [Additional steps as needed]
 <NEWFILE: path/to/new_file.py>
 [Complete file contents with all necessary components:
 - File-level docstring
 - All imports (standard library, third-party, local)
 - All class/function definitions with complete implementations
 - All necessary helper functions
 - Inline comments for complex logic
 - Type hints where applicable]
 </NEWFILE>
 [Additional instructions for the next file, if needed]
 <NEWFILE: path/to/another_file.py>
 [Complete, working code for this file - no partial implementations or placeholders]
 </NEWFILE>
 [Instructions for updating existing files]
 <UPDATED_EXISTING_FILE: existing/path.py>
 [Complete replacement code for the modified sections or routines / lines that need updating:
 - Full function/method bodies (not just the changed lines)
 - Complete class definitions if modifying class methods
 - All necessary imports if adding new dependencies
 - Preserve existing code structure and style]
 </UPDATED_EXISTING_FILE>
 [If additional files need updates (based on existing code that was shared with you earlier), repeat the UPDATED_EXISTING_FILE block]
 <UPDATED_EXISTING_FILE: another/existing/file.py>
 [Complete code for this file's modifications]
 </UPDATED_EXISTING_FILE>
 [For file deletions, explicitly state in instructions with justification:
 "Delete file path/to/obsolete.py - no longer needed because [reason]"]
 </GENERATED-CODE>
 ```
 ## Critical Rules
 **Completeness:**
 - Never output partial code snippets or placeholder comments like "# rest of code here"
 - Include complete function/class implementations from start to finish
 - Add all required imports at the file level
 - Include proper error handling and edge case logic
 **Accuracy:**
 - Match the existing codebase indentation style (tabs vs spaces)
 - Preserve language-specific formatting conventions
 - Include trailing newlines where required by language tooling
 - Use correct file paths relative to project root
 **Clarity:**
 - Number instructions sequentially (1, 2, 3...)
 - Map each instruction to specific file blocks below it
 - Explain *why* changes are needed, not just *what* changes
 - Highlight any breaking changes or migration steps required
 **Structure:**
 - Use `<NEWFILE: ...>` for files that don't exist yet
 - Use `<UPDATED_EXISTING_FILE: ...>` for modifying existing files
 - Place instructions between file blocks to provide context
 - Keep the single `<GENERATED-CODE>` wrapper around everything
 ## Special Cases
 **No Changes Needed:**
 If the task doesn't require file creation or modification, explicitly state:
 "No file changes required. The existing implementation already handles [requirement]."
 Do not emit an empty `<GENERATED-CODE>` block.
 **Configuration Changes:**
 If modifying configuration files (JSON, YAML, TOML), include complete file contents with the changes applied, not just the changed lines.
 **Test Files:**
 When generating tests, include complete test suites with:
 - All necessary test fixtures and setup
 - Multiple test cases covering happy path and edge cases
 - Proper teardown and cleanup
 - Clear test descriptions and assertions
 **Documentation:**
 Include docstrings for all public functions, classes, and modules using the project's documentation style (Google, NumPy, Sphinx, etc.).
 ## Context Awareness
 **CRITICAL:** Your implementation builds upon the ongoing conversation context:
 - All previously shared files, requirements, and constraints remain relevant
 - If updating existing code discussed earlier, reference it and preserve unmodified sections
 - If the user shared code for improvement, your generated code should build upon it, not replace everything
 - The coding agent has full conversation history—your instructions should reference prior discussion as needed
 Your generated code is NOT standalone—it's a continuation of the collaborative session with full context awareness.
 ## Remember
 The coding agent depends on this structured format to:
 - Parse and extract code automatically
 - Apply changes to the correct files within the conversation context
 - Validate completeness before execution
 - Track modifications across the codebase
 Always prioritize clarity, completeness, correctness, and context awareness over brevity.
 """
--- a/tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json
+++ b/tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -137,7 +137,7 @@ class TestAutoMode:
            importlib.reload(config)
    @pytest.mark.asyncio
-    async def test_auto_mode_requires_model_parameter(self):
+    async def test_auto_mode_requires_model_parameter(self, tmp_path):
        """Test that auto mode enforces model parameter"""
        # Save original
        original = os.environ.get("DEFAULT_MODEL", "")
@@ -154,7 +154,7 @@ class TestAutoMode:
            # Mock the provider to avoid real API calls
            with patch.object(tool, "get_model_provider"):
                # Execute without model parameter
-                result = await tool.execute({"prompt": "Test prompt"})
+                result = await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
            # Should get error
            assert len(result) == 1
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -200,7 +200,7 @@ class TestAutoModeComprehensive:
        assert tool.get_model_category() == expected_category
    @pytest.mark.asyncio
-    async def test_auto_mode_with_gemini_only_uses_correct_models(self):
+    async def test_auto_mode_with_gemini_only_uses_correct_models(self, tmp_path):
        """Test that auto mode with only Gemini uses flash for fast tools and pro for reasoning tools."""
        provider_config = {
@@ -234,9 +234,13 @@ class TestAutoModeComprehensive:
            )
            with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
                workdir = tmp_path / "chat_artifacts"
                workdir.mkdir(parents=True, exist_ok=True)
                # Test ChatTool (FAST_RESPONSE) - should prefer flash
                chat_tool = ChatTool()
-                await chat_tool.execute({"prompt": "test", "model": "auto"})  # This should trigger auto selection
+                await chat_tool.execute(
                    {"prompt": "test", "model": "auto", "working_directory": str(workdir)}
                )  # This should trigger auto selection
                # In auto mode, the tool should get an error requiring model selection
                # but the suggested model should be flash
@@ -355,7 +359,7 @@ class TestAutoModeComprehensive:
            # would show models from all providers when called
    @pytest.mark.asyncio
-    async def test_auto_mode_model_parameter_required_error(self):
+    async def test_auto_mode_model_parameter_required_error(self, tmp_path):
        """Test that auto mode properly requires model parameter and suggests correct model."""
        provider_config = {
@@ -384,9 +388,12 @@ class TestAutoModeComprehensive:
            # Test with ChatTool (FAST_RESPONSE category)
            chat_tool = ChatTool()
            workdir = tmp_path / "chat_artifacts"
            workdir.mkdir(parents=True, exist_ok=True)
            result = await chat_tool.execute(
                {
-                    "prompt": "test"
+                    "prompt": "test",
                    "working_directory": str(workdir),
                    # Note: no model parameter provided in auto mode
                }
            )
@@ -508,7 +515,7 @@ class TestAutoModeComprehensive:
                assert fast_response is not None
    @pytest.mark.asyncio
-    async def test_actual_model_name_resolution_in_auto_mode(self):
+    async def test_actual_model_name_resolution_in_auto_mode(self, tmp_path):
        """Test that when a model is selected in auto mode, the tool executes successfully."""
        provider_config = {
@@ -547,7 +554,11 @@ class TestAutoModeComprehensive:
            with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
                chat_tool = ChatTool()
-                result = await chat_tool.execute({"prompt": "test", "model": "flash"})  # Use alias in auto mode
+                workdir = tmp_path / "chat_artifacts"
                workdir.mkdir(parents=True, exist_ok=True)
                result = await chat_tool.execute(
                    {"prompt": "test", "model": "flash", "working_directory": str(workdir)}
                )  # Use alias in auto mode
                # Should succeed with proper model resolution
                assert len(result) == 1
--- a/tests/test_chat_codegen_integration.py
+++ b/tests/test_chat_codegen_integration.py
@@ -0,0 +1,113 @@
 """Integration test for Chat tool code generation with Gemini 2.5 Pro.
 This test uses the Google Gemini SDK's built-in record/replay support. To refresh the
 cassette, delete the existing JSON file under
 ``tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json`` and run:
 ```
 GEMINI_API_KEY=<real-key> pytest tests/test_chat_codegen_integration.py::test_chat_codegen_saves_file
 ```
 The test will automatically record a new interaction when the cassette is missing and
 the environment variable `GEMINI_API_KEY` is set to a valid key.
 """
 from __future__ import annotations
 import json
 import os
 from pathlib import Path
 import pytest
 from providers.gemini import GeminiModelProvider
 from providers.registry import ModelProviderRegistry, ProviderType
 from tools.chat import ChatTool
 REPLAYS_ROOT = Path(__file__).parent / "gemini_cassettes"
 CASSETTE_DIR = REPLAYS_ROOT / "chat_codegen"
 CASSETTE_PATH = CASSETTE_DIR / "gemini25_pro_calculator" / "mldev.json"
 CASSETTE_REPLAY_ID = "chat_codegen/gemini25_pro_calculator/mldev"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
 async def test_chat_codegen_saves_file(monkeypatch, tmp_path):
    """Ensure Gemini 2.5 Pro responses create zen_generated.code when code is emitted."""
    CASSETTE_PATH.parent.mkdir(parents=True, exist_ok=True)
    recording_mode = not CASSETTE_PATH.exists()
    gemini_key = os.getenv("GEMINI_API_KEY", "")
    if recording_mode:
        if not gemini_key or gemini_key.startswith("dummy"):
            pytest.skip("Cassette missing and GEMINI_API_KEY not configured. Provide a real key to record.")
        client_mode = "record"
    else:
        gemini_key = "dummy-key-for-replay"
        client_mode = "replay"
    with monkeypatch.context() as m:
        m.setenv("GEMINI_API_KEY", gemini_key)
        m.setenv("DEFAULT_MODEL", "auto")
        m.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
        m.setenv("GOOGLE_GENAI_CLIENT_MODE", client_mode)
        m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(REPLAYS_ROOT))
        m.setenv("GOOGLE_GENAI_REPLAY_ID", CASSETTE_REPLAY_ID)
        # Clear other provider keys to avoid unintended routing
        for key in ["OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "CUSTOM_API_KEY"]:
            m.delenv(key, raising=False)
        ModelProviderRegistry.reset_for_testing()
        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
        working_dir = tmp_path / "codegen"
        working_dir.mkdir()
        preexisting = working_dir / "zen_generated.code"
        preexisting.write_text("stale contents", encoding="utf-8")
        chat_tool = ChatTool()
        prompt = (
            "Please generate a Python module with functions `add` and `multiply` that perform"
            " basic addition and multiplication. Produce the response using the structured"
            " <GENERATED-CODE> format so the assistant can apply the files directly."
        )
        result = await chat_tool.execute(
            {
                "prompt": prompt,
                "model": "gemini-2.5-pro",
                "working_directory": str(working_dir),
            }
        )
        provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-pro")
        if provider is not None:
            try:
                provider.client.close()
            except AttributeError:
                pass
        # Reset restriction service cache to avoid leaking allowed-model config
        try:
            from utils import model_restrictions
            model_restrictions._restriction_service = None  # type: ignore[attr-defined]
        except Exception:
            pass
    assert result and result[0].type == "text"
    payload = json.loads(result[0].text)
    assert payload["status"] in {"success", "continuation_available"}
    artifact_path = working_dir / "zen_generated.code"
    assert artifact_path.exists()
    saved = artifact_path.read_text()
    assert "<GENERATED-CODE>" in saved
    assert "<NEWFILE:" in saved
    assert "def add" in saved and "def multiply" in saved
    assert "stale contents" not in saved
    artifact_path.unlink()
--- a/tests/test_chat_cross_model_continuation.py
+++ b/tests/test_chat_cross_model_continuation.py
@@ -55,7 +55,7 @@ def _extract_number(text: str) -> str:
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
-async def test_chat_cross_model_continuation(monkeypatch):
+async def test_chat_cross_model_continuation(monkeypatch, tmp_path):
    """Verify continuation across Gemini then OpenAI using recorded interactions."""
    env_updates = {
@@ -115,10 +115,13 @@ async def test_chat_cross_model_continuation(monkeypatch):
        m.setattr(conversation_memory.uuid, "uuid4", lambda: FIXED_THREAD_ID)
        chat_tool = ChatTool()
        working_directory = str(tmp_path)
        step1_args = {
            "prompt": "Pick a number between 1 and 10 and respond with JUST that number.",
            "model": "gemini-2.5-flash",
            "temperature": 0.2,
            "working_directory": working_directory,
        }
        step1_result = await chat_tool.execute(step1_args)
@@ -183,6 +186,7 @@ async def test_chat_cross_model_continuation(monkeypatch):
            "model": "gpt-5",
            "continuation_id": continuation_id,
            "temperature": 0.2,
            "working_directory": working_directory,
        }
        step2_result = await chat_tool.execute(step2_args)
--- a/tests/test_chat_openai_integration.py
+++ b/tests/test_chat_openai_integration.py
@@ -23,7 +23,7 @@ CASSETTE_CONTINUATION_PATH = CASSETTE_DIR / "chat_gpt5_continuation.json"
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
-async def test_chat_auto_mode_with_openai(monkeypatch):
+async def test_chat_auto_mode_with_openai(monkeypatch, tmp_path):
    """Ensure ChatTool in auto mode selects gpt-5 via OpenAI and returns a valid response."""
    # Prepare environment so only OpenAI is available in auto mode
    env_updates = {
@@ -63,10 +63,12 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
        # Execute ChatTool request targeting gpt-5 directly (server normally resolves auto→model)
        chat_tool = ChatTool()
        working_directory = str(tmp_path)
        arguments = {
            "prompt": "Use chat with gpt5 and ask how far the moon is from earth.",
            "model": "gpt-5",
            "temperature": 1.0,
            "working_directory": working_directory,
        }
        result = await chat_tool.execute(arguments)
@@ -87,7 +89,7 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
-async def test_chat_openai_continuation(monkeypatch):
+async def test_chat_openai_continuation(monkeypatch, tmp_path):
    """Verify continuation_id workflow against gpt-5 using recorded OpenAI responses."""
    env_updates = {
@@ -126,12 +128,14 @@ async def test_chat_openai_continuation(monkeypatch):
        m.setattr(conversation_memory.uuid, "uuid4", lambda: fixed_thread_id)
        chat_tool = ChatTool()
        working_directory = str(tmp_path)
        # First message: obtain continuation_id
        first_args = {
            "prompt": "In one word, which sells better: iOS app or macOS app?",
            "model": "gpt-5",
            "temperature": 1.0,
            "working_directory": working_directory,
        }
        first_result = await chat_tool.execute(first_args)
@@ -152,6 +156,7 @@ async def test_chat_openai_continuation(monkeypatch):
            "model": "gpt-5",
            "continuation_id": continuation_id,
            "temperature": 1.0,
            "working_directory": working_directory,
        }
        second_result = await chat_tool.execute(second_args)
--- a/tests/test_chat_simple.py
+++ b/tests/test_chat_simple.py
@@ -38,12 +38,14 @@ class TestChatTool:
        # Required fields
        assert "prompt" in schema["required"]
        assert "working_directory" in schema["required"]
        # Properties
        properties = schema["properties"]
        assert "prompt" in properties
        assert "files" in properties
        assert "images" in properties
        assert "working_directory" in properties
    def test_request_model_validation(self):
        """Test that the request model validates correctly"""
@@ -54,6 +56,7 @@ class TestChatTool:
            "images": ["test.png"],
            "model": "anthropic/claude-opus-4.1",
            "temperature": 0.7,
            "working_directory": "/tmp",  # Dummy absolute path
        }
        request = ChatRequest(**request_data)
@@ -62,6 +65,7 @@ class TestChatTool:
        assert request.images == ["test.png"]
        assert request.model == "anthropic/claude-opus-4.1"
        assert request.temperature == 0.7
        assert request.working_directory == "/tmp"
    def test_required_fields(self):
        """Test that required fields are enforced"""
@@ -69,7 +73,7 @@ class TestChatTool:
        from pydantic import ValidationError
        with pytest.raises(ValidationError):
-            ChatRequest(model="anthropic/claude-opus-4.1")
+            ChatRequest(model="anthropic/claude-opus-4.1", working_directory="/tmp")
    def test_model_availability(self):
        """Test that model availability works"""
@@ -96,7 +100,7 @@ class TestChatTool:
    @pytest.mark.asyncio
    async def test_prompt_preparation(self):
        """Test that prompt preparation works correctly"""
-        request = ChatRequest(prompt="Test prompt", files=[])
+        request = ChatRequest(prompt="Test prompt", files=[], working_directory="/tmp")
        # Mock the system prompt and file handling
        with patch.object(self.tool, "get_system_prompt", return_value="System prompt"):
@@ -113,7 +117,7 @@ class TestChatTool:
    def test_response_formatting(self):
        """Test that response formatting works correctly"""
        response = "Test response content"
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
        formatted = self.tool.format_response(response, request)
@@ -146,6 +150,7 @@ class TestChatTool:
        required_fields = self.tool.get_required_fields()
        assert "prompt" in required_fields
        assert "working_directory" in required_fields
 class TestChatRequestModel:
@@ -160,10 +165,11 @@ class TestChatRequestModel:
        assert "context" in CHAT_FIELD_DESCRIPTIONS["prompt"]
        assert "full-paths" in CHAT_FIELD_DESCRIPTIONS["files"] or "absolute" in CHAT_FIELD_DESCRIPTIONS["files"]
        assert "visual context" in CHAT_FIELD_DESCRIPTIONS["images"]
        assert "directory" in CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
    def test_default_values(self):
        """Test that default values work correctly"""
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
        assert request.prompt == "Test"
        assert request.files == []  # Should default to empty list
@@ -173,7 +179,7 @@ class TestChatRequestModel:
        """Test that ChatRequest properly inherits from ToolRequest"""
        from tools.shared.base_models import ToolRequest
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
        assert isinstance(request, ToolRequest)
        # Should have inherited fields
--- a/tests/test_conversation_continuation_integration.py
+++ b/tests/test_conversation_continuation_integration.py
@@ -5,7 +5,7 @@ from utils.conversation_memory import get_thread
 from utils.storage_backend import get_storage_backend
-def test_first_response_persisted_in_conversation_history():
+def test_first_response_persisted_in_conversation_history(tmp_path):
    """Ensure the assistant's initial reply is stored for newly created threads."""
    # Clear in-memory storage to avoid cross-test contamination
@@ -13,7 +13,7 @@ def test_first_response_persisted_in_conversation_history():
    storage._store.clear()  # type: ignore[attr-defined]
    tool = ChatTool()
-    request = ChatRequest(prompt="First question?", model="local-llama")
+    request = ChatRequest(prompt="First question?", model="local-llama", working_directory=str(tmp_path))
    response_text = "Here is the initial answer."
    # Mimic the first tool invocation (no continuation_id supplied)
--- a/tests/test_directory_expansion_tracking.py
+++ b/tests/test_directory_expansion_tracking.py
@@ -91,6 +91,7 @@ def helper_function():
            "prompt": "Analyze this codebase structure",
            "files": [directory],  # Directory path, not individual files
            "model": "flash",
            "working_directory": directory,
        }
        # Execute the tool
@@ -168,6 +169,7 @@ def helper_function():
            "files": [directory],  # Same directory again
            "model": "flash",
            "continuation_id": thread_id,
            "working_directory": directory,
        }
        # Mock to capture file filtering behavior
@@ -299,6 +301,7 @@ def helper_function():
            "prompt": "Analyze this code",
            "files": [directory],
            "model": "flash",
            "working_directory": directory,
        }
        result = await tool.execute(request_args)
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -56,7 +56,12 @@ class TestLargePromptHandling:
    async def test_chat_large_prompt_detection(self, large_prompt):
        """Test that chat tool detects large prompts."""
        tool = ChatTool()
-        result = await tool.execute({"prompt": large_prompt})
+        temp_dir = tempfile.mkdtemp()
        temp_dir = tempfile.mkdtemp()
        try:
            result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
        assert len(result) == 1
        assert isinstance(result[0], TextContent)
@@ -73,9 +78,16 @@ class TestLargePromptHandling:
        """Test that chat tool works normally with regular prompts."""
        tool = ChatTool()
        temp_dir = tempfile.mkdtemp()
        # This test runs in the test environment which uses dummy keys
        # The chat tool will return an error for dummy keys, which is expected
-        result = await tool.execute({"prompt": normal_prompt, "model": "gemini-2.5-flash"})
+        try:
            result = await tool.execute(
                {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
            )
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
        assert len(result) == 1
        output = json.loads(result[0].text)
@@ -105,7 +117,14 @@ class TestLargePromptHandling:
        try:
            # This test runs in the test environment which uses dummy keys
            # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute({"prompt": "", "files": [temp_prompt_file], "model": "gemini-2.5-flash"})
+            result = await tool.execute(
                {
                    "prompt": "",
                    "files": [temp_prompt_file],
                    "model": "gemini-2.5-flash",
                    "working_directory": temp_dir,
                }
            )
            assert len(result) == 1
            output = json.loads(result[0].text)
@@ -261,7 +280,13 @@ class TestLargePromptHandling:
                    mock_prepare_files.return_value = ("File content", [other_file])
                    # Use a small prompt to avoid triggering size limit
-                    await tool.execute({"prompt": "Test prompt", "files": [temp_prompt_file, other_file]})
+                    await tool.execute(
                        {
                            "prompt": "Test prompt",
                            "files": [temp_prompt_file, other_file],
                            "working_directory": os.path.dirname(temp_prompt_file),
                        }
                    )
                    # Verify handle_prompt_file was called with the original files list
                    mock_handle_prompt.assert_called_once_with([temp_prompt_file, other_file])
@@ -295,7 +320,11 @@ class TestLargePromptHandling:
            mock_get_provider.return_value = mock_provider
            # With the fix, this should now pass because we check at MCP transport boundary before adding internal content
-            result = await tool.execute({"prompt": exact_prompt})
+            temp_dir = tempfile.mkdtemp()
            try:
                result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"
@@ -305,7 +334,11 @@ class TestLargePromptHandling:
        tool = ChatTool()
        over_prompt = "x" * (MCP_PROMPT_SIZE_LIMIT + 1)
-        result = await tool.execute({"prompt": over_prompt})
+        temp_dir = tempfile.mkdtemp()
        try:
            result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
        output = json.loads(result[0].text)
        assert output["status"] == "resend_prompt"
@@ -326,7 +359,11 @@ class TestLargePromptHandling:
            )
            mock_get_provider.return_value = mock_provider
-            result = await tool.execute({"prompt": ""})
+            temp_dir = tempfile.mkdtemp()
            try:
                result = await tool.execute({"prompt": "", "working_directory": temp_dir})
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"
@@ -362,7 +399,11 @@ class TestLargePromptHandling:
            mock_model_context_class.return_value = mock_model_context
            # Should continue with empty prompt when file can't be read
-            result = await tool.execute({"prompt": "", "files": [bad_file]})
+            temp_dir = tempfile.mkdtemp()
            try:
                result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"
@@ -408,6 +449,7 @@ class TestLargePromptHandling:
                    "prompt": "Summarize the design decisions",
                    "files": [str(large_file)],
                    "model": "flash",
                    "working_directory": str(tmp_path),
                    "_model_context": dummy_context,
                }
            )
@@ -424,6 +466,7 @@ class TestLargePromptHandling:
        This test verifies that even if our internal prompt (with system prompts, history, etc.)
        exceeds MCP_PROMPT_SIZE_LIMIT, it should still work as long as the user's input is small.
        """
        tool = ChatTool()
        # Small user input that should pass MCP boundary check
@@ -432,62 +475,57 @@ class TestLargePromptHandling:
        # Mock a huge conversation history that would exceed MCP limits if incorrectly checked
        huge_history = "x" * (MCP_PROMPT_SIZE_LIMIT * 2)  # 100K chars = way over 50K limit
-        with (
+        temp_dir = tempfile.mkdtemp()
-            patch.object(tool, "get_model_provider") as mock_get_provider,
+        original_prepare_prompt = tool.prepare_prompt
            patch("utils.model_context.ModelContext") as mock_model_context_class,
        ):
            from tests.mock_helpers import create_mock_provider
-            mock_provider = create_mock_provider(model_name="flash")
+        try:
-            mock_get_provider.return_value = mock_provider
+            with (
                patch.object(tool, "get_model_provider") as mock_get_provider,
                patch("utils.model_context.ModelContext") as mock_model_context_class,
            ):
                from tests.mock_helpers import create_mock_provider
                from utils.model_context import TokenAllocation
-            # Mock ModelContext to avoid the comparison issue
+                mock_provider = create_mock_provider(model_name="flash")
-            from utils.model_context import TokenAllocation
+                mock_get_provider.return_value = mock_provider
-            mock_model_context = MagicMock()
+                mock_model_context = MagicMock()
-            mock_model_context.model_name = "flash"
+                mock_model_context.model_name = "flash"
-            mock_model_context.provider = mock_provider
+                mock_model_context.provider = mock_provider
-            mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
+                mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
-                total_tokens=1_048_576,
+                    total_tokens=1_048_576,
-                content_tokens=838_861,
+                    content_tokens=838_861,
-                response_tokens=209_715,
+                    response_tokens=209_715,
-                file_tokens=335_544,
+                    file_tokens=335_544,
-                history_tokens=335_544,
+                    history_tokens=335_544,
-            )
+                )
-            mock_model_context_class.return_value = mock_model_context
+                mock_model_context_class.return_value = mock_model_context
-            # Mock the prepare_prompt to simulate huge internal context
+                async def mock_prepare_prompt(request):
-            original_prepare_prompt = tool.prepare_prompt
+                    normal_prompt = await original_prepare_prompt(request)
                    huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
                    assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
                    return huge_internal_prompt
-            async def mock_prepare_prompt(request):
+                tool.prepare_prompt = mock_prepare_prompt
                # Call original to get normal processing
                normal_prompt = await original_prepare_prompt(request)
                # Add huge internal context (simulating large history, system prompts, files)
                huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
-                # Verify the huge internal prompt would exceed MCP limits if incorrectly checked
+                result = await tool.execute(
-                assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
+                    {"prompt": small_user_prompt, "model": "flash", "working_directory": temp_dir}
                )
                output = json.loads(result[0].text)
-                return huge_internal_prompt
+                assert output["status"] != "resend_prompt"
-            tool.prepare_prompt = mock_prepare_prompt
+                mock_provider.generate_content.assert_called_once()
                call_kwargs = mock_provider.generate_content.call_args[1]
                actual_prompt = call_kwargs.get("prompt")
-            # This should succeed because we only check user input at MCP boundary
+                assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
-            result = await tool.execute({"prompt": small_user_prompt, "model": "flash"})
+                assert huge_history in actual_prompt
-            output = json.loads(result[0].text)
+                assert small_user_prompt in actual_prompt
-
+        finally:
-            # Should succeed even though internal context is huge
+            tool.prepare_prompt = original_prepare_prompt
-            assert output["status"] != "resend_prompt"
+            shutil.rmtree(temp_dir, ignore_errors=True)
            # Verify the model was actually called with the huge prompt
            mock_provider.generate_content.assert_called_once()
            call_kwargs = mock_provider.generate_content.call_args[1]
            actual_prompt = call_kwargs.get("prompt")
            # Verify internal prompt was huge (proving we don't limit internal processing)
            assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
            assert huge_history in actual_prompt
            assert small_user_prompt in actual_prompt
    @pytest.mark.asyncio
    async def test_mcp_boundary_vs_internal_processing_distinction(self):
@@ -500,27 +538,37 @@ class TestLargePromptHandling:
        # Test case 1: Large user input should fail at MCP boundary
        large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
-        result = await tool.execute({"prompt": large_user_input, "model": "flash"})
+        temp_dir = tempfile.mkdtemp()
-        output = json.loads(result[0].text)
+        try:
-        assert output["status"] == "resend_prompt"  # Should fail
+            result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
-        assert "too large for MCP's token limits" in output["content"]
+            output = json.loads(result[0].text)
            assert output["status"] == "resend_prompt"  # Should fail
            assert "too large for MCP's token limits" in output["content"]
-        # Test case 2: Small user input should succeed even with huge internal processing
+            # Test case 2: Small user input should succeed even with huge internal processing
-        small_user_input = "Hello"
+            small_user_input = "Hello"
-        # This test runs in the test environment which uses dummy keys
+            # This test runs in the test environment which uses dummy keys
-        # The chat tool will return an error for dummy keys, which is expected
+            # The chat tool will return an error for dummy keys, which is expected
-        result = await tool.execute({"prompt": small_user_input, "model": "gemini-2.5-flash"})
+            result = await tool.execute(
-        output = json.loads(result[0].text)
+                {
                    "prompt": small_user_input,
                    "model": "gemini-2.5-flash",
                    "working_directory": temp_dir,
                }
            )
            output = json.loads(result[0].text)
-        # The test will fail with dummy API keys, which is expected behavior
+            # The test will fail with dummy API keys, which is expected behavior
-        # We're mainly testing that the tool processes small prompts correctly without size errors
+            # We're mainly testing that the tool processes small prompts correctly without size errors
-        if output["status"] == "error":
+            if output["status"] == "error":
-            # If it's an API error, that's fine - we're testing prompt handling, not API calls
+                # If it's an API error, that's fine - we're testing prompt handling, not API calls
-            assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
+                assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
-        else:
+            else:
-            # If somehow it succeeds (e.g., with mocked provider), check the response
+                # If somehow it succeeds (e.g., with mocked provider), check the response
-            assert output["status"] != "resend_prompt"
+                assert output["status"] != "resend_prompt"
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
    @pytest.mark.asyncio
    async def test_continuation_with_huge_conversation_history(self):
@@ -548,6 +596,8 @@ class TestLargePromptHandling:
        # Ensure the history exceeds MCP limits
        assert len(huge_conversation_history) > MCP_PROMPT_SIZE_LIMIT
        temp_dir = tempfile.mkdtemp()
        with (
            patch.object(tool, "get_model_provider") as mock_get_provider,
            patch("utils.model_context.ModelContext") as mock_model_context_class,
@@ -579,6 +629,7 @@ class TestLargePromptHandling:
                "prompt": f"{huge_conversation_history}\n\n=== CURRENT REQUEST ===\n{small_continuation_prompt}",
                "model": "flash",
                "continuation_id": "test_thread_123",
                "working_directory": temp_dir,
            }
            # Mock the conversation history embedding to simulate server.py behavior
@@ -628,6 +679,7 @@ class TestLargePromptHandling:
            finally:
                # Restore original execute method
                tool.__class__.execute = original_execute
                shutil.rmtree(temp_dir, ignore_errors=True)
 if __name__ == "__main__":
--- a/tests/test_listmodels.py
+++ b/tests/test_listmodels.py
@@ -68,6 +68,7 @@ class TestListModelsTool:
            assert "`flash` → `gemini-2.5-flash`" in content
            assert "`pro` → `gemini-2.5-pro`" in content
            assert "1M context" in content
            assert "Supports structured code generation" in content
            # Check summary
            assert "**Configured Providers**: 1" in content
--- a/tests/test_o3_pro_output_text_fix.py
+++ b/tests/test_o3_pro_output_text_fix.py
@@ -12,6 +12,7 @@ RECORDING: To record new responses, delete the cassette file and run with real A
 import logging
 import os
 import tempfile
 from pathlib import Path
 from unittest.mock import patch
@@ -92,9 +93,15 @@ class TestO3ProOutputTextFix:
    async def _execute_chat_tool_test(self):
        """Execute the ChatTool with o3-pro and return the result."""
        chat_tool = ChatTool()
-        arguments = {"prompt": "What is 2 + 2?", "model": "o3-pro", "temperature": 1.0}
+        with tempfile.TemporaryDirectory() as workdir:
            arguments = {
                "prompt": "What is 2 + 2?",
                "model": "o3-pro",
                "temperature": 1.0,
                "working_directory": workdir,
            }
-        return await chat_tool.execute(arguments)
+            return await chat_tool.execute(arguments)
    def _verify_chat_tool_response(self, result):
        """Verify the ChatTool response contains expected data."""
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -4,6 +4,8 @@ Test per-tool model default selection functionality
 import json
 import os
 import shutil
 import tempfile
 from unittest.mock import MagicMock, patch
 import pytest
@@ -290,7 +292,13 @@ class TestAutoModeErrorMessages:
                        mock_get_provider_for.return_value = None
                        tool = ChatTool()
-                        result = await tool.execute({"prompt": "test", "model": "auto"})
+                        temp_dir = tempfile.mkdtemp()
                        try:
                            result = await tool.execute(
                                {"prompt": "test", "model": "auto", "working_directory": temp_dir}
                            )
                        finally:
                            shutil.rmtree(temp_dir, ignore_errors=True)
                        assert len(result) == 1
                        # The SimpleTool will wrap the error message
@@ -418,7 +426,13 @@ class TestRuntimeModelSelection:
                    mock_get_provider.return_value = None
                    tool = ChatTool()
-                    result = await tool.execute({"prompt": "test", "model": "gpt-5-turbo"})
+                    temp_dir = tempfile.mkdtemp()
                    try:
                        result = await tool.execute(
                            {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
                        )
                    finally:
                        shutil.rmtree(temp_dir, ignore_errors=True)
                    # Should require model selection
                    assert len(result) == 1
@@ -515,7 +529,11 @@ class TestUnavailableModelFallback:
                        mock_get_model_provider.return_value = mock_provider
                        tool = ChatTool()
-                        result = await tool.execute({"prompt": "test"})  # No model specified
+                        temp_dir = tempfile.mkdtemp()
                        try:
                            result = await tool.execute({"prompt": "test", "working_directory": temp_dir})
                        finally:
                            shutil.rmtree(temp_dir, ignore_errors=True)
                        # Should work normally, not require model parameter
                        assert len(result) == 1
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -3,6 +3,8 @@ Tests for individual tool implementations
 """
 import json
 import shutil
 import tempfile
 import pytest
@@ -343,12 +345,17 @@ class TestAbsolutePathValidation:
    async def test_chat_tool_relative_path_rejected(self):
        """Test that chat tool rejects relative paths"""
        tool = ChatTool()
-        result = await tool.execute(
+        temp_dir = tempfile.mkdtemp()
-            {
+        try:
-                "prompt": "Explain this code",
+            result = await tool.execute(
-                "files": ["code.py"],  # relative path without ./
+                {
-            }
+                    "prompt": "Explain this code",
-        )
+                    "files": ["code.py"],  # relative path without ./
                    "working_directory": temp_dir,
                }
            )
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
        assert len(result) == 1
        response = json.loads(result[0].text)
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -6,15 +6,20 @@ brainstorming, problem-solving, and collaborative thinking. It supports file con
 images, and conversation continuation for seamless multi-turn interactions.
 """
 import logging
 import os
 import re
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional
 from pydantic import Field
 if TYPE_CHECKING:
    from providers.shared import ModelCapabilities
    from tools.models import ToolModelCategory
 from config import TEMPERATURE_BALANCED
-from systemprompts import CHAT_PROMPT
+from systemprompts import CHAT_PROMPT, GENERATE_CODE_PROMPT
 from tools.shared.base_models import COMMON_FIELD_DESCRIPTIONS, ToolRequest
 from .simple.base import SimpleTool
@@ -27,6 +32,9 @@ CHAT_FIELD_DESCRIPTIONS = {
    ),
    "files": "absolute file or folder paths for code context (do NOT shorten).",
    "images": "Optional absolute image paths or base64 for visual context when helpful.",
    "working_directory": (
        "Absolute full directory path where the assistant AI can save generated code for implementation. The directory must already exist"
    ),
 }
@@ -36,6 +44,7 @@ class ChatRequest(ToolRequest):
    prompt: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["prompt"])
    files: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["files"])
    images: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["images"])
    working_directory: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["working_directory"])
 class ChatTool(SimpleTool):
@@ -49,6 +58,10 @@ class ChatTool(SimpleTool):
    Chat tool with 100% behavioral compatibility.
    """
    def __init__(self) -> None:
        super().__init__()
        self._last_recordable_response: Optional[str] = None
    def get_name(self) -> str:
        return "chat"
@@ -58,9 +71,20 @@ class ChatTool(SimpleTool):
            "getting second opinions, and exploring ideas. Use for ideas, validations, questions, and thoughtful explanations."
        )
    def get_annotations(self) -> Optional[dict[str, Any]]:
        """Chat writes generated artifacts when code-generation is enabled."""
        return {"readOnlyHint": False}
    def get_system_prompt(self) -> str:
        return CHAT_PROMPT
    def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
        prompts = list(super().get_capability_system_prompts(capabilities))
        if capabilities and capabilities.allow_code_generation:
            prompts.append(GENERATE_CODE_PROMPT)
        return prompts
    def get_default_temperature(self) -> float:
        return TEMPERATURE_BALANCED
@@ -85,7 +109,7 @@ class ChatTool(SimpleTool):
        the same schema generation approach while still benefiting from SimpleTool
        convenience methods.
        """
-        required_fields = ["prompt"]
+        required_fields = ["prompt", "working_directory"]
        if self.is_effective_auto_mode():
            required_fields.append("model")
@@ -106,6 +130,10 @@ class ChatTool(SimpleTool):
                    "items": {"type": "string"},
                    "description": CHAT_FIELD_DESCRIPTIONS["images"],
                },
                "working_directory": {
                    "type": "string",
                    "description": CHAT_FIELD_DESCRIPTIONS["working_directory"],
                },
                "model": self.get_model_field_schema(),
                "temperature": {
                    "type": "number",
@@ -159,7 +187,7 @@ class ChatTool(SimpleTool):
    def get_required_fields(self) -> list[str]:
        """Required fields for ChatSimple tool"""
-        return ["prompt"]
+        return ["prompt", "working_directory"]
    # === Hook Method Implementations ===
@@ -173,17 +201,165 @@ class ChatTool(SimpleTool):
        # Use SimpleTool's Chat-style prompt preparation
        return self.prepare_chat_style_prompt(request)
    def _validate_file_paths(self, request) -> Optional[str]:
        """Extend validation to cover the working directory path."""
        error = super()._validate_file_paths(request)
        if error:
            return error
        working_directory = getattr(request, "working_directory", None)
        if working_directory:
            expanded = os.path.expanduser(working_directory)
            if not os.path.isabs(expanded):
                return (
                    "Error: 'working_directory' must be an absolute path (you may use '~' which will be expanded). "
                    f"Received: {working_directory}"
                )
        return None
    def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
        """
        Format the chat response to match the original Chat tool exactly.
        """
-        return (
+        self._last_recordable_response = None
-            f"{response}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
+        body = response
        recordable_override: Optional[str] = None
        if self._model_supports_code_generation():
            block, remainder = self._extract_generated_code_block(response)
            if block:
                sanitized_text = remainder.strip()
                try:
                    artifact_path = self._persist_generated_code_block(block, request.working_directory)
                except Exception as exc:  # pragma: no cover - rare filesystem failures
                    logger.error("Failed to persist generated code block: %s", exc, exc_info=True)
                    warning = (
                        f"WARNING: Unable to write zen_generated.code inside '{request.working_directory}'. "
                        "Check the path permissions and re-run. The generated code block is included below for manual handling."
                    )
                    history_copy = self._join_sections(sanitized_text, warning) if sanitized_text else warning
                    recordable_override = history_copy
                    sanitized_warning = history_copy.strip()
                    body = f"{sanitized_warning}\n\n{block.strip()}".strip()
                else:
                    if not sanitized_text:
                        sanitized_text = "Generated code saved to zen_generated.code. Follow the structured instructions in that file exactly before continuing."
                    instruction = self._build_agent_instruction(artifact_path)
                    body = self._join_sections(sanitized_text, instruction)
        final_output = (
            f"{body}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
            "form a comprehensive solution and continue with the user's request and task at hand."
        )
        if recordable_override is not None:
            self._last_recordable_response = (
                f"{recordable_override}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
                "form a comprehensive solution and continue with the user's request and task at hand."
            )
        else:
            self._last_recordable_response = final_output
        return final_output
    def _record_assistant_turn(
        self, continuation_id: str, response_text: str, request, model_info: Optional[dict]
    ) -> None:
        recordable = self._last_recordable_response if self._last_recordable_response is not None else response_text
        try:
            super()._record_assistant_turn(continuation_id, recordable, request, model_info)
        finally:
            self._last_recordable_response = None
    def _model_supports_code_generation(self) -> bool:
        context = getattr(self, "_model_context", None)
        if not context:
            return False
        try:
            capabilities = context.capabilities
        except Exception:  # pragma: no cover - defensive fallback
            return False
        return bool(capabilities.allow_code_generation)
    def _extract_generated_code_block(self, text: str) -> tuple[Optional[str], str]:
        match = re.search(r"<GENERATED-CODE>.*?</GENERATED-CODE>", text, flags=re.DOTALL | re.IGNORECASE)
        if not match:
            return None, text
        block = match.group(0)
        before = text[: match.start()].rstrip()
        after = text[match.end() :].lstrip()
        if before and after:
            remainder = f"{before}\n\n{after}"
        else:
            remainder = before or after
        return block, remainder or ""
    def _persist_generated_code_block(self, block: str, working_directory: str) -> Path:
        expanded = os.path.expanduser(working_directory)
        target_dir = Path(expanded).resolve()
        target_dir.mkdir(parents=True, exist_ok=True)
        target_file = target_dir / "zen_generated.code"
        if target_file.exists():
            try:
                target_file.unlink()
            except OSError as exc:
                logger.warning("Unable to remove existing zen_generated.code: %s", exc)
        content = block if block.endswith("\n") else f"{block}\n"
        target_file.write_text(content, encoding="utf-8")
        logger.info("Generated code artifact written to %s", target_file)
        return target_file
    @staticmethod
    def _build_agent_instruction(artifact_path: Path) -> str:
        return (
            f"CONTINUING FROM PREVIOUS DISCUSSION: The coding assistant has analyzed our conversation context and generated "
            f"a structured implementation plan at `{artifact_path}`. This is a direct continuation of our discussion—all previous "
            "context, requirements, and shared code remain relevant.\n"
            "\n"
            f"MANDATORY NEXT STEP: Open `{artifact_path}` immediately and review the implementation plan:\n"
            "1. Read the step-by-step instructions—they reference our previous discussion. You may need to read the file in parts if it's too long.\n"
            "2. Review each <NEWFILE:…> or <UPDATED_EXISTING_FILE:…> section in the context of what we've discussed\n"
            "3. Verify the proposed changes align with the requirements and code we've already shared\n"
            "4. Check for syntax errors, missing imports, or incomplete implementations\n"
            "\n"
            "Then systematically apply the changes:\n"
            "- Create new files or update existing ones as instructed, maintaining code style consistency\n"
            "- If updating existing code we discussed earlier, carefully preserve unmodified sections\n"
            "- Run syntax validation after each modification\n"
            "- Execute relevant tests to confirm functionality\n"
            "- Verify the implementation works end-to-end with existing code\n"
            "\n"
            "Remember: This builds upon our conversation. The generated code reflects the full context of what we've discussed, "
            "including any files, requirements, or constraints mentioned earlier. Proceed with implementation immediately."
            "Only after you finish applying ALL the changes completely: delete `zen_generated.code` so stale instructions do not linger."
        )
    @staticmethod
    def _join_sections(*sections: str) -> str:
        chunks: list[str] = []
        for section in sections:
            if section:
                trimmed = section.strip()
                if trimmed:
                    chunks.append(trimmed)
        return "\n\n".join(chunks)
    def get_websearch_guidance(self) -> str:
        """
        Return Chat tool-style web search guidance.
        """
        return self.get_chat_style_websearch_guidance()
 logger = logging.getLogger(__name__)
--- a/tools/listmodels.py
+++ b/tools/listmodels.py
@@ -140,6 +140,8 @@ class ListModelsTool(BaseTool):
            except AttributeError:
                description = "No description available"
            lines = [header, f"  - {context_str}", f"  - {description}"]
            if capabilities.allow_code_generation:
                lines.append("  - Supports structured code generation")
            return lines
        # Check each native provider type
@@ -187,6 +189,8 @@ class ListModelsTool(BaseTool):
                        output_lines.append(f"- `{model_name}` - {context_str}")
                        output_lines.append(f"  - {description}")
                        if capabilities.allow_code_generation:
                            output_lines.append("  - Supports structured code generation")
                        for alias in capabilities.aliases or []:
                            if alias != model_name:
--- a/tools/shared/base_tool.py
+++ b/tools/shared/base_tool.py
@@ -17,6 +17,7 @@ from typing import TYPE_CHECKING, Any, Optional
 from mcp.types import TextContent
 if TYPE_CHECKING:
    from providers.shared import ModelCapabilities
    from tools.models import ToolModelCategory
 from config import MCP_PROMPT_SIZE_LIMIT
@@ -165,6 +166,42 @@ class BaseTool(ABC):
        """
        pass
    def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
        """Return additional system prompt snippets gated on model capabilities.
        Subclasses can override this hook to append capability-specific
        instructions (for example, enabling code-generation exports when a
        model advertises support). The default implementation returns an empty
        list so no extra instructions are appended.
        Args:
            capabilities: The resolved capabilities for the active model.
        Returns:
            List of prompt fragments to append after the base system prompt.
        """
        return []
    def _augment_system_prompt_with_capabilities(
        self, base_prompt: str, capabilities: Optional["ModelCapabilities"]
    ) -> str:
        """Merge capability-driven prompt addenda with the base system prompt."""
        additions: list[str] = []
        if capabilities is not None:
            additions = [fragment.strip() for fragment in self.get_capability_system_prompts(capabilities) if fragment]
        if not additions:
            return base_prompt
        addition_text = "\n\n".join(additions)
        if not base_prompt:
            return addition_text
        suffix = "" if base_prompt.endswith("\n\n") else "\n\n"
        return f"{base_prompt}{suffix}{addition_text}"
    def get_annotations(self) -> Optional[dict[str, Any]]:
        """
        Return optional annotations for this tool.
@@ -413,13 +450,16 @@ class BaseTool(ABC):
        for rank, canonical_name, capabilities in filtered[:limit]:
            details: list[str] = []
-            context_str = self._format_context_window(getattr(capabilities, "context_window", 0))
+            context_str = self._format_context_window(capabilities.context_window)
            if context_str:
                details.append(context_str)
-            if getattr(capabilities, "supports_extended_thinking", False):
+            if capabilities.supports_extended_thinking:
                details.append("thinking")
            if capabilities.allow_code_generation:
                details.append("code-gen")
            base = f"{canonical_name} (score {rank}"
            if details:
                base = f"{base}, {', '.join(details)}"
--- a/tools/simple/base.py
+++ b/tools/simple/base.py
@@ -404,11 +404,15 @@ class SimpleTool(BaseTool):
            # Get the provider from model context (clean OOP - no re-fetching)
            provider = self._model_context.provider
            capabilities = self._model_context.capabilities
            # Get system prompt for this tool
            base_system_prompt = self.get_system_prompt()
            capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
                base_system_prompt, capabilities
            )
            language_instruction = self.get_language_instruction()
-            system_prompt = language_instruction + base_system_prompt
+            system_prompt = language_instruction + capability_augmented_prompt
            # Generate AI response using the provider
            logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.get_name()}")
@@ -423,7 +427,6 @@ class SimpleTool(BaseTool):
            logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")
            # Resolve model capabilities for feature gating
            capabilities = self._model_context.capabilities
            supports_thinking = capabilities.supports_extended_thinking
            # Generate content with provider abstraction
--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -1480,8 +1480,11 @@ class BaseWorkflowMixin(ABC):
            # Get system prompt for this tool with localization support
            base_system_prompt = self.get_system_prompt()
            capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
                base_system_prompt, getattr(self._model_context, "capabilities", None)
            )
            language_instruction = self.get_language_instruction()
-            system_prompt = language_instruction + base_system_prompt
+            system_prompt = language_instruction + capability_augmented_prompt
            # Check if tool wants system prompt embedded in main prompt
            if self.should_embed_system_prompt():