From ece8a5ebedb70798ced4ed6f6aae9bfb93be1c21 Mon Sep 17 00:00:00 2001
From: Fahad <fahad@2doapp.com>
Date: Tue, 7 Oct 2025 18:49:13 +0400
Subject: [PATCH] feat!: Full code can now be generated by an external model
 and shared with the AI tool (Claude Code / Codex etc)!

 model definitions now support a new `allow_code_generation` flag, only to be used with higher reasoning models such as GPT-5-Pro and-Gemini 2.5-Pro

 When `true`, the `chat` tool can now request the external model to generate a full implementation / update / instructions etc and then share the implementation with the calling agent.

 This effectively allows us to utilize more powerful models such as GPT-5-Pro to generate code for us or entire implementations (which are either API-only or part of the $200 Pro plan from within the ChatGPT app)
---
 README.md                                     |   2 +-
 conf/gemini_models.json                       |   4 +-
 conf/openai_models.json                       |   4 +-
 conf/openrouter_models.json                   |   7 +-
 docs/adding_tools.md                          |  14 +-
 docs/configuration.md                         |  60 +++++-
 docs/tools/chat.md                            |  41 +++-
 providers/shared/model_capabilities.py        |   5 +
 systemprompts/__init__.py                     |   2 +
 systemprompts/generate_code_prompt.py         | 181 ++++++++++++++++
 .../gemini25_pro_calculator/mldev.json        | 133 ++++++++++++
 tests/test_auto_mode.py                       |   4 +-
 tests/test_auto_mode_comprehensive.py         |  23 +-
 tests/test_chat_codegen_integration.py        | 113 ++++++++++
 tests/test_chat_cross_model_continuation.py   |   6 +-
 tests/test_chat_openai_integration.py         |   9 +-
 tests/test_chat_simple.py                     |  16 +-
 ...t_conversation_continuation_integration.py |   4 +-
 tests/test_directory_expansion_tracking.py    |   3 +
 tests/test_large_prompt_handling.py           | 198 +++++++++++-------
 tests/test_listmodels.py                      |   1 +
 tests/test_o3_pro_output_text_fix.py          |  11 +-
 tests/test_per_tool_model_defaults.py         |  24 ++-
 tests/test_tools.py                           |  19 +-
 tools/chat.py                                 | 186 +++++++++++++++-
 tools/listmodels.py                           |   4 +
 tools/shared/base_tool.py                     |  44 +++-
 tools/simple/base.py                          |   7 +-
 tools/workflow/workflow_mixin.py              |   5 +-
 29 files changed, 1008 insertions(+), 122 deletions(-)
 create mode 100644 systemprompts/generate_code_prompt.py
 create mode 100644 tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json
 create mode 100644 tests/test_chat_codegen_integration.py

diff --git a/README.md b/README.md
index 17a6ada..6d34658 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ Zen activates any provider that has credentials in your `.env`. See `.env.exampl
 
 **Collaboration & Planning** *(Enabled by default)*
 - **[`clink`](docs/tools/clink.md)** - Bridge requests to external AI CLIs (Gemini planner, codereviewer, etc.)
-- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches
+- **[`chat`](docs/tools/chat.md)** - Brainstorm ideas, get second opinions, validate approaches. With capable models (GPT-5 Pro, Gemini 2.5 Pro), generates complete code / implementation
 - **[`thinkdeep`](docs/tools/thinkdeep.md)** - Extended reasoning, edge case analysis, alternative perspectives
 - **[`planner`](docs/tools/planner.md)** - Break down complex projects into structured, actionable plans
 - **[`consensus`](docs/tools/consensus.md)** - Get expert opinions from multiple AI models with stance steering
diff --git a/conf/gemini_models.json b/conf/gemini_models.json
index e8275e5..23dfb6c 100644
--- a/conf/gemini_models.json
+++ b/conf/gemini_models.json
@@ -20,7 +20,8 @@
       "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
       "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
       "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
+      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
     }
   },
   "models": [
@@ -44,6 +45,7 @@
       "supports_json_mode": true,
       "supports_images": true,
       "supports_temperature": true,
+      "allow_code_generation": true,
       "max_image_size_mb": 32.0
     },
     {
diff --git a/conf/openai_models.json b/conf/openai_models.json
index a7e0674..848fb96 100644
--- a/conf/openai_models.json
+++ b/conf/openai_models.json
@@ -20,7 +20,8 @@
       "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
       "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
       "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
+      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
     }
   },
   "models": [
@@ -66,6 +67,7 @@
       "max_image_size_mb": 20.0,
       "use_openai_response_api": true,
       "default_reasoning_effort": "high",
+      "allow_code_generation": true,
       "temperature_constraint": "fixed"
     },
     {
diff --git a/conf/openrouter_models.json b/conf/openrouter_models.json
index 123c965..aaa1d66 100644
--- a/conf/openrouter_models.json
+++ b/conf/openrouter_models.json
@@ -19,7 +19,8 @@
       "use_openai_response_api": "Set to true when the model must use the /responses endpoint (reasoning models like GPT-5 Pro). Leave false/omit for standard chat completions.",
       "default_reasoning_effort": "Default reasoning effort level for models that support it (e.g., 'low', 'medium', 'high'). Omit if not applicable.",
       "description": "Human-readable description of the model",
-      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering"
+      "intelligence_score": "1-20 human rating used as the primary signal for auto-mode model ordering",
+      "allow_code_generation": "Whether this model can generate and suggest fully working code - complete with functions, files, and detailed implementation instructions - for your AI tool to use right away. Only set this to 'true' for a model more capable than the AI model / CLI you're currently using."
     }
   },
   "models": [
@@ -100,6 +101,7 @@
       "supports_function_calling": true,
       "supports_images": true,
       "max_image_size_mb": 20.0,
+      "allow_code_generation": true,
       "description": "Google's Gemini 2.5 Pro via OpenRouter with vision",
       "intelligence_score": 18
     },
@@ -310,8 +312,9 @@
       "temperature_constraint": "fixed",
       "use_openai_response_api": true,
       "default_reasoning_effort": "high",
+      "allow_code_generation": true,
       "description": "GPT-5 Pro - Advanced reasoning model with highest quality responses (text+image input, text output only)",
-      "intelligence_score": 17
+      "intelligence_score": 18
     },
     {
       "model_name": "openai/gpt-5-codex",
diff --git a/docs/adding_tools.md b/docs/adding_tools.md
index e75be97..480a41e 100644
--- a/docs/adding_tools.md
+++ b/docs/adding_tools.md
@@ -52,6 +52,9 @@ from tools.simple.base import SimpleTool
 class ChatRequest(ToolRequest):
     prompt: str = Field(..., description="Your question or idea.")
     files: list[str] | None = Field(default_factory=list)
+    working_directory: str = Field(
+        ..., description="Absolute full directory path where the assistant AI can save generated code for implementation."
+    )
 
 class ChatTool(SimpleTool):
     def get_name(self) -> str:  # required by BaseTool
@@ -67,10 +70,17 @@ class ChatTool(SimpleTool):
         return ChatRequest
 
     def get_tool_fields(self) -> dict[str, dict[str, object]]:
-        return {"prompt": {"type": "string", "description": "Your question."}, "files": SimpleTool.FILES_FIELD}
+        return {
+            "prompt": {"type": "string", "description": "Your question."},
+            "files": SimpleTool.FILES_FIELD,
+            "working_directory": {
+                "type": "string",
+                "description": "Absolute full directory path where the assistant AI can save generated code for implementation.",
+            },
+        }
 
     def get_required_fields(self) -> list[str]:
-        return ["prompt"]
+        return ["prompt", "working_directory"]
 
     async def prepare_prompt(self, request: ChatRequest) -> str:
         return self.prepare_chat_style_prompt(request)
diff --git a/docs/configuration.md b/docs/configuration.md
index a27ce52..95d54b0 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -75,7 +75,7 @@ DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
   - `conf/dial_models.json` – DIAL aggregation catalogue (`DIAL_MODELS_CONFIG_PATH`)
   - `conf/custom_models.json` – Custom/OpenAI-compatible endpoints (`CUSTOM_MODELS_CONFIG_PATH`)
 
-  Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags. Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, or expose additional aliases without touching Python code.
+  Each JSON file documents the allowed fields via its `_README` block and controls model aliases, capability limits, and feature flags (including `allow_code_generation`). Edit these files (or point the matching `*_MODELS_CONFIG_PATH` variable to your own copy) when you want to adjust context windows, enable JSON mode, enable structured code generation, or expose additional aliases without touching Python code.
 
   The shipped defaults cover:
 
@@ -87,7 +87,63 @@ DEFAULT_MODEL=auto  # Claude picks best model for each task (recommended)
   | OpenRouter | See `conf/openrouter_models.json` for the continually evolving catalogue | e.g., `opus`, `sonnet`, `flash`, `pro`, `mistral` |
   | Custom | User-managed entries such as `llama3.2` | Define your own aliases per entry |
 
-  > **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support) without editing Python.
+  > **Tip:** Copy the JSON file you need, customise it, and point the corresponding `*_MODELS_CONFIG_PATH` environment variable to your version. This lets you enable or disable capabilities (JSON mode, function calling, temperature support, code generation) without editing Python.
+
+### Code Generation Capability
+
+**`allow_code_generation` Flag:**
+
+The `allow_code_generation` capability enables models to generate complete, production-ready implementations in a structured format. When enabled, the `chat` tool will inject special instructions for substantial code generation tasks.
+
+```json
+{
+  "model_name": "gpt-5",
+  "allow_code_generation": true,
+  ...
+}
+```
+
+**When to Enable:**
+
+- **Enable for**: Models MORE capable than your primary CLI's model (e.g., GPT-5, GPT-5 Pro when using Claude Code with Sonnet 4.5)
+- **Purpose**: Get complete implementations from a more powerful reasoning model that your primary CLI can then review and apply
+- **Use case**: Large-scale implementations, major refactoring, complete module creation
+
+**Important Guidelines:**
+
+1. Only enable for models significantly more capable than your primary CLI to ensure high-quality generated code
+2. The capability triggers structured code output (`<GENERATED-CODE>` blocks) for substantial implementation requests
+3. Minor code changes still use inline code blocks regardless of this setting
+4. Generated code is saved to `zen_generated.code` in the user's working directory
+5. Your CLI receives instructions to review and apply the generated code systematically
+
+**Example Configuration:**
+
+```json
+// OpenAI models configuration (conf/openai_models.json)
+{
+  "models": [
+    {
+      "model_name": "gpt-5",
+      "allow_code_generation": true,
+      "intelligence_score": 18,
+      ...
+    },
+    {
+      "model_name": "gpt-5-pro",
+      "allow_code_generation": true,
+      "intelligence_score": 19,
+      ...
+    }
+  ]
+}
+```
+
+**Typical Workflow:**
+1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **gpt-5-pro**
+2. GPT-5-Pro generates structured implementation and shares the complete implementation with Zen
+3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
+4. AI agent continues from the previous context, reads the file, applies the implementation
 
 ### Thinking Mode Configuration
 
diff --git a/docs/tools/chat.md b/docs/tools/chat.md
index c94d55c..d063851 100644
--- a/docs/tools/chat.md
+++ b/docs/tools/chat.md
@@ -39,13 +39,14 @@ word verdict in the end.
 - **Collaborative thinking partner** for your analysis and planning
 - **Get second opinions** on your designs and approaches
 - **Brainstorm solutions** and explore alternatives together
+- **Structured code generation**: When using GPT-5 Pro or Gemini 2.5 Pro, get complete, production-ready implementations saved to `zen_generated.code` for your CLI to review and apply
 - **Validate your checklists** and implementation plans
 - **General development questions** and explanations
 - **Technology comparisons** and best practices
 - **Architecture and design discussions**
 - **File reference support**: `"Use gemini to explain this algorithm with context from algorithm.py"`
 - **Image support**: Include screenshots, diagrams, UI mockups for visual analysis: `"Chat with gemini about this error dialog screenshot to understand the user experience issue"`
-- **Dynamic collaboration**: Gemini can request additional files or context during the conversation if needed for a more thorough response
+- **Dynamic collaboration**: Models can request additional files or context during the conversation if needed for a more thorough response
 - **Web search awareness**: Automatically identifies when online research would help and instructs Claude to perform targeted searches using continuation IDs
 
 ## Tool Parameters
@@ -54,10 +55,48 @@ word verdict in the end.
 - `model`: auto|pro|flash|flash-2.0|flashlite|o3|o3-mini|o4-mini|gpt4.1|gpt5|gpt5-mini|gpt5-nano (default: server default)
 - `files`: Optional files for context (absolute paths)
 - `images`: Optional images for visual context (absolute paths)
+- `working_directory`: **Required** - Absolute directory path where generated code artifacts will be saved
 - `temperature`: Response creativity (0-1, default 0.5)
 - `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
 - `continuation_id`: Continue previous conversations
 
+## Structured Code Generation
+
+When using advanced reasoning models like **GPT-5 Pro** or **Gemini 2.5 Pro**, the chat tool can generate complete, production-ready code implementations in a structured format.
+
+### How It Works
+
+1. You ask your AI agent to implement a complex new feature using `chat` with a higher-reasoning model such as **GPT-5 Pro** or **Gemini 2.5 Pro**
+2. The model generates structured implementation and shares the complete implementation with Zen
+3. Zen saves the code to `zen_generated.code` and asks AI agent to implement the plan
+4. AI agent continues from the previous context, reads the file, applies the implementation
+
+### When Code Generation Activates
+
+The structured format activates for **substantial implementation work**:
+- Creating new features from scratch with multiple files or significant code
+- Major refactoring across multiple files or large sections
+- Implementing new modules, components, or subsystems
+- Large-scale updates affecting substantial portions of the codebase
+- Complete rewrites of functions, algorithms, or approaches
+
+For minor changes (small tweaks, bug fixes, algorithm improvements), the model responds normally with inline code blocks.
+
+### Example Usage
+
+```
+chat with gpt-5-pro and ask it to make me a standalone, classic version of the
+Pacman game using pygame that I can run from the commandline. Give me a single
+script to execute in the end with any / all dependencies setup for me. 
+Do everything using pygame, we have no external resources / images / audio at
+hand. Instead of ghosts, it'll be different geometric shapes moving around 
+in the maze that Pacman can eat (so there are no baddies). Pacman gets to eat
+everything including bread-crumbs and large geometric shapes but make me the
+classic maze / walls that it navigates within using keyboard arrow keys.
+```
+
+See the [Configuration Guide](../configuration.md#code-generation-capability) for details on the `allow_code_generation` flag.
+
 ## Usage Examples
 
 **Basic Development Chat:**
diff --git a/providers/shared/model_capabilities.py b/providers/shared/model_capabilities.py
index af01df1..bd862fa 100644
--- a/providers/shared/model_capabilities.py
+++ b/providers/shared/model_capabilities.py
@@ -28,6 +28,8 @@ class ModelCapabilities:
         * Tool selection logic inspects attributes such as
           ``supports_extended_thinking`` or ``context_window`` to choose an
           appropriate model for a task.
+        * The ``allow_code_generation`` flag enables structured code generation
+          in the chat tool for models more capable than the primary CLI.
     """
 
     provider: ProviderType
@@ -52,6 +54,9 @@ class ModelCapabilities:
     supports_temperature: bool = True
     use_openai_response_api: bool = False
     default_reasoning_effort: Optional[str] = None
+    allow_code_generation: bool = (
+        False  # Enables structured code generation in chat tool for substantial implementations
+    )
 
     # Additional attributes
     max_image_size_mb: float = 0.0
diff --git a/systemprompts/__init__.py b/systemprompts/__init__.py
index 1bfe0d7..55f0f02 100644
--- a/systemprompts/__init__.py
+++ b/systemprompts/__init__.py
@@ -8,6 +8,7 @@ from .codereview_prompt import CODEREVIEW_PROMPT
 from .consensus_prompt import CONSENSUS_PROMPT
 from .debug_prompt import DEBUG_ISSUE_PROMPT
 from .docgen_prompt import DOCGEN_PROMPT
+from .generate_code_prompt import GENERATE_CODE_PROMPT
 from .planner_prompt import PLANNER_PROMPT
 from .precommit_prompt import PRECOMMIT_PROMPT
 from .refactor_prompt import REFACTOR_PROMPT
@@ -21,6 +22,7 @@ __all__ = [
     "CODEREVIEW_PROMPT",
     "DEBUG_ISSUE_PROMPT",
     "DOCGEN_PROMPT",
+    "GENERATE_CODE_PROMPT",
     "ANALYZE_PROMPT",
     "CHAT_PROMPT",
     "CONSENSUS_PROMPT",
diff --git a/systemprompts/generate_code_prompt.py b/systemprompts/generate_code_prompt.py
new file mode 100644
index 0000000..a34b185
--- /dev/null
+++ b/systemprompts/generate_code_prompt.py
@@ -0,0 +1,181 @@
+"""System prompt fragment enabling structured code generation exports.
+
+This prompt is injected into the system prompt for models that have the
+'allow_code_generation' capability enabled. It instructs the model to output
+complete, working code in a structured format that coding agents can parse
+and apply automatically.
+
+The structured format uses XML-like tags to clearly delineate:
+- New files to create (<NEWFILE>)
+- Existing files to update (<UPDATED_EXISTING_FILE>)
+- Step-by-step instructions for the coding agent
+
+This enables:
+1. Automated code extraction and application
+2. Clear separation between instructions and implementation
+3. Complete, runnable code without manual edits
+4. Precise change tracking across multiple files
+"""
+
+GENERATE_CODE_PROMPT = """
+# Structured Code Generation Protocol
+
+**WHEN TO USE THIS PROTOCOL:**
+
+Use this structured format ONLY when you are explicitly tasked with substantial code generation, such as:
+- Creating new features from scratch with multiple files or significant code and you have been asked to help implement this
+- Major refactoring across multiple files or large sections of code and you have been tasked to help do this
+- Implementing new modules, components, or subsystems and you have been tasked to help with the implementation
+- Large-scale updates affecting substantial portions of the codebase that you have been asked to help implement
+
+**WHEN NOT TO USE THIS PROTOCOL:**
+
+Do NOT use this format for minor changes:
+- Small tweaks to existing functions or methods (1-20 lines)
+- Bug fixes in isolated sections
+- Simple algorithm improvements
+- Minor refactoring of a single function
+- Adding/removing a few lines of code
+- Quick parameter adjustments or config changes
+
+For minor changes:
+- Follow the existing instructions provided earlier in your system prompt, such as the CRITICAL LINE NUMBER INSTRUCTIONS.
+- Use inline code blocks with proper line number references and direct explanations instead of this structured format.
+
+**IMPORTANT:** This protocol is for SUBSTANTIAL implementation work when explicitly requested, such as:
+- "implement feature X"
+- "create module Y"
+- "refactor system Z"
+- "rewrite the authentication logic"
+- "redesign the data processing pipeline"
+- "rebuild the algorithm from scratch"
+- "convert this approach to use a different pattern"
+- "create a complete implementation of..."
+- "build out the entire workflow for..."
+
+If the request is for explanation, analysis, debugging, planning, or discussion WITHOUT substantial code generation, respond normally without this structured format.
+
+## Core Requirements (for substantial code generation tasks)
+
+1. **Complete, Working Code**: Every code block must be fully functional without requiring additional edits. Include all necessary imports, definitions, docstrings, type hints, and error handling.
+
+2. **Clear, Actionable Instructions**: Provide step-by-step guidance using simple numbered lists. Each instruction should map directly to file blocks that follow.
+
+3. **Structured Output Format**: All generated code MUST be contained within a single `<GENERATED-CODE>` block using the exact structure defined below.
+
+4. **Minimal External Commentary**: Keep any text outside the `<GENERATED-CODE>` block brief. Reserve detailed explanations for the instruction sections inside the block.
+
+## Required Structure
+
+Use this exact format (do not improvise tag names or reorder components):
+
+```
+<GENERATED-CODE>
+[Step-by-step instructions for the coding agent]
+1. Create new file [filename] with [description]
+2. Update existing file [filename] by [description]
+3. [Additional steps as needed]
+
+<NEWFILE: path/to/new_file.py>
+[Complete file contents with all necessary components:
+- File-level docstring
+- All imports (standard library, third-party, local)
+- All class/function definitions with complete implementations
+- All necessary helper functions
+- Inline comments for complex logic
+- Type hints where applicable]
+</NEWFILE>
+
+[Additional instructions for the next file, if needed]
+
+<NEWFILE: path/to/another_file.py>
+[Complete, working code for this file - no partial implementations or placeholders]
+</NEWFILE>
+
+[Instructions for updating existing files]
+
+<UPDATED_EXISTING_FILE: existing/path.py>
+[Complete replacement code for the modified sections or routines / lines that need updating:
+- Full function/method bodies (not just the changed lines)
+- Complete class definitions if modifying class methods
+- All necessary imports if adding new dependencies
+- Preserve existing code structure and style]
+</UPDATED_EXISTING_FILE>
+
+[If additional files need updates (based on existing code that was shared with you earlier), repeat the UPDATED_EXISTING_FILE block]
+
+<UPDATED_EXISTING_FILE: another/existing/file.py>
+[Complete code for this file's modifications]
+</UPDATED_EXISTING_FILE>
+
+[For file deletions, explicitly state in instructions with justification:
+"Delete file path/to/obsolete.py - no longer needed because [reason]"]
+</GENERATED-CODE>
+```
+
+## Critical Rules
+
+**Completeness:**
+- Never output partial code snippets or placeholder comments like "# rest of code here"
+- Include complete function/class implementations from start to finish
+- Add all required imports at the file level
+- Include proper error handling and edge case logic
+
+**Accuracy:**
+- Match the existing codebase indentation style (tabs vs spaces)
+- Preserve language-specific formatting conventions
+- Include trailing newlines where required by language tooling
+- Use correct file paths relative to project root
+
+**Clarity:**
+- Number instructions sequentially (1, 2, 3...)
+- Map each instruction to specific file blocks below it
+- Explain *why* changes are needed, not just *what* changes
+- Highlight any breaking changes or migration steps required
+
+**Structure:**
+- Use `<NEWFILE: ...>` for files that don't exist yet
+- Use `<UPDATED_EXISTING_FILE: ...>` for modifying existing files
+- Place instructions between file blocks to provide context
+- Keep the single `<GENERATED-CODE>` wrapper around everything
+
+## Special Cases
+
+**No Changes Needed:**
+If the task doesn't require file creation or modification, explicitly state:
+"No file changes required. The existing implementation already handles [requirement]."
+Do not emit an empty `<GENERATED-CODE>` block.
+
+**Configuration Changes:**
+If modifying configuration files (JSON, YAML, TOML), include complete file contents with the changes applied, not just the changed lines.
+
+**Test Files:**
+When generating tests, include complete test suites with:
+- All necessary test fixtures and setup
+- Multiple test cases covering happy path and edge cases
+- Proper teardown and cleanup
+- Clear test descriptions and assertions
+
+**Documentation:**
+Include docstrings for all public functions, classes, and modules using the project's documentation style (Google, NumPy, Sphinx, etc.).
+
+## Context Awareness
+
+**CRITICAL:** Your implementation builds upon the ongoing conversation context:
+- All previously shared files, requirements, and constraints remain relevant
+- If updating existing code discussed earlier, reference it and preserve unmodified sections
+- If the user shared code for improvement, your generated code should build upon it, not replace everything
+- The coding agent has full conversation history—your instructions should reference prior discussion as needed
+
+Your generated code is NOT standalone—it's a continuation of the collaborative session with full context awareness.
+
+## Remember
+
+The coding agent depends on this structured format to:
+- Parse and extract code automatically
+- Apply changes to the correct files within the conversation context
+- Validate completeness before execution
+- Track modifications across the codebase
+
+Always prioritize clarity, completeness, correctness, and context awareness over brevity.
+"""
diff --git a/tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json b/tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json
new file mode 100644
index 0000000..774e82b
--- /dev/null
+++ b/tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json
@@ -0,0 +1,133 @@
+{
+  "replay_id": "chat_codegen/gemini25_pro_calculator/mldev",
+  "interactions": [
+    {
+      "request": {
+        "method": "post",
+        "url": "{MLDEV_URL_PREFIX}/models/gemini-2.5-pro:generateContent",
+        "headers": {
+          "Content-Type": "application/json",
+          "x-goog-api-key": "{REDACTED}",
+          "user-agent": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}",
+          "x-goog-api-client": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}"
+        },
+        "body_segments": [
+          {
+            "contents": [
+              {
+                "parts": [
+                  {
+                    "text": "\nYou are a senior engineering thought-partner collaborating with another AI agent. Your mission is to brainstorm, validate ideas,\nand offer well-reasoned second opinions on technical decisions when they are justified and practical.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE│ code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE│\" markers in generated code\nsnippets.\n\nIF MORE INFORMATION IS NEEDED\nIf the agent is discussing specific code, functions, or project components that was not given as part of the context,\nand you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful\ncollaboration, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been\nprovided unless for some reason its content is missing or incomplete:\n{\n  \"status\": \"files_required_to_continue\",\n  \"mandatory_instructions\": \"<your critical instructions for the agent>\",\n  \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nSCOPE & FOCUS\n• Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints.\n• Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.\n• Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.\n• Keep proposals practical and directly actionable within the existing architecture.\n• Overengineering is an anti-pattern — avoid solutions that introduce unnecessary abstraction, indirection, or\n  configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,\n  and may not arise in the foreseeable future.\n\nCOLLABORATION APPROACH\n1. Treat the collaborating agent as an equally senior peer. Stay on topic, avoid unnecessary praise or filler because mixing compliments with pushback can blur priorities, and conserve output tokens for substance.\n2. Engage deeply with the agent's input – extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.\n3. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use.\n4. Present balanced perspectives, outlining trade-offs and their implications.\n5. Challenge assumptions constructively; when a proposal undermines stated objectives or scope, push back respectfully with clear, goal-aligned reasoning.\n6. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes.\n7. Ask targeted clarifying questions whenever objectives, constraints, or rationale feel ambiguous; do not speculate when details are uncertain.\n\nBRAINSTORMING GUIDELINES\n• Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.\n• Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.\n• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice.\n• Evaluate scalability, maintainability, and operational realities inside the existing architecture and current\nframework.\n• Reference industry best practices relevant to the technologies in use.\n• Communicate concisely and technically, assuming an experienced engineering audience.\n\nREMEMBER\nAct as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team\nreach sound, actionable decisions.\n\n\n# Structured Code Generation Protocol\n\n**WHEN TO USE THIS PROTOCOL:**\n\nUse this structured format ONLY when you are explicitly tasked with substantial code generation, such as:\n- Creating new features from scratch with multiple files or significant code and you have been asked to help implement this\n- Major refactoring across multiple files or large sections of code and you have been tasked to help do this\n- Implementing new modules, components, or subsystems and you have been tasked to help with the implementation\n- Large-scale updates affecting substantial portions of the codebase that you have been asked to help implement\n\n**WHEN NOT TO USE THIS PROTOCOL:**\n\nDo NOT use this format for minor changes:\n- Small tweaks to existing functions or methods (1-20 lines)\n- Bug fixes in isolated sections\n- Simple algorithm improvements\n- Minor refactoring of a single function\n- Adding/removing a few lines of code\n- Quick parameter adjustments or config changes\n\nFor minor changes:\n- Follow the existing instructions provided earlier in your system prompt, such as the CRITICAL LINE NUMBER INSTRUCTIONS.\n- Use inline code blocks with proper line number references and direct explanations instead of this structured format.\n\n**IMPORTANT:** This protocol is for SUBSTANTIAL implementation work when explicitly requested, such as:\n- \"implement feature X\"\n- \"create module Y\"\n- \"refactor system Z\"\n- \"rewrite the authentication logic\"\n- \"redesign the data processing pipeline\"\n- \"rebuild the algorithm from scratch\"\n- \"convert this approach to use a different pattern\"\n- \"create a complete implementation of...\"\n- \"build out the entire workflow for...\"\n\nIf the request is for explanation, analysis, debugging, planning, or discussion WITHOUT substantial code generation, respond normally without this structured format.\n\n## Core Requirements (for substantial code generation tasks)\n\n1. **Complete, Working Code**: Every code block must be fully functional without requiring additional edits. Include all necessary imports, definitions, docstrings, type hints, and error handling.\n\n2. **Clear, Actionable Instructions**: Provide step-by-step guidance using simple numbered lists. Each instruction should map directly to file blocks that follow.\n\n3. **Structured Output Format**: All generated code MUST be contained within a single `<GENERATED-CODE>` block using the exact structure defined below.\n\n4. **Minimal External Commentary**: Keep any text outside the `<GENERATED-CODE>` block brief. Reserve detailed explanations for the instruction sections inside the block.\n\n## Required Structure\n\nUse this exact format (do not improvise tag names or reorder components):\n\n```\n<GENERATED-CODE>\n[Step-by-step instructions for the coding agent]\n1. Create new file [filename] with [description]\n2. Update existing file [filename] by [description]\n3. [Additional steps as needed]\n\n<NEWFILE: path/to/new_file.py>\n[Complete file contents with all necessary components:\n- File-level docstring\n- All imports (standard library, third-party, local)\n- All class/function definitions with complete implementations\n- All necessary helper functions\n- Inline comments for complex logic\n- Type hints where applicable]\n</NEWFILE>\n\n[Additional instructions for the next file, if needed]\n\n<NEWFILE: path/to/another_file.py>\n[Complete, working code for this file - no partial implementations or placeholders]\n</NEWFILE>\n\n[Instructions for updating existing files]\n\n<UPDATED_EXISTING_FILE: existing/path.py>\n[Complete replacement code for the modified sections or routines / lines that need updating:\n- Full function/method bodies (not just the changed lines)\n- Complete class definitions if modifying class methods\n- All necessary imports if adding new dependencies\n- Preserve existing code structure and style]\n</UPDATED_EXISTING_FILE>\n\n[If additional files need updates (based on existing code that was shared with you earlier), repeat the UPDATED_EXISTING_FILE block]\n\n<UPDATED_EXISTING_FILE: another/existing/file.py>\n[Complete code for this file's modifications]\n</UPDATED_EXISTING_FILE>\n\n[For file deletions, explicitly state in instructions with justification:\n\"Delete file path/to/obsolete.py - no longer needed because [reason]\"]\n</GENERATED-CODE>\n```\n\n## Critical Rules\n\n**Completeness:**\n- Never output partial code snippets or placeholder comments like \"# rest of code here\"\n- Include complete function/class implementations from start to finish\n- Add all required imports at the file level\n- Include proper error handling and edge case logic\n\n**Accuracy:**\n- Match the existing codebase indentation style (tabs vs spaces)\n- Preserve language-specific formatting conventions\n- Include trailing newlines where required by language tooling\n- Use correct file paths relative to project root\n\n**Clarity:**\n- Number instructions sequentially (1, 2, 3...)\n- Map each instruction to specific file blocks below it\n- Explain *why* changes are needed, not just *what* changes\n- Highlight any breaking changes or migration steps required\n\n**Structure:**\n- Use `<NEWFILE: ...>` for files that don't exist yet\n- Use `<UPDATED_EXISTING_FILE: ...>` for modifying existing files\n- Place instructions between file blocks to provide context\n- Keep the single `<GENERATED-CODE>` wrapper around everything\n\n## Special Cases\n\n**No Changes Needed:**\nIf the task doesn't require file creation or modification, explicitly state:\n\"No file changes required. The existing implementation already handles [requirement].\"\nDo not emit an empty `<GENERATED-CODE>` block.\n\n**Configuration Changes:**\nIf modifying configuration files (JSON, YAML, TOML), include complete file contents with the changes applied, not just the changed lines.\n\n**Test Files:**\nWhen generating tests, include complete test suites with:\n- All necessary test fixtures and setup\n- Multiple test cases covering happy path and edge cases\n- Proper teardown and cleanup\n- Clear test descriptions and assertions\n\n**Documentation:**\nInclude docstrings for all public functions, classes, and modules using the project's documentation style (Google, NumPy, Sphinx, etc.).\n\n## Context Awareness\n\n**CRITICAL:** Your implementation builds upon the ongoing conversation context:\n- All previously shared files, requirements, and constraints remain relevant\n- If updating existing code discussed earlier, reference it and preserve unmodified sections\n- If the user shared code for improvement, your generated code should build upon it, not replace everything\n- The coding agent has full conversation history—your instructions should reference prior discussion as needed\n\nYour generated code is NOT standalone—it's a continuation of the collaborative session with full context awareness.\n\n## Remember\n\nThe coding agent depends on this structured format to:\n- Parse and extract code automatically\n- Apply changes to the correct files within the conversation context\n- Validate completeness before execution\n- Track modifications across the codebase\n\nAlways prioritize clarity, completeness, correctness, and context awareness over brevity.\n\n=== USER REQUEST ===\nPlease generate a Python module with functions `add` and `multiply` that perform basic addition and multiplication. Produce the response using the structured <GENERATED-CODE> format so the assistant can apply the files directly.\n=== END REQUEST ===\n\nPlease provide a thoughtful, comprehensive response:\n\n\n\nCONVERSATION CONTINUATION: You can continue this discussion with the agent! (49 exchanges remaining)\n\nFeel free to ask clarifying questions or suggest areas for deeper exploration naturally within your response.\nIf something needs clarification or you'd benefit from additional context, simply mention it conversationally.\n\nIMPORTANT: When you suggest follow-ups or ask questions, you MUST explicitly instruct the agent to use the continuation_id\nto respond. Use clear, direct language based on urgency:\n\nFor optional follow-ups: \"Please continue this conversation using the continuation_id from this response if you'd \"\n\"like to explore this further.\"\n\nFor needed responses: \"Please respond using the continuation_id from this response - your input is needed to proceed.\"\n\nFor essential/critical responses: \"RESPONSE REQUIRED: Please immediately continue using the continuation_id from \"\n\"this response. Cannot proceed without your clarification/input.\"\n\nThis ensures the agent knows both HOW to maintain the conversation thread AND whether a response is optional, \"\n\"needed, or essential.\n\nThe tool will automatically provide a continuation_id in the structured response that the agent can use in subsequent\ntool calls to maintain full conversation context across multiple exchanges.\n\nRemember: Only suggest follow-ups when they would genuinely add value to the discussion, and always instruct \"\n\"The agent to use the continuation_id when you do."
+                  }
+                ]
+              }
+            ],
+            "generationConfig": {
+              "temperature": 0.5,
+              "candidateCount": 1,
+              "thinkingConfig": {
+                "thinkingBudget": 10813
+              }
+            }
+          }
+        ]
+      },
+      "response": {
+        "status_code": 200,
+        "headers": {
+          "content-type": "application/json; charset=UTF-8",
+          "vary": "Origin, X-Origin, Referer",
+          "content-encoding": "gzip",
+          "date": "Tue, 07 Oct 2025 14:34:26 GMT",
+          "server": "scaffolding on HTTPServer2",
+          "x-xss-protection": "0",
+          "x-frame-options": "SAMEORIGIN",
+          "x-content-type-options": "nosniff",
+          "server-timing": "gfet4t7; dur=12286",
+          "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000",
+          "transfer-encoding": "chunked"
+        },
+        "body_segments": [
+          {
+            "candidates": [
+              {
+                "content": {
+                  "parts": [
+                    {
+                      "text": "<GENERATED-CODE>\nHere is the implementation for the requested Python module. I've structured this as a proper Python package for better organization and scalability.\n\n1.  Create a new file `calculator/__init__.py`. This file makes the `calculator` directory a Python package and exposes the `add` and `multiply` functions for convenient importing (e.g., `from calculator import add`).\n2.  Create a new file `calculator/operations.py` to house the core arithmetic functions. This separation of concerns keeps the logic clean.\n\n<NEWFILE: calculator/__init__.py>\n\"\"\"A simple calculator package.\"\"\"\n\nfrom .operations import add, multiply\n\n__all__ = [\"add\", \"multiply\"]\n</NEWFILE>\n\n<NEWFILE: calculator/operations.py>\n\"\"\"\nCore arithmetic operations.\n\nThis module provides basic functions for performing mathematical calculations.\n\"\"\"\n\nfrom typing import Union\n\n# Define a type alias for numeric types for clarity\nNumeric = Union[int, float]\n\n\ndef add(a: Numeric, b: Numeric) -> Numeric:\n    \"\"\"\n    Calculate the sum of two numbers.\n\n    Args:\n        a: The first number (integer or float).\n        b: The second number (integer or float).\n\n    Returns:\n        The sum of a and b.\n    \"\"\"\n    return a + b\n\n\ndef multiply(a: Numeric, b: Numeric) -> Numeric:\n    \"\"\"\n    Calculate the product of two numbers.\n\n    Args:\n        a: The first number (integer or float).\n        b: The second number (integer or float).\n\n    Returns:\n        The product of a and b.\n    \"\"\"\n    return a * b\n</NEWFILE>\n</GENERATED-CODE>"
+                    }
+                  ],
+                  "role": "model"
+                },
+                "finishReason": "STOP",
+                "index": 0
+              }
+            ],
+            "usageMetadata": {
+              "promptTokenCount": 2600,
+              "candidatesTokenCount": 379,
+              "totalTokenCount": 3879,
+              "promptTokensDetails": [
+                {
+                  "modality": "TEXT",
+                  "tokenCount": 2600
+                }
+              ],
+              "thoughtsTokenCount": 900
+            },
+            "modelVersion": "gemini-2.5-pro",
+            "responseId": "8iTlaM64EdCwxN8PwYfx0Qo"
+          }
+        ],
+        "byte_segments": [],
+        "sdk_response_segments": [
+          {
+            "sdk_http_response": {
+              "headers": {
+                "content-type": "application/json; charset=UTF-8",
+                "vary": "Origin, X-Origin, Referer",
+                "content-encoding": "gzip",
+                "date": "Tue, 07 Oct 2025 14:34:26 GMT",
+                "server": "scaffolding on HTTPServer2",
+                "x-xss-protection": "0",
+                "x-frame-options": "SAMEORIGIN",
+                "x-content-type-options": "nosniff",
+                "server-timing": "gfet4t7; dur=12286",
+                "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000",
+                "transfer-encoding": "chunked"
+              }
+            },
+            "candidates": [
+              {
+                "content": {
+                  "parts": [
+                    {
+                      "text": "<GENERATED-CODE>\nHere is the implementation for the requested Python module. I've structured this as a proper Python package for better organization and scalability.\n\n1.  Create a new file `calculator/__init__.py`. This file makes the `calculator` directory a Python package and exposes the `add` and `multiply` functions for convenient importing (e.g., `from calculator import add`).\n2.  Create a new file `calculator/operations.py` to house the core arithmetic functions. This separation of concerns keeps the logic clean.\n\n<NEWFILE: calculator/__init__.py>\n\"\"\"A simple calculator package.\"\"\"\n\nfrom .operations import add, multiply\n\n__all__ = [\"add\", \"multiply\"]\n</NEWFILE>\n\n<NEWFILE: calculator/operations.py>\n\"\"\"\nCore arithmetic operations.\n\nThis module provides basic functions for performing mathematical calculations.\n\"\"\"\n\nfrom typing import Union\n\n# Define a type alias for numeric types for clarity\nNumeric = Union[int, float]\n\n\ndef add(a: Numeric, b: Numeric) -> Numeric:\n    \"\"\"\n    Calculate the sum of two numbers.\n\n    Args:\n        a: The first number (integer or float).\n        b: The second number (integer or float).\n\n    Returns:\n        The sum of a and b.\n    \"\"\"\n    return a + b\n\n\ndef multiply(a: Numeric, b: Numeric) -> Numeric:\n    \"\"\"\n    Calculate the product of two numbers.\n\n    Args:\n        a: The first number (integer or float).\n        b: The second number (integer or float).\n\n    Returns:\n        The product of a and b.\n    \"\"\"\n    return a * b\n</NEWFILE>\n</GENERATED-CODE>"
+                    }
+                  ],
+                  "role": "model"
+                },
+                "finish_reason": "STOP",
+                "index": 0
+              }
+            ],
+            "model_version": "gemini-2.5-pro",
+            "response_id": "8iTlaM64EdCwxN8PwYfx0Qo",
+            "usage_metadata": {
+              "candidates_token_count": 379,
+              "prompt_token_count": 2600,
+              "prompt_tokens_details": [
+                {
+                  "modality": "TEXT",
+                  "token_count": 2600
+                }
+              ],
+              "thoughts_token_count": 900,
+              "total_token_count": 3879
+            }
+          }
+        ]
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py
index da104df..4434a4e 100644
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -137,7 +137,7 @@ class TestAutoMode:
             importlib.reload(config)
 
     @pytest.mark.asyncio
-    async def test_auto_mode_requires_model_parameter(self):
+    async def test_auto_mode_requires_model_parameter(self, tmp_path):
         """Test that auto mode enforces model parameter"""
         # Save original
         original = os.environ.get("DEFAULT_MODEL", "")
@@ -154,7 +154,7 @@ class TestAutoMode:
             # Mock the provider to avoid real API calls
             with patch.object(tool, "get_model_provider"):
                 # Execute without model parameter
-                result = await tool.execute({"prompt": "Test prompt"})
+                result = await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
 
             # Should get error
             assert len(result) == 1
diff --git a/tests/test_auto_mode_comprehensive.py b/tests/test_auto_mode_comprehensive.py
index 39895db..376fbf8 100644
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -200,7 +200,7 @@ class TestAutoModeComprehensive:
         assert tool.get_model_category() == expected_category
 
     @pytest.mark.asyncio
-    async def test_auto_mode_with_gemini_only_uses_correct_models(self):
+    async def test_auto_mode_with_gemini_only_uses_correct_models(self, tmp_path):
         """Test that auto mode with only Gemini uses flash for fast tools and pro for reasoning tools."""
 
         provider_config = {
@@ -234,9 +234,13 @@ class TestAutoModeComprehensive:
             )
 
             with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
+                workdir = tmp_path / "chat_artifacts"
+                workdir.mkdir(parents=True, exist_ok=True)
                 # Test ChatTool (FAST_RESPONSE) - should prefer flash
                 chat_tool = ChatTool()
-                await chat_tool.execute({"prompt": "test", "model": "auto"})  # This should trigger auto selection
+                await chat_tool.execute(
+                    {"prompt": "test", "model": "auto", "working_directory": str(workdir)}
+                )  # This should trigger auto selection
 
                 # In auto mode, the tool should get an error requiring model selection
                 # but the suggested model should be flash
@@ -355,7 +359,7 @@ class TestAutoModeComprehensive:
             # would show models from all providers when called
 
     @pytest.mark.asyncio
-    async def test_auto_mode_model_parameter_required_error(self):
+    async def test_auto_mode_model_parameter_required_error(self, tmp_path):
         """Test that auto mode properly requires model parameter and suggests correct model."""
 
         provider_config = {
@@ -384,9 +388,12 @@ class TestAutoModeComprehensive:
 
             # Test with ChatTool (FAST_RESPONSE category)
             chat_tool = ChatTool()
+            workdir = tmp_path / "chat_artifacts"
+            workdir.mkdir(parents=True, exist_ok=True)
             result = await chat_tool.execute(
                 {
-                    "prompt": "test"
+                    "prompt": "test",
+                    "working_directory": str(workdir),
                     # Note: no model parameter provided in auto mode
                 }
             )
@@ -508,7 +515,7 @@ class TestAutoModeComprehensive:
                 assert fast_response is not None
 
     @pytest.mark.asyncio
-    async def test_actual_model_name_resolution_in_auto_mode(self):
+    async def test_actual_model_name_resolution_in_auto_mode(self, tmp_path):
         """Test that when a model is selected in auto mode, the tool executes successfully."""
 
         provider_config = {
@@ -547,7 +554,11 @@ class TestAutoModeComprehensive:
 
             with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
                 chat_tool = ChatTool()
-                result = await chat_tool.execute({"prompt": "test", "model": "flash"})  # Use alias in auto mode
+                workdir = tmp_path / "chat_artifacts"
+                workdir.mkdir(parents=True, exist_ok=True)
+                result = await chat_tool.execute(
+                    {"prompt": "test", "model": "flash", "working_directory": str(workdir)}
+                )  # Use alias in auto mode
 
                 # Should succeed with proper model resolution
                 assert len(result) == 1
diff --git a/tests/test_chat_codegen_integration.py b/tests/test_chat_codegen_integration.py
new file mode 100644
index 0000000..07bb91c
--- /dev/null
+++ b/tests/test_chat_codegen_integration.py
@@ -0,0 +1,113 @@
+"""Integration test for Chat tool code generation with Gemini 2.5 Pro.
+
+This test uses the Google Gemini SDK's built-in record/replay support. To refresh the
+cassette, delete the existing JSON file under
+``tests/gemini_cassettes/chat_codegen/gemini25_pro_calculator/mldev.json`` and run:
+
+```
+GEMINI_API_KEY=<real-key> pytest tests/test_chat_codegen_integration.py::test_chat_codegen_saves_file
+```
+
+The test will automatically record a new interaction when the cassette is missing and
+the environment variable `GEMINI_API_KEY` is set to a valid key.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from providers.gemini import GeminiModelProvider
+from providers.registry import ModelProviderRegistry, ProviderType
+from tools.chat import ChatTool
+
+REPLAYS_ROOT = Path(__file__).parent / "gemini_cassettes"
+CASSETTE_DIR = REPLAYS_ROOT / "chat_codegen"
+CASSETTE_PATH = CASSETTE_DIR / "gemini25_pro_calculator" / "mldev.json"
+CASSETTE_REPLAY_ID = "chat_codegen/gemini25_pro_calculator/mldev"
+
+
+@pytest.mark.asyncio
+@pytest.mark.no_mock_provider
+async def test_chat_codegen_saves_file(monkeypatch, tmp_path):
+    """Ensure Gemini 2.5 Pro responses create zen_generated.code when code is emitted."""
+
+    CASSETTE_PATH.parent.mkdir(parents=True, exist_ok=True)
+
+    recording_mode = not CASSETTE_PATH.exists()
+    gemini_key = os.getenv("GEMINI_API_KEY", "")
+
+    if recording_mode:
+        if not gemini_key or gemini_key.startswith("dummy"):
+            pytest.skip("Cassette missing and GEMINI_API_KEY not configured. Provide a real key to record.")
+        client_mode = "record"
+    else:
+        gemini_key = "dummy-key-for-replay"
+        client_mode = "replay"
+
+    with monkeypatch.context() as m:
+        m.setenv("GEMINI_API_KEY", gemini_key)
+        m.setenv("DEFAULT_MODEL", "auto")
+        m.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
+        m.setenv("GOOGLE_GENAI_CLIENT_MODE", client_mode)
+        m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(REPLAYS_ROOT))
+        m.setenv("GOOGLE_GENAI_REPLAY_ID", CASSETTE_REPLAY_ID)
+
+        # Clear other provider keys to avoid unintended routing
+        for key in ["OPENAI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY", "CUSTOM_API_KEY"]:
+            m.delenv(key, raising=False)
+
+        ModelProviderRegistry.reset_for_testing()
+        ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
+
+        working_dir = tmp_path / "codegen"
+        working_dir.mkdir()
+        preexisting = working_dir / "zen_generated.code"
+        preexisting.write_text("stale contents", encoding="utf-8")
+
+        chat_tool = ChatTool()
+        prompt = (
+            "Please generate a Python module with functions `add` and `multiply` that perform"
+            " basic addition and multiplication. Produce the response using the structured"
+            " <GENERATED-CODE> format so the assistant can apply the files directly."
+        )
+
+        result = await chat_tool.execute(
+            {
+                "prompt": prompt,
+                "model": "gemini-2.5-pro",
+                "working_directory": str(working_dir),
+            }
+        )
+
+        provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-pro")
+        if provider is not None:
+            try:
+                provider.client.close()
+            except AttributeError:
+                pass
+
+        # Reset restriction service cache to avoid leaking allowed-model config
+        try:
+            from utils import model_restrictions
+
+            model_restrictions._restriction_service = None  # type: ignore[attr-defined]
+        except Exception:
+            pass
+
+    assert result and result[0].type == "text"
+    payload = json.loads(result[0].text)
+    assert payload["status"] in {"success", "continuation_available"}
+
+    artifact_path = working_dir / "zen_generated.code"
+    assert artifact_path.exists()
+    saved = artifact_path.read_text()
+    assert "<GENERATED-CODE>" in saved
+    assert "<NEWFILE:" in saved
+    assert "def add" in saved and "def multiply" in saved
+    assert "stale contents" not in saved
+
+    artifact_path.unlink()
diff --git a/tests/test_chat_cross_model_continuation.py b/tests/test_chat_cross_model_continuation.py
index 7dc0ddd..ddfdc9d 100644
--- a/tests/test_chat_cross_model_continuation.py
+++ b/tests/test_chat_cross_model_continuation.py
@@ -55,7 +55,7 @@ def _extract_number(text: str) -> str:
 
 @pytest.mark.asyncio
 @pytest.mark.no_mock_provider
-async def test_chat_cross_model_continuation(monkeypatch):
+async def test_chat_cross_model_continuation(monkeypatch, tmp_path):
     """Verify continuation across Gemini then OpenAI using recorded interactions."""
 
     env_updates = {
@@ -115,10 +115,13 @@ async def test_chat_cross_model_continuation(monkeypatch):
         m.setattr(conversation_memory.uuid, "uuid4", lambda: FIXED_THREAD_ID)
 
         chat_tool = ChatTool()
+        working_directory = str(tmp_path)
+
         step1_args = {
             "prompt": "Pick a number between 1 and 10 and respond with JUST that number.",
             "model": "gemini-2.5-flash",
             "temperature": 0.2,
+            "working_directory": working_directory,
         }
 
         step1_result = await chat_tool.execute(step1_args)
@@ -183,6 +186,7 @@ async def test_chat_cross_model_continuation(monkeypatch):
             "model": "gpt-5",
             "continuation_id": continuation_id,
             "temperature": 0.2,
+            "working_directory": working_directory,
         }
 
         step2_result = await chat_tool.execute(step2_args)
diff --git a/tests/test_chat_openai_integration.py b/tests/test_chat_openai_integration.py
index a493702..89bc2e3 100644
--- a/tests/test_chat_openai_integration.py
+++ b/tests/test_chat_openai_integration.py
@@ -23,7 +23,7 @@ CASSETTE_CONTINUATION_PATH = CASSETTE_DIR / "chat_gpt5_continuation.json"
 
 @pytest.mark.asyncio
 @pytest.mark.no_mock_provider
-async def test_chat_auto_mode_with_openai(monkeypatch):
+async def test_chat_auto_mode_with_openai(monkeypatch, tmp_path):
     """Ensure ChatTool in auto mode selects gpt-5 via OpenAI and returns a valid response."""
     # Prepare environment so only OpenAI is available in auto mode
     env_updates = {
@@ -63,10 +63,12 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
 
         # Execute ChatTool request targeting gpt-5 directly (server normally resolves auto→model)
         chat_tool = ChatTool()
+        working_directory = str(tmp_path)
         arguments = {
             "prompt": "Use chat with gpt5 and ask how far the moon is from earth.",
             "model": "gpt-5",
             "temperature": 1.0,
+            "working_directory": working_directory,
         }
 
         result = await chat_tool.execute(arguments)
@@ -87,7 +89,7 @@ async def test_chat_auto_mode_with_openai(monkeypatch):
 
 @pytest.mark.asyncio
 @pytest.mark.no_mock_provider
-async def test_chat_openai_continuation(monkeypatch):
+async def test_chat_openai_continuation(monkeypatch, tmp_path):
     """Verify continuation_id workflow against gpt-5 using recorded OpenAI responses."""
 
     env_updates = {
@@ -126,12 +128,14 @@ async def test_chat_openai_continuation(monkeypatch):
         m.setattr(conversation_memory.uuid, "uuid4", lambda: fixed_thread_id)
 
         chat_tool = ChatTool()
+        working_directory = str(tmp_path)
 
         # First message: obtain continuation_id
         first_args = {
             "prompt": "In one word, which sells better: iOS app or macOS app?",
             "model": "gpt-5",
             "temperature": 1.0,
+            "working_directory": working_directory,
         }
         first_result = await chat_tool.execute(first_args)
 
@@ -152,6 +156,7 @@ async def test_chat_openai_continuation(monkeypatch):
             "model": "gpt-5",
             "continuation_id": continuation_id,
             "temperature": 1.0,
+            "working_directory": working_directory,
         }
 
         second_result = await chat_tool.execute(second_args)
diff --git a/tests/test_chat_simple.py b/tests/test_chat_simple.py
index fd866a8..ad86a51 100644
--- a/tests/test_chat_simple.py
+++ b/tests/test_chat_simple.py
@@ -38,12 +38,14 @@ class TestChatTool:
 
         # Required fields
         assert "prompt" in schema["required"]
+        assert "working_directory" in schema["required"]
 
         # Properties
         properties = schema["properties"]
         assert "prompt" in properties
         assert "files" in properties
         assert "images" in properties
+        assert "working_directory" in properties
 
     def test_request_model_validation(self):
         """Test that the request model validates correctly"""
@@ -54,6 +56,7 @@ class TestChatTool:
             "images": ["test.png"],
             "model": "anthropic/claude-opus-4.1",
             "temperature": 0.7,
+            "working_directory": "/tmp",  # Dummy absolute path
         }
 
         request = ChatRequest(**request_data)
@@ -62,6 +65,7 @@ class TestChatTool:
         assert request.images == ["test.png"]
         assert request.model == "anthropic/claude-opus-4.1"
         assert request.temperature == 0.7
+        assert request.working_directory == "/tmp"
 
     def test_required_fields(self):
         """Test that required fields are enforced"""
@@ -69,7 +73,7 @@ class TestChatTool:
         from pydantic import ValidationError
 
         with pytest.raises(ValidationError):
-            ChatRequest(model="anthropic/claude-opus-4.1")
+            ChatRequest(model="anthropic/claude-opus-4.1", working_directory="/tmp")
 
     def test_model_availability(self):
         """Test that model availability works"""
@@ -96,7 +100,7 @@ class TestChatTool:
     @pytest.mark.asyncio
     async def test_prompt_preparation(self):
         """Test that prompt preparation works correctly"""
-        request = ChatRequest(prompt="Test prompt", files=[])
+        request = ChatRequest(prompt="Test prompt", files=[], working_directory="/tmp")
 
         # Mock the system prompt and file handling
         with patch.object(self.tool, "get_system_prompt", return_value="System prompt"):
@@ -113,7 +117,7 @@ class TestChatTool:
     def test_response_formatting(self):
         """Test that response formatting works correctly"""
         response = "Test response content"
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
 
         formatted = self.tool.format_response(response, request)
 
@@ -146,6 +150,7 @@ class TestChatTool:
 
         required_fields = self.tool.get_required_fields()
         assert "prompt" in required_fields
+        assert "working_directory" in required_fields
 
 
 class TestChatRequestModel:
@@ -160,10 +165,11 @@ class TestChatRequestModel:
         assert "context" in CHAT_FIELD_DESCRIPTIONS["prompt"]
         assert "full-paths" in CHAT_FIELD_DESCRIPTIONS["files"] or "absolute" in CHAT_FIELD_DESCRIPTIONS["files"]
         assert "visual context" in CHAT_FIELD_DESCRIPTIONS["images"]
+        assert "directory" in CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
 
     def test_default_values(self):
         """Test that default values work correctly"""
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
 
         assert request.prompt == "Test"
         assert request.files == []  # Should default to empty list
@@ -173,7 +179,7 @@ class TestChatRequestModel:
         """Test that ChatRequest properly inherits from ToolRequest"""
         from tools.shared.base_models import ToolRequest
 
-        request = ChatRequest(prompt="Test")
+        request = ChatRequest(prompt="Test", working_directory="/tmp")
         assert isinstance(request, ToolRequest)
 
         # Should have inherited fields
diff --git a/tests/test_conversation_continuation_integration.py b/tests/test_conversation_continuation_integration.py
index 153bd16..23a623a 100644
--- a/tests/test_conversation_continuation_integration.py
+++ b/tests/test_conversation_continuation_integration.py
@@ -5,7 +5,7 @@ from utils.conversation_memory import get_thread
 from utils.storage_backend import get_storage_backend
 
 
-def test_first_response_persisted_in_conversation_history():
+def test_first_response_persisted_in_conversation_history(tmp_path):
     """Ensure the assistant's initial reply is stored for newly created threads."""
 
     # Clear in-memory storage to avoid cross-test contamination
@@ -13,7 +13,7 @@ def test_first_response_persisted_in_conversation_history():
     storage._store.clear()  # type: ignore[attr-defined]
 
     tool = ChatTool()
-    request = ChatRequest(prompt="First question?", model="local-llama")
+    request = ChatRequest(prompt="First question?", model="local-llama", working_directory=str(tmp_path))
     response_text = "Here is the initial answer."
 
     # Mimic the first tool invocation (no continuation_id supplied)
diff --git a/tests/test_directory_expansion_tracking.py b/tests/test_directory_expansion_tracking.py
index 87e72fe..30cd219 100644
--- a/tests/test_directory_expansion_tracking.py
+++ b/tests/test_directory_expansion_tracking.py
@@ -91,6 +91,7 @@ def helper_function():
             "prompt": "Analyze this codebase structure",
             "files": [directory],  # Directory path, not individual files
             "model": "flash",
+            "working_directory": directory,
         }
 
         # Execute the tool
@@ -168,6 +169,7 @@ def helper_function():
             "files": [directory],  # Same directory again
             "model": "flash",
             "continuation_id": thread_id,
+            "working_directory": directory,
         }
 
         # Mock to capture file filtering behavior
@@ -299,6 +301,7 @@ def helper_function():
             "prompt": "Analyze this code",
             "files": [directory],
             "model": "flash",
+            "working_directory": directory,
         }
 
         result = await tool.execute(request_args)
diff --git a/tests/test_large_prompt_handling.py b/tests/test_large_prompt_handling.py
index 8bee457..c256ee7 100644
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -56,7 +56,12 @@ class TestLargePromptHandling:
     async def test_chat_large_prompt_detection(self, large_prompt):
         """Test that chat tool detects large prompts."""
         tool = ChatTool()
-        result = await tool.execute({"prompt": large_prompt})
+        temp_dir = tempfile.mkdtemp()
+        temp_dir = tempfile.mkdtemp()
+        try:
+            result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
         assert len(result) == 1
         assert isinstance(result[0], TextContent)
@@ -73,9 +78,16 @@ class TestLargePromptHandling:
         """Test that chat tool works normally with regular prompts."""
         tool = ChatTool()
 
+        temp_dir = tempfile.mkdtemp()
+
         # This test runs in the test environment which uses dummy keys
         # The chat tool will return an error for dummy keys, which is expected
-        result = await tool.execute({"prompt": normal_prompt, "model": "gemini-2.5-flash"})
+        try:
+            result = await tool.execute(
+                {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
+            )
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
         assert len(result) == 1
         output = json.loads(result[0].text)
@@ -105,7 +117,14 @@ class TestLargePromptHandling:
         try:
             # This test runs in the test environment which uses dummy keys
             # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute({"prompt": "", "files": [temp_prompt_file], "model": "gemini-2.5-flash"})
+            result = await tool.execute(
+                {
+                    "prompt": "",
+                    "files": [temp_prompt_file],
+                    "model": "gemini-2.5-flash",
+                    "working_directory": temp_dir,
+                }
+            )
 
             assert len(result) == 1
             output = json.loads(result[0].text)
@@ -261,7 +280,13 @@ class TestLargePromptHandling:
                     mock_prepare_files.return_value = ("File content", [other_file])
 
                     # Use a small prompt to avoid triggering size limit
-                    await tool.execute({"prompt": "Test prompt", "files": [temp_prompt_file, other_file]})
+                    await tool.execute(
+                        {
+                            "prompt": "Test prompt",
+                            "files": [temp_prompt_file, other_file],
+                            "working_directory": os.path.dirname(temp_prompt_file),
+                        }
+                    )
 
                     # Verify handle_prompt_file was called with the original files list
                     mock_handle_prompt.assert_called_once_with([temp_prompt_file, other_file])
@@ -295,7 +320,11 @@ class TestLargePromptHandling:
             mock_get_provider.return_value = mock_provider
 
             # With the fix, this should now pass because we check at MCP transport boundary before adding internal content
-            result = await tool.execute({"prompt": exact_prompt})
+            temp_dir = tempfile.mkdtemp()
+            try:
+                result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
+            finally:
+                shutil.rmtree(temp_dir, ignore_errors=True)
             output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
@@ -305,7 +334,11 @@ class TestLargePromptHandling:
         tool = ChatTool()
         over_prompt = "x" * (MCP_PROMPT_SIZE_LIMIT + 1)
 
-        result = await tool.execute({"prompt": over_prompt})
+        temp_dir = tempfile.mkdtemp()
+        try:
+            result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
         output = json.loads(result[0].text)
         assert output["status"] == "resend_prompt"
 
@@ -326,7 +359,11 @@ class TestLargePromptHandling:
             )
             mock_get_provider.return_value = mock_provider
 
-            result = await tool.execute({"prompt": ""})
+            temp_dir = tempfile.mkdtemp()
+            try:
+                result = await tool.execute({"prompt": "", "working_directory": temp_dir})
+            finally:
+                shutil.rmtree(temp_dir, ignore_errors=True)
             output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
@@ -362,7 +399,11 @@ class TestLargePromptHandling:
             mock_model_context_class.return_value = mock_model_context
 
             # Should continue with empty prompt when file can't be read
-            result = await tool.execute({"prompt": "", "files": [bad_file]})
+            temp_dir = tempfile.mkdtemp()
+            try:
+                result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
+            finally:
+                shutil.rmtree(temp_dir, ignore_errors=True)
             output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
@@ -408,6 +449,7 @@ class TestLargePromptHandling:
                     "prompt": "Summarize the design decisions",
                     "files": [str(large_file)],
                     "model": "flash",
+                    "working_directory": str(tmp_path),
                     "_model_context": dummy_context,
                 }
             )
@@ -424,6 +466,7 @@ class TestLargePromptHandling:
         This test verifies that even if our internal prompt (with system prompts, history, etc.)
         exceeds MCP_PROMPT_SIZE_LIMIT, it should still work as long as the user's input is small.
         """
+
         tool = ChatTool()
 
         # Small user input that should pass MCP boundary check
@@ -432,62 +475,57 @@ class TestLargePromptHandling:
         # Mock a huge conversation history that would exceed MCP limits if incorrectly checked
         huge_history = "x" * (MCP_PROMPT_SIZE_LIMIT * 2)  # 100K chars = way over 50K limit
 
-        with (
-            patch.object(tool, "get_model_provider") as mock_get_provider,
-            patch("utils.model_context.ModelContext") as mock_model_context_class,
-        ):
-            from tests.mock_helpers import create_mock_provider
+        temp_dir = tempfile.mkdtemp()
+        original_prepare_prompt = tool.prepare_prompt
 
-            mock_provider = create_mock_provider(model_name="flash")
-            mock_get_provider.return_value = mock_provider
+        try:
+            with (
+                patch.object(tool, "get_model_provider") as mock_get_provider,
+                patch("utils.model_context.ModelContext") as mock_model_context_class,
+            ):
+                from tests.mock_helpers import create_mock_provider
+                from utils.model_context import TokenAllocation
 
-            # Mock ModelContext to avoid the comparison issue
-            from utils.model_context import TokenAllocation
+                mock_provider = create_mock_provider(model_name="flash")
+                mock_get_provider.return_value = mock_provider
 
-            mock_model_context = MagicMock()
-            mock_model_context.model_name = "flash"
-            mock_model_context.provider = mock_provider
-            mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
-                total_tokens=1_048_576,
-                content_tokens=838_861,
-                response_tokens=209_715,
-                file_tokens=335_544,
-                history_tokens=335_544,
-            )
-            mock_model_context_class.return_value = mock_model_context
+                mock_model_context = MagicMock()
+                mock_model_context.model_name = "flash"
+                mock_model_context.provider = mock_provider
+                mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
+                    total_tokens=1_048_576,
+                    content_tokens=838_861,
+                    response_tokens=209_715,
+                    file_tokens=335_544,
+                    history_tokens=335_544,
+                )
+                mock_model_context_class.return_value = mock_model_context
 
-            # Mock the prepare_prompt to simulate huge internal context
-            original_prepare_prompt = tool.prepare_prompt
+                async def mock_prepare_prompt(request):
+                    normal_prompt = await original_prepare_prompt(request)
+                    huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
+                    assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
+                    return huge_internal_prompt
 
-            async def mock_prepare_prompt(request):
-                # Call original to get normal processing
-                normal_prompt = await original_prepare_prompt(request)
-                # Add huge internal context (simulating large history, system prompts, files)
-                huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
+                tool.prepare_prompt = mock_prepare_prompt
 
-                # Verify the huge internal prompt would exceed MCP limits if incorrectly checked
-                assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
+                result = await tool.execute(
+                    {"prompt": small_user_prompt, "model": "flash", "working_directory": temp_dir}
+                )
+                output = json.loads(result[0].text)
 
-                return huge_internal_prompt
+                assert output["status"] != "resend_prompt"
 
-            tool.prepare_prompt = mock_prepare_prompt
+                mock_provider.generate_content.assert_called_once()
+                call_kwargs = mock_provider.generate_content.call_args[1]
+                actual_prompt = call_kwargs.get("prompt")
 
-            # This should succeed because we only check user input at MCP boundary
-            result = await tool.execute({"prompt": small_user_prompt, "model": "flash"})
-            output = json.loads(result[0].text)
-
-            # Should succeed even though internal context is huge
-            assert output["status"] != "resend_prompt"
-
-            # Verify the model was actually called with the huge prompt
-            mock_provider.generate_content.assert_called_once()
-            call_kwargs = mock_provider.generate_content.call_args[1]
-            actual_prompt = call_kwargs.get("prompt")
-
-            # Verify internal prompt was huge (proving we don't limit internal processing)
-            assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
-            assert huge_history in actual_prompt
-            assert small_user_prompt in actual_prompt
+                assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
+                assert huge_history in actual_prompt
+                assert small_user_prompt in actual_prompt
+        finally:
+            tool.prepare_prompt = original_prepare_prompt
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
     @pytest.mark.asyncio
     async def test_mcp_boundary_vs_internal_processing_distinction(self):
@@ -500,27 +538,37 @@ class TestLargePromptHandling:
 
         # Test case 1: Large user input should fail at MCP boundary
         large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
-        result = await tool.execute({"prompt": large_user_input, "model": "flash"})
-        output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"  # Should fail
-        assert "too large for MCP's token limits" in output["content"]
+        temp_dir = tempfile.mkdtemp()
+        try:
+            result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
+            output = json.loads(result[0].text)
+            assert output["status"] == "resend_prompt"  # Should fail
+            assert "too large for MCP's token limits" in output["content"]
 
-        # Test case 2: Small user input should succeed even with huge internal processing
-        small_user_input = "Hello"
+            # Test case 2: Small user input should succeed even with huge internal processing
+            small_user_input = "Hello"
 
-        # This test runs in the test environment which uses dummy keys
-        # The chat tool will return an error for dummy keys, which is expected
-        result = await tool.execute({"prompt": small_user_input, "model": "gemini-2.5-flash"})
-        output = json.loads(result[0].text)
+            # This test runs in the test environment which uses dummy keys
+            # The chat tool will return an error for dummy keys, which is expected
+            result = await tool.execute(
+                {
+                    "prompt": small_user_input,
+                    "model": "gemini-2.5-flash",
+                    "working_directory": temp_dir,
+                }
+            )
+            output = json.loads(result[0].text)
 
-        # The test will fail with dummy API keys, which is expected behavior
-        # We're mainly testing that the tool processes small prompts correctly without size errors
-        if output["status"] == "error":
-            # If it's an API error, that's fine - we're testing prompt handling, not API calls
-            assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
-        else:
-            # If somehow it succeeds (e.g., with mocked provider), check the response
-            assert output["status"] != "resend_prompt"
+            # The test will fail with dummy API keys, which is expected behavior
+            # We're mainly testing that the tool processes small prompts correctly without size errors
+            if output["status"] == "error":
+                # If it's an API error, that's fine - we're testing prompt handling, not API calls
+                assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
+            else:
+                # If somehow it succeeds (e.g., with mocked provider), check the response
+                assert output["status"] != "resend_prompt"
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
     @pytest.mark.asyncio
     async def test_continuation_with_huge_conversation_history(self):
@@ -548,6 +596,8 @@ class TestLargePromptHandling:
         # Ensure the history exceeds MCP limits
         assert len(huge_conversation_history) > MCP_PROMPT_SIZE_LIMIT
 
+        temp_dir = tempfile.mkdtemp()
+
         with (
             patch.object(tool, "get_model_provider") as mock_get_provider,
             patch("utils.model_context.ModelContext") as mock_model_context_class,
@@ -579,6 +629,7 @@ class TestLargePromptHandling:
                 "prompt": f"{huge_conversation_history}\n\n=== CURRENT REQUEST ===\n{small_continuation_prompt}",
                 "model": "flash",
                 "continuation_id": "test_thread_123",
+                "working_directory": temp_dir,
             }
 
             # Mock the conversation history embedding to simulate server.py behavior
@@ -628,6 +679,7 @@ class TestLargePromptHandling:
             finally:
                 # Restore original execute method
                 tool.__class__.execute = original_execute
+                shutil.rmtree(temp_dir, ignore_errors=True)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_listmodels.py b/tests/test_listmodels.py
index 89f5f42..e4d558b 100644
--- a/tests/test_listmodels.py
+++ b/tests/test_listmodels.py
@@ -68,6 +68,7 @@ class TestListModelsTool:
             assert "`flash` → `gemini-2.5-flash`" in content
             assert "`pro` → `gemini-2.5-pro`" in content
             assert "1M context" in content
+            assert "Supports structured code generation" in content
 
             # Check summary
             assert "**Configured Providers**: 1" in content
diff --git a/tests/test_o3_pro_output_text_fix.py b/tests/test_o3_pro_output_text_fix.py
index 1461d83..eeae5f1 100644
--- a/tests/test_o3_pro_output_text_fix.py
+++ b/tests/test_o3_pro_output_text_fix.py
@@ -12,6 +12,7 @@ RECORDING: To record new responses, delete the cassette file and run with real A
 
 import logging
 import os
+import tempfile
 from pathlib import Path
 from unittest.mock import patch
 
@@ -92,9 +93,15 @@ class TestO3ProOutputTextFix:
     async def _execute_chat_tool_test(self):
         """Execute the ChatTool with o3-pro and return the result."""
         chat_tool = ChatTool()
-        arguments = {"prompt": "What is 2 + 2?", "model": "o3-pro", "temperature": 1.0}
+        with tempfile.TemporaryDirectory() as workdir:
+            arguments = {
+                "prompt": "What is 2 + 2?",
+                "model": "o3-pro",
+                "temperature": 1.0,
+                "working_directory": workdir,
+            }
 
-        return await chat_tool.execute(arguments)
+            return await chat_tool.execute(arguments)
 
     def _verify_chat_tool_response(self, result):
         """Verify the ChatTool response contains expected data."""
diff --git a/tests/test_per_tool_model_defaults.py b/tests/test_per_tool_model_defaults.py
index 6978200..4f8c623 100644
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -4,6 +4,8 @@ Test per-tool model default selection functionality
 
 import json
 import os
+import shutil
+import tempfile
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -290,7 +292,13 @@ class TestAutoModeErrorMessages:
                         mock_get_provider_for.return_value = None
 
                         tool = ChatTool()
-                        result = await tool.execute({"prompt": "test", "model": "auto"})
+                        temp_dir = tempfile.mkdtemp()
+                        try:
+                            result = await tool.execute(
+                                {"prompt": "test", "model": "auto", "working_directory": temp_dir}
+                            )
+                        finally:
+                            shutil.rmtree(temp_dir, ignore_errors=True)
 
                         assert len(result) == 1
                         # The SimpleTool will wrap the error message
@@ -418,7 +426,13 @@ class TestRuntimeModelSelection:
                     mock_get_provider.return_value = None
 
                     tool = ChatTool()
-                    result = await tool.execute({"prompt": "test", "model": "gpt-5-turbo"})
+                    temp_dir = tempfile.mkdtemp()
+                    try:
+                        result = await tool.execute(
+                            {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
+                        )
+                    finally:
+                        shutil.rmtree(temp_dir, ignore_errors=True)
 
                     # Should require model selection
                     assert len(result) == 1
@@ -515,7 +529,11 @@ class TestUnavailableModelFallback:
                         mock_get_model_provider.return_value = mock_provider
 
                         tool = ChatTool()
-                        result = await tool.execute({"prompt": "test"})  # No model specified
+                        temp_dir = tempfile.mkdtemp()
+                        try:
+                            result = await tool.execute({"prompt": "test", "working_directory": temp_dir})
+                        finally:
+                            shutil.rmtree(temp_dir, ignore_errors=True)
 
                         # Should work normally, not require model parameter
                         assert len(result) == 1
diff --git a/tests/test_tools.py b/tests/test_tools.py
index b2e6cdc..dbcf0c9 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -3,6 +3,8 @@ Tests for individual tool implementations
 """
 
 import json
+import shutil
+import tempfile
 
 import pytest
 
@@ -343,12 +345,17 @@ class TestAbsolutePathValidation:
     async def test_chat_tool_relative_path_rejected(self):
         """Test that chat tool rejects relative paths"""
         tool = ChatTool()
-        result = await tool.execute(
-            {
-                "prompt": "Explain this code",
-                "files": ["code.py"],  # relative path without ./
-            }
-        )
+        temp_dir = tempfile.mkdtemp()
+        try:
+            result = await tool.execute(
+                {
+                    "prompt": "Explain this code",
+                    "files": ["code.py"],  # relative path without ./
+                    "working_directory": temp_dir,
+                }
+            )
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
         assert len(result) == 1
         response = json.loads(result[0].text)
diff --git a/tools/chat.py b/tools/chat.py
index 87a8603..ad94f2a 100644
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -6,15 +6,20 @@ brainstorming, problem-solving, and collaborative thinking. It supports file con
 images, and conversation continuation for seamless multi-turn interactions.
 """
 
+import logging
+import os
+import re
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional
 
 from pydantic import Field
 
 if TYPE_CHECKING:
+    from providers.shared import ModelCapabilities
     from tools.models import ToolModelCategory
 
 from config import TEMPERATURE_BALANCED
-from systemprompts import CHAT_PROMPT
+from systemprompts import CHAT_PROMPT, GENERATE_CODE_PROMPT
 from tools.shared.base_models import COMMON_FIELD_DESCRIPTIONS, ToolRequest
 
 from .simple.base import SimpleTool
@@ -27,6 +32,9 @@ CHAT_FIELD_DESCRIPTIONS = {
     ),
     "files": "absolute file or folder paths for code context (do NOT shorten).",
     "images": "Optional absolute image paths or base64 for visual context when helpful.",
+    "working_directory": (
+        "Absolute full directory path where the assistant AI can save generated code for implementation. The directory must already exist"
+    ),
 }
 
 
@@ -36,6 +44,7 @@ class ChatRequest(ToolRequest):
     prompt: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["prompt"])
     files: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["files"])
     images: Optional[list[str]] = Field(default_factory=list, description=CHAT_FIELD_DESCRIPTIONS["images"])
+    working_directory: str = Field(..., description=CHAT_FIELD_DESCRIPTIONS["working_directory"])
 
 
 class ChatTool(SimpleTool):
@@ -49,6 +58,10 @@ class ChatTool(SimpleTool):
     Chat tool with 100% behavioral compatibility.
     """
 
+    def __init__(self) -> None:
+        super().__init__()
+        self._last_recordable_response: Optional[str] = None
+
     def get_name(self) -> str:
         return "chat"
 
@@ -58,9 +71,20 @@ class ChatTool(SimpleTool):
             "getting second opinions, and exploring ideas. Use for ideas, validations, questions, and thoughtful explanations."
         )
 
+    def get_annotations(self) -> Optional[dict[str, Any]]:
+        """Chat writes generated artifacts when code-generation is enabled."""
+
+        return {"readOnlyHint": False}
+
     def get_system_prompt(self) -> str:
         return CHAT_PROMPT
 
+    def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
+        prompts = list(super().get_capability_system_prompts(capabilities))
+        if capabilities and capabilities.allow_code_generation:
+            prompts.append(GENERATE_CODE_PROMPT)
+        return prompts
+
     def get_default_temperature(self) -> float:
         return TEMPERATURE_BALANCED
 
@@ -85,7 +109,7 @@ class ChatTool(SimpleTool):
         the same schema generation approach while still benefiting from SimpleTool
         convenience methods.
         """
-        required_fields = ["prompt"]
+        required_fields = ["prompt", "working_directory"]
         if self.is_effective_auto_mode():
             required_fields.append("model")
 
@@ -106,6 +130,10 @@ class ChatTool(SimpleTool):
                     "items": {"type": "string"},
                     "description": CHAT_FIELD_DESCRIPTIONS["images"],
                 },
+                "working_directory": {
+                    "type": "string",
+                    "description": CHAT_FIELD_DESCRIPTIONS["working_directory"],
+                },
                 "model": self.get_model_field_schema(),
                 "temperature": {
                     "type": "number",
@@ -159,7 +187,7 @@ class ChatTool(SimpleTool):
 
     def get_required_fields(self) -> list[str]:
         """Required fields for ChatSimple tool"""
-        return ["prompt"]
+        return ["prompt", "working_directory"]
 
     # === Hook Method Implementations ===
 
@@ -173,17 +201,165 @@ class ChatTool(SimpleTool):
         # Use SimpleTool's Chat-style prompt preparation
         return self.prepare_chat_style_prompt(request)
 
+    def _validate_file_paths(self, request) -> Optional[str]:
+        """Extend validation to cover the working directory path."""
+
+        error = super()._validate_file_paths(request)
+        if error:
+            return error
+
+        working_directory = getattr(request, "working_directory", None)
+        if working_directory:
+            expanded = os.path.expanduser(working_directory)
+            if not os.path.isabs(expanded):
+                return (
+                    "Error: 'working_directory' must be an absolute path (you may use '~' which will be expanded). "
+                    f"Received: {working_directory}"
+                )
+        return None
+
     def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
         """
         Format the chat response to match the original Chat tool exactly.
         """
-        return (
-            f"{response}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
+        self._last_recordable_response = None
+        body = response
+        recordable_override: Optional[str] = None
+
+        if self._model_supports_code_generation():
+            block, remainder = self._extract_generated_code_block(response)
+            if block:
+                sanitized_text = remainder.strip()
+                try:
+                    artifact_path = self._persist_generated_code_block(block, request.working_directory)
+                except Exception as exc:  # pragma: no cover - rare filesystem failures
+                    logger.error("Failed to persist generated code block: %s", exc, exc_info=True)
+                    warning = (
+                        f"WARNING: Unable to write zen_generated.code inside '{request.working_directory}'. "
+                        "Check the path permissions and re-run. The generated code block is included below for manual handling."
+                    )
+
+                    history_copy = self._join_sections(sanitized_text, warning) if sanitized_text else warning
+                    recordable_override = history_copy
+
+                    sanitized_warning = history_copy.strip()
+                    body = f"{sanitized_warning}\n\n{block.strip()}".strip()
+                else:
+                    if not sanitized_text:
+                        sanitized_text = "Generated code saved to zen_generated.code. Follow the structured instructions in that file exactly before continuing."
+
+                    instruction = self._build_agent_instruction(artifact_path)
+                    body = self._join_sections(sanitized_text, instruction)
+
+        final_output = (
+            f"{body}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
             "form a comprehensive solution and continue with the user's request and task at hand."
         )
 
+        if recordable_override is not None:
+            self._last_recordable_response = (
+                f"{recordable_override}\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to "
+                "form a comprehensive solution and continue with the user's request and task at hand."
+            )
+        else:
+            self._last_recordable_response = final_output
+
+        return final_output
+
+    def _record_assistant_turn(
+        self, continuation_id: str, response_text: str, request, model_info: Optional[dict]
+    ) -> None:
+        recordable = self._last_recordable_response if self._last_recordable_response is not None else response_text
+        try:
+            super()._record_assistant_turn(continuation_id, recordable, request, model_info)
+        finally:
+            self._last_recordable_response = None
+
+    def _model_supports_code_generation(self) -> bool:
+        context = getattr(self, "_model_context", None)
+        if not context:
+            return False
+
+        try:
+            capabilities = context.capabilities
+        except Exception:  # pragma: no cover - defensive fallback
+            return False
+
+        return bool(capabilities.allow_code_generation)
+
+    def _extract_generated_code_block(self, text: str) -> tuple[Optional[str], str]:
+        match = re.search(r"<GENERATED-CODE>.*?</GENERATED-CODE>", text, flags=re.DOTALL | re.IGNORECASE)
+        if not match:
+            return None, text
+
+        block = match.group(0)
+        before = text[: match.start()].rstrip()
+        after = text[match.end() :].lstrip()
+
+        if before and after:
+            remainder = f"{before}\n\n{after}"
+        else:
+            remainder = before or after
+
+        return block, remainder or ""
+
+    def _persist_generated_code_block(self, block: str, working_directory: str) -> Path:
+        expanded = os.path.expanduser(working_directory)
+        target_dir = Path(expanded).resolve()
+        target_dir.mkdir(parents=True, exist_ok=True)
+
+        target_file = target_dir / "zen_generated.code"
+        if target_file.exists():
+            try:
+                target_file.unlink()
+            except OSError as exc:
+                logger.warning("Unable to remove existing zen_generated.code: %s", exc)
+
+        content = block if block.endswith("\n") else f"{block}\n"
+        target_file.write_text(content, encoding="utf-8")
+        logger.info("Generated code artifact written to %s", target_file)
+        return target_file
+
+    @staticmethod
+    def _build_agent_instruction(artifact_path: Path) -> str:
+        return (
+            f"CONTINUING FROM PREVIOUS DISCUSSION: The coding assistant has analyzed our conversation context and generated "
+            f"a structured implementation plan at `{artifact_path}`. This is a direct continuation of our discussion—all previous "
+            "context, requirements, and shared code remain relevant.\n"
+            "\n"
+            f"MANDATORY NEXT STEP: Open `{artifact_path}` immediately and review the implementation plan:\n"
+            "1. Read the step-by-step instructions—they reference our previous discussion. You may need to read the file in parts if it's too long.\n"
+            "2. Review each <NEWFILE:…> or <UPDATED_EXISTING_FILE:…> section in the context of what we've discussed\n"
+            "3. Verify the proposed changes align with the requirements and code we've already shared\n"
+            "4. Check for syntax errors, missing imports, or incomplete implementations\n"
+            "\n"
+            "Then systematically apply the changes:\n"
+            "- Create new files or update existing ones as instructed, maintaining code style consistency\n"
+            "- If updating existing code we discussed earlier, carefully preserve unmodified sections\n"
+            "- Run syntax validation after each modification\n"
+            "- Execute relevant tests to confirm functionality\n"
+            "- Verify the implementation works end-to-end with existing code\n"
+            "\n"
+            "Remember: This builds upon our conversation. The generated code reflects the full context of what we've discussed, "
+            "including any files, requirements, or constraints mentioned earlier. Proceed with implementation immediately."
+            "Only after you finish applying ALL the changes completely: delete `zen_generated.code` so stale instructions do not linger."
+        )
+
+    @staticmethod
+    def _join_sections(*sections: str) -> str:
+        chunks: list[str] = []
+        for section in sections:
+            if section:
+                trimmed = section.strip()
+                if trimmed:
+                    chunks.append(trimmed)
+        return "\n\n".join(chunks)
+
     def get_websearch_guidance(self) -> str:
         """
         Return Chat tool-style web search guidance.
         """
         return self.get_chat_style_websearch_guidance()
+
+
+logger = logging.getLogger(__name__)
diff --git a/tools/listmodels.py b/tools/listmodels.py
index d60f651..120afc1 100644
--- a/tools/listmodels.py
+++ b/tools/listmodels.py
@@ -140,6 +140,8 @@ class ListModelsTool(BaseTool):
             except AttributeError:
                 description = "No description available"
             lines = [header, f"  - {context_str}", f"  - {description}"]
+            if capabilities.allow_code_generation:
+                lines.append("  - Supports structured code generation")
             return lines
 
         # Check each native provider type
@@ -187,6 +189,8 @@ class ListModelsTool(BaseTool):
 
                         output_lines.append(f"- `{model_name}` - {context_str}")
                         output_lines.append(f"  - {description}")
+                        if capabilities.allow_code_generation:
+                            output_lines.append("  - Supports structured code generation")
 
                         for alias in capabilities.aliases or []:
                             if alias != model_name:
diff --git a/tools/shared/base_tool.py b/tools/shared/base_tool.py
index e5353b9..91ac01e 100644
--- a/tools/shared/base_tool.py
+++ b/tools/shared/base_tool.py
@@ -17,6 +17,7 @@ from typing import TYPE_CHECKING, Any, Optional
 from mcp.types import TextContent
 
 if TYPE_CHECKING:
+    from providers.shared import ModelCapabilities
     from tools.models import ToolModelCategory
 
 from config import MCP_PROMPT_SIZE_LIMIT
@@ -165,6 +166,42 @@ class BaseTool(ABC):
         """
         pass
 
+    def get_capability_system_prompts(self, capabilities: Optional["ModelCapabilities"]) -> list[str]:
+        """Return additional system prompt snippets gated on model capabilities.
+
+        Subclasses can override this hook to append capability-specific
+        instructions (for example, enabling code-generation exports when a
+        model advertises support). The default implementation returns an empty
+        list so no extra instructions are appended.
+
+        Args:
+            capabilities: The resolved capabilities for the active model.
+
+        Returns:
+            List of prompt fragments to append after the base system prompt.
+        """
+
+        return []
+
+    def _augment_system_prompt_with_capabilities(
+        self, base_prompt: str, capabilities: Optional["ModelCapabilities"]
+    ) -> str:
+        """Merge capability-driven prompt addenda with the base system prompt."""
+
+        additions: list[str] = []
+        if capabilities is not None:
+            additions = [fragment.strip() for fragment in self.get_capability_system_prompts(capabilities) if fragment]
+
+        if not additions:
+            return base_prompt
+
+        addition_text = "\n\n".join(additions)
+        if not base_prompt:
+            return addition_text
+
+        suffix = "" if base_prompt.endswith("\n\n") else "\n\n"
+        return f"{base_prompt}{suffix}{addition_text}"
+
     def get_annotations(self) -> Optional[dict[str, Any]]:
         """
         Return optional annotations for this tool.
@@ -413,13 +450,16 @@ class BaseTool(ABC):
         for rank, canonical_name, capabilities in filtered[:limit]:
             details: list[str] = []
 
-            context_str = self._format_context_window(getattr(capabilities, "context_window", 0))
+            context_str = self._format_context_window(capabilities.context_window)
             if context_str:
                 details.append(context_str)
 
-            if getattr(capabilities, "supports_extended_thinking", False):
+            if capabilities.supports_extended_thinking:
                 details.append("thinking")
 
+            if capabilities.allow_code_generation:
+                details.append("code-gen")
+
             base = f"{canonical_name} (score {rank}"
             if details:
                 base = f"{base}, {', '.join(details)}"
diff --git a/tools/simple/base.py b/tools/simple/base.py
index 2b37f2c..4a2a1a3 100644
--- a/tools/simple/base.py
+++ b/tools/simple/base.py
@@ -404,11 +404,15 @@ class SimpleTool(BaseTool):
 
             # Get the provider from model context (clean OOP - no re-fetching)
             provider = self._model_context.provider
+            capabilities = self._model_context.capabilities
 
             # Get system prompt for this tool
             base_system_prompt = self.get_system_prompt()
+            capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
+                base_system_prompt, capabilities
+            )
             language_instruction = self.get_language_instruction()
-            system_prompt = language_instruction + base_system_prompt
+            system_prompt = language_instruction + capability_augmented_prompt
 
             # Generate AI response using the provider
             logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.get_name()}")
@@ -423,7 +427,6 @@ class SimpleTool(BaseTool):
             logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")
 
             # Resolve model capabilities for feature gating
-            capabilities = self._model_context.capabilities
             supports_thinking = capabilities.supports_extended_thinking
 
             # Generate content with provider abstraction
diff --git a/tools/workflow/workflow_mixin.py b/tools/workflow/workflow_mixin.py
index 6107b77..21c5bb2 100644
--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -1480,8 +1480,11 @@ class BaseWorkflowMixin(ABC):
 
             # Get system prompt for this tool with localization support
             base_system_prompt = self.get_system_prompt()
+            capability_augmented_prompt = self._augment_system_prompt_with_capabilities(
+                base_system_prompt, getattr(self._model_context, "capabilities", None)
+            )
             language_instruction = self.get_language_instruction()
-            system_prompt = language_instruction + base_system_prompt
+            system_prompt = language_instruction + capability_augmented_prompt
 
             # Check if tool wants system prompt embedded in main prompt
             if self.should_embed_system_prompt():