From 95e69a7cb234305dcd37dcdd2f22be715922e9a8 Mon Sep 17 00:00:00 2001
From: Fahad <fahad@2doapp.com>
Date: Fri, 17 Oct 2025 23:42:32 +0400
Subject: [PATCH] fix: improved error reporting; codex cli would at times fail
 to figure out how to handle plain-text / JSON errors fix: working directory
 should exist, raise error and not try and create one docs: improved API
 Lookup instructions * test added to confirm failures * chat schema more
 explicit about file paths

---
 server.py                                     |   5 +-
 simulator_tests/conversation_base_test.py     |  12 +-
 .../test_debug_certain_confidence.py          |   9 +-
 tests/test_auto_mode.py                       |  15 +-
 tests/test_auto_mode_comprehensive.py         |  61 ++----
 tests/test_auto_mode_model_listing.py         |  41 ++--
 tests/test_challenge.py                       |   7 +-
 tests/test_chat_simple.py                     |  57 ++++-
 tests/test_image_support_integration.py       |  43 ++--
 tests/test_large_prompt_handling.py           | 200 ++++++++++--------
 tests/test_mcp_error_handling.py              |  64 ++++++
 tests/test_per_tool_model_defaults.py         |  22 +-
 tests/test_planner.py                         |  11 +-
 tests/test_thinking_modes.py                  | 100 ++++++---
 tests/test_tools.py                           |  43 ++--
 tests/test_workflow_metadata.py               |   9 +-
 ..._workflow_prompt_size_validation_simple.py |  13 +-
 tools/apilookup.py                            |   5 +-
 tools/challenge.py                            |   5 +-
 tools/chat.py                                 |  88 ++++----
 tools/clink.py                                |  24 +--
 tools/shared/exceptions.py                    |  20 ++
 tools/simple/base.py                          |  29 ++-
 tools/workflow/workflow_mixin.py              |  23 +-
 24 files changed, 569 insertions(+), 337 deletions(-)
 create mode 100644 tests/test_mcp_error_handling.py
 create mode 100644 tools/shared/exceptions.py

diff --git a/server.py b/server.py
index 5e4e517..11a468b 100644
--- a/server.py
+++ b/server.py
@@ -68,6 +68,7 @@ from tools import (  # noqa: E402
     VersionTool,
 )
 from tools.models import ToolOutput  # noqa: E402
+from tools.shared.exceptions import ToolExecutionError  # noqa: E402
 from utils.env import env_override_enabled, get_env  # noqa: E402
 
 # Configure logging for server operations
@@ -837,7 +838,7 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
                 content_type="text",
                 metadata={"tool_name": name, "requested_model": model_name},
             )
-            return [TextContent(type="text", text=error_output.model_dump_json())]
+            raise ToolExecutionError(error_output.model_dump_json())
 
         # Create model context with resolved model and option
         model_context = ModelContext(model_name, model_option)
@@ -856,7 +857,7 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
             file_size_check = check_total_file_size(arguments["files"], model_name)
             if file_size_check:
                 logger.warning(f"File size check failed for {name} with model {model_name}")
-                return [TextContent(type="text", text=ToolOutput(**file_size_check).model_dump_json())]
+                raise ToolExecutionError(ToolOutput(**file_size_check).model_dump_json())
 
         # Execute tool with pre-resolved model context
         result = await tool.execute(arguments)
diff --git a/simulator_tests/conversation_base_test.py b/simulator_tests/conversation_base_test.py
index 54a13cc..f6d9388 100644
--- a/simulator_tests/conversation_base_test.py
+++ b/simulator_tests/conversation_base_test.py
@@ -38,6 +38,8 @@ import asyncio
 import json
 from typing import Optional
 
+from tools.shared.exceptions import ToolExecutionError
+
 from .base_test import BaseSimulatorTest
 
 
@@ -158,7 +160,15 @@ class ConversationBaseTest(BaseSimulatorTest):
             params["_resolved_model_name"] = model_name
 
             # Execute tool asynchronously
-            result = loop.run_until_complete(tool.execute(params))
+            try:
+                result = loop.run_until_complete(tool.execute(params))
+            except ToolExecutionError as exc:
+                response_text = exc.payload
+                continuation_id = self._extract_continuation_id_from_response(response_text)
+                self.logger.debug(f"Tool '{tool_name}' returned error payload in-process")
+                if self.verbose and response_text:
+                    self.logger.debug(f"Error response preview: {response_text[:500]}...")
+                return response_text, continuation_id
 
             if not result or len(result) == 0:
                 return None, None
diff --git a/simulator_tests/test_debug_certain_confidence.py b/simulator_tests/test_debug_certain_confidence.py
index c864189..f8a41b2 100644
--- a/simulator_tests/test_debug_certain_confidence.py
+++ b/simulator_tests/test_debug_certain_confidence.py
@@ -12,6 +12,8 @@ Tests the debug tool's 'certain' confidence feature in a realistic simulation:
 import json
 from typing import Optional
 
+from tools.shared.exceptions import ToolExecutionError
+
 from .conversation_base_test import ConversationBaseTest
 
 
@@ -482,7 +484,12 @@ This happens every time a user tries to log in. The error occurs in the password
             loop = self._get_event_loop()
 
             # Call the tool's execute method
-            result = loop.run_until_complete(tool.execute(params))
+            try:
+                result = loop.run_until_complete(tool.execute(params))
+            except ToolExecutionError as exc:
+                response_text = exc.payload
+                continuation_id = self._extract_debug_continuation_id(response_text)
+                return response_text, continuation_id
 
             if not result or len(result) == 0:
                 self.logger.error(f"Tool '{tool_name}' returned empty result")
diff --git a/tests/test_auto_mode.py b/tests/test_auto_mode.py
index 4434a4e..98341de 100644
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -7,6 +7,7 @@ from unittest.mock import patch
 import pytest
 
 from tools.chat import ChatTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestAutoMode:
@@ -153,14 +154,14 @@ class TestAutoMode:
 
             # Mock the provider to avoid real API calls
             with patch.object(tool, "get_model_provider"):
-                # Execute without model parameter
-                result = await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
+                # Execute without model parameter and expect protocol error
+                with pytest.raises(ToolExecutionError) as exc_info:
+                    await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
 
-            # Should get error
-            assert len(result) == 1
-            response = result[0].text
-            assert "error" in response
-            assert "Model parameter is required" in response or "Model 'auto' is not available" in response
+            # Should get error payload mentioning model requirement
+            error_payload = getattr(exc_info.value, "payload", str(exc_info.value))
+            assert "Model" in error_payload
+            assert "auto" in error_payload
 
         finally:
             # Restore
diff --git a/tests/test_auto_mode_comprehensive.py b/tests/test_auto_mode_comprehensive.py
index 376fbf8..cb326e4 100644
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -15,6 +15,7 @@ from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.debug import DebugIssueTool
 from tools.models import ToolModelCategory
+from tools.shared.exceptions import ToolExecutionError
 from tools.thinkdeep import ThinkDeepTool
 
 
@@ -227,30 +228,15 @@ class TestAutoModeComprehensive:
             # Register only Gemini provider
             ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
 
-            # Mock provider to capture what model is requested
-            mock_provider = MagicMock()
-            mock_provider.generate_content.return_value = MagicMock(
-                content="test response", model_name="test-model", usage={"input_tokens": 10, "output_tokens": 5}
-            )
+            # Test ChatTool (FAST_RESPONSE) - auto mode should suggest flash variant
+            chat_tool = ChatTool()
+            chat_message = chat_tool._build_auto_mode_required_message()
+            assert "flash" in chat_message
 
-            with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
-                workdir = tmp_path / "chat_artifacts"
-                workdir.mkdir(parents=True, exist_ok=True)
-                # Test ChatTool (FAST_RESPONSE) - should prefer flash
-                chat_tool = ChatTool()
-                await chat_tool.execute(
-                    {"prompt": "test", "model": "auto", "working_directory": str(workdir)}
-                )  # This should trigger auto selection
-
-                # In auto mode, the tool should get an error requiring model selection
-                # but the suggested model should be flash
-
-                # Reset mock for next test
-                ModelProviderRegistry.get_provider_for_model.reset_mock()
-
-                # Test DebugIssueTool (EXTENDED_REASONING) - should prefer pro
-                debug_tool = DebugIssueTool()
-                await debug_tool.execute({"prompt": "test error", "model": "auto"})
+            # Test DebugIssueTool (EXTENDED_REASONING) - auto mode should suggest pro variant
+            debug_tool = DebugIssueTool()
+            debug_message = debug_tool._build_auto_mode_required_message()
+            assert "pro" in debug_message
 
     def test_auto_mode_schema_includes_all_available_models(self):
         """Test that auto mode schema includes all available models for user convenience."""
@@ -390,30 +376,25 @@ class TestAutoModeComprehensive:
             chat_tool = ChatTool()
             workdir = tmp_path / "chat_artifacts"
             workdir.mkdir(parents=True, exist_ok=True)
-            result = await chat_tool.execute(
-                {
-                    "prompt": "test",
-                    "working_directory": str(workdir),
-                    # Note: no model parameter provided in auto mode
-                }
-            )
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await chat_tool.execute(
+                    {
+                        "prompt": "test",
+                        "working_directory": str(workdir),
+                        # Note: no model parameter provided in auto mode
+                    }
+                )
 
-            # Should get error requiring model selection
-            assert len(result) == 1
-            response_text = result[0].text
-
-            # Parse JSON response to check error
+            # Should get error requiring model selection with fallback suggestion
             import json
 
-            response_data = json.loads(response_text)
+            response_data = json.loads(exc_info.value.payload)
 
             assert response_data["status"] == "error"
             assert (
-                "Model parameter is required" in response_data["content"]
-                or "Model 'auto' is not available" in response_data["content"]
+                "Model parameter is required" in response_data["content"] or "Model 'auto'" in response_data["content"]
             )
-            # Note: With the new SimpleTool-based Chat tool, the error format is simpler
-            # and doesn't include category-specific suggestions like the original tool did
+            assert "flash" in response_data["content"]
 
     def test_model_availability_with_restrictions(self):
         """Test that auto mode respects model restrictions when selecting fallback models."""
diff --git a/tests/test_auto_mode_model_listing.py b/tests/test_auto_mode_model_listing.py
index 2b78249..3c844ad 100644
--- a/tests/test_auto_mode_model_listing.py
+++ b/tests/test_auto_mode_model_listing.py
@@ -14,6 +14,7 @@ from providers.openrouter import OpenRouterProvider
 from providers.registry import ModelProviderRegistry
 from providers.shared import ProviderType
 from providers.xai import XAIModelProvider
+from tools.shared.exceptions import ToolExecutionError
 
 
 def _extract_available_models(message: str) -> list[str]:
@@ -123,18 +124,18 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
     model_restrictions._restriction_service = None
     server.configure_providers()
 
-    result = asyncio.run(
-        server.handle_call_tool(
-            "chat",
-            {
-                "model": "gpt5mini",
-                "prompt": "Tell me about your strengths",
-            },
+    with pytest.raises(ToolExecutionError) as exc_info:
+        asyncio.run(
+            server.handle_call_tool(
+                "chat",
+                {
+                    "model": "gpt5mini",
+                    "prompt": "Tell me about your strengths",
+                },
+            )
         )
-    )
 
-    assert len(result) == 1
-    payload = json.loads(result[0].text)
+    payload = json.loads(exc_info.value.payload)
     assert payload["status"] == "error"
 
     available_models = _extract_available_models(payload["content"])
@@ -208,18 +209,18 @@ def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, rese
     model_restrictions._restriction_service = None
     server.configure_providers()
 
-    result = asyncio.run(
-        server.handle_call_tool(
-            "chat",
-            {
-                "model": "dummymodel",
-                "prompt": "Hi there",
-            },
+    with pytest.raises(ToolExecutionError) as exc_info:
+        asyncio.run(
+            server.handle_call_tool(
+                "chat",
+                {
+                    "model": "dummymodel",
+                    "prompt": "Hi there",
+                },
+            )
         )
-    )
 
-    assert len(result) == 1
-    payload = json.loads(result[0].text)
+    payload = json.loads(exc_info.value.payload)
     assert payload["status"] == "error"
 
     available_models = _extract_available_models(payload["content"])
diff --git a/tests/test_challenge.py b/tests/test_challenge.py
index c090f60..e9d30a5 100644
--- a/tests/test_challenge.py
+++ b/tests/test_challenge.py
@@ -12,6 +12,7 @@ from unittest.mock import patch
 import pytest
 
 from tools.challenge import ChallengeRequest, ChallengeTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestChallengeTool:
@@ -110,10 +111,10 @@ class TestChallengeTool:
         """Test error handling in execute method"""
         # Test with invalid arguments (non-dict)
         with patch.object(self.tool, "get_request_model", side_effect=Exception("Test error")):
-            result = await self.tool.execute({"prompt": "test"})
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await self.tool.execute({"prompt": "test"})
 
-        assert len(result) == 1
-        response_data = json.loads(result[0].text)
+        response_data = json.loads(exc_info.value.payload)
         assert response_data["status"] == "error"
         assert "Test error" in response_data["error"]
 
diff --git a/tests/test_chat_simple.py b/tests/test_chat_simple.py
index ad86a51..af45f2f 100644
--- a/tests/test_chat_simple.py
+++ b/tests/test_chat_simple.py
@@ -5,11 +5,14 @@ This module contains unit tests to ensure that the Chat tool
 (now using SimpleTool architecture) maintains proper functionality.
 """
 
+import json
+from types import SimpleNamespace
 from unittest.mock import patch
 
 import pytest
 
 from tools.chat import ChatRequest, ChatTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestChatTool:
@@ -125,6 +128,30 @@ class TestChatTool:
         assert "AGENT'S TURN:" in formatted
         assert "Evaluate this perspective" in formatted
 
+    def test_format_response_multiple_generated_code_blocks(self, tmp_path):
+        """All generated-code blocks should be combined and saved to zen_generated.code."""
+        tool = ChatTool()
+        tool._model_context = SimpleNamespace(capabilities=SimpleNamespace(allow_code_generation=True))
+
+        response = (
+            "Intro text\n"
+            "<GENERATED-CODE>print('hello')</GENERATED-CODE>\n"
+            "Other text\n"
+            "<GENERATED-CODE>print('world')</GENERATED-CODE>"
+        )
+
+        request = ChatRequest(prompt="Test", working_directory=str(tmp_path))
+
+        formatted = tool.format_response(response, request)
+
+        saved_path = tmp_path / "zen_generated.code"
+        saved_content = saved_path.read_text(encoding="utf-8")
+
+        assert "print('hello')" in saved_content
+        assert "print('world')" in saved_content
+        assert saved_content.count("<GENERATED-CODE>") == 2
+        assert str(saved_path) in formatted
+
     def test_tool_name(self):
         """Test tool name is correct"""
         assert self.tool.get_name() == "chat"
@@ -163,10 +190,38 @@ class TestChatRequestModel:
         # Field descriptions should exist and be descriptive
         assert len(CHAT_FIELD_DESCRIPTIONS["prompt"]) > 50
         assert "context" in CHAT_FIELD_DESCRIPTIONS["prompt"]
-        assert "full-paths" in CHAT_FIELD_DESCRIPTIONS["files"] or "absolute" in CHAT_FIELD_DESCRIPTIONS["files"]
+        files_desc = CHAT_FIELD_DESCRIPTIONS["files"].lower()
+        assert "absolute" in files_desc
         assert "visual context" in CHAT_FIELD_DESCRIPTIONS["images"]
         assert "directory" in CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
 
+    def test_working_directory_description_matches_behavior(self):
+        """Working directory description should reflect automatic creation."""
+        from tools.chat import CHAT_FIELD_DESCRIPTIONS
+
+        description = CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
+        assert "must already exist" in description
+
+    @pytest.mark.asyncio
+    async def test_working_directory_must_exist(self, tmp_path):
+        """Chat tool should reject non-existent working directories."""
+        tool = ChatTool()
+        missing_dir = tmp_path / "nonexistent_subdir"
+
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(
+                {
+                    "prompt": "test",
+                    "files": [],
+                    "images": [],
+                    "working_directory": str(missing_dir),
+                }
+            )
+
+        payload = json.loads(exc_info.value.payload)
+        assert payload["status"] == "error"
+        assert "existing directory" in payload["content"].lower()
+
     def test_default_values(self):
         """Test that default values work correctly"""
         request = ChatRequest(prompt="Test", working_directory="/tmp")
diff --git a/tests/test_image_support_integration.py b/tests/test_image_support_integration.py
index 219dce6..498de7c 100644
--- a/tests/test_image_support_integration.py
+++ b/tests/test_image_support_integration.py
@@ -8,7 +8,6 @@ Tests the complete image support pipeline:
 - Cross-tool image context preservation
 """
 
-import json
 import os
 import tempfile
 import uuid
@@ -18,6 +17,7 @@ import pytest
 
 from tools.chat import ChatTool
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError
 from utils.conversation_memory import (
     ConversationTurn,
     ThreadContext,
@@ -276,31 +276,28 @@ class TestImageSupportIntegration:
             tool = ChatTool()
 
             # Test with real provider resolution
-            try:
-                result = await tool.execute(
-                    {"prompt": "What do you see in this image?", "images": [temp_image_path], "model": "gpt-4o"}
-                )
+            with tempfile.TemporaryDirectory() as working_directory:
+                with pytest.raises(ToolExecutionError) as exc_info:
+                    await tool.execute(
+                        {
+                            "prompt": "What do you see in this image?",
+                            "images": [temp_image_path],
+                            "model": "gpt-4o",
+                            "working_directory": working_directory,
+                        }
+                    )
 
-                # If we get here, check the response format
-                assert len(result) == 1
-                # Should be a valid JSON response
-                output = json.loads(result[0].text)
-                assert "status" in output
-                # Test passed - provider accepted images parameter
+            error_msg = exc_info.value.payload if hasattr(exc_info.value, "payload") else str(exc_info.value)
 
-            except Exception as e:
-                # Expected: API call will fail with fake key
-                error_msg = str(e)
-                # Should NOT be a mock-related error
-                assert "MagicMock" not in error_msg
-                assert "'<' not supported between instances" not in error_msg
+            # Should NOT be a mock-related error
+            assert "MagicMock" not in error_msg
+            assert "'<' not supported between instances" not in error_msg
 
-                # Should be a real provider error (API key or network)
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection", "401", "403"]
-                )
-                # Test passed - provider processed images parameter before failing on auth
+            # Should be a real provider error (API key or network)
+            assert any(
+                phrase in error_msg
+                for phrase in ["API", "key", "authentication", "provider", "network", "connection", "401", "403"]
+            )
 
         finally:
             # Clean up temp file
diff --git a/tests/test_large_prompt_handling.py b/tests/test_large_prompt_handling.py
index c256ee7..3425e13 100644
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -13,11 +13,11 @@ import tempfile
 from unittest.mock import MagicMock, patch
 
 import pytest
-from mcp.types import TextContent
 
 from config import MCP_PROMPT_SIZE_LIMIT
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
+from tools.shared.exceptions import ToolExecutionError
 
 # from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 
@@ -59,14 +59,12 @@ class TestLargePromptHandling:
         temp_dir = tempfile.mkdtemp()
         temp_dir = tempfile.mkdtemp()
         try:
-            result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
         finally:
             shutil.rmtree(temp_dir, ignore_errors=True)
 
-        assert len(result) == 1
-        assert isinstance(result[0], TextContent)
-
-        output = json.loads(result[0].text)
+        output = json.loads(exc_info.value.payload)
         assert output["status"] == "resend_prompt"
         assert f"{MCP_PROMPT_SIZE_LIMIT:,} characters" in output["content"]
         # The prompt size should match the user input since we check at MCP transport boundary before adding internal content
@@ -83,23 +81,20 @@ class TestLargePromptHandling:
         # This test runs in the test environment which uses dummy keys
         # The chat tool will return an error for dummy keys, which is expected
         try:
-            result = await tool.execute(
-                {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
-            )
+            try:
+                result = await tool.execute(
+                    {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                assert len(result) == 1
+                output = json.loads(result[0].text)
         finally:
             shutil.rmtree(temp_dir, ignore_errors=True)
 
-        assert len(result) == 1
-        output = json.loads(result[0].text)
-
-        # The test will fail with dummy API keys, which is expected behavior
-        # We're mainly testing that the tool processes prompts correctly without size errors
-        if output["status"] == "error":
-            # Provider stubs surface generic errors when SDKs are unavailable.
-            # As long as we didn't trigger the MCP size guard, the behavior is acceptable.
-            assert output["status"] != "resend_prompt"
-        else:
-            assert output["status"] != "resend_prompt"
+        # Whether provider succeeds or fails, we should not hit the resend_prompt branch
+        assert output["status"] != "resend_prompt"
 
     @pytest.mark.asyncio
     async def test_chat_prompt_file_handling(self):
@@ -115,27 +110,24 @@ class TestLargePromptHandling:
             f.write(reasonable_prompt)
 
         try:
-            # This test runs in the test environment which uses dummy keys
-            # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute(
-                {
-                    "prompt": "",
-                    "files": [temp_prompt_file],
-                    "model": "gemini-2.5-flash",
-                    "working_directory": temp_dir,
-                }
-            )
-
-            assert len(result) == 1
-            output = json.loads(result[0].text)
-
-            # The test will fail with dummy API keys, which is expected behavior
-            # We're mainly testing that the tool processes prompts correctly without size errors
-            if output["status"] == "error":
-                assert output["status"] != "resend_prompt"
+            try:
+                result = await tool.execute(
+                    {
+                        "prompt": "",
+                        "files": [temp_prompt_file],
+                        "model": "gemini-2.5-flash",
+                        "working_directory": temp_dir,
+                    }
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
             else:
-                assert output["status"] != "resend_prompt"
+                assert len(result) == 1
+                output = json.loads(result[0].text)
 
+            # The test may fail with dummy API keys, which is expected behavior.
+            # We're mainly testing that the tool processes prompt files correctly without size errors.
+            assert output["status"] != "resend_prompt"
         finally:
             # Cleanup
             shutil.rmtree(temp_dir)
@@ -173,39 +165,47 @@ class TestLargePromptHandling:
 
             # Test with real provider resolution
             try:
-                result = await tool.execute(
-                    {
-                        "files": ["/some/file.py"],
-                        "focus_on": large_prompt,
-                        "prompt": "Test code review for validation purposes",
-                        "model": "o3-mini",
-                    }
-                )
+                args = {
+                    "step": "initial review setup",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Initial testing",
+                    "relevant_files": ["/some/file.py"],
+                    "files_checked": ["/some/file.py"],
+                    "focus_on": large_prompt,
+                    "prompt": "Test code review for validation purposes",
+                    "model": "o3-mini",
+                }
 
-                # The large focus_on should be detected and handled properly
-                assert len(result) == 1
-                output = json.loads(result[0].text)
-                # Should detect large prompt and return resend_prompt status
-                assert output["status"] == "resend_prompt"
+                try:
+                    result = await tool.execute(args)
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    assert len(result) == 1
+                    output = json.loads(result[0].text)
+
+                # The large focus_on may trigger the resend_prompt guard before provider access.
+                # When the guard does not trigger, auto-mode falls back to provider selection and
+                # returns an error about the unavailable model. Both behaviors are acceptable for this test.
+                if output.get("status") == "resend_prompt":
+                    assert output["metadata"]["prompt_size"] == len(large_prompt)
+                else:
+                    assert output.get("status") == "error"
+                    assert "Model" in output.get("content", "")
 
             except Exception as e:
-                # If we get an exception, check it's not a MagicMock error
+                # If we get an unexpected exception, ensure it's not a mock artifact
                 error_msg = str(e)
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error (API, authentication, etc.)
-                # But the large prompt detection should happen BEFORE the API call
-                # So we might still get the resend_prompt response
-                if "resend_prompt" in error_msg:
-                    # This is actually the expected behavior - large prompt was detected
-                    assert True
-                else:
-                    # Should be a real provider error
-                    assert any(
-                        phrase in error_msg
-                        for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                    )
+                assert any(
+                    phrase in error_msg
+                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
+                )
 
         finally:
             # Restore environment
@@ -322,10 +322,14 @@ class TestLargePromptHandling:
             # With the fix, this should now pass because we check at MCP transport boundary before adding internal content
             temp_dir = tempfile.mkdtemp()
             try:
-                result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
             finally:
                 shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
     @pytest.mark.asyncio
@@ -336,10 +340,14 @@ class TestLargePromptHandling:
 
         temp_dir = tempfile.mkdtemp()
         try:
-            result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
+            try:
+                result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)
         finally:
             shutil.rmtree(temp_dir, ignore_errors=True)
-        output = json.loads(result[0].text)
         assert output["status"] == "resend_prompt"
 
     @pytest.mark.asyncio
@@ -361,10 +369,14 @@ class TestLargePromptHandling:
 
             temp_dir = tempfile.mkdtemp()
             try:
-                result = await tool.execute({"prompt": "", "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": "", "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
             finally:
                 shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
     @pytest.mark.asyncio
@@ -401,10 +413,14 @@ class TestLargePromptHandling:
             # Should continue with empty prompt when file can't be read
             temp_dir = tempfile.mkdtemp()
             try:
-                result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
             finally:
                 shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
             assert output["status"] != "resend_prompt"
 
     @pytest.mark.asyncio
@@ -540,33 +556,37 @@ class TestLargePromptHandling:
         large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
         temp_dir = tempfile.mkdtemp()
         try:
-            result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
-            output = json.loads(result[0].text)
+            try:
+                result = await tool.execute(
+                    {"prompt": large_user_input, "model": "flash", "working_directory": temp_dir}
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)
+
             assert output["status"] == "resend_prompt"  # Should fail
             assert "too large for MCP's token limits" in output["content"]
 
             # Test case 2: Small user input should succeed even with huge internal processing
             small_user_input = "Hello"
 
-            # This test runs in the test environment which uses dummy keys
-            # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute(
-                {
-                    "prompt": small_user_input,
-                    "model": "gemini-2.5-flash",
-                    "working_directory": temp_dir,
-                }
-            )
-            output = json.loads(result[0].text)
+            try:
+                result = await tool.execute(
+                    {
+                        "prompt": small_user_input,
+                        "model": "gemini-2.5-flash",
+                        "working_directory": temp_dir,
+                    }
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)
 
             # The test will fail with dummy API keys, which is expected behavior
             # We're mainly testing that the tool processes small prompts correctly without size errors
-            if output["status"] == "error":
-                # If it's an API error, that's fine - we're testing prompt handling, not API calls
-                assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
-            else:
-                # If somehow it succeeds (e.g., with mocked provider), check the response
-                assert output["status"] != "resend_prompt"
+            assert output["status"] != "resend_prompt"
         finally:
             shutil.rmtree(temp_dir, ignore_errors=True)
 
diff --git a/tests/test_mcp_error_handling.py b/tests/test_mcp_error_handling.py
new file mode 100644
index 0000000..e8267d6
--- /dev/null
+++ b/tests/test_mcp_error_handling.py
@@ -0,0 +1,64 @@
+import json
+from types import SimpleNamespace
+
+import pytest
+from mcp.types import CallToolRequest, CallToolRequestParams
+
+from providers.registry import ModelProviderRegistry
+from server import server as mcp_server
+
+
+def _install_dummy_provider(monkeypatch):
+    """Ensure preflight model checks succeed without real provider configuration."""
+
+    class DummyProvider:
+        def get_provider_type(self):
+            return SimpleNamespace(value="dummy")
+
+        def get_capabilities(self, model_name):
+            return SimpleNamespace(
+                supports_extended_thinking=False,
+                allow_code_generation=False,
+                supports_images=False,
+                context_window=1_000_000,
+                max_image_size_mb=10,
+            )
+
+    monkeypatch.setattr(
+        ModelProviderRegistry,
+        "get_provider_for_model",
+        classmethod(lambda cls, model_name: DummyProvider()),
+    )
+    monkeypatch.setattr(
+        ModelProviderRegistry,
+        "get_available_models",
+        classmethod(lambda cls, respect_restrictions=False: {"gemini-2.5-flash": None}),
+    )
+
+
+@pytest.mark.asyncio
+async def test_tool_execution_error_sets_is_error_flag_for_mcp_response(monkeypatch):
+    """Ensure ToolExecutionError surfaces as CallToolResult with isError=True."""
+
+    _install_dummy_provider(monkeypatch)
+
+    handler = mcp_server.request_handlers[CallToolRequest]
+
+    arguments = {
+        "prompt": "Trigger working_directory validation failure",
+        "working_directory": "relative/path",  # Not absolute -> ToolExecutionError from ChatTool
+        "files": [],
+        "model": "gemini-2.5-flash",
+    }
+
+    request = CallToolRequest(params=CallToolRequestParams(name="chat", arguments=arguments))
+
+    server_result = await handler(request)
+
+    assert server_result.root.isError is True
+    assert server_result.root.content, "Expected error response content"
+
+    payload = server_result.root.content[0].text
+    data = json.loads(payload)
+    assert data["status"] == "error"
+    assert "absolute" in data["content"].lower()
diff --git a/tests/test_per_tool_model_defaults.py b/tests/test_per_tool_model_defaults.py
index 4f8c623..747a93a 100644
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -18,6 +18,7 @@ from tools.debug import DebugIssueTool
 from tools.models import ToolModelCategory
 from tools.precommit import PrecommitTool
 from tools.shared.base_tool import BaseTool
+from tools.shared.exceptions import ToolExecutionError
 from tools.thinkdeep import ThinkDeepTool
 
 
@@ -294,15 +295,12 @@ class TestAutoModeErrorMessages:
                         tool = ChatTool()
                         temp_dir = tempfile.mkdtemp()
                         try:
-                            result = await tool.execute(
-                                {"prompt": "test", "model": "auto", "working_directory": temp_dir}
-                            )
+                            with pytest.raises(ToolExecutionError) as exc_info:
+                                await tool.execute({"prompt": "test", "model": "auto", "working_directory": temp_dir})
                         finally:
                             shutil.rmtree(temp_dir, ignore_errors=True)
 
-                        assert len(result) == 1
-                        # The SimpleTool will wrap the error message
-                        error_output = json.loads(result[0].text)
+                        error_output = json.loads(exc_info.value.payload)
                         assert error_output["status"] == "error"
                         assert "Model 'auto' is not available" in error_output["content"]
 
@@ -412,7 +410,6 @@ class TestRuntimeModelSelection:
                     }
                 )
 
-                # Should require model selection even though DEFAULT_MODEL is valid
                 assert len(result) == 1
                 assert "Model 'auto' is not available" in result[0].text
 
@@ -428,16 +425,15 @@ class TestRuntimeModelSelection:
                     tool = ChatTool()
                     temp_dir = tempfile.mkdtemp()
                     try:
-                        result = await tool.execute(
-                            {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
-                        )
+                        with pytest.raises(ToolExecutionError) as exc_info:
+                            await tool.execute(
+                                {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
+                            )
                     finally:
                         shutil.rmtree(temp_dir, ignore_errors=True)
 
                     # Should require model selection
-                    assert len(result) == 1
-                    # When a specific model is requested but not available, error message is different
-                    error_output = json.loads(result[0].text)
+                    error_output = json.loads(exc_info.value.payload)
                     assert error_output["status"] == "error"
                     assert "gpt-5-turbo" in error_output["content"]
                     assert "is not available" in error_output["content"]
diff --git a/tests/test_planner.py b/tests/test_planner.py
index 75f358c..081e1d0 100644
--- a/tests/test_planner.py
+++ b/tests/test_planner.py
@@ -8,6 +8,7 @@ import pytest
 
 from tools.models import ToolModelCategory
 from tools.planner import PlannerRequest, PlannerTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestPlannerTool:
@@ -340,16 +341,12 @@ class TestPlannerTool:
             # Missing required fields: step_number, total_steps, next_step_required
         }
 
-        result = await tool.execute(arguments)
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(arguments)
 
-        # Should return error response
-        assert len(result) == 1
-        response_text = result[0].text
-
-        # Parse the JSON response
         import json
 
-        parsed_response = json.loads(response_text)
+        parsed_response = json.loads(exc_info.value.payload)
 
         assert parsed_response["status"] == "planner_failed"
         assert "error" in parsed_response
diff --git a/tests/test_thinking_modes.py b/tests/test_thinking_modes.py
index a0bc839..294854c 100644
--- a/tests/test_thinking_modes.py
+++ b/tests/test_thinking_modes.py
@@ -87,16 +87,26 @@ class TestThinkingModes:
             except Exception as e:
                 # Expected: API call will fail with fake key, but we can check the error
                 # If we get a provider resolution error, that's what we're testing
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                 # Should NOT be a mock-related error - should be a real API or key error
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error (API key, network, etc.)
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )
 
         finally:
             # Restore environment
@@ -156,16 +166,26 @@ class TestThinkingModes:
 
             except Exception as e:
                 # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                 # Should NOT be a mock-related error
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )
 
         finally:
             # Restore environment
@@ -226,16 +246,26 @@ class TestThinkingModes:
 
             except Exception as e:
                 # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                 # Should NOT be a mock-related error
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )
 
         finally:
             # Restore environment
@@ -295,16 +325,26 @@ class TestThinkingModes:
 
             except Exception as e:
                 # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                 # Should NOT be a mock-related error
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )
 
         finally:
             # Restore environment
@@ -367,16 +407,26 @@ class TestThinkingModes:
 
             except Exception as e:
                 # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                 # Should NOT be a mock-related error
                 assert "MagicMock" not in error_msg
                 assert "'<' not supported between instances" not in error_msg
 
                 # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )
 
         finally:
             # Restore environment
diff --git a/tests/test_tools.py b/tests/test_tools.py
index dbcf0c9..89245a7 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -9,6 +9,7 @@ import tempfile
 import pytest
 
 from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestThinkDeepTool:
@@ -324,19 +325,19 @@ class TestAbsolutePathValidation:
     async def test_thinkdeep_tool_relative_path_rejected(self):
         """Test that thinkdeep tool rejects relative paths"""
         tool = ThinkDeepTool()
-        result = await tool.execute(
-            {
-                "step": "My analysis",
-                "step_number": 1,
-                "total_steps": 1,
-                "next_step_required": False,
-                "findings": "Initial analysis",
-                "files_checked": ["./local/file.py"],
-            }
-        )
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(
+                {
+                    "step": "My analysis",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Initial analysis",
+                    "files_checked": ["./local/file.py"],
+                }
+            )
 
-        assert len(result) == 1
-        response = json.loads(result[0].text)
+        response = json.loads(exc_info.value.payload)
         assert response["status"] == "error"
         assert "must be FULL absolute paths" in response["content"]
         assert "./local/file.py" in response["content"]
@@ -347,18 +348,18 @@ class TestAbsolutePathValidation:
         tool = ChatTool()
         temp_dir = tempfile.mkdtemp()
         try:
-            result = await tool.execute(
-                {
-                    "prompt": "Explain this code",
-                    "files": ["code.py"],  # relative path without ./
-                    "working_directory": temp_dir,
-                }
-            )
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await tool.execute(
+                    {
+                        "prompt": "Explain this code",
+                        "files": ["code.py"],  # relative path without ./
+                        "working_directory": temp_dir,
+                    }
+                )
         finally:
             shutil.rmtree(temp_dir, ignore_errors=True)
 
-        assert len(result) == 1
-        response = json.loads(result[0].text)
+        response = json.loads(exc_info.value.payload)
         assert response["status"] == "error"
         assert "must be FULL absolute paths" in response["content"]
         assert "code.py" in response["content"]
diff --git a/tests/test_workflow_metadata.py b/tests/test_workflow_metadata.py
index 0d0e870..80bb623 100644
--- a/tests/test_workflow_metadata.py
+++ b/tests/test_workflow_metadata.py
@@ -13,6 +13,7 @@ import pytest
 from providers.registry import ModelProviderRegistry
 from providers.shared import ProviderType
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 class TestWorkflowMetadata:
@@ -167,12 +168,10 @@ class TestWorkflowMetadata:
             # Execute the workflow tool - should fail gracefully
             import asyncio
 
-            result = asyncio.run(debug_tool.execute(arguments))
+            with pytest.raises(ToolExecutionError) as exc_info:
+                asyncio.run(debug_tool.execute(arguments))
 
-            # Parse the JSON response
-            assert len(result) == 1
-            response_text = result[0].text
-            response_data = json.loads(response_text)
+            response_data = json.loads(exc_info.value.payload)
 
             # Verify it's an error response with metadata
             assert "status" in response_data
diff --git a/tests/test_workflow_prompt_size_validation_simple.py b/tests/test_workflow_prompt_size_validation_simple.py
index 4fd84a7..30dd4f2 100644
--- a/tests/test_workflow_prompt_size_validation_simple.py
+++ b/tests/test_workflow_prompt_size_validation_simple.py
@@ -12,6 +12,7 @@ import pytest
 
 from config import MCP_PROMPT_SIZE_LIMIT
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError
 
 
 def build_debug_arguments(**overrides) -> dict[str, object]:
@@ -60,16 +61,10 @@ async def test_workflow_tool_rejects_oversized_step_with_guidance() -> None:
     tool = DebugIssueTool()
     arguments = build_debug_arguments(step=oversized_step)
 
-    responses = await tool.execute(arguments)
-    assert len(responses) == 1
+    with pytest.raises(ToolExecutionError) as exc_info:
+        await tool.execute(arguments)
 
-    payload = json.loads(responses[0].text)
-    assert payload["status"] == "debug_failed"
-    assert "error" in payload
-
-    # Extract the serialized ToolOutput from the MCP_SIZE_CHECK marker
-    error_details = payload["error"].split("MCP_SIZE_CHECK:", 1)[1]
-    output_payload = json.loads(error_details)
+    output_payload = json.loads(exc_info.value.payload)
 
     assert output_payload["status"] == "resend_prompt"
     assert output_payload["metadata"]["prompt_size"] > MCP_PROMPT_SIZE_LIMIT
diff --git a/tools/apilookup.py b/tools/apilookup.py
index 743b20d..f097fe7 100644
--- a/tools/apilookup.py
+++ b/tools/apilookup.py
@@ -28,8 +28,9 @@ LOOKUP_PROMPT = """
 MANDATORY: You MUST perform this research in a SEPARATE SUB-TASK using your web search tool.
 
 CRITICAL RULES - READ CAREFULLY:
-- NEVER call `apilookup` / `zen.apilookup` or any other zen tool again for this mission. Launch your environment's dedicated web search capability
-  (for example `websearch`, `web_search`, or another native web-search tool such as the one you use to perform a web search online) to gather sources.
+- Launch your environment's dedicated web search capability (for example `websearch`, `web_search`, or another native
+web-search tool such as the one you use to perform a web search online) to gather sources - do NOT call this `apilookup` tool again
+during the same lookup, this is ONLY an orchestration tool to guide you and has NO web search capability of its own.
 - ALWAYS run the search from a separate sub-task/sub-process so the research happens outside this tool invocation.
 - If the environment does not expose a web search tool, immediately report that limitation instead of invoking `apilookup` again.
 
diff --git a/tools/challenge.py b/tools/challenge.py
index a217924..a63463d 100644
--- a/tools/challenge.py
+++ b/tools/challenge.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
 
 from config import TEMPERATURE_ANALYTICAL
 from tools.shared.base_models import ToolRequest
+from tools.shared.exceptions import ToolExecutionError
 
 from .simple.base import SimpleTool
 
@@ -138,6 +139,8 @@ class ChallengeTool(SimpleTool):
 
             return [TextContent(type="text", text=json.dumps(response_data, indent=2, ensure_ascii=False))]
 
+        except ToolExecutionError:
+            raise
         except Exception as e:
             import logging
 
@@ -150,7 +153,7 @@ class ChallengeTool(SimpleTool):
                 "content": f"Failed to create challenge prompt: {str(e)}",
             }
 
-            return [TextContent(type="text", text=json.dumps(error_data, ensure_ascii=False))]
+            raise ToolExecutionError(json.dumps(error_data, ensure_ascii=False)) from e
 
     def _wrap_prompt_for_challenge(self, prompt: str) -> str:
         """
diff --git a/tools/chat.py b/tools/chat.py
index f9e7beb..9d42a3d 100644
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -30,10 +30,10 @@ CHAT_FIELD_DESCRIPTIONS = {
         "Your question or idea for collaborative thinking. Provide detailed context, including your goal, what you've tried, and any specific challenges. "
         "CRITICAL: To discuss code, use 'files' parameter instead of pasting code blocks here."
     ),
-    "files": "absolute file or folder paths for code context (do NOT shorten).",
-    "images": "Optional absolute image paths or base64 for visual context when helpful.",
+    "files": "Absolute file or folder paths for code context.",
+    "images": "Image paths (absolute) or base64 strings for optional visual context.",
     "working_directory": (
-        "Absolute full directory path where the assistant AI can save generated code for implementation. The directory must already exist"
+        "Absolute directory path where generated code artifacts are stored. The directory must already exist."
     ),
 }
 
@@ -98,17 +98,11 @@ class ChatTool(SimpleTool):
         """Return the Chat-specific request model"""
         return ChatRequest
 
-    # === Schema Generation ===
-    # For maximum compatibility, we override get_input_schema() to match the original Chat tool exactly
+    # === Schema Generation Utilities ===
 
     def get_input_schema(self) -> dict[str, Any]:
-        """
-        Generate input schema matching the original Chat tool exactly.
+        """Generate input schema matching the original Chat tool expectations."""
 
-        This maintains 100% compatibility with the original Chat tool by using
-        the same schema generation approach while still benefiting from SimpleTool
-        convenience methods.
-        """
         required_fields = ["prompt", "working_directory"]
         if self.is_effective_auto_mode():
             required_fields.append("model")
@@ -152,22 +146,14 @@ class ChatTool(SimpleTool):
                 },
             },
             "required": required_fields,
+            "additionalProperties": False,
         }
 
         return schema
 
-    # === Tool-specific field definitions (alternative approach for reference) ===
-    # These aren't used since we override get_input_schema(), but they show how
-    # the tool could be implemented using the automatic SimpleTool schema building
-
     def get_tool_fields(self) -> dict[str, dict[str, Any]]:
-        """
-        Tool-specific field definitions for ChatSimple.
+        """Tool-specific field definitions used by SimpleTool scaffolding."""
 
-        Note: This method isn't used since we override get_input_schema() for
-        exact compatibility, but it demonstrates how ChatSimple could be
-        implemented using automatic schema building.
-        """
         return {
             "prompt": {
                 "type": "string",
@@ -204,6 +190,19 @@ class ChatTool(SimpleTool):
     def _validate_file_paths(self, request) -> Optional[str]:
         """Extend validation to cover the working directory path."""
 
+        files = self.get_request_files(request)
+        if files:
+            expanded_files: list[str] = []
+            for file_path in files:
+                expanded = os.path.expanduser(file_path)
+                if not os.path.isabs(expanded):
+                    return (
+                        "Error: All file paths must be FULL absolute paths to real files / folders - DO NOT SHORTEN. "
+                        f"Received: {file_path}"
+                    )
+                expanded_files.append(expanded)
+            self.set_request_files(request, expanded_files)
+
         error = super()._validate_file_paths(request)
         if error:
             return error
@@ -216,6 +215,10 @@ class ChatTool(SimpleTool):
                     "Error: 'working_directory' must be an absolute path (you may use '~' which will be expanded). "
                     f"Received: {working_directory}"
                 )
+            if not os.path.isdir(expanded):
+                return (
+                    "Error: 'working_directory' must reference an existing directory. " f"Received: {working_directory}"
+                )
         return None
 
     def format_response(self, response: str, request: ChatRequest, model_info: Optional[dict] = None) -> str:
@@ -227,7 +230,7 @@ class ChatTool(SimpleTool):
         recordable_override: Optional[str] = None
 
         if self._model_supports_code_generation():
-            block, remainder = self._extract_generated_code_block(response)
+            block, remainder, _ = self._extract_generated_code_block(response)
             if block:
                 sanitized_text = remainder.strip()
                 try:
@@ -239,14 +242,15 @@ class ChatTool(SimpleTool):
                         "Check the path permissions and re-run. The generated code block is included below for manual handling."
                     )
 
-                    history_copy = self._join_sections(sanitized_text, warning) if sanitized_text else warning
+                    history_copy_base = sanitized_text
+                    history_copy = self._join_sections(history_copy_base, warning) if history_copy_base else warning
                     recordable_override = history_copy
 
                     sanitized_warning = history_copy.strip()
                     body = f"{sanitized_warning}\n\n{block.strip()}".strip()
                 else:
                     if not sanitized_text:
-                        sanitized_text = (
+                        base_message = (
                             "Generated code saved to zen_generated.code.\n"
                             "\n"
                             "CRITICAL: Contains mixed instructions + partial snippets - NOT complete code to copy as-is!\n"
@@ -260,6 +264,7 @@ class ChatTool(SimpleTool):
                             "\n"
                             "Treat as guidance to implement thoughtfully, not ready-to-paste code."
                         )
+                        sanitized_text = base_message
 
                     instruction = self._build_agent_instruction(artifact_path)
                     body = self._join_sections(sanitized_text, instruction)
@@ -300,26 +305,35 @@ class ChatTool(SimpleTool):
 
         return bool(capabilities.allow_code_generation)
 
-    def _extract_generated_code_block(self, text: str) -> tuple[Optional[str], str]:
-        match = re.search(r"<GENERATED-CODE>.*?</GENERATED-CODE>", text, flags=re.DOTALL | re.IGNORECASE)
-        if not match:
-            return None, text
+    def _extract_generated_code_block(self, text: str) -> tuple[Optional[str], str, int]:
+        matches = list(re.finditer(r"<GENERATED-CODE>.*?</GENERATED-CODE>", text, flags=re.DOTALL | re.IGNORECASE))
+        if not matches:
+            return None, text, 0
 
-        block = match.group(0)
-        before = text[: match.start()].rstrip()
-        after = text[match.end() :].lstrip()
+        blocks = [match.group(0).strip() for match in matches]
+        combined_block = "\n\n".join(blocks)
 
-        if before and after:
-            remainder = f"{before}\n\n{after}"
-        else:
-            remainder = before or after
+        remainder_parts: list[str] = []
+        last_end = 0
+        for match in matches:
+            start, end = match.span()
+            segment = text[last_end:start]
+            if segment:
+                remainder_parts.append(segment)
+            last_end = end
+        tail = text[last_end:]
+        if tail:
+            remainder_parts.append(tail)
 
-        return block, remainder or ""
+        remainder = self._join_sections(*remainder_parts)
+
+        return combined_block, remainder, len(blocks)
 
     def _persist_generated_code_block(self, block: str, working_directory: str) -> Path:
         expanded = os.path.expanduser(working_directory)
         target_dir = Path(expanded).resolve()
-        target_dir.mkdir(parents=True, exist_ok=True)
+        if not target_dir.is_dir():
+            raise FileNotFoundError(f"Working directory '{working_directory}' does not exist")
 
         target_file = target_dir / "zen_generated.code"
         if target_file.exists():
diff --git a/tools/clink.py b/tools/clink.py
index 4e91a52..148ab64 100644
--- a/tools/clink.py
+++ b/tools/clink.py
@@ -17,6 +17,7 @@ from clink.models import ResolvedCLIClient, ResolvedCLIRole
 from config import TEMPERATURE_BALANCED
 from tools.models import ToolModelCategory, ToolOutput
 from tools.shared.base_models import COMMON_FIELD_DESCRIPTIONS
+from tools.shared.exceptions import ToolExecutionError
 from tools.simple.base import SchemaBuilder, SimpleTool
 
 logger = logging.getLogger(__name__)
@@ -166,21 +167,21 @@ class CLinkTool(SimpleTool):
 
         path_error = self._validate_file_paths(request)
         if path_error:
-            return [self._error_response(path_error)]
+            self._raise_tool_error(path_error)
 
         selected_cli = request.cli_name or self._default_cli_name
         if not selected_cli:
-            return [self._error_response("No CLI clients are configured for clink.")]
+            self._raise_tool_error("No CLI clients are configured for clink.")
 
         try:
             client_config = self._registry.get_client(selected_cli)
         except KeyError as exc:
-            return [self._error_response(str(exc))]
+            self._raise_tool_error(str(exc))
 
         try:
             role_config = client_config.get_role(request.role)
         except KeyError as exc:
-            return [self._error_response(str(exc))]
+            self._raise_tool_error(str(exc))
 
         files = self.get_request_files(request)
         images = self.get_request_images(request)
@@ -200,7 +201,7 @@ class CLinkTool(SimpleTool):
             )
         except Exception as exc:
             logger.exception("Failed to prepare clink prompt")
-            return [self._error_response(f"Failed to prepare prompt: {exc}")]
+            self._raise_tool_error(f"Failed to prepare prompt: {exc}")
 
         agent = create_agent(client_config)
         try:
@@ -213,13 +214,10 @@ class CLinkTool(SimpleTool):
             )
         except CLIAgentError as exc:
             metadata = self._build_error_metadata(client_config, exc)
-            error_output = ToolOutput(
-                status="error",
-                content=f"CLI '{client_config.name}' execution failed: {exc}",
-                content_type="text",
+            self._raise_tool_error(
+                f"CLI '{client_config.name}' execution failed: {exc}",
                 metadata=metadata,
             )
-            return [TextContent(type="text", text=error_output.model_dump_json())]
 
         metadata = self._build_success_metadata(client_config, role_config, result)
         metadata = self._prune_metadata(metadata, client_config, reason="normal")
@@ -436,9 +434,9 @@ class CLinkTool(SimpleTool):
             metadata["stderr"] = exc.stderr.strip()
         return metadata
 
-    def _error_response(self, message: str) -> TextContent:
-        error_output = ToolOutput(status="error", content=message, content_type="text")
-        return TextContent(type="text", text=error_output.model_dump_json())
+    def _raise_tool_error(self, message: str, metadata: dict[str, Any] | None = None) -> None:
+        error_output = ToolOutput(status="error", content=message, content_type="text", metadata=metadata)
+        raise ToolExecutionError(error_output.model_dump_json())
 
     def _agent_capabilities_guidance(self) -> str:
         return (
diff --git a/tools/shared/exceptions.py b/tools/shared/exceptions.py
new file mode 100644
index 0000000..325b86f
--- /dev/null
+++ b/tools/shared/exceptions.py
@@ -0,0 +1,20 @@
+"""
+Custom exceptions for Zen MCP tools.
+
+These exceptions allow tools to signal protocol-level errors that should be surfaced
+to MCP clients using the `isError` flag on `CallToolResult`. Raising one of these
+exceptions ensures the low-level server adapter marks the result as an error while
+preserving the structured payload we pass through the exception message.
+"""
+
+
+class ToolExecutionError(RuntimeError):
+    """Raised to indicate a tool-level failure that must set `isError=True`."""
+
+    def __init__(self, payload: str):
+        """
+        Args:
+            payload: Serialized error payload (typically JSON) to return to the client.
+        """
+        super().__init__(payload)
+        self.payload = payload
diff --git a/tools/simple/base.py b/tools/simple/base.py
index 4a2a1a3..33a6697 100644
--- a/tools/simple/base.py
+++ b/tools/simple/base.py
@@ -17,6 +17,7 @@ from typing import Any, Optional
 
 from tools.shared.base_models import ToolRequest
 from tools.shared.base_tool import BaseTool
+from tools.shared.exceptions import ToolExecutionError
 from tools.shared.schema_builders import SchemaBuilder
 
 
@@ -269,7 +270,6 @@ class SimpleTool(BaseTool):
 
         This method replicates the proven execution pattern while using SimpleTool hooks.
         """
-        import json
         import logging
 
         from mcp.types import TextContent
@@ -298,7 +298,8 @@ class SimpleTool(BaseTool):
                     content=path_error,
                     content_type="text",
                 )
-                return [TextContent(type="text", text=error_output.model_dump_json())]
+                logger.error("Path validation failed for %s: %s", self.get_name(), path_error)
+                raise ToolExecutionError(error_output.model_dump_json())
 
             # Handle model resolution like old base.py
             model_name = self.get_request_model_name(request)
@@ -389,7 +390,15 @@ class SimpleTool(BaseTool):
                     images, model_context=self._model_context, continuation_id=continuation_id
                 )
                 if image_validation_error:
-                    return [TextContent(type="text", text=json.dumps(image_validation_error, ensure_ascii=False))]
+                    error_output = ToolOutput(
+                        status=image_validation_error.get("status", "error"),
+                        content=image_validation_error.get("content"),
+                        content_type=image_validation_error.get("content_type", "text"),
+                        metadata=image_validation_error.get("metadata"),
+                    )
+                    payload = error_output.model_dump_json()
+                    logger.error("Image validation failed for %s: %s", self.get_name(), payload)
+                    raise ToolExecutionError(payload)
 
             # Get and validate temperature against model constraints
             temperature, temp_warnings = self.get_validated_temperature(request, self._model_context)
@@ -552,15 +561,21 @@ class SimpleTool(BaseTool):
                             content_type="text",
                         )
 
-            # Return the tool output as TextContent
-            return [TextContent(type="text", text=tool_output.model_dump_json())]
+            # Return the tool output as TextContent, marking protocol errors appropriately
+            payload = tool_output.model_dump_json()
+            if tool_output.status == "error":
+                logger.error("%s reported error status - raising ToolExecutionError", self.get_name())
+                raise ToolExecutionError(payload)
+            return [TextContent(type="text", text=payload)]
 
+        except ToolExecutionError:
+            raise
         except Exception as e:
             # Special handling for MCP size check errors
             if str(e).startswith("MCP_SIZE_CHECK:"):
                 # Extract the JSON content after the prefix
                 json_content = str(e)[len("MCP_SIZE_CHECK:") :]
-                return [TextContent(type="text", text=json_content)]
+                raise ToolExecutionError(json_content)
 
             logger.error(f"Error in {self.get_name()}: {str(e)}")
             error_output = ToolOutput(
@@ -568,7 +583,7 @@ class SimpleTool(BaseTool):
                 content=f"Error in {self.get_name()}: {str(e)}",
                 content_type="text",
             )
-            return [TextContent(type="text", text=error_output.model_dump_json())]
+            raise ToolExecutionError(error_output.model_dump_json()) from e
 
     def _parse_response(self, raw_text: str, request, model_info: Optional[dict] = None):
         """
diff --git a/tools/workflow/workflow_mixin.py b/tools/workflow/workflow_mixin.py
index 21c5bb2..bd6cbf5 100644
--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -33,6 +33,7 @@ from config import MCP_PROMPT_SIZE_LIMIT
 from utils.conversation_memory import add_turn, create_thread
 
 from ..shared.base_models import ConsolidatedFindings
+from ..shared.exceptions import ToolExecutionError
 
 logger = logging.getLogger(__name__)
 
@@ -645,7 +646,8 @@ class BaseWorkflowMixin(ABC):
                         content=path_error,
                         content_type="text",
                     )
-                    return [TextContent(type="text", text=error_output.model_dump_json())]
+                    logger.error("Path validation failed for %s: %s", self.get_name(), path_error)
+                    raise ToolExecutionError(error_output.model_dump_json())
             except AttributeError:
                 # validate_file_paths method not available - skip validation
                 pass
@@ -738,7 +740,13 @@ class BaseWorkflowMixin(ABC):
 
             return [TextContent(type="text", text=json.dumps(response_data, indent=2, ensure_ascii=False))]
 
+        except ToolExecutionError:
+            raise
         except Exception as e:
+            if str(e).startswith("MCP_SIZE_CHECK:"):
+                payload = str(e)[len("MCP_SIZE_CHECK:") :]
+                raise ToolExecutionError(payload)
+
             logger.error(f"Error in {self.get_name()} work: {e}", exc_info=True)
             error_data = {
                 "status": f"{self.get_name()}_failed",
@@ -749,7 +757,7 @@ class BaseWorkflowMixin(ABC):
             # Add metadata to error responses too
             self._add_workflow_metadata(error_data, arguments)
 
-            return [TextContent(type="text", text=json.dumps(error_data, indent=2, ensure_ascii=False))]
+            raise ToolExecutionError(json.dumps(error_data, indent=2, ensure_ascii=False)) from e
 
     # Hook methods for tool customization
 
@@ -1577,11 +1585,13 @@ class BaseWorkflowMixin(ABC):
                 error_data = {"status": "error", "content": "No arguments provided"}
                 # Add basic metadata even for validation errors
                 error_data["metadata"] = {"tool_name": self.get_name()}
-                return [TextContent(type="text", text=json.dumps(error_data, ensure_ascii=False))]
+                raise ToolExecutionError(json.dumps(error_data, ensure_ascii=False))
 
             # Delegate to execute_workflow
             return await self.execute_workflow(arguments)
 
+        except ToolExecutionError:
+            raise
         except Exception as e:
             logger.error(f"Error in {self.get_name()} tool execution: {e}", exc_info=True)
             error_data = {
@@ -1589,12 +1599,7 @@ class BaseWorkflowMixin(ABC):
                 "content": f"Error in {self.get_name()}: {str(e)}",
             }  # Add metadata to error responses
             self._add_workflow_metadata(error_data, arguments)
-            return [
-                TextContent(
-                    type="text",
-                    text=json.dumps(error_data, ensure_ascii=False),
-                )
-            ]
+            raise ToolExecutionError(json.dumps(error_data, ensure_ascii=False)) from e
 
     # Default implementations for methods that workflow-based tools typically don't need