fix: improved error reporting; codex cli would at times fail to figure out how to handle plain-text / JSON errors

fix: working directory should exist, raise error and not try and create one docs: improved API Lookup instructions * test added to confirm failures * chat schema more explicit about file paths
2025-10-17 23:42:32 +04:00
parent 71796c0c70
commit 95e69a7cb2
24 changed files with 569 additions and 337 deletions
--- a/tests/test_auto_mode.py
+++ b/tests/test_auto_mode.py
@@ -7,6 +7,7 @@ from unittest.mock import patch
 import pytest

 from tools.chat import ChatTool
+from tools.shared.exceptions import ToolExecutionError


 class TestAutoMode:
@@ -153,14 +154,14 @@ class TestAutoMode:

            # Mock the provider to avoid real API calls
            with patch.object(tool, "get_model_provider"):
-                # Execute without model parameter
-                result = await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})
+                # Execute without model parameter and expect protocol error
+                with pytest.raises(ToolExecutionError) as exc_info:
+                    await tool.execute({"prompt": "Test prompt", "working_directory": str(tmp_path)})

-            # Should get error
-            assert len(result) == 1
-            response = result[0].text
-            assert "error" in response
-            assert "Model parameter is required" in response or "Model 'auto' is not available" in response
+            # Should get error payload mentioning model requirement
+            error_payload = getattr(exc_info.value, "payload", str(exc_info.value))
+            assert "Model" in error_payload
+            assert "auto" in error_payload

        finally:
            # Restore
--- a/tests/test_auto_mode_comprehensive.py
+++ b/tests/test_auto_mode_comprehensive.py
@@ -15,6 +15,7 @@ from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.debug import DebugIssueTool
 from tools.models import ToolModelCategory
+from tools.shared.exceptions import ToolExecutionError
 from tools.thinkdeep import ThinkDeepTool


@@ -227,30 +228,15 @@ class TestAutoModeComprehensive:
            # Register only Gemini provider
            ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)

-            # Mock provider to capture what model is requested
-            mock_provider = MagicMock()
-            mock_provider.generate_content.return_value = MagicMock(
-                content="test response", model_name="test-model", usage={"input_tokens": 10, "output_tokens": 5}
-            )
+            # Test ChatTool (FAST_RESPONSE) - auto mode should suggest flash variant
+            chat_tool = ChatTool()
+            chat_message = chat_tool._build_auto_mode_required_message()
+            assert "flash" in chat_message

-            with patch.object(ModelProviderRegistry, "get_provider_for_model", return_value=mock_provider):
-                workdir = tmp_path / "chat_artifacts"
-                workdir.mkdir(parents=True, exist_ok=True)
-                # Test ChatTool (FAST_RESPONSE) - should prefer flash
-                chat_tool = ChatTool()
-                await chat_tool.execute(
-                    {"prompt": "test", "model": "auto", "working_directory": str(workdir)}
-                )  # This should trigger auto selection
-
-                # In auto mode, the tool should get an error requiring model selection
-                # but the suggested model should be flash
-
-                # Reset mock for next test
-                ModelProviderRegistry.get_provider_for_model.reset_mock()
-
-                # Test DebugIssueTool (EXTENDED_REASONING) - should prefer pro
-                debug_tool = DebugIssueTool()
-                await debug_tool.execute({"prompt": "test error", "model": "auto"})
+            # Test DebugIssueTool (EXTENDED_REASONING) - auto mode should suggest pro variant
+            debug_tool = DebugIssueTool()
+            debug_message = debug_tool._build_auto_mode_required_message()
+            assert "pro" in debug_message

    def test_auto_mode_schema_includes_all_available_models(self):
        """Test that auto mode schema includes all available models for user convenience."""
@@ -390,30 +376,25 @@ class TestAutoModeComprehensive:
            chat_tool = ChatTool()
            workdir = tmp_path / "chat_artifacts"
            workdir.mkdir(parents=True, exist_ok=True)
-            result = await chat_tool.execute(
-                {
-                    "prompt": "test",
-                    "working_directory": str(workdir),
-                    # Note: no model parameter provided in auto mode
-                }
-            )
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await chat_tool.execute(
+                    {
+                        "prompt": "test",
+                        "working_directory": str(workdir),
+                        # Note: no model parameter provided in auto mode
+                    }
+                )

-            # Should get error requiring model selection
-            assert len(result) == 1
-            response_text = result[0].text
-
-            # Parse JSON response to check error
+            # Should get error requiring model selection with fallback suggestion
            import json

-            response_data = json.loads(response_text)
+            response_data = json.loads(exc_info.value.payload)

            assert response_data["status"] == "error"
            assert (
-                "Model parameter is required" in response_data["content"]
-                or "Model 'auto' is not available" in response_data["content"]
+                "Model parameter is required" in response_data["content"] or "Model 'auto'" in response_data["content"]
            )
-            # Note: With the new SimpleTool-based Chat tool, the error format is simpler
-            # and doesn't include category-specific suggestions like the original tool did
+            assert "flash" in response_data["content"]

    def test_model_availability_with_restrictions(self):
        """Test that auto mode respects model restrictions when selecting fallback models."""
--- a/tests/test_auto_mode_model_listing.py
+++ b/tests/test_auto_mode_model_listing.py
@@ -14,6 +14,7 @@ from providers.openrouter import OpenRouterProvider
 from providers.registry import ModelProviderRegistry
 from providers.shared import ProviderType
 from providers.xai import XAIModelProvider
+from tools.shared.exceptions import ToolExecutionError


 def _extract_available_models(message: str) -> list[str]:
@@ -123,18 +124,18 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
    model_restrictions._restriction_service = None
    server.configure_providers()

-    result = asyncio.run(
-        server.handle_call_tool(
-            "chat",
-            {
-                "model": "gpt5mini",
-                "prompt": "Tell me about your strengths",
-            },
+    with pytest.raises(ToolExecutionError) as exc_info:
+        asyncio.run(
+            server.handle_call_tool(
+                "chat",
+                {
+                    "model": "gpt5mini",
+                    "prompt": "Tell me about your strengths",
+                },
+            )
        )
-    )

-    assert len(result) == 1
-    payload = json.loads(result[0].text)
+    payload = json.loads(exc_info.value.payload)
    assert payload["status"] == "error"

    available_models = _extract_available_models(payload["content"])
@@ -208,18 +209,18 @@ def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, rese
    model_restrictions._restriction_service = None
    server.configure_providers()

-    result = asyncio.run(
-        server.handle_call_tool(
-            "chat",
-            {
-                "model": "dummymodel",
-                "prompt": "Hi there",
-            },
+    with pytest.raises(ToolExecutionError) as exc_info:
+        asyncio.run(
+            server.handle_call_tool(
+                "chat",
+                {
+                    "model": "dummymodel",
+                    "prompt": "Hi there",
+                },
+            )
        )
-    )

-    assert len(result) == 1
-    payload = json.loads(result[0].text)
+    payload = json.loads(exc_info.value.payload)
    assert payload["status"] == "error"

    available_models = _extract_available_models(payload["content"])
--- a/tests/test_challenge.py
+++ b/tests/test_challenge.py
@@ -12,6 +12,7 @@ from unittest.mock import patch
 import pytest

 from tools.challenge import ChallengeRequest, ChallengeTool
+from tools.shared.exceptions import ToolExecutionError


 class TestChallengeTool:
@@ -110,10 +111,10 @@ class TestChallengeTool:
        """Test error handling in execute method"""
        # Test with invalid arguments (non-dict)
        with patch.object(self.tool, "get_request_model", side_effect=Exception("Test error")):
-            result = await self.tool.execute({"prompt": "test"})
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await self.tool.execute({"prompt": "test"})

-        assert len(result) == 1
-        response_data = json.loads(result[0].text)
+        response_data = json.loads(exc_info.value.payload)
        assert response_data["status"] == "error"
        assert "Test error" in response_data["error"]

--- a/tests/test_chat_simple.py
+++ b/tests/test_chat_simple.py
@@ -5,11 +5,14 @@ This module contains unit tests to ensure that the Chat tool
 (now using SimpleTool architecture) maintains proper functionality.
 """

+import json
+from types import SimpleNamespace
 from unittest.mock import patch

 import pytest

 from tools.chat import ChatRequest, ChatTool
+from tools.shared.exceptions import ToolExecutionError


 class TestChatTool:
@@ -125,6 +128,30 @@ class TestChatTool:
        assert "AGENT'S TURN:" in formatted
        assert "Evaluate this perspective" in formatted

+    def test_format_response_multiple_generated_code_blocks(self, tmp_path):
+        """All generated-code blocks should be combined and saved to zen_generated.code."""
+        tool = ChatTool()
+        tool._model_context = SimpleNamespace(capabilities=SimpleNamespace(allow_code_generation=True))
+
+        response = (
+            "Intro text\n"
+            "<GENERATED-CODE>print('hello')</GENERATED-CODE>\n"
+            "Other text\n"
+            "<GENERATED-CODE>print('world')</GENERATED-CODE>"
+        )
+
+        request = ChatRequest(prompt="Test", working_directory=str(tmp_path))
+
+        formatted = tool.format_response(response, request)
+
+        saved_path = tmp_path / "zen_generated.code"
+        saved_content = saved_path.read_text(encoding="utf-8")
+
+        assert "print('hello')" in saved_content
+        assert "print('world')" in saved_content
+        assert saved_content.count("<GENERATED-CODE>") == 2
+        assert str(saved_path) in formatted
+
    def test_tool_name(self):
        """Test tool name is correct"""
        assert self.tool.get_name() == "chat"
@@ -163,10 +190,38 @@ class TestChatRequestModel:
        # Field descriptions should exist and be descriptive
        assert len(CHAT_FIELD_DESCRIPTIONS["prompt"]) > 50
        assert "context" in CHAT_FIELD_DESCRIPTIONS["prompt"]
-        assert "full-paths" in CHAT_FIELD_DESCRIPTIONS["files"] or "absolute" in CHAT_FIELD_DESCRIPTIONS["files"]
+        files_desc = CHAT_FIELD_DESCRIPTIONS["files"].lower()
+        assert "absolute" in files_desc
        assert "visual context" in CHAT_FIELD_DESCRIPTIONS["images"]
        assert "directory" in CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()

+    def test_working_directory_description_matches_behavior(self):
+        """Working directory description should reflect automatic creation."""
+        from tools.chat import CHAT_FIELD_DESCRIPTIONS
+
+        description = CHAT_FIELD_DESCRIPTIONS["working_directory"].lower()
+        assert "must already exist" in description
+
+    @pytest.mark.asyncio
+    async def test_working_directory_must_exist(self, tmp_path):
+        """Chat tool should reject non-existent working directories."""
+        tool = ChatTool()
+        missing_dir = tmp_path / "nonexistent_subdir"
+
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(
+                {
+                    "prompt": "test",
+                    "files": [],
+                    "images": [],
+                    "working_directory": str(missing_dir),
+                }
+            )
+
+        payload = json.loads(exc_info.value.payload)
+        assert payload["status"] == "error"
+        assert "existing directory" in payload["content"].lower()
+
    def test_default_values(self):
        """Test that default values work correctly"""
        request = ChatRequest(prompt="Test", working_directory="/tmp")
--- a/tests/test_image_support_integration.py
+++ b/tests/test_image_support_integration.py
@@ -8,7 +8,6 @@ Tests the complete image support pipeline:
 - Cross-tool image context preservation
 """

-import json
 import os
 import tempfile
 import uuid
@@ -18,6 +17,7 @@ import pytest

 from tools.chat import ChatTool
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError
 from utils.conversation_memory import (
    ConversationTurn,
    ThreadContext,
@@ -276,31 +276,28 @@ class TestImageSupportIntegration:
            tool = ChatTool()

            # Test with real provider resolution
-            try:
-                result = await tool.execute(
-                    {"prompt": "What do you see in this image?", "images": [temp_image_path], "model": "gpt-4o"}
-                )
+            with tempfile.TemporaryDirectory() as working_directory:
+                with pytest.raises(ToolExecutionError) as exc_info:
+                    await tool.execute(
+                        {
+                            "prompt": "What do you see in this image?",
+                            "images": [temp_image_path],
+                            "model": "gpt-4o",
+                            "working_directory": working_directory,
+                        }
+                    )

-                # If we get here, check the response format
-                assert len(result) == 1
-                # Should be a valid JSON response
-                output = json.loads(result[0].text)
-                assert "status" in output
-                # Test passed - provider accepted images parameter
+            error_msg = exc_info.value.payload if hasattr(exc_info.value, "payload") else str(exc_info.value)

-            except Exception as e:
-                # Expected: API call will fail with fake key
-                error_msg = str(e)
-                # Should NOT be a mock-related error
-                assert "MagicMock" not in error_msg
-                assert "'<' not supported between instances" not in error_msg
+            # Should NOT be a mock-related error
+            assert "MagicMock" not in error_msg
+            assert "'<' not supported between instances" not in error_msg

-                # Should be a real provider error (API key or network)
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection", "401", "403"]
-                )
-                # Test passed - provider processed images parameter before failing on auth
+            # Should be a real provider error (API key or network)
+            assert any(
+                phrase in error_msg
+                for phrase in ["API", "key", "authentication", "provider", "network", "connection", "401", "403"]
+            )

        finally:
            # Clean up temp file
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -13,11 +13,11 @@ import tempfile
 from unittest.mock import MagicMock, patch

 import pytest
-from mcp.types import TextContent

 from config import MCP_PROMPT_SIZE_LIMIT
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
+from tools.shared.exceptions import ToolExecutionError

 # from tools.debug import DebugIssueTool  # Commented out - debug tool refactored

@@ -59,14 +59,12 @@ class TestLargePromptHandling:
        temp_dir = tempfile.mkdtemp()
        temp_dir = tempfile.mkdtemp()
        try:
-            result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

-        assert len(result) == 1
-        assert isinstance(result[0], TextContent)
-
-        output = json.loads(result[0].text)
+        output = json.loads(exc_info.value.payload)
        assert output["status"] == "resend_prompt"
        assert f"{MCP_PROMPT_SIZE_LIMIT:,} characters" in output["content"]
        # The prompt size should match the user input since we check at MCP transport boundary before adding internal content
@@ -83,23 +81,20 @@ class TestLargePromptHandling:
        # This test runs in the test environment which uses dummy keys
        # The chat tool will return an error for dummy keys, which is expected
        try:
-            result = await tool.execute(
-                {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
-            )
+            try:
+                result = await tool.execute(
+                    {"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                assert len(result) == 1
+                output = json.loads(result[0].text)
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

-        assert len(result) == 1
-        output = json.loads(result[0].text)
-
-        # The test will fail with dummy API keys, which is expected behavior
-        # We're mainly testing that the tool processes prompts correctly without size errors
-        if output["status"] == "error":
-            # Provider stubs surface generic errors when SDKs are unavailable.
-            # As long as we didn't trigger the MCP size guard, the behavior is acceptable.
-            assert output["status"] != "resend_prompt"
-        else:
-            assert output["status"] != "resend_prompt"
+        # Whether provider succeeds or fails, we should not hit the resend_prompt branch
+        assert output["status"] != "resend_prompt"

    @pytest.mark.asyncio
    async def test_chat_prompt_file_handling(self):
@@ -115,27 +110,24 @@ class TestLargePromptHandling:
            f.write(reasonable_prompt)

        try:
-            # This test runs in the test environment which uses dummy keys
-            # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute(
-                {
-                    "prompt": "",
-                    "files": [temp_prompt_file],
-                    "model": "gemini-2.5-flash",
-                    "working_directory": temp_dir,
-                }
-            )
-
-            assert len(result) == 1
-            output = json.loads(result[0].text)
-
-            # The test will fail with dummy API keys, which is expected behavior
-            # We're mainly testing that the tool processes prompts correctly without size errors
-            if output["status"] == "error":
-                assert output["status"] != "resend_prompt"
+            try:
+                result = await tool.execute(
+                    {
+                        "prompt": "",
+                        "files": [temp_prompt_file],
+                        "model": "gemini-2.5-flash",
+                        "working_directory": temp_dir,
+                    }
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
            else:
-                assert output["status"] != "resend_prompt"
+                assert len(result) == 1
+                output = json.loads(result[0].text)

+            # The test may fail with dummy API keys, which is expected behavior.
+            # We're mainly testing that the tool processes prompt files correctly without size errors.
+            assert output["status"] != "resend_prompt"
        finally:
            # Cleanup
            shutil.rmtree(temp_dir)
@@ -173,39 +165,47 @@ class TestLargePromptHandling:

            # Test with real provider resolution
            try:
-                result = await tool.execute(
-                    {
-                        "files": ["/some/file.py"],
-                        "focus_on": large_prompt,
-                        "prompt": "Test code review for validation purposes",
-                        "model": "o3-mini",
-                    }
-                )
+                args = {
+                    "step": "initial review setup",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Initial testing",
+                    "relevant_files": ["/some/file.py"],
+                    "files_checked": ["/some/file.py"],
+                    "focus_on": large_prompt,
+                    "prompt": "Test code review for validation purposes",
+                    "model": "o3-mini",
+                }

-                # The large focus_on should be detected and handled properly
-                assert len(result) == 1
-                output = json.loads(result[0].text)
-                # Should detect large prompt and return resend_prompt status
-                assert output["status"] == "resend_prompt"
+                try:
+                    result = await tool.execute(args)
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    assert len(result) == 1
+                    output = json.loads(result[0].text)
+
+                # The large focus_on may trigger the resend_prompt guard before provider access.
+                # When the guard does not trigger, auto-mode falls back to provider selection and
+                # returns an error about the unavailable model. Both behaviors are acceptable for this test.
+                if output.get("status") == "resend_prompt":
+                    assert output["metadata"]["prompt_size"] == len(large_prompt)
+                else:
+                    assert output.get("status") == "error"
+                    assert "Model" in output.get("content", "")

            except Exception as e:
-                # If we get an exception, check it's not a MagicMock error
+                # If we get an unexpected exception, ensure it's not a mock artifact
                error_msg = str(e)
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error (API, authentication, etc.)
-                # But the large prompt detection should happen BEFORE the API call
-                # So we might still get the resend_prompt response
-                if "resend_prompt" in error_msg:
-                    # This is actually the expected behavior - large prompt was detected
-                    assert True
-                else:
-                    # Should be a real provider error
-                    assert any(
-                        phrase in error_msg
-                        for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                    )
+                assert any(
+                    phrase in error_msg
+                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
+                )

        finally:
            # Restore environment
@@ -322,10 +322,14 @@ class TestLargePromptHandling:
            # With the fix, this should now pass because we check at MCP transport boundary before adding internal content
            temp_dir = tempfile.mkdtemp()
            try:
-                result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"

    @pytest.mark.asyncio
@@ -336,10 +340,14 @@ class TestLargePromptHandling:

        temp_dir = tempfile.mkdtemp()
        try:
-            result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
+            try:
+                result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
-        output = json.loads(result[0].text)
        assert output["status"] == "resend_prompt"

    @pytest.mark.asyncio
@@ -361,10 +369,14 @@ class TestLargePromptHandling:

            temp_dir = tempfile.mkdtemp()
            try:
-                result = await tool.execute({"prompt": "", "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": "", "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"

    @pytest.mark.asyncio
@@ -401,10 +413,14 @@ class TestLargePromptHandling:
            # Should continue with empty prompt when file can't be read
            temp_dir = tempfile.mkdtemp()
            try:
-                result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
+                try:
+                    result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
+                except ToolExecutionError as exc:
+                    output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+                else:
+                    output = json.loads(result[0].text)
            finally:
                shutil.rmtree(temp_dir, ignore_errors=True)
-            output = json.loads(result[0].text)
            assert output["status"] != "resend_prompt"

    @pytest.mark.asyncio
@@ -540,33 +556,37 @@ class TestLargePromptHandling:
        large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
        temp_dir = tempfile.mkdtemp()
        try:
-            result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
-            output = json.loads(result[0].text)
+            try:
+                result = await tool.execute(
+                    {"prompt": large_user_input, "model": "flash", "working_directory": temp_dir}
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)
+
            assert output["status"] == "resend_prompt"  # Should fail
            assert "too large for MCP's token limits" in output["content"]

            # Test case 2: Small user input should succeed even with huge internal processing
            small_user_input = "Hello"

-            # This test runs in the test environment which uses dummy keys
-            # The chat tool will return an error for dummy keys, which is expected
-            result = await tool.execute(
-                {
-                    "prompt": small_user_input,
-                    "model": "gemini-2.5-flash",
-                    "working_directory": temp_dir,
-                }
-            )
-            output = json.loads(result[0].text)
+            try:
+                result = await tool.execute(
+                    {
+                        "prompt": small_user_input,
+                        "model": "gemini-2.5-flash",
+                        "working_directory": temp_dir,
+                    }
+                )
+            except ToolExecutionError as exc:
+                output = json.loads(exc.payload if hasattr(exc, "payload") else str(exc))
+            else:
+                output = json.loads(result[0].text)

            # The test will fail with dummy API keys, which is expected behavior
            # We're mainly testing that the tool processes small prompts correctly without size errors
-            if output["status"] == "error":
-                # If it's an API error, that's fine - we're testing prompt handling, not API calls
-                assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
-            else:
-                # If somehow it succeeds (e.g., with mocked provider), check the response
-                assert output["status"] != "resend_prompt"
+            assert output["status"] != "resend_prompt"
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

--- a/tests/test_mcp_error_handling.py
+++ b/tests/test_mcp_error_handling.py
@@ -0,0 +1,64 @@
+import json
+from types import SimpleNamespace
+
+import pytest
+from mcp.types import CallToolRequest, CallToolRequestParams
+
+from providers.registry import ModelProviderRegistry
+from server import server as mcp_server
+
+
+def _install_dummy_provider(monkeypatch):
+    """Ensure preflight model checks succeed without real provider configuration."""
+
+    class DummyProvider:
+        def get_provider_type(self):
+            return SimpleNamespace(value="dummy")
+
+        def get_capabilities(self, model_name):
+            return SimpleNamespace(
+                supports_extended_thinking=False,
+                allow_code_generation=False,
+                supports_images=False,
+                context_window=1_000_000,
+                max_image_size_mb=10,
+            )
+
+    monkeypatch.setattr(
+        ModelProviderRegistry,
+        "get_provider_for_model",
+        classmethod(lambda cls, model_name: DummyProvider()),
+    )
+    monkeypatch.setattr(
+        ModelProviderRegistry,
+        "get_available_models",
+        classmethod(lambda cls, respect_restrictions=False: {"gemini-2.5-flash": None}),
+    )
+
+
+@pytest.mark.asyncio
+async def test_tool_execution_error_sets_is_error_flag_for_mcp_response(monkeypatch):
+    """Ensure ToolExecutionError surfaces as CallToolResult with isError=True."""
+
+    _install_dummy_provider(monkeypatch)
+
+    handler = mcp_server.request_handlers[CallToolRequest]
+
+    arguments = {
+        "prompt": "Trigger working_directory validation failure",
+        "working_directory": "relative/path",  # Not absolute -> ToolExecutionError from ChatTool
+        "files": [],
+        "model": "gemini-2.5-flash",
+    }
+
+    request = CallToolRequest(params=CallToolRequestParams(name="chat", arguments=arguments))
+
+    server_result = await handler(request)
+
+    assert server_result.root.isError is True
+    assert server_result.root.content, "Expected error response content"
+
+    payload = server_result.root.content[0].text
+    data = json.loads(payload)
+    assert data["status"] == "error"
+    assert "absolute" in data["content"].lower()
--- a/tests/test_per_tool_model_defaults.py
+++ b/tests/test_per_tool_model_defaults.py
@@ -18,6 +18,7 @@ from tools.debug import DebugIssueTool
 from tools.models import ToolModelCategory
 from tools.precommit import PrecommitTool
 from tools.shared.base_tool import BaseTool
+from tools.shared.exceptions import ToolExecutionError
 from tools.thinkdeep import ThinkDeepTool


@@ -294,15 +295,12 @@ class TestAutoModeErrorMessages:
                        tool = ChatTool()
                        temp_dir = tempfile.mkdtemp()
                        try:
-                            result = await tool.execute(
-                                {"prompt": "test", "model": "auto", "working_directory": temp_dir}
-                            )
+                            with pytest.raises(ToolExecutionError) as exc_info:
+                                await tool.execute({"prompt": "test", "model": "auto", "working_directory": temp_dir})
                        finally:
                            shutil.rmtree(temp_dir, ignore_errors=True)

-                        assert len(result) == 1
-                        # The SimpleTool will wrap the error message
-                        error_output = json.loads(result[0].text)
+                        error_output = json.loads(exc_info.value.payload)
                        assert error_output["status"] == "error"
                        assert "Model 'auto' is not available" in error_output["content"]

@@ -412,7 +410,6 @@ class TestRuntimeModelSelection:
                    }
                )

-                # Should require model selection even though DEFAULT_MODEL is valid
                assert len(result) == 1
                assert "Model 'auto' is not available" in result[0].text

@@ -428,16 +425,15 @@ class TestRuntimeModelSelection:
                    tool = ChatTool()
                    temp_dir = tempfile.mkdtemp()
                    try:
-                        result = await tool.execute(
-                            {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
-                        )
+                        with pytest.raises(ToolExecutionError) as exc_info:
+                            await tool.execute(
+                                {"prompt": "test", "model": "gpt-5-turbo", "working_directory": temp_dir}
+                            )
                    finally:
                        shutil.rmtree(temp_dir, ignore_errors=True)

                    # Should require model selection
-                    assert len(result) == 1
-                    # When a specific model is requested but not available, error message is different
-                    error_output = json.loads(result[0].text)
+                    error_output = json.loads(exc_info.value.payload)
                    assert error_output["status"] == "error"
                    assert "gpt-5-turbo" in error_output["content"]
                    assert "is not available" in error_output["content"]
--- a/tests/test_planner.py
+++ b/tests/test_planner.py
@@ -8,6 +8,7 @@ import pytest

 from tools.models import ToolModelCategory
 from tools.planner import PlannerRequest, PlannerTool
+from tools.shared.exceptions import ToolExecutionError


 class TestPlannerTool:
@@ -340,16 +341,12 @@ class TestPlannerTool:
            # Missing required fields: step_number, total_steps, next_step_required
        }

-        result = await tool.execute(arguments)
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(arguments)

-        # Should return error response
-        assert len(result) == 1
-        response_text = result[0].text
-
-        # Parse the JSON response
        import json

-        parsed_response = json.loads(response_text)
+        parsed_response = json.loads(exc_info.value.payload)

        assert parsed_response["status"] == "planner_failed"
        assert "error" in parsed_response
--- a/tests/test_thinking_modes.py
+++ b/tests/test_thinking_modes.py
@@ -87,16 +87,26 @@ class TestThinkingModes:
            except Exception as e:
                # Expected: API call will fail with fake key, but we can check the error
                # If we get a provider resolution error, that's what we're testing
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                # Should NOT be a mock-related error - should be a real API or key error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error (API key, network, etc.)
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )

        finally:
            # Restore environment
@@ -156,16 +166,26 @@ class TestThinkingModes:

            except Exception as e:
                # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                # Should NOT be a mock-related error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )

        finally:
            # Restore environment
@@ -226,16 +246,26 @@ class TestThinkingModes:

            except Exception as e:
                # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                # Should NOT be a mock-related error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )

        finally:
            # Restore environment
@@ -295,16 +325,26 @@ class TestThinkingModes:

            except Exception as e:
                # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                # Should NOT be a mock-related error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )

        finally:
            # Restore environment
@@ -367,16 +407,26 @@ class TestThinkingModes:

            except Exception as e:
                # Expected: API call will fail with fake key
-                error_msg = str(e)
+                error_msg = getattr(e, "payload", str(e))
                # Should NOT be a mock-related error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg

                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
+                import json
+
+                try:
+                    parsed = json.loads(error_msg)
+                except Exception:
+                    parsed = None
+
+                if isinstance(parsed, dict) and parsed.get("status", "").endswith("_failed"):
+                    assert "validation errors" in parsed.get("error", "")
+                else:
+                    assert any(
+                        phrase in error_msg
+                        for phrase in ["API", "key", "authentication", "provider", "network", "connection", "Model"]
+                    )

        finally:
            # Restore environment
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -9,6 +9,7 @@ import tempfile
 import pytest

 from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool
+from tools.shared.exceptions import ToolExecutionError


 class TestThinkDeepTool:
@@ -324,19 +325,19 @@ class TestAbsolutePathValidation:
    async def test_thinkdeep_tool_relative_path_rejected(self):
        """Test that thinkdeep tool rejects relative paths"""
        tool = ThinkDeepTool()
-        result = await tool.execute(
-            {
-                "step": "My analysis",
-                "step_number": 1,
-                "total_steps": 1,
-                "next_step_required": False,
-                "findings": "Initial analysis",
-                "files_checked": ["./local/file.py"],
-            }
-        )
+        with pytest.raises(ToolExecutionError) as exc_info:
+            await tool.execute(
+                {
+                    "step": "My analysis",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Initial analysis",
+                    "files_checked": ["./local/file.py"],
+                }
+            )

-        assert len(result) == 1
-        response = json.loads(result[0].text)
+        response = json.loads(exc_info.value.payload)
        assert response["status"] == "error"
        assert "must be FULL absolute paths" in response["content"]
        assert "./local/file.py" in response["content"]
@@ -347,18 +348,18 @@ class TestAbsolutePathValidation:
        tool = ChatTool()
        temp_dir = tempfile.mkdtemp()
        try:
-            result = await tool.execute(
-                {
-                    "prompt": "Explain this code",
-                    "files": ["code.py"],  # relative path without ./
-                    "working_directory": temp_dir,
-                }
-            )
+            with pytest.raises(ToolExecutionError) as exc_info:
+                await tool.execute(
+                    {
+                        "prompt": "Explain this code",
+                        "files": ["code.py"],  # relative path without ./
+                        "working_directory": temp_dir,
+                    }
+                )
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

-        assert len(result) == 1
-        response = json.loads(result[0].text)
+        response = json.loads(exc_info.value.payload)
        assert response["status"] == "error"
        assert "must be FULL absolute paths" in response["content"]
        assert "code.py" in response["content"]
--- a/tests/test_workflow_metadata.py
+++ b/tests/test_workflow_metadata.py
@@ -13,6 +13,7 @@ import pytest
 from providers.registry import ModelProviderRegistry
 from providers.shared import ProviderType
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError


 class TestWorkflowMetadata:
@@ -167,12 +168,10 @@ class TestWorkflowMetadata:
            # Execute the workflow tool - should fail gracefully
            import asyncio

-            result = asyncio.run(debug_tool.execute(arguments))
+            with pytest.raises(ToolExecutionError) as exc_info:
+                asyncio.run(debug_tool.execute(arguments))

-            # Parse the JSON response
-            assert len(result) == 1
-            response_text = result[0].text
-            response_data = json.loads(response_text)
+            response_data = json.loads(exc_info.value.payload)

            # Verify it's an error response with metadata
            assert "status" in response_data
--- a/tests/test_workflow_prompt_size_validation_simple.py
+++ b/tests/test_workflow_prompt_size_validation_simple.py
@@ -12,6 +12,7 @@ import pytest

 from config import MCP_PROMPT_SIZE_LIMIT
 from tools.debug import DebugIssueTool
+from tools.shared.exceptions import ToolExecutionError


 def build_debug_arguments(**overrides) -> dict[str, object]:
@@ -60,16 +61,10 @@ async def test_workflow_tool_rejects_oversized_step_with_guidance() -> None:
    tool = DebugIssueTool()
    arguments = build_debug_arguments(step=oversized_step)

-    responses = await tool.execute(arguments)
-    assert len(responses) == 1
+    with pytest.raises(ToolExecutionError) as exc_info:
+        await tool.execute(arguments)

-    payload = json.loads(responses[0].text)
-    assert payload["status"] == "debug_failed"
-    assert "error" in payload
-
-    # Extract the serialized ToolOutput from the MCP_SIZE_CHECK marker
-    error_details = payload["error"].split("MCP_SIZE_CHECK:", 1)[1]
-    output_payload = json.loads(error_details)
+    output_payload = json.loads(exc_info.value.payload)

    assert output_payload["status"] == "resend_prompt"
    assert output_payload["metadata"]["prompt_size"] > MCP_PROMPT_SIZE_LIMIT