Re-imagined and re-written Debug tool. Instead of prompting Claude to perform initial analysis (and hoping it did), the tool now works through the debug process as an 'investigation', encouraging Claud to gather its 'findings' / 'hypothesis', stepping back as needed, collecting files it's gone through and keeping track of files relevant to the issue at hand. This structured investiion is then passed to the other model with far greater insight than the original debug tool ever could.

Improved prompts, guard against overengineering and flag that as an antipattern
2025-06-19 10:22:30 +04:00
parent 2641c78f8d
commit fccfb0d999
16 changed files with 2243 additions and 707 deletions
--- a/tests/test_auto_model_planner_fix.py
+++ b/tests/test_auto_model_planner_fix.py
@@ -198,13 +198,20 @@ class TestAutoModelPlannerFix:
        Verify that other tools still properly require model resolution.

        This ensures our fix doesn't break existing functionality.
+        Note: Debug tool now manages its own model calls like planner.
        """
        from tools.analyze import AnalyzeTool
        from tools.chat import ChatTool
        from tools.debug import DebugIssueTool

        # Test various tools still require models
-        tools_requiring_models = [ChatTool(), DebugIssueTool(), AnalyzeTool()]
+        tools_requiring_models = [ChatTool(), AnalyzeTool()]

        for tool in tools_requiring_models:
            assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution"
+
+        # Test tools that manage their own model calls
+        tools_managing_own_models = [DebugIssueTool()]
+
+        for tool in tools_managing_own_models:
+            assert tool.requires_model() is False, f"{tool.get_name()} should manage its own model calls"
--- a/tests/test_collaboration.py
+++ b/tests/test_collaboration.py
@@ -70,35 +70,35 @@ class TestDynamicContextRequests:

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_normal_response_not_parsed_as_clarification(self, mock_get_provider, debug_tool):
-        """Test that normal responses are not mistaken for clarification requests"""
-        normal_response = """
-        ## Summary
-        The error is caused by a missing import statement.
-
-        ## Hypotheses (Ranked by Likelihood)
-
-        ### 1. Missing Import (Confidence: High)
-        **Root Cause:** The module 'utils' is not imported
-        """
-
-        mock_provider = create_mock_provider()
-        mock_provider.get_provider_type.return_value = Mock(value="google")
-        mock_provider.supports_thinking_mode.return_value = False
-        mock_provider.generate_content.return_value = Mock(
-            content=normal_response, usage={}, model_name="gemini-2.5-flash", metadata={}
+    @patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid")
+    @patch("utils.conversation_memory.add_turn")
+    async def test_normal_response_not_parsed_as_clarification(
+        self, mock_add_turn, mock_create_thread, mock_get_provider, debug_tool
+    ):
+        """Test that normal investigation responses work correctly with new debug tool"""
+        # The new debug tool uses self-investigation pattern
+        result = await debug_tool.execute(
+            {
+                "step": "Investigating NameError: name 'utils' is not defined",
+                "step_number": 1,
+                "total_steps": 3,
+                "next_step_required": True,
+                "findings": "The error indicates 'utils' module is not imported or defined",
+                "files_checked": ["/code/main.py"],
+                "relevant_files": ["/code/main.py"],
+                "hypothesis": "Missing import statement for utils module",
+                "confidence": "high",
+            }
        )
-        mock_get_provider.return_value = mock_provider
-
-        result = await debug_tool.execute({"prompt": "NameError: name 'utils' is not defined"})

        assert len(result) == 1

-        # Parse the response
+        # Parse the response - new debug tool returns structured JSON
        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "success"
-        assert response_data["content_type"] in ["text", "markdown"]
-        assert "Summary" in response_data["content"]
+        assert response_data["status"] == "investigation_in_progress"
+        assert response_data["step_number"] == 1
+        assert response_data["next_step_required"] is True
+        assert response_data["investigation_status"]["current_confidence"] == "high"

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
@@ -125,17 +125,17 @@ class TestDynamicContextRequests:

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_clarification_with_suggested_action(self, mock_get_provider, debug_tool):
+    async def test_clarification_with_suggested_action(self, mock_get_provider, analyze_tool):
        """Test clarification request with suggested next action"""
        clarification_json = json.dumps(
            {
                "status": "files_required_to_continue",
-                "mandatory_instructions": "I need to see the database configuration to diagnose the connection error",
+                "mandatory_instructions": "I need to see the database configuration to analyze the connection error",
                "files_needed": ["config/database.yml", "src/db.py"],
                "suggested_next_action": {
-                    "tool": "debug",
+                    "tool": "analyze",
                    "args": {
-                        "prompt": "Connection timeout to database",
+                        "prompt": "Analyze database connection timeout issue",
                        "files": [
                            "/config/database.yml",
                            "/src/db.py",
@@ -154,9 +154,9 @@ class TestDynamicContextRequests:
        )
        mock_get_provider.return_value = mock_provider

-        result = await debug_tool.execute(
+        result = await analyze_tool.execute(
            {
-                "prompt": "Connection timeout to database",
+                "prompt": "Analyze database connection timeout issue",
                "files": ["/absolute/logs/error.log"],
            }
        )
@@ -168,7 +168,7 @@ class TestDynamicContextRequests:

        clarification = json.loads(response_data["content"])
        assert "suggested_next_action" in clarification
-        assert clarification["suggested_next_action"]["tool"] == "debug"
+        assert clarification["suggested_next_action"]["tool"] == "analyze"

    def test_tool_output_model_serialization(self):
        """Test ToolOutput model serialization"""
@@ -298,7 +298,7 @@ class TestCollaborationWorkflow:
    @patch("tools.base.BaseTool.get_model_provider")
    async def test_multi_step_collaboration(self, mock_get_provider):
        """Test a multi-step collaboration workflow"""
-        tool = DebugIssueTool()
+        tool = AnalyzeTool()

        # Step 1: Initial request returns clarification needed
        clarification_json = json.dumps(
@@ -319,8 +319,8 @@ class TestCollaborationWorkflow:

        result1 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
-                "error_context": "Timeout after 30s",
+                "prompt": "Analyze database connection timeout issue",
+                "files": ["/logs/error.log"],
            }
        )

@@ -345,9 +345,8 @@ class TestCollaborationWorkflow:

        result2 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
-                "error_context": "Timeout after 30s",
-                "files": ["/absolute/path/config.py"],  # Additional context provided
+                "prompt": "Analyze database connection timeout issue with config file",
+                "files": ["/absolute/path/config.py", "/logs/error.log"],  # Additional context provided
            }
        )

--- a/tests/test_conversation_field_mapping.py
+++ b/tests/test_conversation_field_mapping.py
@@ -157,10 +157,10 @@ async def test_unknown_tool_defaults_to_prompt():

@pytest.mark.asyncio
 async def test_tool_parameter_standardization():
-    """Test that all tools use standardized 'prompt' parameter"""
+    """Test that most tools use standardized 'prompt' parameter (debug uses investigation pattern)"""
    from tools.analyze import AnalyzeRequest
    from tools.codereview import CodeReviewRequest
-    from tools.debug import DebugIssueRequest
+    from tools.debug import DebugInvestigationRequest
    from tools.precommit import PrecommitRequest
    from tools.thinkdeep import ThinkDeepRequest

@@ -168,9 +168,16 @@ async def test_tool_parameter_standardization():
    analyze = AnalyzeRequest(files=["/test.py"], prompt="What does this do?")
    assert analyze.prompt == "What does this do?"

-    # Test debug tool uses prompt
-    debug = DebugIssueRequest(prompt="Error occurred")
-    assert debug.prompt == "Error occurred"
+    # Debug tool now uses self-investigation pattern with different fields
+    debug = DebugInvestigationRequest(
+        step="Investigating error",
+        step_number=1,
+        total_steps=3,
+        next_step_required=True,
+        findings="Initial error analysis",
+    )
+    assert debug.step == "Investigating error"
+    assert debug.findings == "Initial error analysis"

    # Test codereview tool uses prompt
    review = CodeReviewRequest(files=["/test.py"], prompt="Review this")
--- a/tests/test_debug.py
+++ b/tests/test_debug.py
@@ -0,0 +1,514 @@
+"""
+Tests for the debug tool.
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugInvestigationRequest, DebugIssueTool
+from tools.models import ToolModelCategory
+
+
+class TestDebugTool:
+    """Test suite for DebugIssueTool."""
+
+    def test_tool_metadata(self):
+        """Test basic tool metadata and configuration."""
+        tool = DebugIssueTool()
+
+        assert tool.get_name() == "debug"
+        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
+        assert tool.get_default_temperature() == 0.2  # TEMPERATURE_ANALYTICAL
+        assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
+        assert tool.requires_model() is False  # Since it manages its own model calls
+
+    def test_request_validation(self):
+        """Test Pydantic request model validation."""
+        # Valid investigation step request
+        step_request = DebugInvestigationRequest(
+            step="Investigating null pointer exception in UserService",
+            step_number=1,
+            total_steps=5,
+            next_step_required=True,
+            findings="Found that UserService.getUser() is called with null ID",
+        )
+        assert step_request.step == "Investigating null pointer exception in UserService"
+        assert step_request.step_number == 1
+        assert step_request.next_step_required is True
+        assert step_request.confidence == "low"  # default
+
+        # Request with optional fields
+        detailed_request = DebugInvestigationRequest(
+            step="Deep dive into getUser method implementation",
+            step_number=2,
+            total_steps=5,
+            next_step_required=True,
+            findings="Method doesn't validate input parameters",
+            files_checked=["/src/UserService.java", "/src/UserController.java"],
+            relevant_files=["/src/UserService.java"],
+            relevant_methods=["UserService.getUser", "UserController.handleRequest"],
+            hypothesis="Null ID passed from controller without validation",
+            confidence="medium",
+        )
+        assert len(detailed_request.files_checked) == 2
+        assert len(detailed_request.relevant_files) == 1
+        assert detailed_request.confidence == "medium"
+
+        # Missing required fields should fail
+        with pytest.raises(ValueError):
+            DebugInvestigationRequest()  # Missing all required fields
+
+        with pytest.raises(ValueError):
+            DebugInvestigationRequest(step="test")  # Missing other required fields
+
+    def test_input_schema_generation(self):
+        """Test JSON schema generation for MCP client."""
+        tool = DebugIssueTool()
+        schema = tool.get_input_schema()
+
+        assert schema["type"] == "object"
+        # Investigation fields
+        assert "step" in schema["properties"]
+        assert "step_number" in schema["properties"]
+        assert "total_steps" in schema["properties"]
+        assert "next_step_required" in schema["properties"]
+        assert "findings" in schema["properties"]
+        assert "files_checked" in schema["properties"]
+        assert "relevant_files" in schema["properties"]
+        assert "relevant_methods" in schema["properties"]
+        assert "hypothesis" in schema["properties"]
+        assert "confidence" in schema["properties"]
+        assert "backtrack_from_step" in schema["properties"]
+        assert "continuation_id" in schema["properties"]
+        assert "images" in schema["properties"]  # Now supported for visual debugging
+
+        # Check excluded fields are NOT present
+        assert "model" not in schema["properties"]
+        assert "temperature" not in schema["properties"]
+        assert "thinking_mode" not in schema["properties"]
+        assert "use_websearch" not in schema["properties"]
+
+        # Check required fields
+        assert "step" in schema["required"]
+        assert "step_number" in schema["required"]
+        assert "total_steps" in schema["required"]
+        assert "next_step_required" in schema["required"]
+        assert "findings" in schema["required"]
+
+    def test_model_category_for_debugging(self):
+        """Test that debug uses extended reasoning category."""
+        tool = DebugIssueTool()
+        category = tool.get_model_category()
+
+        # Debugging needs deep thinking
+        assert category == ToolModelCategory.EXTENDED_REASONING
+
+    @pytest.mark.asyncio
+    async def test_execute_first_investigation_step(self):
+        """Test execute method for first investigation step."""
+        tool = DebugIssueTool()
+        arguments = {
+            "step": "Investigating intermittent session validation failures in production",
+            "step_number": 1,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Users report random session invalidation, occurs more during high traffic",
+            "files_checked": ["/api/session_manager.py"],
+            "relevant_files": ["/api/session_manager.py"],
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.create_thread", return_value="debug-uuid-123"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        assert result[0].type == "text"
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(result[0].text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["step_number"] == 1
+        assert parsed_response["total_steps"] == 5
+        assert parsed_response["next_step_required"] is True
+        assert parsed_response["continuation_id"] == "debug-uuid-123"
+        assert parsed_response["investigation_status"]["files_checked"] == 1
+        assert parsed_response["investigation_status"]["relevant_files"] == 1
+
+    @pytest.mark.asyncio
+    async def test_execute_subsequent_investigation_step(self):
+        """Test execute method for subsequent investigation step."""
+        tool = DebugIssueTool()
+
+        # Set up initial state
+        tool.initial_issue = "Session validation failures"
+        tool.consolidated_findings["files_checked"].add("/api/session_manager.py")
+
+        arguments = {
+            "step": "Examining session cleanup method for concurrent modification issues",
+            "step_number": 2,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Found dictionary modification during iteration in cleanup_expired_sessions",
+            "files_checked": ["/api/session_manager.py", "/api/utils.py"],
+            "relevant_files": ["/api/session_manager.py"],
+            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+            "hypothesis": "Dictionary modified during iteration causing RuntimeError",
+            "confidence": "high",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        assert result[0].type == "text"
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(result[0].text)
+
+        assert parsed_response["step_number"] == 2
+        assert parsed_response["next_step_required"] is True
+        assert parsed_response["continuation_id"] == "debug-uuid-123"
+        assert parsed_response["investigation_status"]["files_checked"] == 2  # Cumulative
+        assert parsed_response["investigation_status"]["relevant_methods"] == 1
+        assert parsed_response["investigation_status"]["current_confidence"] == "high"
+
+    @pytest.mark.asyncio
+    async def test_execute_final_investigation_step(self):
+        """Test execute method for final investigation step with expert analysis."""
+        tool = DebugIssueTool()
+
+        # Set up investigation history
+        tool.initial_issue = "Session validation failures"
+        tool.investigation_history = [
+            {
+                "step_number": 1,
+                "step": "Initial investigation of session validation failures",
+                "findings": "Initial investigation",
+                "files_checked": ["/api/utils.py"],
+            },
+            {
+                "step_number": 2,
+                "step": "Deeper analysis of session manager",
+                "findings": "Found dictionary issue",
+                "files_checked": ["/api/session_manager.py"],
+            },
+        ]
+        tool.consolidated_findings = {
+            "files_checked": {"/api/session_manager.py", "/api/utils.py"},
+            "relevant_files": {"/api/session_manager.py"},
+            "relevant_methods": {"SessionManager.cleanup_expired_sessions"},
+            "findings": ["Step 1: Initial investigation", "Step 2: Found dictionary issue"],
+            "hypotheses": [{"step": 2, "hypothesis": "Dictionary modified during iteration", "confidence": "high"}],
+            "images": [],
+        }
+
+        arguments = {
+            "step": "Confirmed the root cause and identified fix",
+            "step_number": 3,
+            "total_steps": 3,
+            "next_step_required": False,  # Final step
+            "findings": "Root cause confirmed: dictionary modification during iteration in cleanup method",
+            "files_checked": ["/api/session_manager.py"],
+            "relevant_files": ["/api/session_manager.py"],
+            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+            "hypothesis": "Dictionary modification during iteration causes intermittent RuntimeError",
+            "confidence": "high",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock the expert analysis call
+        mock_expert_response = {
+            "status": "analysis_complete",
+            "summary": "Dictionary modification during iteration bug identified",
+            "hypotheses": [
+                {
+                    "name": "CONCURRENT_MODIFICATION",
+                    "confidence": "High",
+                    "root_cause": "Modifying dictionary while iterating",
+                    "minimal_fix": "Create list of keys to delete first",
+                }
+            ],
+        }
+
+        # Mock conversation memory and file reading
+        with patch("utils.conversation_memory.add_turn"):
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                with patch.object(tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
+                    result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        # Check final step structure
+        assert parsed_response["status"] == "calling_expert_analysis"
+        assert parsed_response["investigation_complete"] is True
+        assert parsed_response["expert_analysis"]["status"] == "analysis_complete"
+        assert "complete_investigation" in parsed_response
+        assert parsed_response["complete_investigation"]["steps_taken"] == 3  # All steps including current
+
+    @pytest.mark.asyncio
+    async def test_execute_with_backtracking(self):
+        """Test execute method with backtracking to revise findings."""
+        tool = DebugIssueTool()
+
+        # Set up some investigation history with all required fields
+        tool.investigation_history = [
+            {
+                "step": "Initial investigation",
+                "step_number": 1,
+                "findings": "Initial findings",
+                "files_checked": ["file1.py"],
+                "relevant_files": [],
+                "relevant_methods": [],
+                "hypothesis": None,
+                "confidence": "low",
+            },
+            {
+                "step": "Wrong direction",
+                "step_number": 2,
+                "findings": "Wrong path",
+                "files_checked": ["file2.py"],
+                "relevant_files": [],
+                "relevant_methods": [],
+                "hypothesis": None,
+                "confidence": "low",
+            },
+        ]
+        tool.consolidated_findings = {
+            "files_checked": {"file1.py", "file2.py"},
+            "relevant_files": set(),
+            "relevant_methods": set(),
+            "findings": ["Step 1: Initial findings", "Step 2: Wrong path"],
+            "hypotheses": [],
+            "images": [],
+        }
+
+        arguments = {
+            "step": "Backtracking to revise approach",
+            "step_number": 3,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Taking a different investigation approach",
+            "files_checked": ["file3.py"],
+            "backtrack_from_step": 2,  # Backtrack from step 2
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        # After backtracking from step 2, history should have step 1 plus the new step
+        assert len(tool.investigation_history) == 2  # Step 1 + new step 3
+        assert tool.investigation_history[0]["step_number"] == 1
+        assert tool.investigation_history[1]["step_number"] == 3  # The new step that triggered backtrack
+
+    @pytest.mark.asyncio
+    async def test_execute_adjusts_total_steps(self):
+        """Test execute method adjusts total steps when current step exceeds estimate."""
+        tool = DebugIssueTool()
+        arguments = {
+            "step": "Additional investigation needed",
+            "step_number": 8,
+            "total_steps": 5,  # Current step exceeds total
+            "next_step_required": True,
+            "findings": "More complexity discovered",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        # Total steps should be adjusted to match current step
+        assert parsed_response["total_steps"] == 8
+        assert parsed_response["step_number"] == 8
+
+    @pytest.mark.asyncio
+    async def test_execute_error_handling(self):
+        """Test execute method error handling."""
+        tool = DebugIssueTool()
+        # Invalid arguments - missing required fields
+        arguments = {
+            "step": "Invalid request"
+            # Missing required fields
+        }
+
+        result = await tool.execute(arguments)
+
+        # Should return error response
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_failed"
+        assert "error" in parsed_response
+
+    def test_prepare_investigation_summary(self):
+        """Test investigation summary preparation."""
+        tool = DebugIssueTool()
+        tool.consolidated_findings = {
+            "files_checked": {"file1.py", "file2.py", "file3.py"},
+            "relevant_files": {"file1.py", "file2.py"},
+            "relevant_methods": {"Class1.method1", "Class2.method2"},
+            "findings": [
+                "Step 1: Initial investigation findings",
+                "Step 2: Discovered potential issue",
+                "Step 3: Confirmed root cause",
+            ],
+            "hypotheses": [
+                {"step": 1, "hypothesis": "Initial hypothesis", "confidence": "low"},
+                {"step": 2, "hypothesis": "Refined hypothesis", "confidence": "medium"},
+                {"step": 3, "hypothesis": "Final hypothesis", "confidence": "high"},
+            ],
+            "images": [],
+        }
+
+        summary = tool._prepare_investigation_summary()
+
+        assert "SYSTEMATIC INVESTIGATION SUMMARY" in summary
+        assert "Files examined: 3" in summary
+        assert "Relevant files identified: 2" in summary
+        assert "Methods/functions involved: 2" in summary
+        assert "INVESTIGATION PROGRESSION" in summary
+        assert "Step 1:" in summary
+        assert "Step 2:" in summary
+        assert "Step 3:" in summary
+        assert "HYPOTHESIS EVOLUTION" in summary
+        assert "low confidence" in summary
+        assert "medium confidence" in summary
+        assert "high confidence" in summary
+
+    def test_extract_error_context(self):
+        """Test error context extraction from findings."""
+        tool = DebugIssueTool()
+        tool.consolidated_findings = {
+            "findings": [
+                "Step 1: Found no issues initially",
+                "Step 2: Discovered ERROR: Dictionary size changed during iteration",
+                "Step 3: Stack trace shows RuntimeError in cleanup method",
+                "Step 4: Exception occurs intermittently",
+            ],
+        }
+
+        error_context = tool._extract_error_context()
+
+        assert error_context is not None
+        assert "ERROR: Dictionary size changed" in error_context
+        assert "Stack trace shows RuntimeError" in error_context
+        assert "Exception occurs intermittently" in error_context
+        assert "Found no issues initially" not in error_context  # Should not include non-error findings
+
+    def test_reprocess_consolidated_findings(self):
+        """Test reprocessing of consolidated findings after backtracking."""
+        tool = DebugIssueTool()
+        tool.investigation_history = [
+            {
+                "step_number": 1,
+                "findings": "Initial findings",
+                "files_checked": ["file1.py"],
+                "relevant_files": ["file1.py"],
+                "relevant_methods": ["method1"],
+                "hypothesis": "Initial hypothesis",
+                "confidence": "low",
+            },
+            {
+                "step_number": 2,
+                "findings": "Second findings",
+                "files_checked": ["file2.py"],
+                "relevant_files": [],
+                "relevant_methods": ["method2"],
+            },
+        ]
+
+        tool._reprocess_consolidated_findings()
+
+        assert tool.consolidated_findings["files_checked"] == {"file1.py", "file2.py"}
+        assert tool.consolidated_findings["relevant_files"] == {"file1.py"}
+        assert tool.consolidated_findings["relevant_methods"] == {"method1", "method2"}
+        assert len(tool.consolidated_findings["findings"]) == 2
+        assert len(tool.consolidated_findings["hypotheses"]) == 1
+        assert tool.consolidated_findings["hypotheses"][0]["hypothesis"] == "Initial hypothesis"
+
+
+# Integration test
+class TestDebugToolIntegration:
+    """Integration tests for debug tool."""
+
+    def setup_method(self):
+        """Set up model context for integration tests."""
+        from utils.model_context import ModelContext
+
+        self.tool = DebugIssueTool()
+        self.tool._model_context = ModelContext("flash")  # Test model
+
+    @pytest.mark.asyncio
+    async def test_complete_investigation_flow(self):
+        """Test complete investigation flow from start to expert analysis."""
+        # Step 1: Initial investigation
+        arguments = {
+            "step": "Investigating memory leak in data processing pipeline",
+            "step_number": 1,
+            "total_steps": 3,
+            "next_step_required": True,
+            "findings": "High memory usage observed during batch processing",
+            "files_checked": ["/processor/main.py"],
+        }
+
+        # Mock conversation memory and expert analysis
+        with patch("utils.conversation_memory.create_thread", return_value="debug-flow-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await self.tool.execute(arguments)
+
+        # Verify response structure
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["step_number"] == 1
+        assert parsed_response["continuation_id"] == "debug-flow-uuid"
--- a/tests/test_debug_comprehensive_workflow.py
+++ b/tests/test_debug_comprehensive_workflow.py
@@ -0,0 +1,363 @@
+"""
+Comprehensive test demonstrating debug tool's self-investigation pattern
+and continuation ID functionality working together end-to-end.
+"""
+
+import json
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugIssueTool
+from utils.conversation_memory import (
+    ConversationTurn,
+    ThreadContext,
+    build_conversation_history,
+    get_conversation_file_list,
+)
+
+
+class TestDebugComprehensiveWorkflow:
+    """Test the complete debug workflow from investigation to expert analysis to continuation."""
+
+    @pytest.mark.asyncio
+    async def test_full_debug_workflow_with_continuation(self):
+        """Test complete debug workflow: investigation → expert analysis → continuation to another tool."""
+        tool = DebugIssueTool()
+
+        # Step 1: Initial investigation
+        with patch("utils.conversation_memory.create_thread", return_value="debug-workflow-uuid"):
+            with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+                result1 = await tool.execute(
+                    {
+                        "step": "Investigating memory leak in user session handler",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "High memory usage detected in session handler",
+                        "files_checked": ["/api/sessions.py"],
+                        "images": ["/screenshots/memory_profile.png"],
+                    }
+                )
+
+        # Verify step 1 response
+        assert len(result1) == 1
+        response1 = json.loads(result1[0].text)
+        assert response1["status"] == "investigation_in_progress"
+        assert response1["step_number"] == 1
+        assert response1["continuation_id"] == "debug-workflow-uuid"
+
+        # Verify conversation turn was added
+        assert mock_add_turn.called
+        call_args = mock_add_turn.call_args
+        if call_args:
+            # Check if args were passed positionally or as keywords
+            args = call_args.args if hasattr(call_args, "args") else call_args[0]
+            if args and len(args) >= 3:
+                assert args[0] == "debug-workflow-uuid"
+                assert args[1] == "assistant"
+                assert json.loads(args[2])["status"] == "investigation_in_progress"
+
+        # Step 2: Continue investigation with findings
+        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+            result2 = await tool.execute(
+                {
+                    "step": "Found circular references in session cache preventing garbage collection",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Session objects hold references to themselves through event handlers",
+                    "files_checked": ["/api/sessions.py", "/api/cache.py"],
+                    "relevant_files": ["/api/sessions.py"],
+                    "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
+                    "hypothesis": "Circular references preventing garbage collection",
+                    "confidence": "high",
+                    "continuation_id": "debug-workflow-uuid",
+                }
+            )
+
+        # Verify step 2 response
+        response2 = json.loads(result2[0].text)
+        assert response2["status"] == "investigation_in_progress"
+        assert response2["step_number"] == 2
+        assert response2["investigation_status"]["files_checked"] == 2
+        assert response2["investigation_status"]["relevant_methods"] == 2
+        assert response2["investigation_status"]["current_confidence"] == "high"
+
+        # Step 3: Final investigation with expert analysis
+        # Mock the expert analysis response
+        mock_expert_response = {
+            "status": "analysis_complete",
+            "summary": "Memory leak caused by circular references in session event handlers",
+            "hypotheses": [
+                {
+                    "name": "CIRCULAR_REFERENCE_LEAK",
+                    "confidence": "High (95%)",
+                    "evidence": ["Event handlers hold strong references", "No weak references used"],
+                    "root_cause": "SessionHandler stores callbacks that reference the handler itself",
+                    "potential_fixes": [
+                        {
+                            "description": "Use weakref for event handler callbacks",
+                            "files_to_modify": ["/api/sessions.py"],
+                            "complexity": "Low",
+                        }
+                    ],
+                    "minimal_fix": "Replace self references in callbacks with weakref.ref(self)",
+                }
+            ],
+            "investigation_summary": {
+                "pattern": "Classic circular reference memory leak",
+                "severity": "High - causes unbounded memory growth",
+                "recommended_action": "Implement weakref solution immediately",
+            },
+        }
+
+        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                result3 = await tool.execute(
+                    {
+                        "step": "Investigation complete - confirmed circular reference memory leak pattern",
+                        "step_number": 3,
+                        "total_steps": 3,
+                        "next_step_required": False,  # Triggers expert analysis
+                        "findings": "Circular references between SessionHandler and event callbacks prevent GC",
+                        "files_checked": ["/api/sessions.py", "/api/cache.py"],
+                        "relevant_files": ["/api/sessions.py"],
+                        "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
+                        "hypothesis": "Circular references in event handler callbacks causing memory leak",
+                        "confidence": "high",
+                        "continuation_id": "debug-workflow-uuid",
+                        "model": "flash",
+                    }
+                )
+
+        # Verify final response with expert analysis
+        response3 = json.loads(result3[0].text)
+        assert response3["status"] == "calling_expert_analysis"
+        assert response3["investigation_complete"] is True
+        assert "expert_analysis" in response3
+
+        expert = response3["expert_analysis"]
+        assert expert["status"] == "analysis_complete"
+        assert "CIRCULAR_REFERENCE_LEAK" in expert["hypotheses"][0]["name"]
+        assert "weakref" in expert["hypotheses"][0]["minimal_fix"]
+
+        # Verify complete investigation summary
+        assert "complete_investigation" in response3
+        complete = response3["complete_investigation"]
+        assert complete["steps_taken"] == 3
+        assert "/api/sessions.py" in complete["files_examined"]
+        assert "SessionHandler.add_event_listener" in complete["relevant_methods"]
+
+        # Step 4: Test continuation to another tool (e.g., analyze)
+        # Create a mock thread context representing the debug conversation
+        debug_context = ThreadContext(
+            thread_id="debug-workflow-uuid",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Investigating memory leak",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/sessions.py"],
+                    images=["/screenshots/memory_profile.png"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response1),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 2: Found circular references",
+                    timestamp="2025-01-01T00:03:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response2),
+                    timestamp="2025-01-01T00:04:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 3: Investigation complete",
+                    timestamp="2025-01-01T00:05:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response3),
+                    timestamp="2025-01-01T00:06:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Test that another tool can use the continuation
+        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
+            # Mock file reading
+            def mock_read_file(file_path):
+                if file_path == "/api/sessions.py":
+                    return "# SessionHandler with circular refs\nclass SessionHandler:\n    pass", 20
+                elif file_path == "/screenshots/memory_profile.png":
+                    # Images return empty string for content but 0 tokens
+                    return "", 0
+                elif file_path == "/api/cache.py":
+                    return "# Cache module", 5
+                return "", 0
+
+            # Build conversation history for another tool
+            from utils.model_context import ModelContext
+
+            model_context = ModelContext("flash")
+            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
+
+            # Verify history contains all debug information
+            assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
+            assert "Thread: debug-workflow-uuid" in history
+            assert "Tool: debug" in history
+
+            # Check investigation progression
+            assert "Step 1: Investigating memory leak" in history
+            assert "Step 2: Found circular references" in history
+            assert "Step 3: Investigation complete" in history
+
+            # Check expert analysis is included
+            assert "CIRCULAR_REFERENCE_LEAK" in history
+            assert "weakref" in history
+            assert "memory leak" in history
+
+            # Check files are referenced in conversation history
+            assert "/api/sessions.py" in history
+
+            # File content would be in referenced files section if the files were readable
+            # In our test they're not real files so they won't be embedded
+            # But the expert analysis content should be there
+            assert "Memory leak caused by circular references" in history
+
+            # Verify file list includes all files from investigation
+            file_list = get_conversation_file_list(debug_context)
+            assert "/api/sessions.py" in file_list
+
+    @pytest.mark.asyncio
+    async def test_debug_investigation_state_machine(self):
+        """Test the debug tool's investigation state machine behavior."""
+        tool = DebugIssueTool()
+
+        # Test state transitions
+        states = []
+
+        # Initial state
+        with patch("utils.conversation_memory.create_thread", return_value="state-test-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(
+                    {
+                        "step": "Starting investigation",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "findings": "Initial findings",
+                    }
+                )
+                states.append(json.loads(result[0].text))
+
+        # Verify initial state
+        assert states[0]["status"] == "investigation_in_progress"
+        assert states[0]["step_number"] == 1
+        assert states[0]["next_step_required"] is True
+
+        # Final state (triggers expert analysis)
+        mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
+
+        with patch("utils.conversation_memory.add_turn"):
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                result = await tool.execute(
+                    {
+                        "step": "Final findings",
+                        "step_number": 2,
+                        "total_steps": 2,
+                        "next_step_required": False,
+                        "findings": "Complete findings",
+                        "continuation_id": "state-test-uuid",
+                        "model": "flash",
+                    }
+                )
+                states.append(json.loads(result[0].text))
+
+        # Verify final state
+        assert states[1]["status"] == "calling_expert_analysis"
+        assert states[1]["investigation_complete"] is True
+        assert "expert_analysis" in states[1]
+
+    @pytest.mark.asyncio
+    async def test_debug_backtracking_preserves_continuation(self):
+        """Test that backtracking preserves continuation ID and investigation state."""
+        tool = DebugIssueTool()
+
+        # Start investigation
+        with patch("utils.conversation_memory.create_thread", return_value="backtrack-test-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result1 = await tool.execute(
+                    {
+                        "step": "Initial hypothesis",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "Initial findings",
+                    }
+                )
+
+        response1 = json.loads(result1[0].text)
+        continuation_id = response1["continuation_id"]
+
+        # Step 2 - wrong direction
+        with patch("utils.conversation_memory.add_turn"):
+            await tool.execute(
+                {
+                    "step": "Wrong hypothesis",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Dead end",
+                    "hypothesis": "Wrong initial hypothesis",
+                    "confidence": "low",
+                    "continuation_id": continuation_id,
+                }
+            )
+
+        # Backtrack from step 2
+        with patch("utils.conversation_memory.add_turn"):
+            result3 = await tool.execute(
+                {
+                    "step": "Backtracking - new hypothesis",
+                    "step_number": 3,
+                    "total_steps": 4,  # Adjusted total
+                    "next_step_required": True,
+                    "findings": "New direction",
+                    "hypothesis": "New hypothesis after backtracking",
+                    "confidence": "medium",
+                    "backtrack_from_step": 2,
+                    "continuation_id": continuation_id,
+                }
+            )
+
+        response3 = json.loads(result3[0].text)
+
+        # Verify continuation preserved through backtracking
+        assert response3["continuation_id"] == continuation_id
+        assert response3["step_number"] == 3
+        assert response3["total_steps"] == 4
+
+        # Verify investigation status after backtracking
+        # When we backtrack, investigation continues
+        assert response3["investigation_status"]["files_checked"] == 0  # Reset after backtrack
+        assert response3["investigation_status"]["current_confidence"] == "medium"
+
+        # The key thing is the continuation ID is preserved
+        # and we've adjusted our approach (total_steps increased)
--- a/tests/test_debug_continuation.py
+++ b/tests/test_debug_continuation.py
@@ -0,0 +1,336 @@
+"""
+Test debug tool continuation ID functionality and conversation history formatting.
+"""
+
+import json
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugIssueTool
+from utils.conversation_memory import (
+    ConversationTurn,
+    ThreadContext,
+    build_conversation_history,
+    get_conversation_file_list,
+)
+
+
+class TestDebugContinuation:
+    """Test debug tool continuation ID and conversation history integration."""
+
+    @pytest.mark.asyncio
+    async def test_debug_creates_continuation_id(self):
+        """Test that debug tool creates continuation ID on first step."""
+        tool = DebugIssueTool()
+
+        with patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid-123"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(
+                    {
+                        "step": "Investigating null pointer exception",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "Initial investigation shows null reference in UserService",
+                        "files_checked": ["/api/UserService.java"],
+                    }
+                )
+
+        assert len(result) == 1
+        response = json.loads(result[0].text)
+        assert response["status"] == "investigation_in_progress"
+        assert response["continuation_id"] == "debug-test-uuid-123"
+
+    def test_debug_conversation_formatting(self):
+        """Test that debug tool's structured output is properly formatted in conversation history."""
+        # Create a mock conversation with debug tool output
+        debug_output = {
+            "status": "investigation_in_progress",
+            "step_number": 2,
+            "total_steps": 3,
+            "next_step_required": True,
+            "investigation_status": {
+                "files_checked": 3,
+                "relevant_files": 2,
+                "relevant_methods": 1,
+                "hypotheses_formed": 1,
+                "images_collected": 0,
+                "current_confidence": "medium",
+            },
+            "output": {"instructions": "Continue systematic investigation.", "format": "systematic_investigation"},
+            "continuation_id": "debug-test-uuid-123",
+            "next_steps": "Continue investigation with step 3.",
+        }
+
+        context = ThreadContext(
+            thread_id="debug-test-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:05:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Investigating null pointer exception",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/UserService.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(debug_output, indent=2),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                    files=["/api/UserService.java", "/api/UserController.java"],
+                ),
+            ],
+            initial_context={
+                "step": "Investigating null pointer exception",
+                "step_number": 1,
+                "total_steps": 3,
+                "next_step_required": True,
+                "findings": "Initial investigation",
+            },
+        )
+
+        # Mock file reading to avoid actual file I/O
+        def mock_read_file(file_path):
+            if file_path == "/api/UserService.java":
+                return "// UserService.java\npublic class UserService {\n    // code...\n}", 10
+            elif file_path == "/api/UserController.java":
+                return "// UserController.java\npublic class UserController {\n    // code...\n}", 10
+            return "", 0
+
+        # Build conversation history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
+
+        # Verify the history contains debug-specific content
+        assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
+        assert "Thread: debug-test-uuid-123" in history
+        assert "Tool: debug" in history
+
+        # Check that files are included
+        assert "UserService.java" in history
+        assert "UserController.java" in history
+
+        # Check that debug output is included
+        assert "investigation_in_progress" in history
+        assert '"step_number": 2' in history
+        assert '"files_checked": 3' in history
+        assert '"current_confidence": "medium"' in history
+
+    def test_debug_continuation_preserves_investigation_state(self):
+        """Test that continuation preserves investigation state across tools."""
+        # Create a debug investigation context
+        context = ThreadContext(
+            thread_id="debug-test-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Initial investigation",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 1,
+                            "total_steps": 4,
+                            "next_step_required": True,
+                            "investigation_status": {"files_checked": 1, "relevant_files": 1},
+                            "continuation_id": "debug-test-uuid-123",
+                        }
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 2: Found dictionary modification issue",
+                    timestamp="2025-01-01T00:03:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java", "/api/utils.py"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 2,
+                            "total_steps": 4,
+                            "next_step_required": True,
+                            "investigation_status": {
+                                "files_checked": 2,
+                                "relevant_files": 1,
+                                "relevant_methods": 1,
+                                "hypotheses_formed": 1,
+                                "current_confidence": "high",
+                            },
+                            "continuation_id": "debug-test-uuid-123",
+                        }
+                    ),
+                    timestamp="2025-01-01T00:04:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Get file list to verify prioritization
+        file_list = get_conversation_file_list(context)
+        assert file_list == ["/api/SessionManager.java", "/api/utils.py"]
+
+        # Mock file reading
+        def mock_read_file(file_path):
+            return f"// {file_path}\n// Mock content", 5
+
+        # Build history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
+
+        # Verify investigation progression is preserved
+        assert "Step 1: Initial investigation" in history
+        assert "Step 2: Found dictionary modification issue" in history
+        assert '"step_number": 1' in history
+        assert '"step_number": 2' in history
+        assert '"current_confidence": "high"' in history
+
+    @pytest.mark.asyncio
+    async def test_debug_to_analyze_continuation(self):
+        """Test continuation from debug tool to analyze tool."""
+        # Simulate debug tool creating initial investigation
+        debug_context = ThreadContext(
+            thread_id="debug-analyze-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Final investigation step",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "calling_expert_analysis",
+                            "investigation_complete": True,
+                            "expert_analysis": {
+                                "status": "analysis_complete",
+                                "summary": "Dictionary modification during iteration bug",
+                                "hypotheses": [
+                                    {
+                                        "name": "CONCURRENT_MODIFICATION",
+                                        "confidence": "High",
+                                        "root_cause": "Modifying dict while iterating",
+                                        "minimal_fix": "Create list of keys first",
+                                    }
+                                ],
+                            },
+                            "complete_investigation": {
+                                "initial_issue": "Session validation failures",
+                                "steps_taken": 3,
+                                "files_examined": ["/api/SessionManager.java"],
+                                "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                            },
+                        }
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Mock getting the thread
+        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
+            # Mock file reading
+            def mock_read_file(file_path):
+                return "// SessionManager.java\n// cleanup_expired_sessions method", 10
+
+            # Build history for analyze tool
+            from utils.model_context import ModelContext
+
+            model_context = ModelContext("flash")
+            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
+
+            # Verify analyze tool can see debug investigation
+            assert "calling_expert_analysis" in history
+            assert "CONCURRENT_MODIFICATION" in history
+            assert "Dictionary modification during iteration bug" in history
+            assert "SessionManager.cleanup_expired_sessions" in history
+
+            # Verify the continuation context is clear
+            assert "Thread: debug-analyze-uuid-123" in history
+            assert "Tool: debug" in history  # Shows original tool
+
+    def test_debug_planner_style_formatting(self):
+        """Test that debug tool uses similar formatting to planner for structured responses."""
+        # Create debug investigation with multiple steps
+        context = ThreadContext(
+            thread_id="debug-format-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:15:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Initial error analysis",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 1,
+                            "total_steps": 3,
+                            "next_step_required": True,
+                            "output": {
+                                "instructions": "Continue systematic investigation.",
+                                "format": "systematic_investigation",
+                            },
+                            "continuation_id": "debug-format-uuid-123",
+                        },
+                        indent=2,
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Build history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, _ = build_conversation_history(context, model_context, read_files_func=lambda x: ("", 0))
+
+        # Verify structured format is preserved
+        assert '"status": "investigation_in_progress"' in history
+        assert '"format": "systematic_investigation"' in history
+        assert "--- Turn 1 (Claude using debug) ---" in history
+        assert "--- Turn 2 (Gemini using debug" in history
+
+        # The JSON structure should be preserved for tools to parse
+        # This allows other tools to understand the investigation state
+        turn_2_start = history.find("--- Turn 2 (Gemini using debug")
+        turn_2_content = history[turn_2_start:]
+        assert "{\n" in turn_2_content  # JSON formatting preserved
+        assert '"continuation_id"' in turn_2_content
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -19,7 +19,8 @@ from config import MCP_PROMPT_SIZE_LIMIT
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
+# from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool

@@ -250,25 +251,30 @@ class TestLargePromptHandling:
        # The core fix ensures large prompts are detected at the right time
        assert output["status"] in ["success", "files_required_to_continue", "resend_prompt"]

-    @pytest.mark.asyncio
-    async def test_debug_large_error_description(self, large_prompt):
-        """Test that debug tool detects large error_description."""
-        tool = DebugIssueTool()
-        result = await tool.execute({"prompt": large_prompt})
+    # NOTE: Debug tool tests have been commented out because the debug tool has been
+    # refactored to use a self-investigation pattern instead of accepting a prompt field.
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
+    # and doesn't have the "resend_prompt" functionality for large prompts.

-        assert len(result) == 1
-        output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"
+    # @pytest.mark.asyncio
+    # async def test_debug_large_error_description(self, large_prompt):
+    #     """Test that debug tool detects large error_description."""
+    #     tool = DebugIssueTool()
+    #     result = await tool.execute({"prompt": large_prompt})
+    #
+    #     assert len(result) == 1
+    #     output = json.loads(result[0].text)
+    #     assert output["status"] == "resend_prompt"

-    @pytest.mark.asyncio
-    async def test_debug_large_error_context(self, large_prompt, normal_prompt):
-        """Test that debug tool detects large error_context."""
-        tool = DebugIssueTool()
-        result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
-
-        assert len(result) == 1
-        output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"
+    # @pytest.mark.asyncio
+    # async def test_debug_large_error_context(self, large_prompt, normal_prompt):
+    #     """Test that debug tool detects large error_context."""
+    #     tool = DebugIssueTool()
+    #     result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
+    #
+    #     assert len(result) == 1
+    #     output = json.loads(result[0].text)
+    #     assert output["status"] == "resend_prompt"

    @pytest.mark.asyncio
    async def test_analyze_large_question(self, large_prompt):
--- a/tests/test_prompt_regression.py
+++ b/tests/test_prompt_regression.py
@@ -13,7 +13,8 @@ import pytest
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
+# from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool

@@ -182,33 +183,37 @@ class TestPromptRegression:
                    output = json.loads(result[0].text)
                    assert output["status"] == "success"

-    @pytest.mark.asyncio
-    async def test_debug_normal_error(self, mock_model_response):
-        """Test debug tool with normal error description."""
-        tool = DebugIssueTool()
+    # NOTE: Debug tool test has been commented out because the debug tool has been
+    # refactored to use a self-investigation pattern instead of accepting prompt/error_context fields.
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings

-        with patch.object(tool, "get_model_provider") as mock_get_provider:
-            mock_provider = MagicMock()
-            mock_provider.get_provider_type.return_value = MagicMock(value="google")
-            mock_provider.supports_thinking_mode.return_value = False
-            mock_provider.generate_content.return_value = mock_model_response(
-                "Root cause: The variable is undefined. Fix: Initialize it..."
-            )
-            mock_get_provider.return_value = mock_provider
-
-            result = await tool.execute(
-                {
-                    "prompt": "TypeError: Cannot read property 'name' of undefined",
-                    "error_context": "at line 42 in user.js\n  console.log(user.name)",
-                    "runtime_info": "Node.js v16.14.0",
-                }
-            )
-
-            assert len(result) == 1
-            output = json.loads(result[0].text)
-            assert output["status"] == "success"
-            assert "Next Steps:" in output["content"]
-            assert "Root cause" in output["content"]
+    # @pytest.mark.asyncio
+    # async def test_debug_normal_error(self, mock_model_response):
+    #     """Test debug tool with normal error description."""
+    #     tool = DebugIssueTool()
+    #
+    #     with patch.object(tool, "get_model_provider") as mock_get_provider:
+    #         mock_provider = MagicMock()
+    #         mock_provider.get_provider_type.return_value = MagicMock(value="google")
+    #         mock_provider.supports_thinking_mode.return_value = False
+    #         mock_provider.generate_content.return_value = mock_model_response(
+    #             "Root cause: The variable is undefined. Fix: Initialize it..."
+    #         )
+    #         mock_get_provider.return_value = mock_provider
+    #
+    #         result = await tool.execute(
+    #             {
+    #                 "prompt": "TypeError: Cannot read property 'name' of undefined",
+    #                 "error_context": "at line 42 in user.js\n  console.log(user.name)",
+    #                 "runtime_info": "Node.js v16.14.0",
+    #             }
+    #         )
+    #
+    #         assert len(result) == 1
+    #         output = json.loads(result[0].text)
+    #         assert output["status"] == "success"
+    #         assert "Next Steps:" in output["content"]
+    #         assert "Root cause" in output["content"]

    @pytest.mark.asyncio
    async def test_analyze_normal_question(self, mock_model_response):
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -6,7 +6,7 @@ import json

 import pytest

-from tools import AnalyzeTool, ChatTool, CodeReviewTool, DebugIssueTool, ThinkDeepTool
+from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool


 class TestThinkDeepTool:
@@ -183,94 +183,6 @@ class TestCodeReviewTool:
            ModelProviderRegistry._instance = None


-class TestDebugIssueTool:
-    """Test the debug tool"""
-
-    @pytest.fixture
-    def tool(self):
-        return DebugIssueTool()
-
-    def test_tool_metadata(self, tool):
-        """Test tool metadata"""
-        assert tool.get_name() == "debug"
-        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
-        assert tool.get_default_temperature() == 0.2
-
-        schema = tool.get_input_schema()
-        assert "prompt" in schema["properties"]
-        assert schema["required"] == ["prompt"]
-
-    @pytest.mark.asyncio
-    async def test_execute_with_context(self, tool):
-        """Test execution with error context using real integration testing"""
-        import importlib
-        import os
-
-        # Save original environment
-        original_env = {
-            "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
-            "DEFAULT_MODEL": os.environ.get("DEFAULT_MODEL"),
-        }
-
-        try:
-            # Set up environment for real provider resolution
-            os.environ["OPENAI_API_KEY"] = "sk-test-key-debug-context-test-not-real"
-            os.environ["DEFAULT_MODEL"] = "o3-mini"
-
-            # Clear other provider keys to isolate to OpenAI
-            for key in ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]:
-                os.environ.pop(key, None)
-
-            # Reload config and clear registry
-            import config
-
-            importlib.reload(config)
-            from providers.registry import ModelProviderRegistry
-
-            ModelProviderRegistry._instance = None
-
-            # Test with real provider resolution
-            try:
-                result = await tool.execute(
-                    {
-                        "prompt": "Test fails intermittently",
-                        "error_context": "AssertionError in test_async",
-                        "previous_attempts": "Added sleep, still fails",
-                        "model": "o3-mini",
-                    }
-                )
-
-                # If we get here, check the response format
-                assert len(result) == 1
-                # Should contain debug analysis
-                assert result[0].text is not None
-
-            except Exception as e:
-                # Expected: API call will fail with fake key
-                error_msg = str(e)
-                # Should NOT be a mock-related error
-                assert "MagicMock" not in error_msg
-                assert "'<' not supported between instances" not in error_msg
-
-                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
-
-        finally:
-            # Restore environment
-            for key, value in original_env.items():
-                if value is not None:
-                    os.environ[key] = value
-                else:
-                    os.environ.pop(key, None)
-
-            # Reload config and clear registry
-            importlib.reload(config)
-            ModelProviderRegistry._instance = None
-
-
 class TestAnalyzeTool:
    """Test the analyze tool"""

@@ -400,23 +312,6 @@ class TestAbsolutePathValidation:
        assert "must be FULL absolute paths" in response["content"]
        assert "../parent/file.py" in response["content"]

-    @pytest.mark.asyncio
-    async def test_debug_tool_relative_path_rejected(self):
-        """Test that debug tool rejects relative paths"""
-        tool = DebugIssueTool()
-        result = await tool.execute(
-            {
-                "prompt": "Something broke",
-                "files": ["src/main.py"],  # relative path
-            }
-        )
-
-        assert len(result) == 1
-        response = json.loads(result[0].text)
-        assert response["status"] == "error"
-        assert "must be FULL absolute paths" in response["content"]
-        assert "src/main.py" in response["content"]
-
    @pytest.mark.asyncio
    async def test_thinkdeep_tool_relative_path_rejected(self):
        """Test that thinkdeep tool rejects relative paths"""