Re-imagined and re-written Debug tool. Instead of prompting Claude to perform initial analysis (and hoping it did), the tool now works through the debug process as an 'investigation', encouraging Claud to gather its 'findings' / 'hypothesis', stepping back as needed, collecting files it's gone through and keeping track of files relevant to the issue at hand. This structured investiion is then passed to the other model with far greater insight than the original debug tool ever could.

Improved prompts, guard against overengineering and flag that as an antipattern
2025-06-19 10:22:30 +04:00
parent 2641c78f8d
commit fccfb0d999
16 changed files with 2243 additions and 707 deletions
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -1,21 +1,23 @@
 #!/usr/bin/env python3
 """
-Debug Tool Validation Test
+Debug Tool Self-Investigation Validation Test

-Tests the debug tool with real bugs to validate:
- Proper execution with flash model
- Actual bug identification and analysis
- Hypothesis generation for root causes
- Log validation for tool execution
+Tests the debug tool's systematic self-investigation capabilities including:
+- Step-by-step investigation with proper JSON responses
+- Progressive tracking of findings, files, and methods
+- Hypothesis formation and confidence tracking
+- Backtracking and revision capabilities
+- Final expert analysis after investigation completion
 """

 import json
+from typing import Optional

 from .base_test import BaseSimulatorTest


 class DebugValidationTest(BaseSimulatorTest):
-    """Test debug tool with actual bug scenarios"""
+    """Test debug tool's self-investigation and expert analysis features"""

    @property
    def test_name(self) -> str:
@@ -23,23 +25,48 @@ class DebugValidationTest(BaseSimulatorTest):

    @property
    def test_description(self) -> str:
-        return "Debug tool validation with actual bugs"
+        return "Debug tool self-investigation pattern validation"

    def run_test(self) -> bool:
-        """Test debug tool with real bugs"""
+        """Test debug tool self-investigation capabilities"""
        try:
-            self.logger.info("Test: Debug tool validation")
+            self.logger.info("Test: Debug tool self-investigation validation")

            # Setup test files directory first
            self.setup_test_files()

            # Create a Python file with a subtle but realistic bug
-            buggy_code = """#!/usr/bin/env python3
+            self._create_buggy_code()
+
+            # Test 1: Single investigation session with multiple steps
+            if not self._test_single_investigation_session():
+                return False
+
+            # Test 2: Investigation with backtracking
+            if not self._test_investigation_with_backtracking():
+                return False
+
+            # Test 3: Complete investigation with expert analysis
+            if not self._test_complete_investigation_with_analysis():
+                return False
+
+            self.logger.info("  ✅ All debug validation tests passed")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Debug validation test failed: {e}")
+            return False
+        finally:
+            self.cleanup_test_files()
+
+    def _create_buggy_code(self):
+        """Create test files with a subtle bug for debugging"""
+        # Create a Python file with dictionary iteration bug
+        buggy_code = """#!/usr/bin/env python3
 import json
-import requests
 from datetime import datetime, timedelta

-class UserSessionManager:
+class SessionManager:
    def __init__(self):
        self.active_sessions = {}
        self.session_timeout = 30 * 60  # 30 minutes in seconds
@@ -52,7 +79,6 @@ class UserSessionManager:
            'user_id': user_id,
            'user_data': user_data,
            'created_at': datetime.now(),
-            'last_activity': datetime.now(),
            'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
        }

@@ -72,322 +98,356 @@ class UserSessionManager:
            del self.active_sessions[session_id]
            return False

-        # Update last activity
-        session['last_activity'] = current_time
        return True

    def cleanup_expired_sessions(self):
        \"\"\"Remove expired sessions from memory\"\"\"
        current_time = datetime.now()
-        expired_sessions = []
+        expired_count = 0

+        # BUG: Modifying dictionary while iterating over it
        for session_id, session in self.active_sessions.items():
            if current_time > session['expires_at']:
-                expired_sessions.append(session_id)
+                del self.active_sessions[session_id]  # This causes RuntimeError
+                expired_count += 1

-        for session_id in expired_sessions:
-            del self.active_sessions[session_id]
-
-        return len(expired_sessions)
-
-class APIHandler:
-    def __init__(self):
-        self.session_manager = UserSessionManager()
-        self.request_count = 0
-
-    def authenticate_user(self, username, password):
-        \"\"\"Authenticate user and create session\"\"\"
-        # Simulate API call to auth service
-        auth_response = self._call_auth_service(username, password)
-
-        if auth_response.get('success'):
-            user_data = auth_response.get('user_data', {})
-            session_id = self.session_manager.create_session(
-                user_data['id'], user_data
-            )
-            return {'success': True, 'session_id': session_id}
-
-        return {'success': False, 'error': 'Authentication failed'}
-
-    def process_request(self, session_id, request_data):
-        \"\"\"Process an API request with session validation\"\"\"
-        self.request_count += 1
-
-        # Validate session before processing
-        if not self.session_manager.validate_session(session_id):
-            return {'error': 'Invalid or expired session', 'code': 401}
-
-        # Simulate request processing
-        try:
-            result = self._process_business_logic(request_data)
-            return {'success': True, 'data': result}
-        except Exception as e:
-            return {'error': str(e), 'code': 500}
-
-    def _call_auth_service(self, username, password):
-        \"\"\"Simulate external authentication service call\"\"\"
-        # Simulate network delay and response
-        import time
-        time.sleep(0.1)
-
-        # Mock successful authentication
-        if username and password:
-            return {
-                'success': True,
-                'user_data': {
-                    'id': hash(username) % 10000,
-                    'username': username,
-                    'roles': ['user']
-                }
-            }
-        return {'success': False}
-
-    def _process_business_logic(self, request_data):
-        \"\"\"Simulate business logic processing\"\"\"
-        if not request_data:
-            raise ValueError("Invalid request data")
-
-        # Simulate some processing
-        return {
-            'processed_at': datetime.now().isoformat(),
-            'request_id': self.request_count,
-            'status': 'completed'
-        }
-
-# Global API handler instance
-api_handler = APIHandler()
-
-def handle_api_request(session_id, request_data):
-    \"\"\"Main API request handler\"\"\"
-    return api_handler.process_request(session_id, request_data)
+        return expired_count
 """

-            # Create test file with subtle bug
-            test_file = self.create_additional_test_file("session_manager.py", buggy_code)
-            self.logger.info(f"  ✅ Created test file with subtle bug: {test_file}")
+        # Create test file with subtle bug
+        self.buggy_file = self.create_additional_test_file("session_manager.py", buggy_code)
+        self.logger.info(f"  ✅ Created test file with subtle bug: {self.buggy_file}")

-            # Create a realistic problem description with subtle symptoms
-            error_description = """ISSUE DESCRIPTION:
-Our API service is experiencing intermittent session validation failures in production.
+        # Create error description
+        error_description = """ISSUE DESCRIPTION:
+Our session management system is experiencing intermittent failures during cleanup operations.

-SYMPTOMS OBSERVED:
- Users randomly get "Invalid or expired session" errors even with valid sessions
- The issue happens more frequently during high-traffic periods
- Sessions that should still be valid (created < 30 minutes ago) are being rejected
- The problem occurs maybe 2-3% of requests but is hard to reproduce consistently
- Server logs show session validation failing but no clear pattern
+SYMPTOMS:
+- Random RuntimeError: dictionary changed size during iteration
+- Occurs during high load when many sessions expire simultaneously
+- Error happens in cleanup_expired_sessions method
+- Affects about 5% of cleanup operations

-ENVIRONMENT:
- Python 3.13 API service
- Running in production with multiple concurrent users
- Redis not used for session storage (in-memory only)
- Load balancer distributes requests across multiple instances
+ERROR LOG:
+RuntimeError: dictionary changed size during iteration
+  File "session_manager.py", line 44, in cleanup_expired_sessions
+    for session_id, session in self.active_sessions.items():
+"""

-RECENT CHANGES:
- Increased session timeout from 15 to 30 minutes last week
- Added cleanup routine to remove expired sessions
- No major code changes to session management
+        self.error_file = self.create_additional_test_file("error_description.txt", error_description)
+        self.logger.info(f"  ✅ Created error description file: {self.error_file}")

-USER IMPACT:
- Users have to re-authenticate randomly
- Affects user experience and causes complaints
- Seems to happen more on busy days
+    def _test_single_investigation_session(self) -> bool:
+        """Test a complete investigation session with multiple steps"""
+        try:
+            self.logger.info("  1.1: Testing single investigation session")

-The code looks correct to me, but something is causing valid sessions to be treated as expired or invalid. I'm not sure what's causing this intermittent behavior."""
-
-            error_file = self.create_additional_test_file("error_description.txt", error_description)
-            self.logger.info(f"  ✅ Created error description file: {error_file}")
-
-            # Call debug tool with flash model and realistic problem description
-            self.logger.info("  🔍 Calling debug tool to investigate session validation issues...")
-            response, continuation_id = self.call_mcp_tool(
+            # Step 1: Start investigation
+            self.logger.info("    1.1.1: Step 1 - Initial investigation")
+            response1, continuation_id = self.call_mcp_tool(
                "debug",
                {
-                    "prompt": "Investigate why our API is experiencing intermittent session validation failures in production",
-                    "files": [test_file, error_file],
-                    "findings": "Users getting 'Invalid or expired session' errors randomly, occurs more during high traffic, sessions should still be valid",
-                    "error_context": "Sessions created < 30 minutes ago being rejected, happens ~2-3% of requests, load balanced environment",
-                    "systematic_investigation": True,
-                    "model": "flash",
-                    "thinking_mode": "medium",
+                    "step": "I need to investigate intermittent RuntimeError during session cleanup. Let me start by examining the error description and understanding the symptoms.",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "RuntimeError occurs during dictionary iteration in cleanup_expired_sessions method. Error happens intermittently during high load.",
+                    "files_checked": [self.error_file],
+                    "relevant_files": [self.error_file],
                },
            )

-            if not response:
-                self.logger.error("Failed to get debug response")
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to get initial investigation response")
                return False

-            self.logger.info("  ✅ Got debug response")
+            # Parse and validate JSON response
+            response1_data = self._parse_debug_response(response1)
+            if not response1_data:
+                return False

-            # Parse response to validate bug identification
-            try:
-                response_data = json.loads(response)
-                self.logger.debug(f"Response keys: {list(response_data.keys())}")
+            # Validate step 1 response structure
+            if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
+                return False

-                # Extract the actual content if it's wrapped
-                if "content" in response_data:
-                    content = response_data["content"]
-                    # Handle markdown JSON blocks
-                    if content.startswith("```json"):
-                        content = content[7:]
-                    if content.endswith("```"):
-                        content = content[:-3]
-                    content = content.strip()
+            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")

-                    # Parse the inner JSON
-                    inner_data = json.loads(content)
-                    self.logger.debug(f"Inner data keys: {list(inner_data.keys())}")
-                else:
-                    inner_data = response_data
+            # Step 2: Examine the code
+            self.logger.info("    1.1.2: Step 2 - Code examination")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Now examining the session_manager.py file to understand the cleanup_expired_sessions implementation and identify the root cause.",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Found the issue: cleanup_expired_sessions modifies self.active_sessions dictionary while iterating over it with .items(). This causes RuntimeError when del is called during iteration.",
+                    "files_checked": [self.error_file, self.buggy_file],
+                    "relevant_files": [self.buggy_file],
+                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    "hypothesis": "Dictionary is being modified during iteration causing RuntimeError",
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                },
+            )

-                # Check for structured debug analysis (should have analysis_complete status)
-                if inner_data.get("status") == "analysis_complete":
-                    self.logger.info("  ✅ Got structured debug analysis")
+            if not response2:
+                self.logger.error("Failed to continue investigation to step 2")
+                return False

-                    # Validate hypothesis generation
-                    hypotheses = inner_data.get("hypotheses", [])
-                    if not hypotheses:
-                        self.logger.error("No hypotheses found in debug analysis")
-                        return False
+            response2_data = self._parse_debug_response(response2)
+            if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
+                return False

-                    self.logger.info(f"  🧠 Found {len(hypotheses)} hypotheses")
+            # Check investigation status tracking
+            investigation_status = response2_data.get("investigation_status", {})
+            if investigation_status.get("files_checked", 0) < 2:
+                self.logger.error("Files checked count not properly tracked")
+                return False

-                    # Check if the model identified the real bug: dictionary modification during iteration
-                    analysis_text = json.dumps(inner_data).lower()
+            if investigation_status.get("relevant_methods", 0) != 1:
+                self.logger.error("Relevant methods not properly tracked")
+                return False

-                    # Look for the actual bug - modifying dictionary while iterating
-                    bug_indicators = [
-                        "dictionary",
-                        "iteration",
-                        "modify",
-                        "concurrent",
-                        "runtime error",
-                        "dictionary changed size during iteration",
-                        "cleanup_expired_sessions",
-                        "active_sessions",
-                        "del",
-                        "removing while iterating",
-                    ]
+            if investigation_status.get("current_confidence") != "high":
+                self.logger.error("Confidence level not properly tracked")
+                return False

-                    found_indicators = [indicator for indicator in bug_indicators if indicator in analysis_text]
+            self.logger.info("    ✅ Step 2 successful with proper tracking")

-                    # Check for specific mentions of the problematic pattern
-                    dictionary_bug_patterns = [
-                        "modifying dictionary while iterating",
-                        "dictionary changed size",
-                        "concurrent modification",
-                        "iterating over dictionary",
-                        "del.*active_sessions",
-                        "cleanup.*iteration",
-                    ]
+            # Step 3: Validate hypothesis
+            self.logger.info("    1.1.3: Step 3 - Hypothesis validation")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
+                    "files_checked": [self.buggy_file],
+                    "relevant_files": [self.buggy_file],
+                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    "hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                },
+            )

-                    import re
+            if not response3:
+                self.logger.error("Failed to continue investigation to step 3")
+                return False

-                    pattern_matches = []
-                    for pattern in dictionary_bug_patterns:
-                        if re.search(pattern, analysis_text):
-                            pattern_matches.append(pattern)
+            response3_data = self._parse_debug_response(response3)
+            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
+                return False

-                    if len(found_indicators) >= 3 or len(pattern_matches) >= 1:
-                        self.logger.info("  ✅ Flash identified the dictionary iteration bug")
-                        self.logger.info(f"     Found indicators: {found_indicators[:3]}")
-                        if pattern_matches:
-                            self.logger.info(f"     Pattern matches: {pattern_matches}")
-                    else:
-                        self.logger.error("  ❌ Flash missed the dictionary iteration bug")
-                        self.logger.error(f"     Found only: {found_indicators}")
-                        return False
+            self.logger.info("    ✅ Investigation session progressing successfully")

-                    # Validate hypothesis quality (should have confidence levels and reasoning)
-                    valid_hypotheses = 0
-                    for i, hypothesis in enumerate(hypotheses[:3]):  # Check top 3
-                        confidence = hypothesis.get("confidence", "").lower()
-                        reasoning = hypothesis.get("reasoning", "")
+            # Store continuation_id for next test
+            self.investigation_continuation_id = continuation_id
+            return True

-                        if confidence in ["high", "medium", "low"] and len(reasoning) > 20:
-                            valid_hypotheses += 1
-                            self.logger.debug(f"  Hypothesis {i+1}: {confidence} confidence, good reasoning")
-                        else:
-                            self.logger.debug(f"  Hypothesis {i+1}: weak ({confidence}, {len(reasoning)} chars)")
+        except Exception as e:
+            self.logger.error(f"Single investigation session test failed: {e}")
+            return False

-                    if valid_hypotheses >= 2:
-                        self.logger.info(f"  ✅ Found {valid_hypotheses} well-structured hypotheses")
-                    else:
-                        self.logger.error(f"  ❌ Only {valid_hypotheses} well-structured hypotheses")
-                        return False
+    def _test_investigation_with_backtracking(self) -> bool:
+        """Test investigation with backtracking to revise findings"""
+        try:
+            self.logger.info("  1.2: Testing investigation with backtracking")

-                    # Check for line-specific references
-                    if "line" in analysis_text or "lines" in analysis_text:
-                        self.logger.info("  📍 Analysis includes line-specific references")
-                    else:
-                        self.logger.warning("  ⚠️ No line-specific references found")
+            # Start a new investigation for testing backtracking
+            self.logger.info("    1.2.1: Start investigation for backtracking test")
+            response1, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigating performance degradation in data processing pipeline",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Initial analysis shows slow database queries",
+                    "files_checked": ["/db/queries.py"],
+                    "relevant_files": ["/db/queries.py"],
+                },
+            )

-                else:
-                    # Non-structured response - check for dictionary iteration bug identification
-                    self.logger.info("  📝 Got general debug response")
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start backtracking test investigation")
+                return False

-                    response_text = response.lower()
+            # Step 2: Wrong direction
+            self.logger.info("    1.2.2: Step 2 - Wrong investigation path")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Focusing on database optimization strategies",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Database queries seem optimized, might be looking in wrong place",
+                    "files_checked": ["/db/queries.py", "/db/indexes.py"],
+                    "relevant_files": [],
+                    "hypothesis": "Database performance issues",
+                    "confidence": "low",
+                    "continuation_id": continuation_id,
+                },
+            )

-                    # Check for the specific bug in general response
-                    bug_indicators = [
-                        "dictionary",
-                        "iteration",
-                        "modify",
-                        "concurrent",
-                        "active_sessions",
-                        "cleanup",
-                        "del ",
-                        "removing",
-                        "changed size",
-                    ]
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False

-                    found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+            # Step 3: Backtrack from step 2
+            self.logger.info("    1.2.3: Step 3 - Backtrack and revise approach")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Backtracking - the issue might not be database related. Let me investigate the data processing algorithm instead.",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Found inefficient nested loops in data processor causing O(n²) complexity",
+                    "files_checked": ["/processor/algorithm.py"],
+                    "relevant_files": ["/processor/algorithm.py"],
+                    "relevant_methods": ["DataProcessor.process_batch"],
+                    "hypothesis": "Inefficient algorithm causing performance issues",
+                    "confidence": "medium",
+                    "backtrack_from_step": 2,  # Backtrack from step 2
+                    "continuation_id": continuation_id,
+                },
+            )

-                    if len(found_indicators) >= 3:
-                        self.logger.info(f"  ✅ Found {len(found_indicators)} relevant indicators in response")
-                        self.logger.info(f"     Found: {found_indicators}")
-                    else:
-                        self.logger.error(f"  ❌ Only found {len(found_indicators)} relevant indicators")
-                        self.logger.error(f"     Found: {found_indicators}")
-                        return False
+            if not response3:
+                self.logger.error("Failed to backtrack")
+                return False

-            except json.JSONDecodeError as e:
-                self.logger.error(f"Failed to parse debug response as JSON: {e}")
-                # For non-JSON responses, check for dictionary iteration bug
-                response_text = response.lower()
+            response3_data = self._parse_debug_response(response3)
+            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
+                return False

-                bug_indicators = [
-                    "dictionary",
-                    "iteration",
-                    "modify",
-                    "concurrent",
-                    "active_sessions",
-                    "cleanup",
-                    "del ",
-                    "removing",
-                ]
+            self.logger.info("    ✅ Backtracking working correctly")
+            return True

-                found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+        except Exception as e:
+            self.logger.error(f"Backtracking test failed: {e}")
+            return False

-                if len(found_indicators) >= 3:
-                    self.logger.info(f"  ✅ Text response found {len(found_indicators)} relevant indicators")
-                else:
-                    self.logger.error(f"  ❌ Text response only found {len(found_indicators)} relevant indicators")
+    def _test_complete_investigation_with_analysis(self) -> bool:
+        """Test complete investigation ending with expert analysis"""
+        try:
+            self.logger.info("  1.3: Testing complete investigation with expert analysis")
+
+            # Use the continuation from first test
+            continuation_id = getattr(self, "investigation_continuation_id", None)
+            if not continuation_id:
+                # Start fresh if no continuation available
+                self.logger.info("    1.3.0: Starting fresh investigation")
+                response0, continuation_id = self.call_mcp_tool(
+                    "debug",
+                    {
+                        "step": "Investigating the dictionary iteration bug in session cleanup",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "findings": "Found dictionary modification during iteration",
+                        "files_checked": [self.buggy_file],
+                        "relevant_files": [self.buggy_file],
+                        "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    },
+                )
+                if not response0 or not continuation_id:
+                    self.logger.error("Failed to start fresh investigation")
                    return False

+            # Final step - trigger expert analysis
+            self.logger.info("    1.3.1: Final step - complete investigation")
+            response_final, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigation complete. The root cause is confirmed: cleanup_expired_sessions modifies the dictionary while iterating, causing RuntimeError.",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step - triggers expert analysis
+                    "findings": "Root cause identified: del self.active_sessions[session_id] on line 46 modifies dictionary during iteration starting at line 44. Fix: collect expired IDs first, then delete.",
+                    "files_checked": [self.buggy_file],
+                    "relevant_files": [self.buggy_file],
+                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    "hypothesis": "Dictionary modification during iteration causes RuntimeError in cleanup_expired_sessions",
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                    "model": "flash",  # Use flash for expert analysis
+                },
+            )
+
+            if not response_final:
+                self.logger.error("Failed to complete investigation")
+                return False
+
+            response_final_data = self._parse_debug_response(response_final)
+            if not response_final_data:
+                return False
+
+            # Validate final response structure
+            if response_final_data.get("status") != "calling_expert_analysis":
+                self.logger.error(
+                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
+                )
+                return False
+
+            if not response_final_data.get("investigation_complete"):
+                self.logger.error("Expected investigation_complete=true for final step")
+                return False
+
+            # Check for expert analysis
+            if "expert_analysis" not in response_final_data:
+                self.logger.error("Missing expert_analysis in final response")
+                return False
+
+            expert_analysis = response_final_data.get("expert_analysis", {})
+
+            # Check for expected analysis content (checking common patterns)
+            analysis_text = json.dumps(expert_analysis).lower()
+
+            # Look for bug identification
+            bug_indicators = ["dictionary", "iteration", "modify", "runtime", "error", "del"]
+            found_indicators = sum(1 for indicator in bug_indicators if indicator in analysis_text)
+
+            if found_indicators >= 3:
+                self.logger.info("    ✅ Expert analysis identified the bug correctly")
+            else:
+                self.logger.warning(
+                    f"    ⚠️ Expert analysis may not have fully identified the bug (found {found_indicators}/6 indicators)"
+                )
+
+            # Check complete investigation summary
+            if "complete_investigation" not in response_final_data:
+                self.logger.error("Missing complete_investigation in final response")
+                return False
+
+            complete_investigation = response_final_data["complete_investigation"]
+            if not complete_investigation.get("relevant_methods"):
+                self.logger.error("Missing relevant methods in complete investigation")
+                return False
+
+            if "SessionManager.cleanup_expired_sessions" not in complete_investigation["relevant_methods"]:
+                self.logger.error("Expected method not found in investigation summary")
+                return False
+
+            self.logger.info("    ✅ Complete investigation with expert analysis successful")
+
            # Validate logs
            self.logger.info("  📋 Validating execution logs...")

-            # Get server logs using inherited method
+            # Get server logs
            logs = self.get_recent_server_logs(500)

            # Look for debug tool execution patterns
            debug_patterns = [
                "debug tool",
-                "[DEBUG]",
-                "systematic investigation",
-                "Token budget",
-                "Essential files for debugging",
+                "investigation",
+                "Expert analysis",
+                "calling_expert_analysis",
            ]

            patterns_found = 0
@@ -396,34 +456,101 @@ The code looks correct to me, but something is causing valid sessions to be trea
                    patterns_found += 1
                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")

-            if patterns_found >= 3:
+            if patterns_found >= 2:
                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
            else:
                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")

-            # Test continuation if available
-            if continuation_id:
-                self.logger.info("  🔄 Testing debug continuation...")
-
-                follow_up_response, _ = self.call_mcp_tool(
-                    "debug",
-                    {
-                        "prompt": "Based on your analysis, which bug should we fix first and how?",
-                        "continuation_id": continuation_id,
-                        "model": "flash",
-                    },
-                )
-
-                if follow_up_response:
-                    self.logger.info("  ✅ Debug continuation worked")
-                else:
-                    self.logger.warning("  ⚠️ Debug continuation failed")
-
-            self.logger.info("  ✅ Debug tool validation completed successfully")
            return True

        except Exception as e:
-            self.logger.error(f"Debug validation test failed: {e}")
+            self.logger.error(f"Complete investigation test failed: {e}")
+            return False
+
+    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
+        """Call an MCP tool via standalone server - override for debug-specific response handling"""
+        # Use parent implementation to get the raw response
+        response_text, _ = super().call_mcp_tool(tool_name, params)
+
+        if not response_text:
+            return None, None
+
+        # Extract continuation_id from debug response specifically
+        continuation_id = self._extract_debug_continuation_id(response_text)
+
+        return response_text, continuation_id
+
+    def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from debug response"""
+        try:
+            # Parse the response
+            response_data = json.loads(response_text)
+            return response_data.get("continuation_id")
+
+        except json.JSONDecodeError as e:
+            self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
+            return None
+
+    def _parse_debug_response(self, response_text: str) -> dict:
+        """Parse debug tool JSON response"""
+        try:
+            # Parse the response - it should be direct JSON
+            return json.loads(response_text)
+
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse debug response as JSON: {e}")
+            self.logger.error(f"Response text: {response_text[:500]}...")
+            return {}
+
+    def _validate_step_response(
+        self,
+        response_data: dict,
+        expected_step: int,
+        expected_total: int,
+        expected_next_required: bool,
+        expected_status: str,
+    ) -> bool:
+        """Validate a debug investigation step response structure"""
+        try:
+            # Check status
+            if response_data.get("status") != expected_status:
+                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
+                return False
+
+            # Check step number
+            if response_data.get("step_number") != expected_step:
+                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
+                return False
+
+            # Check total steps
+            if response_data.get("total_steps") != expected_total:
+                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
+                return False
+
+            # Check next_step_required
+            if response_data.get("next_step_required") != expected_next_required:
+                self.logger.error(
+                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
+                )
+                return False
+
+            # Check investigation_status exists
+            if "investigation_status" not in response_data:
+                self.logger.error("Missing investigation_status in response")
+                return False
+
+            # Check output guidance exists
+            if "output" not in response_data:
+                self.logger.error("Missing output guidance in response")
+                return False
+
+            # Check next_steps guidance
+            if not response_data.get("next_steps"):
+                self.logger.error("Missing next_steps guidance in response")
+                return False
+
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error validating step response: {e}")
            return False
-        finally:
-            self.cleanup_test_files()
--- a/systemprompts/analyze_prompt.py
+++ b/systemprompts/analyze_prompt.py
@@ -39,6 +39,10 @@ SCOPE & FOCUS
 • Identify strengths, risks, and strategic improvement areas that affect future development
 • Avoid line-by-line bug hunts or minor style critiques—those are covered by CodeReview
 • Recommend practical, proportional changes; no "rip-and-replace" proposals unless the architecture is untenable
+• Identify and flag overengineered solutions — excessive abstraction, unnecessary configuration layers, or generic
+  frameworks introduced without a clear, current need. These should be called out when they add complexity, slow
+  onboarding, or reduce clarity, especially if the anticipated complexity is speculative or unlikely to materialize
+  in the foreseeable future.

 ANALYSIS STRATEGY
 1. Map the tech stack, frameworks, deployment model, and constraints
--- a/systemprompts/chat_prompt.py
+++ b/systemprompts/chat_prompt.py
@@ -29,6 +29,9 @@ SCOPE & FOCUS
 • Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.
 • Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.
 • Keep proposals practical and directly actionable within the existing architecture.
+• Overengineering is an anti-pattern — avoid solutions that introduce unnecessary abstraction, indirection, or
+  configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,
+  and may not arise in the foreseeable future.

 COLLABORATION APPROACH
 1. Engage deeply with Claude's input – extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.
--- a/systemprompts/codereview_prompt.py
+++ b/systemprompts/codereview_prompt.py
@@ -55,6 +55,9 @@ Your review approach:
  - Ways to reduce the overall complexity while maintaining and retaining functionality without introducing regression
 8. Where further investigation and analysis is required, be direct and suggest which code or related file needs to be
 reviewed
+9. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
+   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
+   current scope, and may not arise in the foreseeable future.

 SEVERITY DEFINITIONS
 🔴 CRITICAL: Security flaws or defects that cause crashes, data loss, or undefined behavior
--- a/systemprompts/precommit_prompt.py
+++ b/systemprompts/precommit_prompt.py
@@ -53,6 +53,9 @@ REVIEW METHOD
 4. Flag bugs, regressions, crash risks, data loss, or race conditions.
 5. Recommend specific fixes for each issue raised; include code where helpful.
 6. Acknowledge sound patterns to reinforce best practices.
+7. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
+   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
+   current scope, and may not arise in the foreseeable future.

 CORE ANALYSIS (adapt to diff and stack)
 • Security – injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety
@@ -62,6 +65,11 @@ CORE ANALYSIS (adapt to diff and stack)

 ADDITIONAL ANALYSIS (only when relevant)
 • Language/runtime concerns – memory management, concurrency, exception handling
+    • Carefully assess the code's context and purpose before raising concurrency-related concerns. Confirm the presence
+    of shared state, race conditions, or unsafe access patterns before flagging any issues to avoid false positives.
+    • Also carefully evaluate concurrency and parallelism risks only after confirming that the code runs in an environment
+     where such concerns are applicable. Avoid flagging issues unless shared state, asynchronous execution, or multi-threaded
+     access are clearly possible based on context.
 • System/integration – config handling, external calls, operational impact
 • Testing – coverage gaps for new logic
    • If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic
--- a/systemprompts/thinkdeep_prompt.py
+++ b/systemprompts/thinkdeep_prompt.py
@@ -32,6 +32,9 @@ GUIDELINES
 5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.
 6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.
 7. Use concise, technical language; assume an experienced engineering audience.
+8. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
+   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
+   current scope, and may not arise in the foreseeable future.

 KEY FOCUS AREAS (apply when relevant)
 - Architecture & Design: modularity, boundaries, abstraction layers, dependencies
--- a/tests/test_auto_model_planner_fix.py
+++ b/tests/test_auto_model_planner_fix.py
@@ -198,13 +198,20 @@ class TestAutoModelPlannerFix:
        Verify that other tools still properly require model resolution.

        This ensures our fix doesn't break existing functionality.
+        Note: Debug tool now manages its own model calls like planner.
        """
        from tools.analyze import AnalyzeTool
        from tools.chat import ChatTool
        from tools.debug import DebugIssueTool

        # Test various tools still require models
-        tools_requiring_models = [ChatTool(), DebugIssueTool(), AnalyzeTool()]
+        tools_requiring_models = [ChatTool(), AnalyzeTool()]

        for tool in tools_requiring_models:
            assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution"
+
+        # Test tools that manage their own model calls
+        tools_managing_own_models = [DebugIssueTool()]
+
+        for tool in tools_managing_own_models:
+            assert tool.requires_model() is False, f"{tool.get_name()} should manage its own model calls"
--- a/tests/test_collaboration.py
+++ b/tests/test_collaboration.py
@@ -70,35 +70,35 @@ class TestDynamicContextRequests:

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_normal_response_not_parsed_as_clarification(self, mock_get_provider, debug_tool):
-        """Test that normal responses are not mistaken for clarification requests"""
-        normal_response = """
-        ## Summary
-        The error is caused by a missing import statement.
-
-        ## Hypotheses (Ranked by Likelihood)
-
-        ### 1. Missing Import (Confidence: High)
-        **Root Cause:** The module 'utils' is not imported
-        """
-
-        mock_provider = create_mock_provider()
-        mock_provider.get_provider_type.return_value = Mock(value="google")
-        mock_provider.supports_thinking_mode.return_value = False
-        mock_provider.generate_content.return_value = Mock(
-            content=normal_response, usage={}, model_name="gemini-2.5-flash", metadata={}
+    @patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid")
+    @patch("utils.conversation_memory.add_turn")
+    async def test_normal_response_not_parsed_as_clarification(
+        self, mock_add_turn, mock_create_thread, mock_get_provider, debug_tool
+    ):
+        """Test that normal investigation responses work correctly with new debug tool"""
+        # The new debug tool uses self-investigation pattern
+        result = await debug_tool.execute(
+            {
+                "step": "Investigating NameError: name 'utils' is not defined",
+                "step_number": 1,
+                "total_steps": 3,
+                "next_step_required": True,
+                "findings": "The error indicates 'utils' module is not imported or defined",
+                "files_checked": ["/code/main.py"],
+                "relevant_files": ["/code/main.py"],
+                "hypothesis": "Missing import statement for utils module",
+                "confidence": "high",
+            }
        )
-        mock_get_provider.return_value = mock_provider
-
-        result = await debug_tool.execute({"prompt": "NameError: name 'utils' is not defined"})

        assert len(result) == 1

-        # Parse the response
+        # Parse the response - new debug tool returns structured JSON
        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "success"
-        assert response_data["content_type"] in ["text", "markdown"]
-        assert "Summary" in response_data["content"]
+        assert response_data["status"] == "investigation_in_progress"
+        assert response_data["step_number"] == 1
+        assert response_data["next_step_required"] is True
+        assert response_data["investigation_status"]["current_confidence"] == "high"

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
@@ -125,17 +125,17 @@ class TestDynamicContextRequests:

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_clarification_with_suggested_action(self, mock_get_provider, debug_tool):
+    async def test_clarification_with_suggested_action(self, mock_get_provider, analyze_tool):
        """Test clarification request with suggested next action"""
        clarification_json = json.dumps(
            {
                "status": "files_required_to_continue",
-                "mandatory_instructions": "I need to see the database configuration to diagnose the connection error",
+                "mandatory_instructions": "I need to see the database configuration to analyze the connection error",
                "files_needed": ["config/database.yml", "src/db.py"],
                "suggested_next_action": {
-                    "tool": "debug",
+                    "tool": "analyze",
                    "args": {
-                        "prompt": "Connection timeout to database",
+                        "prompt": "Analyze database connection timeout issue",
                        "files": [
                            "/config/database.yml",
                            "/src/db.py",
@@ -154,9 +154,9 @@ class TestDynamicContextRequests:
        )
        mock_get_provider.return_value = mock_provider

-        result = await debug_tool.execute(
+        result = await analyze_tool.execute(
            {
-                "prompt": "Connection timeout to database",
+                "prompt": "Analyze database connection timeout issue",
                "files": ["/absolute/logs/error.log"],
            }
        )
@@ -168,7 +168,7 @@ class TestDynamicContextRequests:

        clarification = json.loads(response_data["content"])
        assert "suggested_next_action" in clarification
-        assert clarification["suggested_next_action"]["tool"] == "debug"
+        assert clarification["suggested_next_action"]["tool"] == "analyze"

    def test_tool_output_model_serialization(self):
        """Test ToolOutput model serialization"""
@@ -298,7 +298,7 @@ class TestCollaborationWorkflow:
    @patch("tools.base.BaseTool.get_model_provider")
    async def test_multi_step_collaboration(self, mock_get_provider):
        """Test a multi-step collaboration workflow"""
-        tool = DebugIssueTool()
+        tool = AnalyzeTool()

        # Step 1: Initial request returns clarification needed
        clarification_json = json.dumps(
@@ -319,8 +319,8 @@ class TestCollaborationWorkflow:

        result1 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
-                "error_context": "Timeout after 30s",
+                "prompt": "Analyze database connection timeout issue",
+                "files": ["/logs/error.log"],
            }
        )

@@ -345,9 +345,8 @@ class TestCollaborationWorkflow:

        result2 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
-                "error_context": "Timeout after 30s",
-                "files": ["/absolute/path/config.py"],  # Additional context provided
+                "prompt": "Analyze database connection timeout issue with config file",
+                "files": ["/absolute/path/config.py", "/logs/error.log"],  # Additional context provided
            }
        )

--- a/tests/test_conversation_field_mapping.py
+++ b/tests/test_conversation_field_mapping.py
@@ -157,10 +157,10 @@ async def test_unknown_tool_defaults_to_prompt():

@pytest.mark.asyncio
 async def test_tool_parameter_standardization():
-    """Test that all tools use standardized 'prompt' parameter"""
+    """Test that most tools use standardized 'prompt' parameter (debug uses investigation pattern)"""
    from tools.analyze import AnalyzeRequest
    from tools.codereview import CodeReviewRequest
-    from tools.debug import DebugIssueRequest
+    from tools.debug import DebugInvestigationRequest
    from tools.precommit import PrecommitRequest
    from tools.thinkdeep import ThinkDeepRequest

@@ -168,9 +168,16 @@ async def test_tool_parameter_standardization():
    analyze = AnalyzeRequest(files=["/test.py"], prompt="What does this do?")
    assert analyze.prompt == "What does this do?"

-    # Test debug tool uses prompt
-    debug = DebugIssueRequest(prompt="Error occurred")
-    assert debug.prompt == "Error occurred"
+    # Debug tool now uses self-investigation pattern with different fields
+    debug = DebugInvestigationRequest(
+        step="Investigating error",
+        step_number=1,
+        total_steps=3,
+        next_step_required=True,
+        findings="Initial error analysis",
+    )
+    assert debug.step == "Investigating error"
+    assert debug.findings == "Initial error analysis"

    # Test codereview tool uses prompt
    review = CodeReviewRequest(files=["/test.py"], prompt="Review this")
--- a/tests/test_debug.py
+++ b/tests/test_debug.py
@@ -0,0 +1,514 @@
+"""
+Tests for the debug tool.
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugInvestigationRequest, DebugIssueTool
+from tools.models import ToolModelCategory
+
+
+class TestDebugTool:
+    """Test suite for DebugIssueTool."""
+
+    def test_tool_metadata(self):
+        """Test basic tool metadata and configuration."""
+        tool = DebugIssueTool()
+
+        assert tool.get_name() == "debug"
+        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
+        assert tool.get_default_temperature() == 0.2  # TEMPERATURE_ANALYTICAL
+        assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
+        assert tool.requires_model() is False  # Since it manages its own model calls
+
+    def test_request_validation(self):
+        """Test Pydantic request model validation."""
+        # Valid investigation step request
+        step_request = DebugInvestigationRequest(
+            step="Investigating null pointer exception in UserService",
+            step_number=1,
+            total_steps=5,
+            next_step_required=True,
+            findings="Found that UserService.getUser() is called with null ID",
+        )
+        assert step_request.step == "Investigating null pointer exception in UserService"
+        assert step_request.step_number == 1
+        assert step_request.next_step_required is True
+        assert step_request.confidence == "low"  # default
+
+        # Request with optional fields
+        detailed_request = DebugInvestigationRequest(
+            step="Deep dive into getUser method implementation",
+            step_number=2,
+            total_steps=5,
+            next_step_required=True,
+            findings="Method doesn't validate input parameters",
+            files_checked=["/src/UserService.java", "/src/UserController.java"],
+            relevant_files=["/src/UserService.java"],
+            relevant_methods=["UserService.getUser", "UserController.handleRequest"],
+            hypothesis="Null ID passed from controller without validation",
+            confidence="medium",
+        )
+        assert len(detailed_request.files_checked) == 2
+        assert len(detailed_request.relevant_files) == 1
+        assert detailed_request.confidence == "medium"
+
+        # Missing required fields should fail
+        with pytest.raises(ValueError):
+            DebugInvestigationRequest()  # Missing all required fields
+
+        with pytest.raises(ValueError):
+            DebugInvestigationRequest(step="test")  # Missing other required fields
+
+    def test_input_schema_generation(self):
+        """Test JSON schema generation for MCP client."""
+        tool = DebugIssueTool()
+        schema = tool.get_input_schema()
+
+        assert schema["type"] == "object"
+        # Investigation fields
+        assert "step" in schema["properties"]
+        assert "step_number" in schema["properties"]
+        assert "total_steps" in schema["properties"]
+        assert "next_step_required" in schema["properties"]
+        assert "findings" in schema["properties"]
+        assert "files_checked" in schema["properties"]
+        assert "relevant_files" in schema["properties"]
+        assert "relevant_methods" in schema["properties"]
+        assert "hypothesis" in schema["properties"]
+        assert "confidence" in schema["properties"]
+        assert "backtrack_from_step" in schema["properties"]
+        assert "continuation_id" in schema["properties"]
+        assert "images" in schema["properties"]  # Now supported for visual debugging
+
+        # Check excluded fields are NOT present
+        assert "model" not in schema["properties"]
+        assert "temperature" not in schema["properties"]
+        assert "thinking_mode" not in schema["properties"]
+        assert "use_websearch" not in schema["properties"]
+
+        # Check required fields
+        assert "step" in schema["required"]
+        assert "step_number" in schema["required"]
+        assert "total_steps" in schema["required"]
+        assert "next_step_required" in schema["required"]
+        assert "findings" in schema["required"]
+
+    def test_model_category_for_debugging(self):
+        """Test that debug uses extended reasoning category."""
+        tool = DebugIssueTool()
+        category = tool.get_model_category()
+
+        # Debugging needs deep thinking
+        assert category == ToolModelCategory.EXTENDED_REASONING
+
+    @pytest.mark.asyncio
+    async def test_execute_first_investigation_step(self):
+        """Test execute method for first investigation step."""
+        tool = DebugIssueTool()
+        arguments = {
+            "step": "Investigating intermittent session validation failures in production",
+            "step_number": 1,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Users report random session invalidation, occurs more during high traffic",
+            "files_checked": ["/api/session_manager.py"],
+            "relevant_files": ["/api/session_manager.py"],
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.create_thread", return_value="debug-uuid-123"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        assert result[0].type == "text"
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(result[0].text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["step_number"] == 1
+        assert parsed_response["total_steps"] == 5
+        assert parsed_response["next_step_required"] is True
+        assert parsed_response["continuation_id"] == "debug-uuid-123"
+        assert parsed_response["investigation_status"]["files_checked"] == 1
+        assert parsed_response["investigation_status"]["relevant_files"] == 1
+
+    @pytest.mark.asyncio
+    async def test_execute_subsequent_investigation_step(self):
+        """Test execute method for subsequent investigation step."""
+        tool = DebugIssueTool()
+
+        # Set up initial state
+        tool.initial_issue = "Session validation failures"
+        tool.consolidated_findings["files_checked"].add("/api/session_manager.py")
+
+        arguments = {
+            "step": "Examining session cleanup method for concurrent modification issues",
+            "step_number": 2,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Found dictionary modification during iteration in cleanup_expired_sessions",
+            "files_checked": ["/api/session_manager.py", "/api/utils.py"],
+            "relevant_files": ["/api/session_manager.py"],
+            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+            "hypothesis": "Dictionary modified during iteration causing RuntimeError",
+            "confidence": "high",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        assert result[0].type == "text"
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(result[0].text)
+
+        assert parsed_response["step_number"] == 2
+        assert parsed_response["next_step_required"] is True
+        assert parsed_response["continuation_id"] == "debug-uuid-123"
+        assert parsed_response["investigation_status"]["files_checked"] == 2  # Cumulative
+        assert parsed_response["investigation_status"]["relevant_methods"] == 1
+        assert parsed_response["investigation_status"]["current_confidence"] == "high"
+
+    @pytest.mark.asyncio
+    async def test_execute_final_investigation_step(self):
+        """Test execute method for final investigation step with expert analysis."""
+        tool = DebugIssueTool()
+
+        # Set up investigation history
+        tool.initial_issue = "Session validation failures"
+        tool.investigation_history = [
+            {
+                "step_number": 1,
+                "step": "Initial investigation of session validation failures",
+                "findings": "Initial investigation",
+                "files_checked": ["/api/utils.py"],
+            },
+            {
+                "step_number": 2,
+                "step": "Deeper analysis of session manager",
+                "findings": "Found dictionary issue",
+                "files_checked": ["/api/session_manager.py"],
+            },
+        ]
+        tool.consolidated_findings = {
+            "files_checked": {"/api/session_manager.py", "/api/utils.py"},
+            "relevant_files": {"/api/session_manager.py"},
+            "relevant_methods": {"SessionManager.cleanup_expired_sessions"},
+            "findings": ["Step 1: Initial investigation", "Step 2: Found dictionary issue"],
+            "hypotheses": [{"step": 2, "hypothesis": "Dictionary modified during iteration", "confidence": "high"}],
+            "images": [],
+        }
+
+        arguments = {
+            "step": "Confirmed the root cause and identified fix",
+            "step_number": 3,
+            "total_steps": 3,
+            "next_step_required": False,  # Final step
+            "findings": "Root cause confirmed: dictionary modification during iteration in cleanup method",
+            "files_checked": ["/api/session_manager.py"],
+            "relevant_files": ["/api/session_manager.py"],
+            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+            "hypothesis": "Dictionary modification during iteration causes intermittent RuntimeError",
+            "confidence": "high",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock the expert analysis call
+        mock_expert_response = {
+            "status": "analysis_complete",
+            "summary": "Dictionary modification during iteration bug identified",
+            "hypotheses": [
+                {
+                    "name": "CONCURRENT_MODIFICATION",
+                    "confidence": "High",
+                    "root_cause": "Modifying dictionary while iterating",
+                    "minimal_fix": "Create list of keys to delete first",
+                }
+            ],
+        }
+
+        # Mock conversation memory and file reading
+        with patch("utils.conversation_memory.add_turn"):
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                with patch.object(tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
+                    result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        # Check final step structure
+        assert parsed_response["status"] == "calling_expert_analysis"
+        assert parsed_response["investigation_complete"] is True
+        assert parsed_response["expert_analysis"]["status"] == "analysis_complete"
+        assert "complete_investigation" in parsed_response
+        assert parsed_response["complete_investigation"]["steps_taken"] == 3  # All steps including current
+
+    @pytest.mark.asyncio
+    async def test_execute_with_backtracking(self):
+        """Test execute method with backtracking to revise findings."""
+        tool = DebugIssueTool()
+
+        # Set up some investigation history with all required fields
+        tool.investigation_history = [
+            {
+                "step": "Initial investigation",
+                "step_number": 1,
+                "findings": "Initial findings",
+                "files_checked": ["file1.py"],
+                "relevant_files": [],
+                "relevant_methods": [],
+                "hypothesis": None,
+                "confidence": "low",
+            },
+            {
+                "step": "Wrong direction",
+                "step_number": 2,
+                "findings": "Wrong path",
+                "files_checked": ["file2.py"],
+                "relevant_files": [],
+                "relevant_methods": [],
+                "hypothesis": None,
+                "confidence": "low",
+            },
+        ]
+        tool.consolidated_findings = {
+            "files_checked": {"file1.py", "file2.py"},
+            "relevant_files": set(),
+            "relevant_methods": set(),
+            "findings": ["Step 1: Initial findings", "Step 2: Wrong path"],
+            "hypotheses": [],
+            "images": [],
+        }
+
+        arguments = {
+            "step": "Backtracking to revise approach",
+            "step_number": 3,
+            "total_steps": 5,
+            "next_step_required": True,
+            "findings": "Taking a different investigation approach",
+            "files_checked": ["file3.py"],
+            "backtrack_from_step": 2,  # Backtrack from step 2
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        # After backtracking from step 2, history should have step 1 plus the new step
+        assert len(tool.investigation_history) == 2  # Step 1 + new step 3
+        assert tool.investigation_history[0]["step_number"] == 1
+        assert tool.investigation_history[1]["step_number"] == 3  # The new step that triggered backtrack
+
+    @pytest.mark.asyncio
+    async def test_execute_adjusts_total_steps(self):
+        """Test execute method adjusts total steps when current step exceeds estimate."""
+        tool = DebugIssueTool()
+        arguments = {
+            "step": "Additional investigation needed",
+            "step_number": 8,
+            "total_steps": 5,  # Current step exceeds total
+            "next_step_required": True,
+            "findings": "More complexity discovered",
+            "continuation_id": "debug-uuid-123",
+        }
+
+        # Mock conversation memory functions
+        with patch("utils.conversation_memory.add_turn"):
+            result = await tool.execute(arguments)
+
+        # Should return a list with TextContent
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        # Total steps should be adjusted to match current step
+        assert parsed_response["total_steps"] == 8
+        assert parsed_response["step_number"] == 8
+
+    @pytest.mark.asyncio
+    async def test_execute_error_handling(self):
+        """Test execute method error handling."""
+        tool = DebugIssueTool()
+        # Invalid arguments - missing required fields
+        arguments = {
+            "step": "Invalid request"
+            # Missing required fields
+        }
+
+        result = await tool.execute(arguments)
+
+        # Should return error response
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_failed"
+        assert "error" in parsed_response
+
+    def test_prepare_investigation_summary(self):
+        """Test investigation summary preparation."""
+        tool = DebugIssueTool()
+        tool.consolidated_findings = {
+            "files_checked": {"file1.py", "file2.py", "file3.py"},
+            "relevant_files": {"file1.py", "file2.py"},
+            "relevant_methods": {"Class1.method1", "Class2.method2"},
+            "findings": [
+                "Step 1: Initial investigation findings",
+                "Step 2: Discovered potential issue",
+                "Step 3: Confirmed root cause",
+            ],
+            "hypotheses": [
+                {"step": 1, "hypothesis": "Initial hypothesis", "confidence": "low"},
+                {"step": 2, "hypothesis": "Refined hypothesis", "confidence": "medium"},
+                {"step": 3, "hypothesis": "Final hypothesis", "confidence": "high"},
+            ],
+            "images": [],
+        }
+
+        summary = tool._prepare_investigation_summary()
+
+        assert "SYSTEMATIC INVESTIGATION SUMMARY" in summary
+        assert "Files examined: 3" in summary
+        assert "Relevant files identified: 2" in summary
+        assert "Methods/functions involved: 2" in summary
+        assert "INVESTIGATION PROGRESSION" in summary
+        assert "Step 1:" in summary
+        assert "Step 2:" in summary
+        assert "Step 3:" in summary
+        assert "HYPOTHESIS EVOLUTION" in summary
+        assert "low confidence" in summary
+        assert "medium confidence" in summary
+        assert "high confidence" in summary
+
+    def test_extract_error_context(self):
+        """Test error context extraction from findings."""
+        tool = DebugIssueTool()
+        tool.consolidated_findings = {
+            "findings": [
+                "Step 1: Found no issues initially",
+                "Step 2: Discovered ERROR: Dictionary size changed during iteration",
+                "Step 3: Stack trace shows RuntimeError in cleanup method",
+                "Step 4: Exception occurs intermittently",
+            ],
+        }
+
+        error_context = tool._extract_error_context()
+
+        assert error_context is not None
+        assert "ERROR: Dictionary size changed" in error_context
+        assert "Stack trace shows RuntimeError" in error_context
+        assert "Exception occurs intermittently" in error_context
+        assert "Found no issues initially" not in error_context  # Should not include non-error findings
+
+    def test_reprocess_consolidated_findings(self):
+        """Test reprocessing of consolidated findings after backtracking."""
+        tool = DebugIssueTool()
+        tool.investigation_history = [
+            {
+                "step_number": 1,
+                "findings": "Initial findings",
+                "files_checked": ["file1.py"],
+                "relevant_files": ["file1.py"],
+                "relevant_methods": ["method1"],
+                "hypothesis": "Initial hypothesis",
+                "confidence": "low",
+            },
+            {
+                "step_number": 2,
+                "findings": "Second findings",
+                "files_checked": ["file2.py"],
+                "relevant_files": [],
+                "relevant_methods": ["method2"],
+            },
+        ]
+
+        tool._reprocess_consolidated_findings()
+
+        assert tool.consolidated_findings["files_checked"] == {"file1.py", "file2.py"}
+        assert tool.consolidated_findings["relevant_files"] == {"file1.py"}
+        assert tool.consolidated_findings["relevant_methods"] == {"method1", "method2"}
+        assert len(tool.consolidated_findings["findings"]) == 2
+        assert len(tool.consolidated_findings["hypotheses"]) == 1
+        assert tool.consolidated_findings["hypotheses"][0]["hypothesis"] == "Initial hypothesis"
+
+
+# Integration test
+class TestDebugToolIntegration:
+    """Integration tests for debug tool."""
+
+    def setup_method(self):
+        """Set up model context for integration tests."""
+        from utils.model_context import ModelContext
+
+        self.tool = DebugIssueTool()
+        self.tool._model_context = ModelContext("flash")  # Test model
+
+    @pytest.mark.asyncio
+    async def test_complete_investigation_flow(self):
+        """Test complete investigation flow from start to expert analysis."""
+        # Step 1: Initial investigation
+        arguments = {
+            "step": "Investigating memory leak in data processing pipeline",
+            "step_number": 1,
+            "total_steps": 3,
+            "next_step_required": True,
+            "findings": "High memory usage observed during batch processing",
+            "files_checked": ["/processor/main.py"],
+        }
+
+        # Mock conversation memory and expert analysis
+        with patch("utils.conversation_memory.create_thread", return_value="debug-flow-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await self.tool.execute(arguments)
+
+        # Verify response structure
+        assert len(result) == 1
+        response_text = result[0].text
+
+        # Parse the JSON response
+        import json
+
+        parsed_response = json.loads(response_text)
+
+        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["step_number"] == 1
+        assert parsed_response["continuation_id"] == "debug-flow-uuid"
--- a/tests/test_debug_comprehensive_workflow.py
+++ b/tests/test_debug_comprehensive_workflow.py
@@ -0,0 +1,363 @@
+"""
+Comprehensive test demonstrating debug tool's self-investigation pattern
+and continuation ID functionality working together end-to-end.
+"""
+
+import json
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugIssueTool
+from utils.conversation_memory import (
+    ConversationTurn,
+    ThreadContext,
+    build_conversation_history,
+    get_conversation_file_list,
+)
+
+
+class TestDebugComprehensiveWorkflow:
+    """Test the complete debug workflow from investigation to expert analysis to continuation."""
+
+    @pytest.mark.asyncio
+    async def test_full_debug_workflow_with_continuation(self):
+        """Test complete debug workflow: investigation → expert analysis → continuation to another tool."""
+        tool = DebugIssueTool()
+
+        # Step 1: Initial investigation
+        with patch("utils.conversation_memory.create_thread", return_value="debug-workflow-uuid"):
+            with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+                result1 = await tool.execute(
+                    {
+                        "step": "Investigating memory leak in user session handler",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "High memory usage detected in session handler",
+                        "files_checked": ["/api/sessions.py"],
+                        "images": ["/screenshots/memory_profile.png"],
+                    }
+                )
+
+        # Verify step 1 response
+        assert len(result1) == 1
+        response1 = json.loads(result1[0].text)
+        assert response1["status"] == "investigation_in_progress"
+        assert response1["step_number"] == 1
+        assert response1["continuation_id"] == "debug-workflow-uuid"
+
+        # Verify conversation turn was added
+        assert mock_add_turn.called
+        call_args = mock_add_turn.call_args
+        if call_args:
+            # Check if args were passed positionally or as keywords
+            args = call_args.args if hasattr(call_args, "args") else call_args[0]
+            if args and len(args) >= 3:
+                assert args[0] == "debug-workflow-uuid"
+                assert args[1] == "assistant"
+                assert json.loads(args[2])["status"] == "investigation_in_progress"
+
+        # Step 2: Continue investigation with findings
+        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+            result2 = await tool.execute(
+                {
+                    "step": "Found circular references in session cache preventing garbage collection",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Session objects hold references to themselves through event handlers",
+                    "files_checked": ["/api/sessions.py", "/api/cache.py"],
+                    "relevant_files": ["/api/sessions.py"],
+                    "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
+                    "hypothesis": "Circular references preventing garbage collection",
+                    "confidence": "high",
+                    "continuation_id": "debug-workflow-uuid",
+                }
+            )
+
+        # Verify step 2 response
+        response2 = json.loads(result2[0].text)
+        assert response2["status"] == "investigation_in_progress"
+        assert response2["step_number"] == 2
+        assert response2["investigation_status"]["files_checked"] == 2
+        assert response2["investigation_status"]["relevant_methods"] == 2
+        assert response2["investigation_status"]["current_confidence"] == "high"
+
+        # Step 3: Final investigation with expert analysis
+        # Mock the expert analysis response
+        mock_expert_response = {
+            "status": "analysis_complete",
+            "summary": "Memory leak caused by circular references in session event handlers",
+            "hypotheses": [
+                {
+                    "name": "CIRCULAR_REFERENCE_LEAK",
+                    "confidence": "High (95%)",
+                    "evidence": ["Event handlers hold strong references", "No weak references used"],
+                    "root_cause": "SessionHandler stores callbacks that reference the handler itself",
+                    "potential_fixes": [
+                        {
+                            "description": "Use weakref for event handler callbacks",
+                            "files_to_modify": ["/api/sessions.py"],
+                            "complexity": "Low",
+                        }
+                    ],
+                    "minimal_fix": "Replace self references in callbacks with weakref.ref(self)",
+                }
+            ],
+            "investigation_summary": {
+                "pattern": "Classic circular reference memory leak",
+                "severity": "High - causes unbounded memory growth",
+                "recommended_action": "Implement weakref solution immediately",
+            },
+        }
+
+        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                result3 = await tool.execute(
+                    {
+                        "step": "Investigation complete - confirmed circular reference memory leak pattern",
+                        "step_number": 3,
+                        "total_steps": 3,
+                        "next_step_required": False,  # Triggers expert analysis
+                        "findings": "Circular references between SessionHandler and event callbacks prevent GC",
+                        "files_checked": ["/api/sessions.py", "/api/cache.py"],
+                        "relevant_files": ["/api/sessions.py"],
+                        "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
+                        "hypothesis": "Circular references in event handler callbacks causing memory leak",
+                        "confidence": "high",
+                        "continuation_id": "debug-workflow-uuid",
+                        "model": "flash",
+                    }
+                )
+
+        # Verify final response with expert analysis
+        response3 = json.loads(result3[0].text)
+        assert response3["status"] == "calling_expert_analysis"
+        assert response3["investigation_complete"] is True
+        assert "expert_analysis" in response3
+
+        expert = response3["expert_analysis"]
+        assert expert["status"] == "analysis_complete"
+        assert "CIRCULAR_REFERENCE_LEAK" in expert["hypotheses"][0]["name"]
+        assert "weakref" in expert["hypotheses"][0]["minimal_fix"]
+
+        # Verify complete investigation summary
+        assert "complete_investigation" in response3
+        complete = response3["complete_investigation"]
+        assert complete["steps_taken"] == 3
+        assert "/api/sessions.py" in complete["files_examined"]
+        assert "SessionHandler.add_event_listener" in complete["relevant_methods"]
+
+        # Step 4: Test continuation to another tool (e.g., analyze)
+        # Create a mock thread context representing the debug conversation
+        debug_context = ThreadContext(
+            thread_id="debug-workflow-uuid",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Investigating memory leak",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/sessions.py"],
+                    images=["/screenshots/memory_profile.png"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response1),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 2: Found circular references",
+                    timestamp="2025-01-01T00:03:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response2),
+                    timestamp="2025-01-01T00:04:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 3: Investigation complete",
+                    timestamp="2025-01-01T00:05:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(response3),
+                    timestamp="2025-01-01T00:06:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Test that another tool can use the continuation
+        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
+            # Mock file reading
+            def mock_read_file(file_path):
+                if file_path == "/api/sessions.py":
+                    return "# SessionHandler with circular refs\nclass SessionHandler:\n    pass", 20
+                elif file_path == "/screenshots/memory_profile.png":
+                    # Images return empty string for content but 0 tokens
+                    return "", 0
+                elif file_path == "/api/cache.py":
+                    return "# Cache module", 5
+                return "", 0
+
+            # Build conversation history for another tool
+            from utils.model_context import ModelContext
+
+            model_context = ModelContext("flash")
+            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
+
+            # Verify history contains all debug information
+            assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
+            assert "Thread: debug-workflow-uuid" in history
+            assert "Tool: debug" in history
+
+            # Check investigation progression
+            assert "Step 1: Investigating memory leak" in history
+            assert "Step 2: Found circular references" in history
+            assert "Step 3: Investigation complete" in history
+
+            # Check expert analysis is included
+            assert "CIRCULAR_REFERENCE_LEAK" in history
+            assert "weakref" in history
+            assert "memory leak" in history
+
+            # Check files are referenced in conversation history
+            assert "/api/sessions.py" in history
+
+            # File content would be in referenced files section if the files were readable
+            # In our test they're not real files so they won't be embedded
+            # But the expert analysis content should be there
+            assert "Memory leak caused by circular references" in history
+
+            # Verify file list includes all files from investigation
+            file_list = get_conversation_file_list(debug_context)
+            assert "/api/sessions.py" in file_list
+
+    @pytest.mark.asyncio
+    async def test_debug_investigation_state_machine(self):
+        """Test the debug tool's investigation state machine behavior."""
+        tool = DebugIssueTool()
+
+        # Test state transitions
+        states = []
+
+        # Initial state
+        with patch("utils.conversation_memory.create_thread", return_value="state-test-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(
+                    {
+                        "step": "Starting investigation",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "findings": "Initial findings",
+                    }
+                )
+                states.append(json.loads(result[0].text))
+
+        # Verify initial state
+        assert states[0]["status"] == "investigation_in_progress"
+        assert states[0]["step_number"] == 1
+        assert states[0]["next_step_required"] is True
+
+        # Final state (triggers expert analysis)
+        mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
+
+        with patch("utils.conversation_memory.add_turn"):
+            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
+                result = await tool.execute(
+                    {
+                        "step": "Final findings",
+                        "step_number": 2,
+                        "total_steps": 2,
+                        "next_step_required": False,
+                        "findings": "Complete findings",
+                        "continuation_id": "state-test-uuid",
+                        "model": "flash",
+                    }
+                )
+                states.append(json.loads(result[0].text))
+
+        # Verify final state
+        assert states[1]["status"] == "calling_expert_analysis"
+        assert states[1]["investigation_complete"] is True
+        assert "expert_analysis" in states[1]
+
+    @pytest.mark.asyncio
+    async def test_debug_backtracking_preserves_continuation(self):
+        """Test that backtracking preserves continuation ID and investigation state."""
+        tool = DebugIssueTool()
+
+        # Start investigation
+        with patch("utils.conversation_memory.create_thread", return_value="backtrack-test-uuid"):
+            with patch("utils.conversation_memory.add_turn"):
+                result1 = await tool.execute(
+                    {
+                        "step": "Initial hypothesis",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "Initial findings",
+                    }
+                )
+
+        response1 = json.loads(result1[0].text)
+        continuation_id = response1["continuation_id"]
+
+        # Step 2 - wrong direction
+        with patch("utils.conversation_memory.add_turn"):
+            await tool.execute(
+                {
+                    "step": "Wrong hypothesis",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Dead end",
+                    "hypothesis": "Wrong initial hypothesis",
+                    "confidence": "low",
+                    "continuation_id": continuation_id,
+                }
+            )
+
+        # Backtrack from step 2
+        with patch("utils.conversation_memory.add_turn"):
+            result3 = await tool.execute(
+                {
+                    "step": "Backtracking - new hypothesis",
+                    "step_number": 3,
+                    "total_steps": 4,  # Adjusted total
+                    "next_step_required": True,
+                    "findings": "New direction",
+                    "hypothesis": "New hypothesis after backtracking",
+                    "confidence": "medium",
+                    "backtrack_from_step": 2,
+                    "continuation_id": continuation_id,
+                }
+            )
+
+        response3 = json.loads(result3[0].text)
+
+        # Verify continuation preserved through backtracking
+        assert response3["continuation_id"] == continuation_id
+        assert response3["step_number"] == 3
+        assert response3["total_steps"] == 4
+
+        # Verify investigation status after backtracking
+        # When we backtrack, investigation continues
+        assert response3["investigation_status"]["files_checked"] == 0  # Reset after backtrack
+        assert response3["investigation_status"]["current_confidence"] == "medium"
+
+        # The key thing is the continuation ID is preserved
+        # and we've adjusted our approach (total_steps increased)
--- a/tests/test_debug_continuation.py
+++ b/tests/test_debug_continuation.py
@@ -0,0 +1,336 @@
+"""
+Test debug tool continuation ID functionality and conversation history formatting.
+"""
+
+import json
+from unittest.mock import patch
+
+import pytest
+
+from tools.debug import DebugIssueTool
+from utils.conversation_memory import (
+    ConversationTurn,
+    ThreadContext,
+    build_conversation_history,
+    get_conversation_file_list,
+)
+
+
+class TestDebugContinuation:
+    """Test debug tool continuation ID and conversation history integration."""
+
+    @pytest.mark.asyncio
+    async def test_debug_creates_continuation_id(self):
+        """Test that debug tool creates continuation ID on first step."""
+        tool = DebugIssueTool()
+
+        with patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid-123"):
+            with patch("utils.conversation_memory.add_turn"):
+                result = await tool.execute(
+                    {
+                        "step": "Investigating null pointer exception",
+                        "step_number": 1,
+                        "total_steps": 3,
+                        "next_step_required": True,
+                        "findings": "Initial investigation shows null reference in UserService",
+                        "files_checked": ["/api/UserService.java"],
+                    }
+                )
+
+        assert len(result) == 1
+        response = json.loads(result[0].text)
+        assert response["status"] == "investigation_in_progress"
+        assert response["continuation_id"] == "debug-test-uuid-123"
+
+    def test_debug_conversation_formatting(self):
+        """Test that debug tool's structured output is properly formatted in conversation history."""
+        # Create a mock conversation with debug tool output
+        debug_output = {
+            "status": "investigation_in_progress",
+            "step_number": 2,
+            "total_steps": 3,
+            "next_step_required": True,
+            "investigation_status": {
+                "files_checked": 3,
+                "relevant_files": 2,
+                "relevant_methods": 1,
+                "hypotheses_formed": 1,
+                "images_collected": 0,
+                "current_confidence": "medium",
+            },
+            "output": {"instructions": "Continue systematic investigation.", "format": "systematic_investigation"},
+            "continuation_id": "debug-test-uuid-123",
+            "next_steps": "Continue investigation with step 3.",
+        }
+
+        context = ThreadContext(
+            thread_id="debug-test-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:05:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Investigating null pointer exception",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/UserService.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(debug_output, indent=2),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                    files=["/api/UserService.java", "/api/UserController.java"],
+                ),
+            ],
+            initial_context={
+                "step": "Investigating null pointer exception",
+                "step_number": 1,
+                "total_steps": 3,
+                "next_step_required": True,
+                "findings": "Initial investigation",
+            },
+        )
+
+        # Mock file reading to avoid actual file I/O
+        def mock_read_file(file_path):
+            if file_path == "/api/UserService.java":
+                return "// UserService.java\npublic class UserService {\n    // code...\n}", 10
+            elif file_path == "/api/UserController.java":
+                return "// UserController.java\npublic class UserController {\n    // code...\n}", 10
+            return "", 0
+
+        # Build conversation history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
+
+        # Verify the history contains debug-specific content
+        assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
+        assert "Thread: debug-test-uuid-123" in history
+        assert "Tool: debug" in history
+
+        # Check that files are included
+        assert "UserService.java" in history
+        assert "UserController.java" in history
+
+        # Check that debug output is included
+        assert "investigation_in_progress" in history
+        assert '"step_number": 2' in history
+        assert '"files_checked": 3' in history
+        assert '"current_confidence": "medium"' in history
+
+    def test_debug_continuation_preserves_investigation_state(self):
+        """Test that continuation preserves investigation state across tools."""
+        # Create a debug investigation context
+        context = ThreadContext(
+            thread_id="debug-test-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Initial investigation",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 1,
+                            "total_steps": 4,
+                            "next_step_required": True,
+                            "investigation_status": {"files_checked": 1, "relevant_files": 1},
+                            "continuation_id": "debug-test-uuid-123",
+                        }
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="user",
+                    content="Step 2: Found dictionary modification issue",
+                    timestamp="2025-01-01T00:03:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java", "/api/utils.py"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 2,
+                            "total_steps": 4,
+                            "next_step_required": True,
+                            "investigation_status": {
+                                "files_checked": 2,
+                                "relevant_files": 1,
+                                "relevant_methods": 1,
+                                "hypotheses_formed": 1,
+                                "current_confidence": "high",
+                            },
+                            "continuation_id": "debug-test-uuid-123",
+                        }
+                    ),
+                    timestamp="2025-01-01T00:04:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Get file list to verify prioritization
+        file_list = get_conversation_file_list(context)
+        assert file_list == ["/api/SessionManager.java", "/api/utils.py"]
+
+        # Mock file reading
+        def mock_read_file(file_path):
+            return f"// {file_path}\n// Mock content", 5
+
+        # Build history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
+
+        # Verify investigation progression is preserved
+        assert "Step 1: Initial investigation" in history
+        assert "Step 2: Found dictionary modification issue" in history
+        assert '"step_number": 1' in history
+        assert '"step_number": 2' in history
+        assert '"current_confidence": "high"' in history
+
+    @pytest.mark.asyncio
+    async def test_debug_to_analyze_continuation(self):
+        """Test continuation from debug tool to analyze tool."""
+        # Simulate debug tool creating initial investigation
+        debug_context = ThreadContext(
+            thread_id="debug-analyze-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:10:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Final investigation step",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                    files=["/api/SessionManager.java"],
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "calling_expert_analysis",
+                            "investigation_complete": True,
+                            "expert_analysis": {
+                                "status": "analysis_complete",
+                                "summary": "Dictionary modification during iteration bug",
+                                "hypotheses": [
+                                    {
+                                        "name": "CONCURRENT_MODIFICATION",
+                                        "confidence": "High",
+                                        "root_cause": "Modifying dict while iterating",
+                                        "minimal_fix": "Create list of keys first",
+                                    }
+                                ],
+                            },
+                            "complete_investigation": {
+                                "initial_issue": "Session validation failures",
+                                "steps_taken": 3,
+                                "files_examined": ["/api/SessionManager.java"],
+                                "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                            },
+                        }
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Mock getting the thread
+        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
+            # Mock file reading
+            def mock_read_file(file_path):
+                return "// SessionManager.java\n// cleanup_expired_sessions method", 10
+
+            # Build history for analyze tool
+            from utils.model_context import ModelContext
+
+            model_context = ModelContext("flash")
+            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
+
+            # Verify analyze tool can see debug investigation
+            assert "calling_expert_analysis" in history
+            assert "CONCURRENT_MODIFICATION" in history
+            assert "Dictionary modification during iteration bug" in history
+            assert "SessionManager.cleanup_expired_sessions" in history
+
+            # Verify the continuation context is clear
+            assert "Thread: debug-analyze-uuid-123" in history
+            assert "Tool: debug" in history  # Shows original tool
+
+    def test_debug_planner_style_formatting(self):
+        """Test that debug tool uses similar formatting to planner for structured responses."""
+        # Create debug investigation with multiple steps
+        context = ThreadContext(
+            thread_id="debug-format-uuid-123",
+            created_at="2025-01-01T00:00:00Z",
+            last_updated_at="2025-01-01T00:15:00Z",
+            tool_name="debug",
+            turns=[
+                ConversationTurn(
+                    role="user",
+                    content="Step 1: Initial error analysis",
+                    timestamp="2025-01-01T00:01:00Z",
+                    tool_name="debug",
+                ),
+                ConversationTurn(
+                    role="assistant",
+                    content=json.dumps(
+                        {
+                            "status": "investigation_in_progress",
+                            "step_number": 1,
+                            "total_steps": 3,
+                            "next_step_required": True,
+                            "output": {
+                                "instructions": "Continue systematic investigation.",
+                                "format": "systematic_investigation",
+                            },
+                            "continuation_id": "debug-format-uuid-123",
+                        },
+                        indent=2,
+                    ),
+                    timestamp="2025-01-01T00:02:00Z",
+                    tool_name="debug",
+                ),
+            ],
+            initial_context={},
+        )
+
+        # Build history
+        from utils.model_context import ModelContext
+
+        model_context = ModelContext("flash")
+        history, _ = build_conversation_history(context, model_context, read_files_func=lambda x: ("", 0))
+
+        # Verify structured format is preserved
+        assert '"status": "investigation_in_progress"' in history
+        assert '"format": "systematic_investigation"' in history
+        assert "--- Turn 1 (Claude using debug) ---" in history
+        assert "--- Turn 2 (Gemini using debug" in history
+
+        # The JSON structure should be preserved for tools to parse
+        # This allows other tools to understand the investigation state
+        turn_2_start = history.find("--- Turn 2 (Gemini using debug")
+        turn_2_content = history[turn_2_start:]
+        assert "{\n" in turn_2_content  # JSON formatting preserved
+        assert '"continuation_id"' in turn_2_content
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -19,7 +19,8 @@ from config import MCP_PROMPT_SIZE_LIMIT
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
+# from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool

@@ -250,25 +251,30 @@ class TestLargePromptHandling:
        # The core fix ensures large prompts are detected at the right time
        assert output["status"] in ["success", "files_required_to_continue", "resend_prompt"]

-    @pytest.mark.asyncio
-    async def test_debug_large_error_description(self, large_prompt):
-        """Test that debug tool detects large error_description."""
-        tool = DebugIssueTool()
-        result = await tool.execute({"prompt": large_prompt})
+    # NOTE: Debug tool tests have been commented out because the debug tool has been
+    # refactored to use a self-investigation pattern instead of accepting a prompt field.
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
+    # and doesn't have the "resend_prompt" functionality for large prompts.

-        assert len(result) == 1
-        output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"
+    # @pytest.mark.asyncio
+    # async def test_debug_large_error_description(self, large_prompt):
+    #     """Test that debug tool detects large error_description."""
+    #     tool = DebugIssueTool()
+    #     result = await tool.execute({"prompt": large_prompt})
+    #
+    #     assert len(result) == 1
+    #     output = json.loads(result[0].text)
+    #     assert output["status"] == "resend_prompt"

-    @pytest.mark.asyncio
-    async def test_debug_large_error_context(self, large_prompt, normal_prompt):
-        """Test that debug tool detects large error_context."""
-        tool = DebugIssueTool()
-        result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
-
-        assert len(result) == 1
-        output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"
+    # @pytest.mark.asyncio
+    # async def test_debug_large_error_context(self, large_prompt, normal_prompt):
+    #     """Test that debug tool detects large error_context."""
+    #     tool = DebugIssueTool()
+    #     result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
+    #
+    #     assert len(result) == 1
+    #     output = json.loads(result[0].text)
+    #     assert output["status"] == "resend_prompt"

    @pytest.mark.asyncio
    async def test_analyze_large_question(self, large_prompt):
--- a/tests/test_prompt_regression.py
+++ b/tests/test_prompt_regression.py
@@ -13,7 +13,8 @@ import pytest
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
+# from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool

@@ -182,33 +183,37 @@ class TestPromptRegression:
                    output = json.loads(result[0].text)
                    assert output["status"] == "success"

-    @pytest.mark.asyncio
-    async def test_debug_normal_error(self, mock_model_response):
-        """Test debug tool with normal error description."""
-        tool = DebugIssueTool()
+    # NOTE: Debug tool test has been commented out because the debug tool has been
+    # refactored to use a self-investigation pattern instead of accepting prompt/error_context fields.
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings

-        with patch.object(tool, "get_model_provider") as mock_get_provider:
-            mock_provider = MagicMock()
-            mock_provider.get_provider_type.return_value = MagicMock(value="google")
-            mock_provider.supports_thinking_mode.return_value = False
-            mock_provider.generate_content.return_value = mock_model_response(
-                "Root cause: The variable is undefined. Fix: Initialize it..."
-            )
-            mock_get_provider.return_value = mock_provider
-
-            result = await tool.execute(
-                {
-                    "prompt": "TypeError: Cannot read property 'name' of undefined",
-                    "error_context": "at line 42 in user.js\n  console.log(user.name)",
-                    "runtime_info": "Node.js v16.14.0",
-                }
-            )
-
-            assert len(result) == 1
-            output = json.loads(result[0].text)
-            assert output["status"] == "success"
-            assert "Next Steps:" in output["content"]
-            assert "Root cause" in output["content"]
+    # @pytest.mark.asyncio
+    # async def test_debug_normal_error(self, mock_model_response):
+    #     """Test debug tool with normal error description."""
+    #     tool = DebugIssueTool()
+    #
+    #     with patch.object(tool, "get_model_provider") as mock_get_provider:
+    #         mock_provider = MagicMock()
+    #         mock_provider.get_provider_type.return_value = MagicMock(value="google")
+    #         mock_provider.supports_thinking_mode.return_value = False
+    #         mock_provider.generate_content.return_value = mock_model_response(
+    #             "Root cause: The variable is undefined. Fix: Initialize it..."
+    #         )
+    #         mock_get_provider.return_value = mock_provider
+    #
+    #         result = await tool.execute(
+    #             {
+    #                 "prompt": "TypeError: Cannot read property 'name' of undefined",
+    #                 "error_context": "at line 42 in user.js\n  console.log(user.name)",
+    #                 "runtime_info": "Node.js v16.14.0",
+    #             }
+    #         )
+    #
+    #         assert len(result) == 1
+    #         output = json.loads(result[0].text)
+    #         assert output["status"] == "success"
+    #         assert "Next Steps:" in output["content"]
+    #         assert "Root cause" in output["content"]

    @pytest.mark.asyncio
    async def test_analyze_normal_question(self, mock_model_response):
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -6,7 +6,7 @@ import json

 import pytest

-from tools import AnalyzeTool, ChatTool, CodeReviewTool, DebugIssueTool, ThinkDeepTool
+from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool


 class TestThinkDeepTool:
@@ -183,94 +183,6 @@ class TestCodeReviewTool:
            ModelProviderRegistry._instance = None


-class TestDebugIssueTool:
-    """Test the debug tool"""
-
-    @pytest.fixture
-    def tool(self):
-        return DebugIssueTool()
-
-    def test_tool_metadata(self, tool):
-        """Test tool metadata"""
-        assert tool.get_name() == "debug"
-        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
-        assert tool.get_default_temperature() == 0.2
-
-        schema = tool.get_input_schema()
-        assert "prompt" in schema["properties"]
-        assert schema["required"] == ["prompt"]
-
-    @pytest.mark.asyncio
-    async def test_execute_with_context(self, tool):
-        """Test execution with error context using real integration testing"""
-        import importlib
-        import os
-
-        # Save original environment
-        original_env = {
-            "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
-            "DEFAULT_MODEL": os.environ.get("DEFAULT_MODEL"),
-        }
-
-        try:
-            # Set up environment for real provider resolution
-            os.environ["OPENAI_API_KEY"] = "sk-test-key-debug-context-test-not-real"
-            os.environ["DEFAULT_MODEL"] = "o3-mini"
-
-            # Clear other provider keys to isolate to OpenAI
-            for key in ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]:
-                os.environ.pop(key, None)
-
-            # Reload config and clear registry
-            import config
-
-            importlib.reload(config)
-            from providers.registry import ModelProviderRegistry
-
-            ModelProviderRegistry._instance = None
-
-            # Test with real provider resolution
-            try:
-                result = await tool.execute(
-                    {
-                        "prompt": "Test fails intermittently",
-                        "error_context": "AssertionError in test_async",
-                        "previous_attempts": "Added sleep, still fails",
-                        "model": "o3-mini",
-                    }
-                )
-
-                # If we get here, check the response format
-                assert len(result) == 1
-                # Should contain debug analysis
-                assert result[0].text is not None
-
-            except Exception as e:
-                # Expected: API call will fail with fake key
-                error_msg = str(e)
-                # Should NOT be a mock-related error
-                assert "MagicMock" not in error_msg
-                assert "'<' not supported between instances" not in error_msg
-
-                # Should be a real provider error
-                assert any(
-                    phrase in error_msg
-                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
-                )
-
-        finally:
-            # Restore environment
-            for key, value in original_env.items():
-                if value is not None:
-                    os.environ[key] = value
-                else:
-                    os.environ.pop(key, None)
-
-            # Reload config and clear registry
-            importlib.reload(config)
-            ModelProviderRegistry._instance = None
-
-
 class TestAnalyzeTool:
    """Test the analyze tool"""

@@ -400,23 +312,6 @@ class TestAbsolutePathValidation:
        assert "must be FULL absolute paths" in response["content"]
        assert "../parent/file.py" in response["content"]

-    @pytest.mark.asyncio
-    async def test_debug_tool_relative_path_rejected(self):
-        """Test that debug tool rejects relative paths"""
-        tool = DebugIssueTool()
-        result = await tool.execute(
-            {
-                "prompt": "Something broke",
-                "files": ["src/main.py"],  # relative path
-            }
-        )
-
-        assert len(result) == 1
-        response = json.loads(result[0].text)
-        assert response["status"] == "error"
-        assert "must be FULL absolute paths" in response["content"]
-        assert "src/main.py" in response["content"]
-
    @pytest.mark.asyncio
    async def test_thinkdeep_tool_relative_path_rejected(self):
        """Test that thinkdeep tool rejects relative paths"""
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -1,7 +1,9 @@
 """
-Debug Issue tool - Root cause analysis and debugging assistance
+Debug Issue tool - Root cause analysis and debugging assistance with systematic investigation
 """

+import json
+import logging
 from typing import TYPE_CHECKING, Any, Optional

 from pydantic import Field
@@ -14,155 +16,207 @@ from systemprompts import DEBUG_ISSUE_PROMPT

 from .base import BaseTool, ToolRequest

-# Field descriptions to avoid duplication between Pydantic and JSON schema
-DEBUG_FIELD_DESCRIPTIONS = {
-    "prompt": (
-        "MANDATORY: You MUST first think deep about the issue, what it is, why it might be happening, what code might be involved, "
-        "is it an error stemming out of the code directly or is it a side-effect of some part of the existing code. If it's an error "
-        "message, could it be coming from an external resource and NOT directly from the project? What part of the code seems most likely"
-        "the culprit. MUST try and ZERO IN on the issue and surrounding code. Include all the details into the prompt that you can provide: "
-        "error messages, symptoms, when it occurs, steps to reproduce, environment details, "
-        "recent changes, and any other relevant information. Mention any previous attempts at fixing this issue, "
-        "including any past fix that was in place but has now regressed. "
-        "The more context available, the better the analysis. "
-        "PERFORM SYSTEMATIC INVESTIGATION: You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. "
-        "First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. "
-        "You MUST maintain detailed investigation notes in a DEBUGGING_{issue_description}.md file within the project folder, "
-        "updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. "
-        "This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. "
-        "CRITICAL: If after thorough investigation You has very high confidence that NO BUG EXISTS that correlates to the reported symptoms, "
-        "You should consider the possibility that the reported issue may not actually be present, may be a misunderstanding, or may be conflated with something else entirely. "
-        "In such cases, you should gather more information from the user through targeted questioning rather than continue hunting for non-existent bugs. "
-        "Once complete, you MUST provide also pass in this file into the files parameter of this tool. "
-        "It is ESSENTIAL that this detailed work is performed by you before sharing all the relevant details with its development assistant. This will greatly help in zeroing in on the root cause."
+logger = logging.getLogger(__name__)
+
+# Field descriptions for the investigation steps
+DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
+    "step": (
+        "Your current investigation step. For the first step, describe the issue/error to investigate. "
+        "For subsequent steps, describe what you're investigating, what code you're examining, "
+        "what patterns you're looking for, or what hypothesis you're testing."
    ),
+    "step_number": "Current step number in the investigation sequence (starts at 1)",
+    "total_steps": "Current estimate of total investigation steps needed (can be adjusted as investigation progresses)",
+    "next_step_required": "Whether another investigation step is required",
    "findings": (
-        "You MUST first perform its own investigation, gather its findings and analysis. Include: steps taken to analyze the issue, "
-        "code patterns discovered, initial hypotheses formed, any relevant classes/functions/methods examined, "
-        "and any preliminary conclusions. If investigation yields no concrete evidence of a bug correlating to the reported symptoms, "
-        "You should clearly state this finding and consider that the issue may not exist as described. "
-        "This provides context for the assistant model's analysis."
+        "Current findings from this investigation step. Include code patterns discovered, "
+        "potential causes identified, hypotheses formed, or evidence gathered."
    ),
-    "files": (
-        "Essential files for debugging - ONLY include files that are directly related to the issue, "
-        "contain the problematic code, or are necessary for understanding the root cause. "
-        "This can include any relevant log files, error description documents, investigation documents, "
-        "Your own findings as a document, related code that may help with analysis."
-        "DO NOT include every file scanned during investigation (must be FULL absolute paths - DO NOT SHORTEN)."
+    "files_checked": (
+        "List of files you've examined so far in the investigation (cumulative list). "
+        "Include all files you've looked at, even if they turned out to be irrelevant."
    ),
-    "error_context": "Stack trace, snippet from logs, or additional error context. For very large text you MUST instead"
-    "save the context as a temporary file within the project folder and share it as a FULL absolute file path - DO NOT SHORTEN"
-    "reference to the files parameter.",
-    "images": "Optional images showing error screens, UI issues, logs displays, or visual debugging information",
+    "relevant_files": (
+        "List of files that are definitely related to the issue (subset of files_checked). "
+        "Only include files that contain code directly related to the problem."
+    ),
+    "relevant_methods": (
+        "List of specific methods/functions that are involved in the issue. "
+        "Format: 'ClassName.methodName' or 'functionName'"
+    ),
+    "hypothesis": (
+        "Your current working hypothesis about the root cause. This can be updated/revised "
+        "as the investigation progresses."
+    ),
+    "confidence": "Your confidence level in the current hypothesis: 'low', 'medium', or 'high'",
+    "backtrack_from_step": "If you need to revise a previous finding, which step number to backtrack from",
+    "continuation_id": "Thread continuation ID for multi-turn investigation sessions",
+    "images": (
+        "Optional images showing error screens, UI issues, logs displays, or visual debugging information "
+        "that help understand the issue (must be FULL absolute paths - DO NOT SHORTEN)"
+    ),
+}
+
+# Field descriptions for the final debug request
+DEBUG_FIELD_DESCRIPTIONS = {
+    "initial_issue": "The original issue description that started the investigation",
+    "investigation_summary": "Complete summary of the systematic investigation performed",
+    "findings": "Consolidated findings from all investigation steps",
+    "files": "Essential files identified during investigation (must be FULL absolute paths - DO NOT SHORTEN)",
+    "error_context": "Stack trace, logs, or error context discovered during investigation",
+    "relevant_methods": "List of methods/functions identified as involved in the issue",
+    "hypothesis": "Final hypothesis about the root cause after investigation",
+    "images": "Optional images showing error screens, UI issues, or visual debugging information",
 }


-class DebugIssueRequest(ToolRequest):
-    """Request model for debug tool"""
+class DebugInvestigationRequest(ToolRequest):
+    """Request model for debug investigation steps"""

-    prompt: str = Field(..., description=DEBUG_FIELD_DESCRIPTIONS["prompt"])
-    findings: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["findings"])
-    files: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["files"])
-    error_context: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["error_context"])
-    images: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["images"])
+    # Required fields for each investigation step
+    step: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"])
+
+    # Investigation tracking fields
+    findings: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"])
+    files_checked: list[str] = Field(
+        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"]
+    )
+    relevant_files: list[str] = Field(
+        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"]
+    )
+    relevant_methods: list[str] = Field(
+        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"]
+    )
+    hypothesis: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"])
+    confidence: Optional[str] = Field("low", description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"])
+
+    # Optional backtracking field
+    backtrack_from_step: Optional[int] = Field(
+        None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"]
+    )
+
+    # Optional continuation field
+    continuation_id: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"])
+
+    # Optional images for visual debugging
+    images: Optional[list[str]] = Field(default=None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"])
+
+    # Override inherited fields to exclude them
+    model: Optional[str] = Field(default=None, exclude=True)
+    temperature: Optional[float] = Field(default=None, exclude=True)
+    thinking_mode: Optional[str] = Field(default=None, exclude=True)
+    use_websearch: Optional[bool] = Field(default=None, exclude=True)


 class DebugIssueTool(BaseTool):
-    """Advanced debugging and root cause analysis tool"""
+    """Advanced debugging tool with systematic self-investigation"""
+
+    def __init__(self):
+        super().__init__()
+        self.investigation_history = []
+        self.consolidated_findings = {
+            "files_checked": set(),
+            "relevant_files": set(),
+            "relevant_methods": set(),
+            "findings": [],
+            "hypotheses": [],
+            "images": [],
+        }

    def get_name(self) -> str:
        return "debug"

    def get_description(self) -> str:
        return (
-            "DEBUG & ROOT CAUSE ANALYSIS - Expert debugging for complex issues with systematic investigation support. "
-            "Use this when you need to debug code, find out why something is failing, identify root causes, "
-            "trace errors, or diagnose issues. "
-            "MANDATORY: Claud you MUST first think deep and follow these instructions when using this tool"
-            "SYSTEMATIC INVESTIGATION WORKFLOW: "
-            "You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. "
-            "First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. "
-            "You MUST maintain detailed investigation notes while it performs its analysis, "
-            "updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. "
-            "This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. "
-            "Once complete, You MUST provide Zen's debug tool with this file passed into the files parameter. "
-            "1. INVESTIGATE SYSTEMATICALLY: You MUST think and use a methodical approach to trace through error reports, "
-            "examine code, and gather evidence step by step "
-            "2. DOCUMENT FINDINGS: Maintain detailed investigation notes to "
-            "keep the user informed during its initial investigation. This investigation MUST be shared with this tool for the assistant "
-            "to be able to help more effectively. "
-            "3. USE TRACER TOOL: For complex method calls, class references, or side effects use Zen's tracer tool and include its output as part of the "
-            "prompt or additional context "
-            "4. COLLECT EVIDENCE: Document important discoveries and validation attempts "
-            "5. PROVIDE COMPREHENSIVE FINDINGS: Pass complete findings to this tool for expert analysis "
-            "INVESTIGATION METHODOLOGY: "
-            "- Start with error messages/symptoms and work backwards to root cause "
-            "- Examine code flow and identify potential failure points "
-            "- Use tracer tool for complex method interactions and dependencies if and as needed but continue with the investigation after using it "
-            "- Test hypotheses against actual code and logs and confirm the idea holds "
-            "- Document everything systematically "
-            "- CRITICAL: If investigation yields no concrete evidence of a bug, consider that the reported issue may not exist as described and gather more information through questioning "
-            "ESSENTIAL FILES ONLY: Include only files (documents, code etc) directly related to the issue. "
-            "Focus on quality over quantity for assistant model analysis. "
-            "STRUCTURED OUTPUT: Assistant models return JSON responses with hypothesis "
-            "ranking, evidence correlation, and actionable fixes. "
-            "Choose thinking_mode based on issue complexity: 'low' for simple errors, "
-            "'medium' for standard debugging (default), 'high' for complex system issues, "
-            "'max' for extremely challenging bugs requiring deepest analysis. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
+            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
+            "This tool guides you through a step-by-step investigation process where you:\n\n"
+            "1. Start with step 1: describe the issue to investigate\n"
+            "2. Continue with investigation steps: examine code, trace errors, test hypotheses\n"
+            "3. Track findings, relevant files, and methods throughout\n"
+            "4. Update hypotheses as understanding evolves\n"
+            "5. Backtrack and revise findings when needed\n"
+            "6. Once investigation is complete, receive expert analysis\n\n"
+            "The tool enforces systematic investigation methodology:\n"
+            "- Methodical code examination and evidence collection\n"
+            "- Hypothesis formation and validation\n"
+            "- File and method tracking for context\n"
+            "- Confidence assessment and revision capabilities\n\n"
+            "Perfect for: complex bugs, mysterious errors, performance issues, "
+            "race conditions, memory leaks, integration problems."
        )

    def get_input_schema(self) -> dict[str, Any]:
        schema = {
            "type": "object",
            "properties": {
-                "prompt": {
+                # Investigation step fields
+                "step": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["prompt"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"],
+                },
+                "step_number": {
+                    "type": "integer",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"],
+                    "minimum": 1,
+                },
+                "total_steps": {
+                    "type": "integer",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"],
+                    "minimum": 1,
+                },
+                "next_step_required": {
+                    "type": "boolean",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"],
                },
-                "model": self.get_model_field_schema(),
                "findings": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["findings"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"],
                },
-                "files": {
+                "files_checked": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": DEBUG_FIELD_DESCRIPTIONS["files"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"],
                },
-                "error_context": {
+                "relevant_files": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"],
+                },
+                "relevant_methods": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"],
+                },
+                "hypothesis": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["error_context"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"],
+                },
+                "confidence": {
+                    "type": "string",
+                    "enum": ["low", "medium", "high"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"],
+                },
+                "backtrack_from_step": {
+                    "type": "integer",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"],
+                    "minimum": 1,
+                },
+                "continuation_id": {
+                    "type": "string",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"],
                },
                "images": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": DEBUG_FIELD_DESCRIPTIONS["images"],
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "Temperature (0-1, default 0.2 for accuracy)",
-                    "minimum": 0,
-                    "maximum": 1,
-                },
-                "thinking_mode": {
-                    "type": "string",
-                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
-                },
-                "use_websearch": {
-                    "type": "boolean",
-                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
-                    "default": True,
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"],
                },
            },
-            "required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []),
+            # Required fields for investigation
+            "required": ["step", "step_number", "total_steps", "next_step_required", "findings"],
        }
-
        return schema

    def get_system_prompt(self) -> str:
@@ -171,8 +225,6 @@ class DebugIssueTool(BaseTool):
    def get_default_temperature(self) -> float:
        return TEMPERATURE_ANALYTICAL

-    # Line numbers are enabled by default from base class for precise error location
-
    def get_model_category(self) -> "ToolModelCategory":
        """Debug requires deep analysis and reasoning"""
        from tools.models import ToolModelCategory
@@ -180,138 +232,342 @@ class DebugIssueTool(BaseTool):
        return ToolModelCategory.EXTENDED_REASONING

    def get_request_model(self):
-        return DebugIssueRequest
+        return DebugInvestigationRequest

-    async def prepare_prompt(self, request: DebugIssueRequest) -> str:
-        """Prepare the debugging prompt"""
-        # Check for prompt.txt in files
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
+    def requires_model(self) -> bool:
+        """
+        Debug tool manages its own model interactions.
+        It doesn't need model during investigation steps, only for final analysis.
+        """
+        return False

-        # If prompt.txt was found, use it as prompt or error_context
-        if prompt_content:
-            if not request.prompt or request.prompt == "":
-                request.prompt = prompt_content
+    async def execute(self, arguments: dict[str, Any]) -> list:
+        """
+        Override execute to implement self-investigation pattern.
+
+        Investigation Flow:
+        1. Claude calls debug with investigation steps
+        2. Tool tracks findings, files, methods progressively
+        3. Once investigation is complete, tool calls AI model for expert analysis
+        4. Returns structured response combining investigation + expert analysis
+        """
+        from mcp.types import TextContent
+
+        from utils.conversation_memory import add_turn, create_thread
+
+        try:
+            # Validate request
+            request = DebugInvestigationRequest(**arguments)
+
+            # Adjust total steps if needed
+            if request.step_number > request.total_steps:
+                request.total_steps = request.step_number
+
+            # Handle continuation
+            continuation_id = request.continuation_id
+
+            # Create thread for first step
+            if not continuation_id and request.step_number == 1:
+                continuation_id = create_thread("debug", arguments)
+                # Store initial issue description
+                self.initial_issue = request.step
+
+            # Handle backtracking first if requested
+            if request.backtrack_from_step:
+                # Remove findings after the backtrack point
+                self.investigation_history = [
+                    s for s in self.investigation_history if s["step_number"] < request.backtrack_from_step
+                ]
+                # Reprocess consolidated findings to match truncated history
+                self._reprocess_consolidated_findings()
+
+                # Log if step number needs correction
+                expected_step_number = len(self.investigation_history) + 1
+                if request.step_number != expected_step_number:
+                    logger.debug(
+                        f"Step number adjusted from {request.step_number} to {expected_step_number} after backtracking"
+                    )
+
+            # Process investigation step
+            step_data = {
+                "step": request.step,
+                "step_number": request.step_number,
+                "findings": request.findings,
+                "files_checked": request.files_checked,
+                "relevant_files": request.relevant_files,
+                "relevant_methods": request.relevant_methods,
+                "hypothesis": request.hypothesis,
+                "confidence": request.confidence,
+                "images": request.images,
+            }
+
+            # Store in history
+            self.investigation_history.append(step_data)
+
+            # Update consolidated findings
+            self.consolidated_findings["files_checked"].update(request.files_checked)
+            self.consolidated_findings["relevant_files"].update(request.relevant_files)
+            self.consolidated_findings["relevant_methods"].update(request.relevant_methods)
+            self.consolidated_findings["findings"].append(f"Step {request.step_number}: {request.findings}")
+            if request.hypothesis:
+                self.consolidated_findings["hypotheses"].append(
+                    {"step": request.step_number, "hypothesis": request.hypothesis, "confidence": request.confidence}
+                )
+            if request.images:
+                self.consolidated_findings["images"].extend(request.images)
+
+            # Build response
+            response_data = {
+                "status": "investigation_in_progress",
+                "step_number": request.step_number,
+                "total_steps": request.total_steps,
+                "next_step_required": request.next_step_required,
+                "investigation_status": {
+                    "files_checked": len(self.consolidated_findings["files_checked"]),
+                    "relevant_files": len(self.consolidated_findings["relevant_files"]),
+                    "relevant_methods": len(self.consolidated_findings["relevant_methods"]),
+                    "hypotheses_formed": len(self.consolidated_findings["hypotheses"]),
+                    "images_collected": len(set(self.consolidated_findings["images"])),
+                    "current_confidence": request.confidence,
+                },
+                "output": {
+                    "instructions": "Continue systematic investigation. Present findings clearly and proceed to next step if required.",
+                    "format": "systematic_investigation",
+                },
+            }
+
+            if continuation_id:
+                response_data["continuation_id"] = continuation_id
+
+            # If investigation is complete, call the AI model for expert analysis
+            if not request.next_step_required:
+                response_data["status"] = "calling_expert_analysis"
+                response_data["investigation_complete"] = True
+
+                # Prepare consolidated investigation summary
+                investigation_summary = self._prepare_investigation_summary()
+
+                # Call the AI model with full context
+                expert_analysis = await self._call_expert_analysis(
+                    initial_issue=getattr(self, "initial_issue", request.step),
+                    investigation_summary=investigation_summary,
+                    relevant_files=list(self.consolidated_findings["relevant_files"]),
+                    relevant_methods=list(self.consolidated_findings["relevant_methods"]),
+                    final_hypothesis=request.hypothesis,
+                    error_context=self._extract_error_context(),
+                    images=list(set(self.consolidated_findings["images"])),  # Unique images
+                    model_info=arguments.get("_model_context"),
+                    model_override=arguments.get("model"),  # Pass model selection from final step
+                )
+
+                # Combine investigation and expert analysis
+                response_data["expert_analysis"] = expert_analysis
+                response_data["complete_investigation"] = {
+                    "initial_issue": getattr(self, "initial_issue", request.step),
+                    "steps_taken": len(self.investigation_history),
+                    "files_examined": list(self.consolidated_findings["files_checked"]),
+                    "relevant_files": list(self.consolidated_findings["relevant_files"]),
+                    "relevant_methods": list(self.consolidated_findings["relevant_methods"]),
+                    "investigation_summary": investigation_summary,
+                }
+                response_data["next_steps"] = (
+                    "Investigation complete with expert analysis. Present the findings, hypotheses, "
+                    "and recommended fixes to the user. Focus on the most likely root cause and "
+                    "provide actionable implementation guidance."
+                )
            else:
-                request.error_context = prompt_content
+                response_data["next_steps"] = (
+                    f"Continue investigation with step {request.step_number + 1}. "
+                    f"Focus on: examining relevant code, testing hypotheses, gathering evidence."
+                )

-        # Check user input sizes at MCP transport boundary (before adding internal content)
-        size_check = self.check_prompt_size(request.prompt)
-        if size_check:
-            from tools.models import ToolOutput
+            # Store in conversation memory
+            if continuation_id:
+                add_turn(
+                    thread_id=continuation_id,
+                    role="assistant",
+                    content=json.dumps(response_data, indent=2),
+                    tool_name="debug",
+                    files=list(self.consolidated_findings["relevant_files"]),
+                    images=request.images,
+                )

-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+            return [TextContent(type="text", text=json.dumps(response_data, indent=2))]

-        if request.error_context:
-            size_check = self.check_prompt_size(request.error_context)
-            if size_check:
-                from tools.models import ToolOutput
+        except Exception as e:
+            logger.error(f"Error in debug investigation: {e}", exc_info=True)
+            error_data = {
+                "status": "investigation_failed",
+                "error": str(e),
+                "step_number": arguments.get("step_number", 0),
+            }
+            return [TextContent(type="text", text=json.dumps(error_data, indent=2))]

-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+    def _reprocess_consolidated_findings(self):
+        """Reprocess consolidated findings after backtracking"""
+        self.consolidated_findings = {
+            "files_checked": set(),
+            "relevant_files": set(),
+            "relevant_methods": set(),
+            "findings": [],
+            "hypotheses": [],
+            "images": [],
+        }

-        # Update request files list
-        if updated_files is not None:
-            request.files = updated_files
+        for step in self.investigation_history:
+            self.consolidated_findings["files_checked"].update(step.get("files_checked", []))
+            self.consolidated_findings["relevant_files"].update(step.get("relevant_files", []))
+            self.consolidated_findings["relevant_methods"].update(step.get("relevant_methods", []))
+            self.consolidated_findings["findings"].append(f"Step {step['step_number']}: {step['findings']}")
+            if step.get("hypothesis"):
+                self.consolidated_findings["hypotheses"].append(
+                    {
+                        "step": step["step_number"],
+                        "hypothesis": step["hypothesis"],
+                        "confidence": step.get("confidence", "low"),
+                    }
+                )
+            if step.get("images"):
+                self.consolidated_findings["images"].extend(step["images"])

-        # File size validation happens at MCP boundary in server.py
+    def _prepare_investigation_summary(self) -> str:
+        """Prepare a comprehensive summary of the investigation"""
+        summary_parts = [
+            "=== SYSTEMATIC INVESTIGATION SUMMARY ===",
+            f"Total steps: {len(self.investigation_history)}",
+            f"Files examined: {len(self.consolidated_findings['files_checked'])}",
+            f"Relevant files identified: {len(self.consolidated_findings['relevant_files'])}",
+            f"Methods/functions involved: {len(self.consolidated_findings['relevant_methods'])}",
+            "",
+            "=== INVESTIGATION PROGRESSION ===",
+        ]

-        # Build context sections
-        context_parts = [f"=== ISSUE DESCRIPTION ===\n{request.prompt}\n=== END DESCRIPTION ==="]
+        for finding in self.consolidated_findings["findings"]:
+            summary_parts.append(finding)

-        if request.findings:
-            context_parts.append(f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{request.findings}\n=== END FINDINGS ===")
-
-        if request.error_context:
-            context_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{request.error_context}\n=== END CONTEXT ===")
-
-        # Add relevant files if provided
-        if request.files:
-            # Use centralized file processing logic
-            continuation_id = getattr(request, "continuation_id", None)
-            file_content, processed_files = self._prepare_file_content_for_prompt(
-                request.files, continuation_id, "Code"
+        if self.consolidated_findings["hypotheses"]:
+            summary_parts.extend(
+                [
+                    "",
+                    "=== HYPOTHESIS EVOLUTION ===",
+                ]
            )
-            self._actually_processed_files = processed_files
+            for hyp in self.consolidated_findings["hypotheses"]:
+                summary_parts.append(f"Step {hyp['step']} ({hyp['confidence']} confidence): {hyp['hypothesis']}")

+        return "\n".join(summary_parts)
+
+    def _extract_error_context(self) -> Optional[str]:
+        """Extract error context from investigation findings"""
+        error_patterns = ["error", "exception", "stack trace", "traceback", "failure"]
+        error_context_parts = []
+
+        for finding in self.consolidated_findings["findings"]:
+            if any(pattern in finding.lower() for pattern in error_patterns):
+                error_context_parts.append(finding)
+
+        return "\n".join(error_context_parts) if error_context_parts else None
+
+    async def _call_expert_analysis(
+        self,
+        initial_issue: str,
+        investigation_summary: str,
+        relevant_files: list[str],
+        relevant_methods: list[str],
+        final_hypothesis: Optional[str],
+        error_context: Optional[str],
+        images: list[str],
+        model_info: Optional[Any] = None,
+        model_override: Optional[str] = None,
+    ) -> dict:
+        """Call AI model for expert analysis of the investigation"""
+        # Prepare the debug prompt with all investigation context
+        prompt_parts = [
+            f"=== ISSUE DESCRIPTION ===\n{initial_issue}\n=== END DESCRIPTION ===",
+            f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{investigation_summary}\n=== END FINDINGS ===",
+        ]
+
+        if error_context:
+            prompt_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{error_context}\n=== END CONTEXT ===")
+
+        if relevant_methods:
+            prompt_parts.append(
+                "\n=== RELEVANT METHODS/FUNCTIONS ===\n"
+                + "\n".join(f"- {method}" for method in relevant_methods)
+                + "\n=== END METHODS ==="
+            )
+
+        if final_hypothesis:
+            prompt_parts.append(f"\n=== FINAL HYPOTHESIS ===\n{final_hypothesis}\n=== END HYPOTHESIS ===")
+
+        if images:
+            prompt_parts.append(
+                "\n=== VISUAL DEBUGGING INFORMATION ===\n"
+                + "\n".join(f"- {img}" for img in images)
+                + "\n=== END VISUAL INFORMATION ==="
+            )
+
+        # Add file content if we have relevant files
+        if relevant_files:
+            file_content, _ = self._prepare_file_content_for_prompt(relevant_files, None, "Essential debugging files")
            if file_content:
-                context_parts.append(
+                prompt_parts.append(
                    f"\n=== ESSENTIAL FILES FOR DEBUGGING ===\n{file_content}\n=== END ESSENTIAL FILES ==="
                )

-        full_context = "\n".join(context_parts)
+        full_prompt = "\n".join(prompt_parts)

-        # Check token limits
-        self._validate_token_limit(full_context, "Context")
+        # Get appropriate model and provider
+        from config import DEFAULT_MODEL
+        from providers.registry import ModelProviderRegistry

-        # Add web search instruction if enabled
-        websearch_instruction = self.get_websearch_instruction(
-            request.use_websearch,
-            """When debugging issues, consider if searches for these would help:
- The exact error message to find known solutions
- Framework-specific error codes and their meanings
- Similar issues in forums, GitHub issues, or Stack Overflow
- Workarounds and patches for known bugs
- Version-specific issues and compatibility problems""",
-        )
+        model_name = model_override or DEFAULT_MODEL  # Use override if provided
+        provider = ModelProviderRegistry.get_provider_for_model(model_name)

-        # Combine everything
-        full_prompt = f"""{self.get_system_prompt()}{websearch_instruction}
+        if not provider:
+            return {"error": f"No provider available for model {model_name}", "status": "provider_error"}

-{full_context}
+        # Generate AI response
+        try:
+            full_analysis_prompt = f"{self.get_system_prompt()}\n\n{full_prompt}\n\nPlease debug this issue following the structured format in the system prompt."

-Please debug this issue following the structured format in the system prompt.
-Focus on finding the root cause and providing actionable solutions."""
+            # Prepare generation kwargs
+            generation_kwargs = {
+                "prompt": full_analysis_prompt,
+                "model_name": model_name,
+                "system_prompt": "",  # Already included in prompt
+                "temperature": self.get_default_temperature(),
+                "thinking_mode": "high",  # High thinking for debug analysis
+            }

-        return full_prompt
+            # Add images if available
+            if images:
+                generation_kwargs["images"] = images

-    def _get_model_name(self, model_info: Optional[dict]) -> str:
-        """Extract friendly model name from model info."""
-        if model_info and model_info.get("model_response"):
-            return model_info["model_response"].friendly_name or "the model"
-        return "the model"
+            model_response = provider.generate_content(**generation_kwargs)

-    def _generate_systematic_next_steps(self, model_name: str) -> str:
-        """Generate next steps for systematic investigation completion."""
-        return f"""**Expert Analysis Complete**
+            if model_response.content:
+                # Try to parse as JSON
+                try:
+                    analysis_result = json.loads(model_response.content.strip())
+                    return analysis_result
+                except json.JSONDecodeError:
+                    # Return as text if not valid JSON
+                    return {
+                        "status": "analysis_complete",
+                        "raw_analysis": model_response.content,
+                        "parse_error": "Response was not valid JSON",
+                    }
+            else:
+                return {"error": "No response from model", "status": "empty_response"}

-{model_name} has analyzed your systematic investigation findings.
+        except Exception as e:
+            logger.error(f"Error calling expert analysis: {e}", exc_info=True)
+            return {"error": str(e), "status": "analysis_error"}

-**Next Steps:**
-1. **UPDATE INVESTIGATION DOCUMENT**: Add the expert analysis to your DEBUGGING_*.md file
-2. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence validation
-3. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
-4. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions
-5. **DOCUMENT RESOLUTION**: Update investigation document with final resolution"""
+    # Stub implementations for base class requirements
+    async def prepare_prompt(self, request) -> str:
+        return ""  # Not used - execute() is overridden

-    def _generate_standard_analysis_steps(self, model_name: str) -> str:
-        """Generate next steps for standard analysis completion."""
-        return f"""**Expert Analysis Complete**
-
-{model_name} has analyzed your investigation findings.
-
-**Next Steps:**
-1. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence
-2. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
-3. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions"""
-
-    def _generate_general_analysis_steps(self, model_name: str) -> str:
-        """Generate next steps for general analysis responses."""
-        return f"""**Analysis from {model_name}**
-
-**Next Steps:** Continue your systematic investigation based on the guidance provided, then return
-with comprehensive findings for expert analysis."""
-
-    def format_response(self, response: str, request: DebugIssueRequest, model_info: Optional[dict] = None) -> str:
-        """Format the debugging response for Claude to present to user"""
-        # The base class automatically handles structured responses like 'files_required_to_continue'
-        # and 'analysis_complete' via SPECIAL_STATUS_MODELS, so we only handle normal text responses here
-
-        model_name = self._get_model_name(model_info)
-
-        # For normal text responses, provide general guidance
-        next_steps = self._generate_general_analysis_steps(model_name)
-
-        return f"""{response}
-
---
-
-{next_steps}"""
+    def format_response(self, response: str, request, model_info: dict = None) -> str:
+        return response  # Not used - execute() is overridden