Re-imagined and re-written Debug tool. Instead of prompting Claude to perform initial analysis (and hoping it did), the tool now works through the debug process as an 'investigation', encouraging Claud to gather its 'findings' / 'hypothesis', stepping back as needed, collecting files it's gone through and keeping track of files relevant to the issue at hand. This structured investiion is then passed to the other model with far greater insight than the original debug tool ever could.

Improved prompts, guard against overengineering and flag that as an antipattern
2025-06-19 10:22:30 +04:00
parent 2641c78f8d
commit fccfb0d999
16 changed files with 2243 additions and 707 deletions
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -1,21 +1,23 @@
 #!/usr/bin/env python3
 """
-Debug Tool Validation Test
+Debug Tool Self-Investigation Validation Test
-Tests the debug tool with real bugs to validate:
+Tests the debug tool's systematic self-investigation capabilities including:
- Proper execution with flash model
+- Step-by-step investigation with proper JSON responses
- Actual bug identification and analysis
+- Progressive tracking of findings, files, and methods
- Hypothesis generation for root causes
+- Hypothesis formation and confidence tracking
- Log validation for tool execution
+- Backtracking and revision capabilities
 - Final expert analysis after investigation completion
 """
 import json
 from typing import Optional
 from .base_test import BaseSimulatorTest
 class DebugValidationTest(BaseSimulatorTest):
-    """Test debug tool with actual bug scenarios"""
+    """Test debug tool's self-investigation and expert analysis features"""
    @property
    def test_name(self) -> str:
@@ -23,23 +25,48 @@ class DebugValidationTest(BaseSimulatorTest):
    @property
    def test_description(self) -> str:
-        return "Debug tool validation with actual bugs"
+        return "Debug tool self-investigation pattern validation"
    def run_test(self) -> bool:
-        """Test debug tool with real bugs"""
+        """Test debug tool self-investigation capabilities"""
        try:
-            self.logger.info("Test: Debug tool validation")
+            self.logger.info("Test: Debug tool self-investigation validation")
            # Setup test files directory first
            self.setup_test_files()
            # Create a Python file with a subtle but realistic bug
-            buggy_code = """#!/usr/bin/env python3
+            self._create_buggy_code()
            # Test 1: Single investigation session with multiple steps
            if not self._test_single_investigation_session():
                return False
            # Test 2: Investigation with backtracking
            if not self._test_investigation_with_backtracking():
                return False
            # Test 3: Complete investigation with expert analysis
            if not self._test_complete_investigation_with_analysis():
                return False
            self.logger.info("  ✅ All debug validation tests passed")
            return True
        except Exception as e:
            self.logger.error(f"Debug validation test failed: {e}")
            return False
        finally:
            self.cleanup_test_files()
    def _create_buggy_code(self):
        """Create test files with a subtle bug for debugging"""
        # Create a Python file with dictionary iteration bug
        buggy_code = """#!/usr/bin/env python3
 import json
 import requests
 from datetime import datetime, timedelta
-class UserSessionManager:
+class SessionManager:
    def __init__(self):
        self.active_sessions = {}
        self.session_timeout = 30 * 60  # 30 minutes in seconds
@@ -52,7 +79,6 @@ class UserSessionManager:
            'user_id': user_id,
            'user_data': user_data,
            'created_at': datetime.now(),
            'last_activity': datetime.now(),
            'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
        }
@@ -72,322 +98,356 @@ class UserSessionManager:
            del self.active_sessions[session_id]
            return False
        # Update last activity
        session['last_activity'] = current_time
        return True
    def cleanup_expired_sessions(self):
        \"\"\"Remove expired sessions from memory\"\"\"
        current_time = datetime.now()
-        expired_sessions = []
+        expired_count = 0
        # BUG: Modifying dictionary while iterating over it
        for session_id, session in self.active_sessions.items():
            if current_time > session['expires_at']:
-                expired_sessions.append(session_id)
+                del self.active_sessions[session_id]  # This causes RuntimeError
                expired_count += 1
-        for session_id in expired_sessions:
+        return expired_count
            del self.active_sessions[session_id]
        return len(expired_sessions)
 class APIHandler:
    def __init__(self):
        self.session_manager = UserSessionManager()
        self.request_count = 0
    def authenticate_user(self, username, password):
        \"\"\"Authenticate user and create session\"\"\"
        # Simulate API call to auth service
        auth_response = self._call_auth_service(username, password)
        if auth_response.get('success'):
            user_data = auth_response.get('user_data', {})
            session_id = self.session_manager.create_session(
                user_data['id'], user_data
            )
            return {'success': True, 'session_id': session_id}
        return {'success': False, 'error': 'Authentication failed'}
    def process_request(self, session_id, request_data):
        \"\"\"Process an API request with session validation\"\"\"
        self.request_count += 1
        # Validate session before processing
        if not self.session_manager.validate_session(session_id):
            return {'error': 'Invalid or expired session', 'code': 401}
        # Simulate request processing
        try:
            result = self._process_business_logic(request_data)
            return {'success': True, 'data': result}
        except Exception as e:
            return {'error': str(e), 'code': 500}
    def _call_auth_service(self, username, password):
        \"\"\"Simulate external authentication service call\"\"\"
        # Simulate network delay and response
        import time
        time.sleep(0.1)
        # Mock successful authentication
        if username and password:
            return {
                'success': True,
                'user_data': {
                    'id': hash(username) % 10000,
                    'username': username,
                    'roles': ['user']
                }
            }
        return {'success': False}
    def _process_business_logic(self, request_data):
        \"\"\"Simulate business logic processing\"\"\"
        if not request_data:
            raise ValueError("Invalid request data")
        # Simulate some processing
        return {
            'processed_at': datetime.now().isoformat(),
            'request_id': self.request_count,
            'status': 'completed'
        }
 # Global API handler instance
 api_handler = APIHandler()
 def handle_api_request(session_id, request_data):
    \"\"\"Main API request handler\"\"\"
    return api_handler.process_request(session_id, request_data)
 """
-            # Create test file with subtle bug
+        # Create test file with subtle bug
-            test_file = self.create_additional_test_file("session_manager.py", buggy_code)
+        self.buggy_file = self.create_additional_test_file("session_manager.py", buggy_code)
-            self.logger.info(f"  ✅ Created test file with subtle bug: {test_file}")
+        self.logger.info(f"  ✅ Created test file with subtle bug: {self.buggy_file}")
-            # Create a realistic problem description with subtle symptoms
+        # Create error description
-            error_description = """ISSUE DESCRIPTION:
+        error_description = """ISSUE DESCRIPTION:
-Our API service is experiencing intermittent session validation failures in production.
+Our session management system is experiencing intermittent failures during cleanup operations.
-SYMPTOMS OBSERVED:
+SYMPTOMS:
- Users randomly get "Invalid or expired session" errors even with valid sessions
+- Random RuntimeError: dictionary changed size during iteration
- The issue happens more frequently during high-traffic periods
+- Occurs during high load when many sessions expire simultaneously
- Sessions that should still be valid (created < 30 minutes ago) are being rejected
+- Error happens in cleanup_expired_sessions method
- The problem occurs maybe 2-3% of requests but is hard to reproduce consistently
+- Affects about 5% of cleanup operations
 - Server logs show session validation failing but no clear pattern
-ENVIRONMENT:
+ERROR LOG:
- Python 3.13 API service
+RuntimeError: dictionary changed size during iteration
- Running in production with multiple concurrent users
+  File "session_manager.py", line 44, in cleanup_expired_sessions
- Redis not used for session storage (in-memory only)
+    for session_id, session in self.active_sessions.items():
- Load balancer distributes requests across multiple instances
+"""
-RECENT CHANGES:
+        self.error_file = self.create_additional_test_file("error_description.txt", error_description)
- Increased session timeout from 15 to 30 minutes last week
+        self.logger.info(f"  ✅ Created error description file: {self.error_file}")
 - Added cleanup routine to remove expired sessions
 - No major code changes to session management
-USER IMPACT:
+    def _test_single_investigation_session(self) -> bool:
- Users have to re-authenticate randomly
+        """Test a complete investigation session with multiple steps"""
- Affects user experience and causes complaints
+        try:
- Seems to happen more on busy days
+            self.logger.info("  1.1: Testing single investigation session")
-The code looks correct to me, but something is causing valid sessions to be treated as expired or invalid. I'm not sure what's causing this intermittent behavior."""
+            # Step 1: Start investigation
-
+            self.logger.info("    1.1.1: Step 1 - Initial investigation")
-            error_file = self.create_additional_test_file("error_description.txt", error_description)
+            response1, continuation_id = self.call_mcp_tool(
            self.logger.info(f"  ✅ Created error description file: {error_file}")
            # Call debug tool with flash model and realistic problem description
            self.logger.info("  🔍 Calling debug tool to investigate session validation issues...")
            response, continuation_id = self.call_mcp_tool(
                "debug",
                {
-                    "prompt": "Investigate why our API is experiencing intermittent session validation failures in production",
+                    "step": "I need to investigate intermittent RuntimeError during session cleanup. Let me start by examining the error description and understanding the symptoms.",
-                    "files": [test_file, error_file],
+                    "step_number": 1,
-                    "findings": "Users getting 'Invalid or expired session' errors randomly, occurs more during high traffic, sessions should still be valid",
+                    "total_steps": 4,
-                    "error_context": "Sessions created < 30 minutes ago being rejected, happens ~2-3% of requests, load balanced environment",
+                    "next_step_required": True,
-                    "systematic_investigation": True,
+                    "findings": "RuntimeError occurs during dictionary iteration in cleanup_expired_sessions method. Error happens intermittently during high load.",
-                    "model": "flash",
+                    "files_checked": [self.error_file],
-                    "thinking_mode": "medium",
+                    "relevant_files": [self.error_file],
                },
            )
-            if not response:
+            if not response1 or not continuation_id:
-                self.logger.error("Failed to get debug response")
+                self.logger.error("Failed to get initial investigation response")
                return False
-            self.logger.info("  ✅ Got debug response")
+            # Parse and validate JSON response
            response1_data = self._parse_debug_response(response1)
            if not response1_data:
                return False
-            # Parse response to validate bug identification
+            # Validate step 1 response structure
-            try:
+            if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
-                response_data = json.loads(response)
+                return False
                self.logger.debug(f"Response keys: {list(response_data.keys())}")
-                # Extract the actual content if it's wrapped
+            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
                if "content" in response_data:
                    content = response_data["content"]
                    # Handle markdown JSON blocks
                    if content.startswith("```json"):
                        content = content[7:]
                    if content.endswith("```"):
                        content = content[:-3]
                    content = content.strip()
-                    # Parse the inner JSON
+            # Step 2: Examine the code
-                    inner_data = json.loads(content)
+            self.logger.info("    1.1.2: Step 2 - Code examination")
-                    self.logger.debug(f"Inner data keys: {list(inner_data.keys())}")
+            response2, _ = self.call_mcp_tool(
-                else:
+                "debug",
-                    inner_data = response_data
+                {
                    "step": "Now examining the session_manager.py file to understand the cleanup_expired_sessions implementation and identify the root cause.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found the issue: cleanup_expired_sessions modifies self.active_sessions dictionary while iterating over it with .items(). This causes RuntimeError when del is called during iteration.",
                    "files_checked": [self.error_file, self.buggy_file],
                    "relevant_files": [self.buggy_file],
                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
                    "hypothesis": "Dictionary is being modified during iteration causing RuntimeError",
                    "confidence": "high",
                    "continuation_id": continuation_id,
                },
            )
-                # Check for structured debug analysis (should have analysis_complete status)
+            if not response2:
-                if inner_data.get("status") == "analysis_complete":
+                self.logger.error("Failed to continue investigation to step 2")
-                    self.logger.info("  ✅ Got structured debug analysis")
+                return False
-                    # Validate hypothesis generation
+            response2_data = self._parse_debug_response(response2)
-                    hypotheses = inner_data.get("hypotheses", [])
+            if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
-                    if not hypotheses:
+                return False
                        self.logger.error("No hypotheses found in debug analysis")
                        return False
-                    self.logger.info(f"  🧠 Found {len(hypotheses)} hypotheses")
+            # Check investigation status tracking
            investigation_status = response2_data.get("investigation_status", {})
            if investigation_status.get("files_checked", 0) < 2:
                self.logger.error("Files checked count not properly tracked")
                return False
-                    # Check if the model identified the real bug: dictionary modification during iteration
+            if investigation_status.get("relevant_methods", 0) != 1:
-                    analysis_text = json.dumps(inner_data).lower()
+                self.logger.error("Relevant methods not properly tracked")
                return False
-                    # Look for the actual bug - modifying dictionary while iterating
+            if investigation_status.get("current_confidence") != "high":
-                    bug_indicators = [
+                self.logger.error("Confidence level not properly tracked")
-                        "dictionary",
+                return False
                        "iteration",
                        "modify",
                        "concurrent",
                        "runtime error",
                        "dictionary changed size during iteration",
                        "cleanup_expired_sessions",
                        "active_sessions",
                        "del",
                        "removing while iterating",
                    ]
-                    found_indicators = [indicator for indicator in bug_indicators if indicator in analysis_text]
+            self.logger.info("    ✅ Step 2 successful with proper tracking")
-                    # Check for specific mentions of the problematic pattern
+            # Step 3: Validate hypothesis
-                    dictionary_bug_patterns = [
+            self.logger.info("    1.1.3: Step 3 - Hypothesis validation")
-                        "modifying dictionary while iterating",
+            response3, _ = self.call_mcp_tool(
-                        "dictionary changed size",
+                "debug",
-                        "concurrent modification",
+                {
-                        "iterating over dictionary",
+                    "step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
-                        "del.*active_sessions",
+                    "step_number": 3,
-                        "cleanup.*iteration",
+                    "total_steps": 4,
-                    ]
+                    "next_step_required": True,
                    "findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
                    "files_checked": [self.buggy_file],
                    "relevant_files": [self.buggy_file],
                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
                    "hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
                    "confidence": "high",
                    "continuation_id": continuation_id,
                },
            )
-                    import re
+            if not response3:
                self.logger.error("Failed to continue investigation to step 3")
                return False
-                    pattern_matches = []
+            response3_data = self._parse_debug_response(response3)
-                    for pattern in dictionary_bug_patterns:
+            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
-                        if re.search(pattern, analysis_text):
+                return False
                            pattern_matches.append(pattern)
-                    if len(found_indicators) >= 3 or len(pattern_matches) >= 1:
+            self.logger.info("    ✅ Investigation session progressing successfully")
                        self.logger.info("  ✅ Flash identified the dictionary iteration bug")
                        self.logger.info(f"     Found indicators: {found_indicators[:3]}")
                        if pattern_matches:
                            self.logger.info(f"     Pattern matches: {pattern_matches}")
                    else:
                        self.logger.error("  ❌ Flash missed the dictionary iteration bug")
                        self.logger.error(f"     Found only: {found_indicators}")
                        return False
-                    # Validate hypothesis quality (should have confidence levels and reasoning)
+            # Store continuation_id for next test
-                    valid_hypotheses = 0
+            self.investigation_continuation_id = continuation_id
-                    for i, hypothesis in enumerate(hypotheses[:3]):  # Check top 3
+            return True
                        confidence = hypothesis.get("confidence", "").lower()
                        reasoning = hypothesis.get("reasoning", "")
-                        if confidence in ["high", "medium", "low"] and len(reasoning) > 20:
+        except Exception as e:
-                            valid_hypotheses += 1
+            self.logger.error(f"Single investigation session test failed: {e}")
-                            self.logger.debug(f"  Hypothesis {i+1}: {confidence} confidence, good reasoning")
+            return False
                        else:
                            self.logger.debug(f"  Hypothesis {i+1}: weak ({confidence}, {len(reasoning)} chars)")
-                    if valid_hypotheses >= 2:
+    def _test_investigation_with_backtracking(self) -> bool:
-                        self.logger.info(f"  ✅ Found {valid_hypotheses} well-structured hypotheses")
+        """Test investigation with backtracking to revise findings"""
-                    else:
+        try:
-                        self.logger.error(f"  ❌ Only {valid_hypotheses} well-structured hypotheses")
+            self.logger.info("  1.2: Testing investigation with backtracking")
                        return False
-                    # Check for line-specific references
+            # Start a new investigation for testing backtracking
-                    if "line" in analysis_text or "lines" in analysis_text:
+            self.logger.info("    1.2.1: Start investigation for backtracking test")
-                        self.logger.info("  📍 Analysis includes line-specific references")
+            response1, continuation_id = self.call_mcp_tool(
-                    else:
+                "debug",
-                        self.logger.warning("  ⚠️ No line-specific references found")
+                {
                    "step": "Investigating performance degradation in data processing pipeline",
                    "step_number": 1,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Initial analysis shows slow database queries",
                    "files_checked": ["/db/queries.py"],
                    "relevant_files": ["/db/queries.py"],
                },
            )
-                else:
+            if not response1 or not continuation_id:
-                    # Non-structured response - check for dictionary iteration bug identification
+                self.logger.error("Failed to start backtracking test investigation")
-                    self.logger.info("  📝 Got general debug response")
+                return False
-                    response_text = response.lower()
+            # Step 2: Wrong direction
            self.logger.info("    1.2.2: Step 2 - Wrong investigation path")
            response2, _ = self.call_mcp_tool(
                "debug",
                {
                    "step": "Focusing on database optimization strategies",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Database queries seem optimized, might be looking in wrong place",
                    "files_checked": ["/db/queries.py", "/db/indexes.py"],
                    "relevant_files": [],
                    "hypothesis": "Database performance issues",
                    "confidence": "low",
                    "continuation_id": continuation_id,
                },
            )
-                    # Check for the specific bug in general response
+            if not response2:
-                    bug_indicators = [
+                self.logger.error("Failed to continue to step 2")
-                        "dictionary",
+                return False
                        "iteration",
                        "modify",
                        "concurrent",
                        "active_sessions",
                        "cleanup",
                        "del ",
                        "removing",
                        "changed size",
                    ]
-                    found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+            # Step 3: Backtrack from step 2
            self.logger.info("    1.2.3: Step 3 - Backtrack and revise approach")
            response3, _ = self.call_mcp_tool(
                "debug",
                {
                    "step": "Backtracking - the issue might not be database related. Let me investigate the data processing algorithm instead.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "findings": "Found inefficient nested loops in data processor causing O(n²) complexity",
                    "files_checked": ["/processor/algorithm.py"],
                    "relevant_files": ["/processor/algorithm.py"],
                    "relevant_methods": ["DataProcessor.process_batch"],
                    "hypothesis": "Inefficient algorithm causing performance issues",
                    "confidence": "medium",
                    "backtrack_from_step": 2,  # Backtrack from step 2
                    "continuation_id": continuation_id,
                },
            )
-                    if len(found_indicators) >= 3:
+            if not response3:
-                        self.logger.info(f"  ✅ Found {len(found_indicators)} relevant indicators in response")
+                self.logger.error("Failed to backtrack")
-                        self.logger.info(f"     Found: {found_indicators}")
+                return False
                    else:
                        self.logger.error(f"  ❌ Only found {len(found_indicators)} relevant indicators")
                        self.logger.error(f"     Found: {found_indicators}")
                        return False
-            except json.JSONDecodeError as e:
+            response3_data = self._parse_debug_response(response3)
-                self.logger.error(f"Failed to parse debug response as JSON: {e}")
+            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
-                # For non-JSON responses, check for dictionary iteration bug
+                return False
                response_text = response.lower()
-                bug_indicators = [
+            self.logger.info("    ✅ Backtracking working correctly")
-                    "dictionary",
+            return True
                    "iteration",
                    "modify",
                    "concurrent",
                    "active_sessions",
                    "cleanup",
                    "del ",
                    "removing",
                ]
-                found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+        except Exception as e:
            self.logger.error(f"Backtracking test failed: {e}")
            return False
-                if len(found_indicators) >= 3:
+    def _test_complete_investigation_with_analysis(self) -> bool:
-                    self.logger.info(f"  ✅ Text response found {len(found_indicators)} relevant indicators")
+        """Test complete investigation ending with expert analysis"""
-                else:
+        try:
-                    self.logger.error(f"  ❌ Text response only found {len(found_indicators)} relevant indicators")
+            self.logger.info("  1.3: Testing complete investigation with expert analysis")
            # Use the continuation from first test
            continuation_id = getattr(self, "investigation_continuation_id", None)
            if not continuation_id:
                # Start fresh if no continuation available
                self.logger.info("    1.3.0: Starting fresh investigation")
                response0, continuation_id = self.call_mcp_tool(
                    "debug",
                    {
                        "step": "Investigating the dictionary iteration bug in session cleanup",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Found dictionary modification during iteration",
                        "files_checked": [self.buggy_file],
                        "relevant_files": [self.buggy_file],
                        "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
                    },
                )
                if not response0 or not continuation_id:
                    self.logger.error("Failed to start fresh investigation")
                    return False
            # Final step - trigger expert analysis
            self.logger.info("    1.3.1: Final step - complete investigation")
            response_final, _ = self.call_mcp_tool(
                "debug",
                {
                    "step": "Investigation complete. The root cause is confirmed: cleanup_expired_sessions modifies the dictionary while iterating, causing RuntimeError.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step - triggers expert analysis
                    "findings": "Root cause identified: del self.active_sessions[session_id] on line 46 modifies dictionary during iteration starting at line 44. Fix: collect expired IDs first, then delete.",
                    "files_checked": [self.buggy_file],
                    "relevant_files": [self.buggy_file],
                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
                    "hypothesis": "Dictionary modification during iteration causes RuntimeError in cleanup_expired_sessions",
                    "confidence": "high",
                    "continuation_id": continuation_id,
                    "model": "flash",  # Use flash for expert analysis
                },
            )
            if not response_final:
                self.logger.error("Failed to complete investigation")
                return False
            response_final_data = self._parse_debug_response(response_final)
            if not response_final_data:
                return False
            # Validate final response structure
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
                )
                return False
            if not response_final_data.get("investigation_complete"):
                self.logger.error("Expected investigation_complete=true for final step")
                return False
            # Check for expert analysis
            if "expert_analysis" not in response_final_data:
                self.logger.error("Missing expert_analysis in final response")
                return False
            expert_analysis = response_final_data.get("expert_analysis", {})
            # Check for expected analysis content (checking common patterns)
            analysis_text = json.dumps(expert_analysis).lower()
            # Look for bug identification
            bug_indicators = ["dictionary", "iteration", "modify", "runtime", "error", "del"]
            found_indicators = sum(1 for indicator in bug_indicators if indicator in analysis_text)
            if found_indicators >= 3:
                self.logger.info("    ✅ Expert analysis identified the bug correctly")
            else:
                self.logger.warning(
                    f"    ⚠️ Expert analysis may not have fully identified the bug (found {found_indicators}/6 indicators)"
                )
            # Check complete investigation summary
            if "complete_investigation" not in response_final_data:
                self.logger.error("Missing complete_investigation in final response")
                return False
            complete_investigation = response_final_data["complete_investigation"]
            if not complete_investigation.get("relevant_methods"):
                self.logger.error("Missing relevant methods in complete investigation")
                return False
            if "SessionManager.cleanup_expired_sessions" not in complete_investigation["relevant_methods"]:
                self.logger.error("Expected method not found in investigation summary")
                return False
            self.logger.info("    ✅ Complete investigation with expert analysis successful")
            # Validate logs
            self.logger.info("  📋 Validating execution logs...")
-            # Get server logs using inherited method
+            # Get server logs
            logs = self.get_recent_server_logs(500)
            # Look for debug tool execution patterns
            debug_patterns = [
                "debug tool",
-                "[DEBUG]",
+                "investigation",
-                "systematic investigation",
+                "Expert analysis",
-                "Token budget",
+                "calling_expert_analysis",
                "Essential files for debugging",
            ]
            patterns_found = 0
@@ -396,34 +456,101 @@ The code looks correct to me, but something is causing valid sessions to be trea
                    patterns_found += 1
                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")
-            if patterns_found >= 3:
+            if patterns_found >= 2:
                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
            else:
                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
            # Test continuation if available
            if continuation_id:
                self.logger.info("  🔄 Testing debug continuation...")
                follow_up_response, _ = self.call_mcp_tool(
                    "debug",
                    {
                        "prompt": "Based on your analysis, which bug should we fix first and how?",
                        "continuation_id": continuation_id,
                        "model": "flash",
                    },
                )
                if follow_up_response:
                    self.logger.info("  ✅ Debug continuation worked")
                else:
                    self.logger.warning("  ⚠️ Debug continuation failed")
            self.logger.info("  ✅ Debug tool validation completed successfully")
            return True
        except Exception as e:
-            self.logger.error(f"Debug validation test failed: {e}")
+            self.logger.error(f"Complete investigation test failed: {e}")
            return False
    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool via standalone server - override for debug-specific response handling"""
        # Use parent implementation to get the raw response
        response_text, _ = super().call_mcp_tool(tool_name, params)
        if not response_text:
            return None, None
        # Extract continuation_id from debug response specifically
        continuation_id = self._extract_debug_continuation_id(response_text)
        return response_text, continuation_id
    def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from debug response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")
        except json.JSONDecodeError as e:
            self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
            return None
    def _parse_debug_response(self, response_text: str) -> dict:
        """Parse debug tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)
        except json.JSONDecodeError as e:
            self.logger.error(f"Failed to parse debug response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}
    def _validate_step_response(
        self,
        response_data: dict,
        expected_step: int,
        expected_total: int,
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
        """Validate a debug investigation step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
                return False
            # Check step number
            if response_data.get("step_number") != expected_step:
                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
                return False
            # Check total steps
            if response_data.get("total_steps") != expected_total:
                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
                return False
            # Check next_step_required
            if response_data.get("next_step_required") != expected_next_required:
                self.logger.error(
                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
                )
                return False
            # Check investigation_status exists
            if "investigation_status" not in response_data:
                self.logger.error("Missing investigation_status in response")
                return False
            # Check output guidance exists
            if "output" not in response_data:
                self.logger.error("Missing output guidance in response")
                return False
            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
                return False
            return True
        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False
        finally:
            self.cleanup_test_files()
--- a/systemprompts/analyze_prompt.py
+++ b/systemprompts/analyze_prompt.py
@@ -39,6 +39,10 @@ SCOPE & FOCUS
 • Identify strengths, risks, and strategic improvement areas that affect future development
 • Avoid line-by-line bug hunts or minor style critiques—those are covered by CodeReview
 • Recommend practical, proportional changes; no "rip-and-replace" proposals unless the architecture is untenable
 • Identify and flag overengineered solutions — excessive abstraction, unnecessary configuration layers, or generic
  frameworks introduced without a clear, current need. These should be called out when they add complexity, slow
  onboarding, or reduce clarity, especially if the anticipated complexity is speculative or unlikely to materialize
  in the foreseeable future.
 ANALYSIS STRATEGY
 1. Map the tech stack, frameworks, deployment model, and constraints
--- a/systemprompts/chat_prompt.py
+++ b/systemprompts/chat_prompt.py
@@ -29,6 +29,9 @@ SCOPE & FOCUS
 • Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.
 • Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.
 • Keep proposals practical and directly actionable within the existing architecture.
 • Overengineering is an anti-pattern — avoid solutions that introduce unnecessary abstraction, indirection, or
  configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,
  and may not arise in the foreseeable future.
 COLLABORATION APPROACH
 1. Engage deeply with Claude's input – extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.
--- a/systemprompts/codereview_prompt.py
+++ b/systemprompts/codereview_prompt.py
@@ -55,6 +55,9 @@ Your review approach:
  - Ways to reduce the overall complexity while maintaining and retaining functionality without introducing regression
 8. Where further investigation and analysis is required, be direct and suggest which code or related file needs to be
 reviewed
 9. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
   current scope, and may not arise in the foreseeable future.
 SEVERITY DEFINITIONS
 🔴 CRITICAL: Security flaws or defects that cause crashes, data loss, or undefined behavior
--- a/systemprompts/precommit_prompt.py
+++ b/systemprompts/precommit_prompt.py
@@ -53,6 +53,9 @@ REVIEW METHOD
 4. Flag bugs, regressions, crash risks, data loss, or race conditions.
 5. Recommend specific fixes for each issue raised; include code where helpful.
 6. Acknowledge sound patterns to reinforce best practices.
 7. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
   current scope, and may not arise in the foreseeable future.
 CORE ANALYSIS (adapt to diff and stack)
 • Security – injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety
@@ -62,6 +65,11 @@ CORE ANALYSIS (adapt to diff and stack)
 ADDITIONAL ANALYSIS (only when relevant)
 • Language/runtime concerns – memory management, concurrency, exception handling
    • Carefully assess the code's context and purpose before raising concurrency-related concerns. Confirm the presence
    of shared state, race conditions, or unsafe access patterns before flagging any issues to avoid false positives.
    • Also carefully evaluate concurrency and parallelism risks only after confirming that the code runs in an environment
     where such concerns are applicable. Avoid flagging issues unless shared state, asynchronous execution, or multi-threaded
     access are clearly possible based on context.
 • System/integration – config handling, external calls, operational impact
 • Testing – coverage gaps for new logic
    • If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic
--- a/systemprompts/thinkdeep_prompt.py
+++ b/systemprompts/thinkdeep_prompt.py
@@ -32,6 +32,9 @@ GUIDELINES
 5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.
 6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.
 7. Use concise, technical language; assume an experienced engineering audience.
 8. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
   indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
   current scope, and may not arise in the foreseeable future.
 KEY FOCUS AREAS (apply when relevant)
 - Architecture & Design: modularity, boundaries, abstraction layers, dependencies
--- a/tests/test_auto_model_planner_fix.py
+++ b/tests/test_auto_model_planner_fix.py
@@ -198,13 +198,20 @@ class TestAutoModelPlannerFix:
        Verify that other tools still properly require model resolution.
        This ensures our fix doesn't break existing functionality.
        Note: Debug tool now manages its own model calls like planner.
        """
        from tools.analyze import AnalyzeTool
        from tools.chat import ChatTool
        from tools.debug import DebugIssueTool
        # Test various tools still require models
-        tools_requiring_models = [ChatTool(), DebugIssueTool(), AnalyzeTool()]
+        tools_requiring_models = [ChatTool(), AnalyzeTool()]
        for tool in tools_requiring_models:
            assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution"
        # Test tools that manage their own model calls
        tools_managing_own_models = [DebugIssueTool()]
        for tool in tools_managing_own_models:
            assert tool.requires_model() is False, f"{tool.get_name()} should manage its own model calls"
--- a/tests/test_collaboration.py
+++ b/tests/test_collaboration.py
@@ -70,35 +70,35 @@ class TestDynamicContextRequests:
    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_normal_response_not_parsed_as_clarification(self, mock_get_provider, debug_tool):
+    @patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid")
-        """Test that normal responses are not mistaken for clarification requests"""
+    @patch("utils.conversation_memory.add_turn")
-        normal_response = """
+    async def test_normal_response_not_parsed_as_clarification(
-        ## Summary
+        self, mock_add_turn, mock_create_thread, mock_get_provider, debug_tool
-        The error is caused by a missing import statement.
+    ):
-
+        """Test that normal investigation responses work correctly with new debug tool"""
-        ## Hypotheses (Ranked by Likelihood)
+        # The new debug tool uses self-investigation pattern
-
+        result = await debug_tool.execute(
-        ### 1. Missing Import (Confidence: High)
+            {
-        **Root Cause:** The module 'utils' is not imported
+                "step": "Investigating NameError: name 'utils' is not defined",
-        """
+                "step_number": 1,
-
+                "total_steps": 3,
-        mock_provider = create_mock_provider()
+                "next_step_required": True,
-        mock_provider.get_provider_type.return_value = Mock(value="google")
+                "findings": "The error indicates 'utils' module is not imported or defined",
-        mock_provider.supports_thinking_mode.return_value = False
+                "files_checked": ["/code/main.py"],
-        mock_provider.generate_content.return_value = Mock(
+                "relevant_files": ["/code/main.py"],
-            content=normal_response, usage={}, model_name="gemini-2.5-flash", metadata={}
+                "hypothesis": "Missing import statement for utils module",
                "confidence": "high",
            }
        )
        mock_get_provider.return_value = mock_provider
        result = await debug_tool.execute({"prompt": "NameError: name 'utils' is not defined"})
        assert len(result) == 1
-        # Parse the response
+        # Parse the response - new debug tool returns structured JSON
        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "success"
+        assert response_data["status"] == "investigation_in_progress"
-        assert response_data["content_type"] in ["text", "markdown"]
+        assert response_data["step_number"] == 1
-        assert "Summary" in response_data["content"]
+        assert response_data["next_step_required"] is True
        assert response_data["investigation_status"]["current_confidence"] == "high"
    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
@@ -125,17 +125,17 @@ class TestDynamicContextRequests:
    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
-    async def test_clarification_with_suggested_action(self, mock_get_provider, debug_tool):
+    async def test_clarification_with_suggested_action(self, mock_get_provider, analyze_tool):
        """Test clarification request with suggested next action"""
        clarification_json = json.dumps(
            {
                "status": "files_required_to_continue",
-                "mandatory_instructions": "I need to see the database configuration to diagnose the connection error",
+                "mandatory_instructions": "I need to see the database configuration to analyze the connection error",
                "files_needed": ["config/database.yml", "src/db.py"],
                "suggested_next_action": {
-                    "tool": "debug",
+                    "tool": "analyze",
                    "args": {
-                        "prompt": "Connection timeout to database",
+                        "prompt": "Analyze database connection timeout issue",
                        "files": [
                            "/config/database.yml",
                            "/src/db.py",
@@ -154,9 +154,9 @@ class TestDynamicContextRequests:
        )
        mock_get_provider.return_value = mock_provider
-        result = await debug_tool.execute(
+        result = await analyze_tool.execute(
            {
-                "prompt": "Connection timeout to database",
+                "prompt": "Analyze database connection timeout issue",
                "files": ["/absolute/logs/error.log"],
            }
        )
@@ -168,7 +168,7 @@ class TestDynamicContextRequests:
        clarification = json.loads(response_data["content"])
        assert "suggested_next_action" in clarification
-        assert clarification["suggested_next_action"]["tool"] == "debug"
+        assert clarification["suggested_next_action"]["tool"] == "analyze"
    def test_tool_output_model_serialization(self):
        """Test ToolOutput model serialization"""
@@ -298,7 +298,7 @@ class TestCollaborationWorkflow:
    @patch("tools.base.BaseTool.get_model_provider")
    async def test_multi_step_collaboration(self, mock_get_provider):
        """Test a multi-step collaboration workflow"""
-        tool = DebugIssueTool()
+        tool = AnalyzeTool()
        # Step 1: Initial request returns clarification needed
        clarification_json = json.dumps(
@@ -319,8 +319,8 @@ class TestCollaborationWorkflow:
        result1 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
+                "prompt": "Analyze database connection timeout issue",
-                "error_context": "Timeout after 30s",
+                "files": ["/logs/error.log"],
            }
        )
@@ -345,9 +345,8 @@ class TestCollaborationWorkflow:
        result2 = await tool.execute(
            {
-                "prompt": "Database connection timeout",
+                "prompt": "Analyze database connection timeout issue with config file",
-                "error_context": "Timeout after 30s",
+                "files": ["/absolute/path/config.py", "/logs/error.log"],  # Additional context provided
                "files": ["/absolute/path/config.py"],  # Additional context provided
            }
        )
--- a/tests/test_conversation_field_mapping.py
+++ b/tests/test_conversation_field_mapping.py
@@ -157,10 +157,10 @@ async def test_unknown_tool_defaults_to_prompt():
@pytest.mark.asyncio
 async def test_tool_parameter_standardization():
-    """Test that all tools use standardized 'prompt' parameter"""
+    """Test that most tools use standardized 'prompt' parameter (debug uses investigation pattern)"""
    from tools.analyze import AnalyzeRequest
    from tools.codereview import CodeReviewRequest
-    from tools.debug import DebugIssueRequest
+    from tools.debug import DebugInvestigationRequest
    from tools.precommit import PrecommitRequest
    from tools.thinkdeep import ThinkDeepRequest
@@ -168,9 +168,16 @@ async def test_tool_parameter_standardization():
    analyze = AnalyzeRequest(files=["/test.py"], prompt="What does this do?")
    assert analyze.prompt == "What does this do?"
-    # Test debug tool uses prompt
+    # Debug tool now uses self-investigation pattern with different fields
-    debug = DebugIssueRequest(prompt="Error occurred")
+    debug = DebugInvestigationRequest(
-    assert debug.prompt == "Error occurred"
+        step="Investigating error",
        step_number=1,
        total_steps=3,
        next_step_required=True,
        findings="Initial error analysis",
    )
    assert debug.step == "Investigating error"
    assert debug.findings == "Initial error analysis"
    # Test codereview tool uses prompt
    review = CodeReviewRequest(files=["/test.py"], prompt="Review this")
--- a/tests/test_debug.py
+++ b/tests/test_debug.py
@@ -0,0 +1,514 @@
 """
 Tests for the debug tool.
 """
 from unittest.mock import patch
 import pytest
 from tools.debug import DebugInvestigationRequest, DebugIssueTool
 from tools.models import ToolModelCategory
 class TestDebugTool:
    """Test suite for DebugIssueTool."""
    def test_tool_metadata(self):
        """Test basic tool metadata and configuration."""
        tool = DebugIssueTool()
        assert tool.get_name() == "debug"
        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
        assert tool.get_default_temperature() == 0.2  # TEMPERATURE_ANALYTICAL
        assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
        assert tool.requires_model() is False  # Since it manages its own model calls
    def test_request_validation(self):
        """Test Pydantic request model validation."""
        # Valid investigation step request
        step_request = DebugInvestigationRequest(
            step="Investigating null pointer exception in UserService",
            step_number=1,
            total_steps=5,
            next_step_required=True,
            findings="Found that UserService.getUser() is called with null ID",
        )
        assert step_request.step == "Investigating null pointer exception in UserService"
        assert step_request.step_number == 1
        assert step_request.next_step_required is True
        assert step_request.confidence == "low"  # default
        # Request with optional fields
        detailed_request = DebugInvestigationRequest(
            step="Deep dive into getUser method implementation",
            step_number=2,
            total_steps=5,
            next_step_required=True,
            findings="Method doesn't validate input parameters",
            files_checked=["/src/UserService.java", "/src/UserController.java"],
            relevant_files=["/src/UserService.java"],
            relevant_methods=["UserService.getUser", "UserController.handleRequest"],
            hypothesis="Null ID passed from controller without validation",
            confidence="medium",
        )
        assert len(detailed_request.files_checked) == 2
        assert len(detailed_request.relevant_files) == 1
        assert detailed_request.confidence == "medium"
        # Missing required fields should fail
        with pytest.raises(ValueError):
            DebugInvestigationRequest()  # Missing all required fields
        with pytest.raises(ValueError):
            DebugInvestigationRequest(step="test")  # Missing other required fields
    def test_input_schema_generation(self):
        """Test JSON schema generation for MCP client."""
        tool = DebugIssueTool()
        schema = tool.get_input_schema()
        assert schema["type"] == "object"
        # Investigation fields
        assert "step" in schema["properties"]
        assert "step_number" in schema["properties"]
        assert "total_steps" in schema["properties"]
        assert "next_step_required" in schema["properties"]
        assert "findings" in schema["properties"]
        assert "files_checked" in schema["properties"]
        assert "relevant_files" in schema["properties"]
        assert "relevant_methods" in schema["properties"]
        assert "hypothesis" in schema["properties"]
        assert "confidence" in schema["properties"]
        assert "backtrack_from_step" in schema["properties"]
        assert "continuation_id" in schema["properties"]
        assert "images" in schema["properties"]  # Now supported for visual debugging
        # Check excluded fields are NOT present
        assert "model" not in schema["properties"]
        assert "temperature" not in schema["properties"]
        assert "thinking_mode" not in schema["properties"]
        assert "use_websearch" not in schema["properties"]
        # Check required fields
        assert "step" in schema["required"]
        assert "step_number" in schema["required"]
        assert "total_steps" in schema["required"]
        assert "next_step_required" in schema["required"]
        assert "findings" in schema["required"]
    def test_model_category_for_debugging(self):
        """Test that debug uses extended reasoning category."""
        tool = DebugIssueTool()
        category = tool.get_model_category()
        # Debugging needs deep thinking
        assert category == ToolModelCategory.EXTENDED_REASONING
    @pytest.mark.asyncio
    async def test_execute_first_investigation_step(self):
        """Test execute method for first investigation step."""
        tool = DebugIssueTool()
        arguments = {
            "step": "Investigating intermittent session validation failures in production",
            "step_number": 1,
            "total_steps": 5,
            "next_step_required": True,
            "findings": "Users report random session invalidation, occurs more during high traffic",
            "files_checked": ["/api/session_manager.py"],
            "relevant_files": ["/api/session_manager.py"],
        }
        # Mock conversation memory functions
        with patch("utils.conversation_memory.create_thread", return_value="debug-uuid-123"):
            with patch("utils.conversation_memory.add_turn"):
                result = await tool.execute(arguments)
        # Should return a list with TextContent
        assert len(result) == 1
        assert result[0].type == "text"
        # Parse the JSON response
        import json
        parsed_response = json.loads(result[0].text)
        assert parsed_response["status"] == "investigation_in_progress"
        assert parsed_response["step_number"] == 1
        assert parsed_response["total_steps"] == 5
        assert parsed_response["next_step_required"] is True
        assert parsed_response["continuation_id"] == "debug-uuid-123"
        assert parsed_response["investigation_status"]["files_checked"] == 1
        assert parsed_response["investigation_status"]["relevant_files"] == 1
    @pytest.mark.asyncio
    async def test_execute_subsequent_investigation_step(self):
        """Test execute method for subsequent investigation step."""
        tool = DebugIssueTool()
        # Set up initial state
        tool.initial_issue = "Session validation failures"
        tool.consolidated_findings["files_checked"].add("/api/session_manager.py")
        arguments = {
            "step": "Examining session cleanup method for concurrent modification issues",
            "step_number": 2,
            "total_steps": 5,
            "next_step_required": True,
            "findings": "Found dictionary modification during iteration in cleanup_expired_sessions",
            "files_checked": ["/api/session_manager.py", "/api/utils.py"],
            "relevant_files": ["/api/session_manager.py"],
            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
            "hypothesis": "Dictionary modified during iteration causing RuntimeError",
            "confidence": "high",
            "continuation_id": "debug-uuid-123",
        }
        # Mock conversation memory functions
        with patch("utils.conversation_memory.add_turn"):
            result = await tool.execute(arguments)
        # Should return a list with TextContent
        assert len(result) == 1
        assert result[0].type == "text"
        # Parse the JSON response
        import json
        parsed_response = json.loads(result[0].text)
        assert parsed_response["step_number"] == 2
        assert parsed_response["next_step_required"] is True
        assert parsed_response["continuation_id"] == "debug-uuid-123"
        assert parsed_response["investigation_status"]["files_checked"] == 2  # Cumulative
        assert parsed_response["investigation_status"]["relevant_methods"] == 1
        assert parsed_response["investigation_status"]["current_confidence"] == "high"
    @pytest.mark.asyncio
    async def test_execute_final_investigation_step(self):
        """Test execute method for final investigation step with expert analysis."""
        tool = DebugIssueTool()
        # Set up investigation history
        tool.initial_issue = "Session validation failures"
        tool.investigation_history = [
            {
                "step_number": 1,
                "step": "Initial investigation of session validation failures",
                "findings": "Initial investigation",
                "files_checked": ["/api/utils.py"],
            },
            {
                "step_number": 2,
                "step": "Deeper analysis of session manager",
                "findings": "Found dictionary issue",
                "files_checked": ["/api/session_manager.py"],
            },
        ]
        tool.consolidated_findings = {
            "files_checked": {"/api/session_manager.py", "/api/utils.py"},
            "relevant_files": {"/api/session_manager.py"},
            "relevant_methods": {"SessionManager.cleanup_expired_sessions"},
            "findings": ["Step 1: Initial investigation", "Step 2: Found dictionary issue"],
            "hypotheses": [{"step": 2, "hypothesis": "Dictionary modified during iteration", "confidence": "high"}],
            "images": [],
        }
        arguments = {
            "step": "Confirmed the root cause and identified fix",
            "step_number": 3,
            "total_steps": 3,
            "next_step_required": False,  # Final step
            "findings": "Root cause confirmed: dictionary modification during iteration in cleanup method",
            "files_checked": ["/api/session_manager.py"],
            "relevant_files": ["/api/session_manager.py"],
            "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
            "hypothesis": "Dictionary modification during iteration causes intermittent RuntimeError",
            "confidence": "high",
            "continuation_id": "debug-uuid-123",
        }
        # Mock the expert analysis call
        mock_expert_response = {
            "status": "analysis_complete",
            "summary": "Dictionary modification during iteration bug identified",
            "hypotheses": [
                {
                    "name": "CONCURRENT_MODIFICATION",
                    "confidence": "High",
                    "root_cause": "Modifying dictionary while iterating",
                    "minimal_fix": "Create list of keys to delete first",
                }
            ],
        }
        # Mock conversation memory and file reading
        with patch("utils.conversation_memory.add_turn"):
            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
                with patch.object(tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
                    result = await tool.execute(arguments)
        # Should return a list with TextContent
        assert len(result) == 1
        response_text = result[0].text
        # Parse the JSON response
        import json
        parsed_response = json.loads(response_text)
        # Check final step structure
        assert parsed_response["status"] == "calling_expert_analysis"
        assert parsed_response["investigation_complete"] is True
        assert parsed_response["expert_analysis"]["status"] == "analysis_complete"
        assert "complete_investigation" in parsed_response
        assert parsed_response["complete_investigation"]["steps_taken"] == 3  # All steps including current
    @pytest.mark.asyncio
    async def test_execute_with_backtracking(self):
        """Test execute method with backtracking to revise findings."""
        tool = DebugIssueTool()
        # Set up some investigation history with all required fields
        tool.investigation_history = [
            {
                "step": "Initial investigation",
                "step_number": 1,
                "findings": "Initial findings",
                "files_checked": ["file1.py"],
                "relevant_files": [],
                "relevant_methods": [],
                "hypothesis": None,
                "confidence": "low",
            },
            {
                "step": "Wrong direction",
                "step_number": 2,
                "findings": "Wrong path",
                "files_checked": ["file2.py"],
                "relevant_files": [],
                "relevant_methods": [],
                "hypothesis": None,
                "confidence": "low",
            },
        ]
        tool.consolidated_findings = {
            "files_checked": {"file1.py", "file2.py"},
            "relevant_files": set(),
            "relevant_methods": set(),
            "findings": ["Step 1: Initial findings", "Step 2: Wrong path"],
            "hypotheses": [],
            "images": [],
        }
        arguments = {
            "step": "Backtracking to revise approach",
            "step_number": 3,
            "total_steps": 5,
            "next_step_required": True,
            "findings": "Taking a different investigation approach",
            "files_checked": ["file3.py"],
            "backtrack_from_step": 2,  # Backtrack from step 2
            "continuation_id": "debug-uuid-123",
        }
        # Mock conversation memory functions
        with patch("utils.conversation_memory.add_turn"):
            result = await tool.execute(arguments)
        # Should return a list with TextContent
        assert len(result) == 1
        response_text = result[0].text
        # Parse the JSON response
        import json
        parsed_response = json.loads(response_text)
        assert parsed_response["status"] == "investigation_in_progress"
        # After backtracking from step 2, history should have step 1 plus the new step
        assert len(tool.investigation_history) == 2  # Step 1 + new step 3
        assert tool.investigation_history[0]["step_number"] == 1
        assert tool.investigation_history[1]["step_number"] == 3  # The new step that triggered backtrack
    @pytest.mark.asyncio
    async def test_execute_adjusts_total_steps(self):
        """Test execute method adjusts total steps when current step exceeds estimate."""
        tool = DebugIssueTool()
        arguments = {
            "step": "Additional investigation needed",
            "step_number": 8,
            "total_steps": 5,  # Current step exceeds total
            "next_step_required": True,
            "findings": "More complexity discovered",
            "continuation_id": "debug-uuid-123",
        }
        # Mock conversation memory functions
        with patch("utils.conversation_memory.add_turn"):
            result = await tool.execute(arguments)
        # Should return a list with TextContent
        assert len(result) == 1
        response_text = result[0].text
        # Parse the JSON response
        import json
        parsed_response = json.loads(response_text)
        # Total steps should be adjusted to match current step
        assert parsed_response["total_steps"] == 8
        assert parsed_response["step_number"] == 8
    @pytest.mark.asyncio
    async def test_execute_error_handling(self):
        """Test execute method error handling."""
        tool = DebugIssueTool()
        # Invalid arguments - missing required fields
        arguments = {
            "step": "Invalid request"
            # Missing required fields
        }
        result = await tool.execute(arguments)
        # Should return error response
        assert len(result) == 1
        response_text = result[0].text
        # Parse the JSON response
        import json
        parsed_response = json.loads(response_text)
        assert parsed_response["status"] == "investigation_failed"
        assert "error" in parsed_response
    def test_prepare_investigation_summary(self):
        """Test investigation summary preparation."""
        tool = DebugIssueTool()
        tool.consolidated_findings = {
            "files_checked": {"file1.py", "file2.py", "file3.py"},
            "relevant_files": {"file1.py", "file2.py"},
            "relevant_methods": {"Class1.method1", "Class2.method2"},
            "findings": [
                "Step 1: Initial investigation findings",
                "Step 2: Discovered potential issue",
                "Step 3: Confirmed root cause",
            ],
            "hypotheses": [
                {"step": 1, "hypothesis": "Initial hypothesis", "confidence": "low"},
                {"step": 2, "hypothesis": "Refined hypothesis", "confidence": "medium"},
                {"step": 3, "hypothesis": "Final hypothesis", "confidence": "high"},
            ],
            "images": [],
        }
        summary = tool._prepare_investigation_summary()
        assert "SYSTEMATIC INVESTIGATION SUMMARY" in summary
        assert "Files examined: 3" in summary
        assert "Relevant files identified: 2" in summary
        assert "Methods/functions involved: 2" in summary
        assert "INVESTIGATION PROGRESSION" in summary
        assert "Step 1:" in summary
        assert "Step 2:" in summary
        assert "Step 3:" in summary
        assert "HYPOTHESIS EVOLUTION" in summary
        assert "low confidence" in summary
        assert "medium confidence" in summary
        assert "high confidence" in summary
    def test_extract_error_context(self):
        """Test error context extraction from findings."""
        tool = DebugIssueTool()
        tool.consolidated_findings = {
            "findings": [
                "Step 1: Found no issues initially",
                "Step 2: Discovered ERROR: Dictionary size changed during iteration",
                "Step 3: Stack trace shows RuntimeError in cleanup method",
                "Step 4: Exception occurs intermittently",
            ],
        }
        error_context = tool._extract_error_context()
        assert error_context is not None
        assert "ERROR: Dictionary size changed" in error_context
        assert "Stack trace shows RuntimeError" in error_context
        assert "Exception occurs intermittently" in error_context
        assert "Found no issues initially" not in error_context  # Should not include non-error findings
    def test_reprocess_consolidated_findings(self):
        """Test reprocessing of consolidated findings after backtracking."""
        tool = DebugIssueTool()
        tool.investigation_history = [
            {
                "step_number": 1,
                "findings": "Initial findings",
                "files_checked": ["file1.py"],
                "relevant_files": ["file1.py"],
                "relevant_methods": ["method1"],
                "hypothesis": "Initial hypothesis",
                "confidence": "low",
            },
            {
                "step_number": 2,
                "findings": "Second findings",
                "files_checked": ["file2.py"],
                "relevant_files": [],
                "relevant_methods": ["method2"],
            },
        ]
        tool._reprocess_consolidated_findings()
        assert tool.consolidated_findings["files_checked"] == {"file1.py", "file2.py"}
        assert tool.consolidated_findings["relevant_files"] == {"file1.py"}
        assert tool.consolidated_findings["relevant_methods"] == {"method1", "method2"}
        assert len(tool.consolidated_findings["findings"]) == 2
        assert len(tool.consolidated_findings["hypotheses"]) == 1
        assert tool.consolidated_findings["hypotheses"][0]["hypothesis"] == "Initial hypothesis"
 # Integration test
 class TestDebugToolIntegration:
    """Integration tests for debug tool."""
    def setup_method(self):
        """Set up model context for integration tests."""
        from utils.model_context import ModelContext
        self.tool = DebugIssueTool()
        self.tool._model_context = ModelContext("flash")  # Test model
    @pytest.mark.asyncio
    async def test_complete_investigation_flow(self):
        """Test complete investigation flow from start to expert analysis."""
        # Step 1: Initial investigation
        arguments = {
            "step": "Investigating memory leak in data processing pipeline",
            "step_number": 1,
            "total_steps": 3,
            "next_step_required": True,
            "findings": "High memory usage observed during batch processing",
            "files_checked": ["/processor/main.py"],
        }
        # Mock conversation memory and expert analysis
        with patch("utils.conversation_memory.create_thread", return_value="debug-flow-uuid"):
            with patch("utils.conversation_memory.add_turn"):
                result = await self.tool.execute(arguments)
        # Verify response structure
        assert len(result) == 1
        response_text = result[0].text
        # Parse the JSON response
        import json
        parsed_response = json.loads(response_text)
        assert parsed_response["status"] == "investigation_in_progress"
        assert parsed_response["step_number"] == 1
        assert parsed_response["continuation_id"] == "debug-flow-uuid"
--- a/tests/test_debug_comprehensive_workflow.py
+++ b/tests/test_debug_comprehensive_workflow.py
@@ -0,0 +1,363 @@
 """
 Comprehensive test demonstrating debug tool's self-investigation pattern
 and continuation ID functionality working together end-to-end.
 """
 import json
 from unittest.mock import patch
 import pytest
 from tools.debug import DebugIssueTool
 from utils.conversation_memory import (
    ConversationTurn,
    ThreadContext,
    build_conversation_history,
    get_conversation_file_list,
 )
 class TestDebugComprehensiveWorkflow:
    """Test the complete debug workflow from investigation to expert analysis to continuation."""
    @pytest.mark.asyncio
    async def test_full_debug_workflow_with_continuation(self):
        """Test complete debug workflow: investigation → expert analysis → continuation to another tool."""
        tool = DebugIssueTool()
        # Step 1: Initial investigation
        with patch("utils.conversation_memory.create_thread", return_value="debug-workflow-uuid"):
            with patch("utils.conversation_memory.add_turn") as mock_add_turn:
                result1 = await tool.execute(
                    {
                        "step": "Investigating memory leak in user session handler",
                        "step_number": 1,
                        "total_steps": 3,
                        "next_step_required": True,
                        "findings": "High memory usage detected in session handler",
                        "files_checked": ["/api/sessions.py"],
                        "images": ["/screenshots/memory_profile.png"],
                    }
                )
        # Verify step 1 response
        assert len(result1) == 1
        response1 = json.loads(result1[0].text)
        assert response1["status"] == "investigation_in_progress"
        assert response1["step_number"] == 1
        assert response1["continuation_id"] == "debug-workflow-uuid"
        # Verify conversation turn was added
        assert mock_add_turn.called
        call_args = mock_add_turn.call_args
        if call_args:
            # Check if args were passed positionally or as keywords
            args = call_args.args if hasattr(call_args, "args") else call_args[0]
            if args and len(args) >= 3:
                assert args[0] == "debug-workflow-uuid"
                assert args[1] == "assistant"
                assert json.loads(args[2])["status"] == "investigation_in_progress"
        # Step 2: Continue investigation with findings
        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
            result2 = await tool.execute(
                {
                    "step": "Found circular references in session cache preventing garbage collection",
                    "step_number": 2,
                    "total_steps": 3,
                    "next_step_required": True,
                    "findings": "Session objects hold references to themselves through event handlers",
                    "files_checked": ["/api/sessions.py", "/api/cache.py"],
                    "relevant_files": ["/api/sessions.py"],
                    "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
                    "hypothesis": "Circular references preventing garbage collection",
                    "confidence": "high",
                    "continuation_id": "debug-workflow-uuid",
                }
            )
        # Verify step 2 response
        response2 = json.loads(result2[0].text)
        assert response2["status"] == "investigation_in_progress"
        assert response2["step_number"] == 2
        assert response2["investigation_status"]["files_checked"] == 2
        assert response2["investigation_status"]["relevant_methods"] == 2
        assert response2["investigation_status"]["current_confidence"] == "high"
        # Step 3: Final investigation with expert analysis
        # Mock the expert analysis response
        mock_expert_response = {
            "status": "analysis_complete",
            "summary": "Memory leak caused by circular references in session event handlers",
            "hypotheses": [
                {
                    "name": "CIRCULAR_REFERENCE_LEAK",
                    "confidence": "High (95%)",
                    "evidence": ["Event handlers hold strong references", "No weak references used"],
                    "root_cause": "SessionHandler stores callbacks that reference the handler itself",
                    "potential_fixes": [
                        {
                            "description": "Use weakref for event handler callbacks",
                            "files_to_modify": ["/api/sessions.py"],
                            "complexity": "Low",
                        }
                    ],
                    "minimal_fix": "Replace self references in callbacks with weakref.ref(self)",
                }
            ],
            "investigation_summary": {
                "pattern": "Classic circular reference memory leak",
                "severity": "High - causes unbounded memory growth",
                "recommended_action": "Implement weakref solution immediately",
            },
        }
        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
                result3 = await tool.execute(
                    {
                        "step": "Investigation complete - confirmed circular reference memory leak pattern",
                        "step_number": 3,
                        "total_steps": 3,
                        "next_step_required": False,  # Triggers expert analysis
                        "findings": "Circular references between SessionHandler and event callbacks prevent GC",
                        "files_checked": ["/api/sessions.py", "/api/cache.py"],
                        "relevant_files": ["/api/sessions.py"],
                        "relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
                        "hypothesis": "Circular references in event handler callbacks causing memory leak",
                        "confidence": "high",
                        "continuation_id": "debug-workflow-uuid",
                        "model": "flash",
                    }
                )
        # Verify final response with expert analysis
        response3 = json.loads(result3[0].text)
        assert response3["status"] == "calling_expert_analysis"
        assert response3["investigation_complete"] is True
        assert "expert_analysis" in response3
        expert = response3["expert_analysis"]
        assert expert["status"] == "analysis_complete"
        assert "CIRCULAR_REFERENCE_LEAK" in expert["hypotheses"][0]["name"]
        assert "weakref" in expert["hypotheses"][0]["minimal_fix"]
        # Verify complete investigation summary
        assert "complete_investigation" in response3
        complete = response3["complete_investigation"]
        assert complete["steps_taken"] == 3
        assert "/api/sessions.py" in complete["files_examined"]
        assert "SessionHandler.add_event_listener" in complete["relevant_methods"]
        # Step 4: Test continuation to another tool (e.g., analyze)
        # Create a mock thread context representing the debug conversation
        debug_context = ThreadContext(
            thread_id="debug-workflow-uuid",
            created_at="2025-01-01T00:00:00Z",
            last_updated_at="2025-01-01T00:10:00Z",
            tool_name="debug",
            turns=[
                ConversationTurn(
                    role="user",
                    content="Step 1: Investigating memory leak",
                    timestamp="2025-01-01T00:01:00Z",
                    tool_name="debug",
                    files=["/api/sessions.py"],
                    images=["/screenshots/memory_profile.png"],
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(response1),
                    timestamp="2025-01-01T00:02:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="user",
                    content="Step 2: Found circular references",
                    timestamp="2025-01-01T00:03:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(response2),
                    timestamp="2025-01-01T00:04:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="user",
                    content="Step 3: Investigation complete",
                    timestamp="2025-01-01T00:05:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(response3),
                    timestamp="2025-01-01T00:06:00Z",
                    tool_name="debug",
                ),
            ],
            initial_context={},
        )
        # Test that another tool can use the continuation
        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
            # Mock file reading
            def mock_read_file(file_path):
                if file_path == "/api/sessions.py":
                    return "# SessionHandler with circular refs\nclass SessionHandler:\n    pass", 20
                elif file_path == "/screenshots/memory_profile.png":
                    # Images return empty string for content but 0 tokens
                    return "", 0
                elif file_path == "/api/cache.py":
                    return "# Cache module", 5
                return "", 0
            # Build conversation history for another tool
            from utils.model_context import ModelContext
            model_context = ModelContext("flash")
            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
            # Verify history contains all debug information
            assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
            assert "Thread: debug-workflow-uuid" in history
            assert "Tool: debug" in history
            # Check investigation progression
            assert "Step 1: Investigating memory leak" in history
            assert "Step 2: Found circular references" in history
            assert "Step 3: Investigation complete" in history
            # Check expert analysis is included
            assert "CIRCULAR_REFERENCE_LEAK" in history
            assert "weakref" in history
            assert "memory leak" in history
            # Check files are referenced in conversation history
            assert "/api/sessions.py" in history
            # File content would be in referenced files section if the files were readable
            # In our test they're not real files so they won't be embedded
            # But the expert analysis content should be there
            assert "Memory leak caused by circular references" in history
            # Verify file list includes all files from investigation
            file_list = get_conversation_file_list(debug_context)
            assert "/api/sessions.py" in file_list
    @pytest.mark.asyncio
    async def test_debug_investigation_state_machine(self):
        """Test the debug tool's investigation state machine behavior."""
        tool = DebugIssueTool()
        # Test state transitions
        states = []
        # Initial state
        with patch("utils.conversation_memory.create_thread", return_value="state-test-uuid"):
            with patch("utils.conversation_memory.add_turn"):
                result = await tool.execute(
                    {
                        "step": "Starting investigation",
                        "step_number": 1,
                        "total_steps": 2,
                        "next_step_required": True,
                        "findings": "Initial findings",
                    }
                )
                states.append(json.loads(result[0].text))
        # Verify initial state
        assert states[0]["status"] == "investigation_in_progress"
        assert states[0]["step_number"] == 1
        assert states[0]["next_step_required"] is True
        # Final state (triggers expert analysis)
        mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
        with patch("utils.conversation_memory.add_turn"):
            with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
                result = await tool.execute(
                    {
                        "step": "Final findings",
                        "step_number": 2,
                        "total_steps": 2,
                        "next_step_required": False,
                        "findings": "Complete findings",
                        "continuation_id": "state-test-uuid",
                        "model": "flash",
                    }
                )
                states.append(json.loads(result[0].text))
        # Verify final state
        assert states[1]["status"] == "calling_expert_analysis"
        assert states[1]["investigation_complete"] is True
        assert "expert_analysis" in states[1]
    @pytest.mark.asyncio
    async def test_debug_backtracking_preserves_continuation(self):
        """Test that backtracking preserves continuation ID and investigation state."""
        tool = DebugIssueTool()
        # Start investigation
        with patch("utils.conversation_memory.create_thread", return_value="backtrack-test-uuid"):
            with patch("utils.conversation_memory.add_turn"):
                result1 = await tool.execute(
                    {
                        "step": "Initial hypothesis",
                        "step_number": 1,
                        "total_steps": 3,
                        "next_step_required": True,
                        "findings": "Initial findings",
                    }
                )
        response1 = json.loads(result1[0].text)
        continuation_id = response1["continuation_id"]
        # Step 2 - wrong direction
        with patch("utils.conversation_memory.add_turn"):
            await tool.execute(
                {
                    "step": "Wrong hypothesis",
                    "step_number": 2,
                    "total_steps": 3,
                    "next_step_required": True,
                    "findings": "Dead end",
                    "hypothesis": "Wrong initial hypothesis",
                    "confidence": "low",
                    "continuation_id": continuation_id,
                }
            )
        # Backtrack from step 2
        with patch("utils.conversation_memory.add_turn"):
            result3 = await tool.execute(
                {
                    "step": "Backtracking - new hypothesis",
                    "step_number": 3,
                    "total_steps": 4,  # Adjusted total
                    "next_step_required": True,
                    "findings": "New direction",
                    "hypothesis": "New hypothesis after backtracking",
                    "confidence": "medium",
                    "backtrack_from_step": 2,
                    "continuation_id": continuation_id,
                }
            )
        response3 = json.loads(result3[0].text)
        # Verify continuation preserved through backtracking
        assert response3["continuation_id"] == continuation_id
        assert response3["step_number"] == 3
        assert response3["total_steps"] == 4
        # Verify investigation status after backtracking
        # When we backtrack, investigation continues
        assert response3["investigation_status"]["files_checked"] == 0  # Reset after backtrack
        assert response3["investigation_status"]["current_confidence"] == "medium"
        # The key thing is the continuation ID is preserved
        # and we've adjusted our approach (total_steps increased)
--- a/tests/test_debug_continuation.py
+++ b/tests/test_debug_continuation.py
@@ -0,0 +1,336 @@
 """
 Test debug tool continuation ID functionality and conversation history formatting.
 """
 import json
 from unittest.mock import patch
 import pytest
 from tools.debug import DebugIssueTool
 from utils.conversation_memory import (
    ConversationTurn,
    ThreadContext,
    build_conversation_history,
    get_conversation_file_list,
 )
 class TestDebugContinuation:
    """Test debug tool continuation ID and conversation history integration."""
    @pytest.mark.asyncio
    async def test_debug_creates_continuation_id(self):
        """Test that debug tool creates continuation ID on first step."""
        tool = DebugIssueTool()
        with patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid-123"):
            with patch("utils.conversation_memory.add_turn"):
                result = await tool.execute(
                    {
                        "step": "Investigating null pointer exception",
                        "step_number": 1,
                        "total_steps": 3,
                        "next_step_required": True,
                        "findings": "Initial investigation shows null reference in UserService",
                        "files_checked": ["/api/UserService.java"],
                    }
                )
        assert len(result) == 1
        response = json.loads(result[0].text)
        assert response["status"] == "investigation_in_progress"
        assert response["continuation_id"] == "debug-test-uuid-123"
    def test_debug_conversation_formatting(self):
        """Test that debug tool's structured output is properly formatted in conversation history."""
        # Create a mock conversation with debug tool output
        debug_output = {
            "status": "investigation_in_progress",
            "step_number": 2,
            "total_steps": 3,
            "next_step_required": True,
            "investigation_status": {
                "files_checked": 3,
                "relevant_files": 2,
                "relevant_methods": 1,
                "hypotheses_formed": 1,
                "images_collected": 0,
                "current_confidence": "medium",
            },
            "output": {"instructions": "Continue systematic investigation.", "format": "systematic_investigation"},
            "continuation_id": "debug-test-uuid-123",
            "next_steps": "Continue investigation with step 3.",
        }
        context = ThreadContext(
            thread_id="debug-test-uuid-123",
            created_at="2025-01-01T00:00:00Z",
            last_updated_at="2025-01-01T00:05:00Z",
            tool_name="debug",
            turns=[
                ConversationTurn(
                    role="user",
                    content="Step 1: Investigating null pointer exception",
                    timestamp="2025-01-01T00:01:00Z",
                    tool_name="debug",
                    files=["/api/UserService.java"],
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(debug_output, indent=2),
                    timestamp="2025-01-01T00:02:00Z",
                    tool_name="debug",
                    files=["/api/UserService.java", "/api/UserController.java"],
                ),
            ],
            initial_context={
                "step": "Investigating null pointer exception",
                "step_number": 1,
                "total_steps": 3,
                "next_step_required": True,
                "findings": "Initial investigation",
            },
        )
        # Mock file reading to avoid actual file I/O
        def mock_read_file(file_path):
            if file_path == "/api/UserService.java":
                return "// UserService.java\npublic class UserService {\n    // code...\n}", 10
            elif file_path == "/api/UserController.java":
                return "// UserController.java\npublic class UserController {\n    // code...\n}", 10
            return "", 0
        # Build conversation history
        from utils.model_context import ModelContext
        model_context = ModelContext("flash")
        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
        # Verify the history contains debug-specific content
        assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
        assert "Thread: debug-test-uuid-123" in history
        assert "Tool: debug" in history
        # Check that files are included
        assert "UserService.java" in history
        assert "UserController.java" in history
        # Check that debug output is included
        assert "investigation_in_progress" in history
        assert '"step_number": 2' in history
        assert '"files_checked": 3' in history
        assert '"current_confidence": "medium"' in history
    def test_debug_continuation_preserves_investigation_state(self):
        """Test that continuation preserves investigation state across tools."""
        # Create a debug investigation context
        context = ThreadContext(
            thread_id="debug-test-uuid-123",
            created_at="2025-01-01T00:00:00Z",
            last_updated_at="2025-01-01T00:10:00Z",
            tool_name="debug",
            turns=[
                ConversationTurn(
                    role="user",
                    content="Step 1: Initial investigation",
                    timestamp="2025-01-01T00:01:00Z",
                    tool_name="debug",
                    files=["/api/SessionManager.java"],
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(
                        {
                            "status": "investigation_in_progress",
                            "step_number": 1,
                            "total_steps": 4,
                            "next_step_required": True,
                            "investigation_status": {"files_checked": 1, "relevant_files": 1},
                            "continuation_id": "debug-test-uuid-123",
                        }
                    ),
                    timestamp="2025-01-01T00:02:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="user",
                    content="Step 2: Found dictionary modification issue",
                    timestamp="2025-01-01T00:03:00Z",
                    tool_name="debug",
                    files=["/api/SessionManager.java", "/api/utils.py"],
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(
                        {
                            "status": "investigation_in_progress",
                            "step_number": 2,
                            "total_steps": 4,
                            "next_step_required": True,
                            "investigation_status": {
                                "files_checked": 2,
                                "relevant_files": 1,
                                "relevant_methods": 1,
                                "hypotheses_formed": 1,
                                "current_confidence": "high",
                            },
                            "continuation_id": "debug-test-uuid-123",
                        }
                    ),
                    timestamp="2025-01-01T00:04:00Z",
                    tool_name="debug",
                ),
            ],
            initial_context={},
        )
        # Get file list to verify prioritization
        file_list = get_conversation_file_list(context)
        assert file_list == ["/api/SessionManager.java", "/api/utils.py"]
        # Mock file reading
        def mock_read_file(file_path):
            return f"// {file_path}\n// Mock content", 5
        # Build history
        from utils.model_context import ModelContext
        model_context = ModelContext("flash")
        history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
        # Verify investigation progression is preserved
        assert "Step 1: Initial investigation" in history
        assert "Step 2: Found dictionary modification issue" in history
        assert '"step_number": 1' in history
        assert '"step_number": 2' in history
        assert '"current_confidence": "high"' in history
    @pytest.mark.asyncio
    async def test_debug_to_analyze_continuation(self):
        """Test continuation from debug tool to analyze tool."""
        # Simulate debug tool creating initial investigation
        debug_context = ThreadContext(
            thread_id="debug-analyze-uuid-123",
            created_at="2025-01-01T00:00:00Z",
            last_updated_at="2025-01-01T00:10:00Z",
            tool_name="debug",
            turns=[
                ConversationTurn(
                    role="user",
                    content="Final investigation step",
                    timestamp="2025-01-01T00:01:00Z",
                    tool_name="debug",
                    files=["/api/SessionManager.java"],
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(
                        {
                            "status": "calling_expert_analysis",
                            "investigation_complete": True,
                            "expert_analysis": {
                                "status": "analysis_complete",
                                "summary": "Dictionary modification during iteration bug",
                                "hypotheses": [
                                    {
                                        "name": "CONCURRENT_MODIFICATION",
                                        "confidence": "High",
                                        "root_cause": "Modifying dict while iterating",
                                        "minimal_fix": "Create list of keys first",
                                    }
                                ],
                            },
                            "complete_investigation": {
                                "initial_issue": "Session validation failures",
                                "steps_taken": 3,
                                "files_examined": ["/api/SessionManager.java"],
                                "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
                            },
                        }
                    ),
                    timestamp="2025-01-01T00:02:00Z",
                    tool_name="debug",
                ),
            ],
            initial_context={},
        )
        # Mock getting the thread
        with patch("utils.conversation_memory.get_thread", return_value=debug_context):
            # Mock file reading
            def mock_read_file(file_path):
                return "// SessionManager.java\n// cleanup_expired_sessions method", 10
            # Build history for analyze tool
            from utils.model_context import ModelContext
            model_context = ModelContext("flash")
            history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
            # Verify analyze tool can see debug investigation
            assert "calling_expert_analysis" in history
            assert "CONCURRENT_MODIFICATION" in history
            assert "Dictionary modification during iteration bug" in history
            assert "SessionManager.cleanup_expired_sessions" in history
            # Verify the continuation context is clear
            assert "Thread: debug-analyze-uuid-123" in history
            assert "Tool: debug" in history  # Shows original tool
    def test_debug_planner_style_formatting(self):
        """Test that debug tool uses similar formatting to planner for structured responses."""
        # Create debug investigation with multiple steps
        context = ThreadContext(
            thread_id="debug-format-uuid-123",
            created_at="2025-01-01T00:00:00Z",
            last_updated_at="2025-01-01T00:15:00Z",
            tool_name="debug",
            turns=[
                ConversationTurn(
                    role="user",
                    content="Step 1: Initial error analysis",
                    timestamp="2025-01-01T00:01:00Z",
                    tool_name="debug",
                ),
                ConversationTurn(
                    role="assistant",
                    content=json.dumps(
                        {
                            "status": "investigation_in_progress",
                            "step_number": 1,
                            "total_steps": 3,
                            "next_step_required": True,
                            "output": {
                                "instructions": "Continue systematic investigation.",
                                "format": "systematic_investigation",
                            },
                            "continuation_id": "debug-format-uuid-123",
                        },
                        indent=2,
                    ),
                    timestamp="2025-01-01T00:02:00Z",
                    tool_name="debug",
                ),
            ],
            initial_context={},
        )
        # Build history
        from utils.model_context import ModelContext
        model_context = ModelContext("flash")
        history, _ = build_conversation_history(context, model_context, read_files_func=lambda x: ("", 0))
        # Verify structured format is preserved
        assert '"status": "investigation_in_progress"' in history
        assert '"format": "systematic_investigation"' in history
        assert "--- Turn 1 (Claude using debug) ---" in history
        assert "--- Turn 2 (Gemini using debug" in history
        # The JSON structure should be preserved for tools to parse
        # This allows other tools to understand the investigation state
        turn_2_start = history.find("--- Turn 2 (Gemini using debug")
        turn_2_content = history[turn_2_start:]
        assert "{\n" in turn_2_content  # JSON formatting preserved
        assert '"continuation_id"' in turn_2_content
--- a/tests/test_large_prompt_handling.py
+++ b/tests/test_large_prompt_handling.py
@@ -19,7 +19,8 @@ from config import MCP_PROMPT_SIZE_LIMIT
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
 # from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool
@@ -250,25 +251,30 @@ class TestLargePromptHandling:
        # The core fix ensures large prompts are detected at the right time
        assert output["status"] in ["success", "files_required_to_continue", "resend_prompt"]
-    @pytest.mark.asyncio
+    # NOTE: Debug tool tests have been commented out because the debug tool has been
-    async def test_debug_large_error_description(self, large_prompt):
+    # refactored to use a self-investigation pattern instead of accepting a prompt field.
-        """Test that debug tool detects large error_description."""
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
-        tool = DebugIssueTool()
+    # and doesn't have the "resend_prompt" functionality for large prompts.
        result = await tool.execute({"prompt": large_prompt})
-        assert len(result) == 1
+    # @pytest.mark.asyncio
-        output = json.loads(result[0].text)
+    # async def test_debug_large_error_description(self, large_prompt):
-        assert output["status"] == "resend_prompt"
+    #     """Test that debug tool detects large error_description."""
    #     tool = DebugIssueTool()
    #     result = await tool.execute({"prompt": large_prompt})
    #
    #     assert len(result) == 1
    #     output = json.loads(result[0].text)
    #     assert output["status"] == "resend_prompt"
-    @pytest.mark.asyncio
+    # @pytest.mark.asyncio
-    async def test_debug_large_error_context(self, large_prompt, normal_prompt):
+    # async def test_debug_large_error_context(self, large_prompt, normal_prompt):
-        """Test that debug tool detects large error_context."""
+    #     """Test that debug tool detects large error_context."""
-        tool = DebugIssueTool()
+    #     tool = DebugIssueTool()
-        result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
+    #     result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
-
+    #
-        assert len(result) == 1
+    #     assert len(result) == 1
-        output = json.loads(result[0].text)
+    #     output = json.loads(result[0].text)
-        assert output["status"] == "resend_prompt"
+    #     assert output["status"] == "resend_prompt"
    @pytest.mark.asyncio
    async def test_analyze_large_question(self, large_prompt):
--- a/tests/test_prompt_regression.py
+++ b/tests/test_prompt_regression.py
@@ -13,7 +13,8 @@ import pytest
 from tools.analyze import AnalyzeTool
 from tools.chat import ChatTool
 from tools.codereview import CodeReviewTool
-from tools.debug import DebugIssueTool
+
 # from tools.debug import DebugIssueTool  # Commented out - debug tool refactored
 from tools.precommit import Precommit
 from tools.thinkdeep import ThinkDeepTool
@@ -182,33 +183,37 @@ class TestPromptRegression:
                    output = json.loads(result[0].text)
                    assert output["status"] == "success"
-    @pytest.mark.asyncio
+    # NOTE: Debug tool test has been commented out because the debug tool has been
-    async def test_debug_normal_error(self, mock_model_response):
+    # refactored to use a self-investigation pattern instead of accepting prompt/error_context fields.
-        """Test debug tool with normal error description."""
+    # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
        tool = DebugIssueTool()
-        with patch.object(tool, "get_model_provider") as mock_get_provider:
+    # @pytest.mark.asyncio
-            mock_provider = MagicMock()
+    # async def test_debug_normal_error(self, mock_model_response):
-            mock_provider.get_provider_type.return_value = MagicMock(value="google")
+    #     """Test debug tool with normal error description."""
-            mock_provider.supports_thinking_mode.return_value = False
+    #     tool = DebugIssueTool()
-            mock_provider.generate_content.return_value = mock_model_response(
+    #
-                "Root cause: The variable is undefined. Fix: Initialize it..."
+    #     with patch.object(tool, "get_model_provider") as mock_get_provider:
-            )
+    #         mock_provider = MagicMock()
-            mock_get_provider.return_value = mock_provider
+    #         mock_provider.get_provider_type.return_value = MagicMock(value="google")
-
+    #         mock_provider.supports_thinking_mode.return_value = False
-            result = await tool.execute(
+    #         mock_provider.generate_content.return_value = mock_model_response(
-                {
+    #             "Root cause: The variable is undefined. Fix: Initialize it..."
-                    "prompt": "TypeError: Cannot read property 'name' of undefined",
+    #         )
-                    "error_context": "at line 42 in user.js\n  console.log(user.name)",
+    #         mock_get_provider.return_value = mock_provider
-                    "runtime_info": "Node.js v16.14.0",
+    #
-                }
+    #         result = await tool.execute(
-            )
+    #             {
-
+    #                 "prompt": "TypeError: Cannot read property 'name' of undefined",
-            assert len(result) == 1
+    #                 "error_context": "at line 42 in user.js\n  console.log(user.name)",
-            output = json.loads(result[0].text)
+    #                 "runtime_info": "Node.js v16.14.0",
-            assert output["status"] == "success"
+    #             }
-            assert "Next Steps:" in output["content"]
+    #         )
-            assert "Root cause" in output["content"]
+    #
    #         assert len(result) == 1
    #         output = json.loads(result[0].text)
    #         assert output["status"] == "success"
    #         assert "Next Steps:" in output["content"]
    #         assert "Root cause" in output["content"]
    @pytest.mark.asyncio
    async def test_analyze_normal_question(self, mock_model_response):
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -6,7 +6,7 @@ import json
 import pytest
-from tools import AnalyzeTool, ChatTool, CodeReviewTool, DebugIssueTool, ThinkDeepTool
+from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool
 class TestThinkDeepTool:
@@ -183,94 +183,6 @@ class TestCodeReviewTool:
            ModelProviderRegistry._instance = None
 class TestDebugIssueTool:
    """Test the debug tool"""
    @pytest.fixture
    def tool(self):
        return DebugIssueTool()
    def test_tool_metadata(self, tool):
        """Test tool metadata"""
        assert tool.get_name() == "debug"
        assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
        assert tool.get_default_temperature() == 0.2
        schema = tool.get_input_schema()
        assert "prompt" in schema["properties"]
        assert schema["required"] == ["prompt"]
    @pytest.mark.asyncio
    async def test_execute_with_context(self, tool):
        """Test execution with error context using real integration testing"""
        import importlib
        import os
        # Save original environment
        original_env = {
            "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
            "DEFAULT_MODEL": os.environ.get("DEFAULT_MODEL"),
        }
        try:
            # Set up environment for real provider resolution
            os.environ["OPENAI_API_KEY"] = "sk-test-key-debug-context-test-not-real"
            os.environ["DEFAULT_MODEL"] = "o3-mini"
            # Clear other provider keys to isolate to OpenAI
            for key in ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]:
                os.environ.pop(key, None)
            # Reload config and clear registry
            import config
            importlib.reload(config)
            from providers.registry import ModelProviderRegistry
            ModelProviderRegistry._instance = None
            # Test with real provider resolution
            try:
                result = await tool.execute(
                    {
                        "prompt": "Test fails intermittently",
                        "error_context": "AssertionError in test_async",
                        "previous_attempts": "Added sleep, still fails",
                        "model": "o3-mini",
                    }
                )
                # If we get here, check the response format
                assert len(result) == 1
                # Should contain debug analysis
                assert result[0].text is not None
            except Exception as e:
                # Expected: API call will fail with fake key
                error_msg = str(e)
                # Should NOT be a mock-related error
                assert "MagicMock" not in error_msg
                assert "'<' not supported between instances" not in error_msg
                # Should be a real provider error
                assert any(
                    phrase in error_msg
                    for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
                )
        finally:
            # Restore environment
            for key, value in original_env.items():
                if value is not None:
                    os.environ[key] = value
                else:
                    os.environ.pop(key, None)
            # Reload config and clear registry
            importlib.reload(config)
            ModelProviderRegistry._instance = None
 class TestAnalyzeTool:
    """Test the analyze tool"""
@@ -400,23 +312,6 @@ class TestAbsolutePathValidation:
        assert "must be FULL absolute paths" in response["content"]
        assert "../parent/file.py" in response["content"]
    @pytest.mark.asyncio
    async def test_debug_tool_relative_path_rejected(self):
        """Test that debug tool rejects relative paths"""
        tool = DebugIssueTool()
        result = await tool.execute(
            {
                "prompt": "Something broke",
                "files": ["src/main.py"],  # relative path
            }
        )
        assert len(result) == 1
        response = json.loads(result[0].text)
        assert response["status"] == "error"
        assert "must be FULL absolute paths" in response["content"]
        assert "src/main.py" in response["content"]
    @pytest.mark.asyncio
    async def test_thinkdeep_tool_relative_path_rejected(self):
        """Test that thinkdeep tool rejects relative paths"""
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -1,7 +1,9 @@
 """
-Debug Issue tool - Root cause analysis and debugging assistance
+Debug Issue tool - Root cause analysis and debugging assistance with systematic investigation
 """
 import json
 import logging
 from typing import TYPE_CHECKING, Any, Optional
 from pydantic import Field
@@ -14,155 +16,207 @@ from systemprompts import DEBUG_ISSUE_PROMPT
 from .base import BaseTool, ToolRequest
-# Field descriptions to avoid duplication between Pydantic and JSON schema
+logger = logging.getLogger(__name__)
-DEBUG_FIELD_DESCRIPTIONS = {
+
-    "prompt": (
+# Field descriptions for the investigation steps
-        "MANDATORY: You MUST first think deep about the issue, what it is, why it might be happening, what code might be involved, "
+DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
-        "is it an error stemming out of the code directly or is it a side-effect of some part of the existing code. If it's an error "
+    "step": (
-        "message, could it be coming from an external resource and NOT directly from the project? What part of the code seems most likely"
+        "Your current investigation step. For the first step, describe the issue/error to investigate. "
-        "the culprit. MUST try and ZERO IN on the issue and surrounding code. Include all the details into the prompt that you can provide: "
+        "For subsequent steps, describe what you're investigating, what code you're examining, "
-        "error messages, symptoms, when it occurs, steps to reproduce, environment details, "
+        "what patterns you're looking for, or what hypothesis you're testing."
        "recent changes, and any other relevant information. Mention any previous attempts at fixing this issue, "
        "including any past fix that was in place but has now regressed. "
        "The more context available, the better the analysis. "
        "PERFORM SYSTEMATIC INVESTIGATION: You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. "
        "First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. "
        "You MUST maintain detailed investigation notes in a DEBUGGING_{issue_description}.md file within the project folder, "
        "updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. "
        "This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. "
        "CRITICAL: If after thorough investigation You has very high confidence that NO BUG EXISTS that correlates to the reported symptoms, "
        "You should consider the possibility that the reported issue may not actually be present, may be a misunderstanding, or may be conflated with something else entirely. "
        "In such cases, you should gather more information from the user through targeted questioning rather than continue hunting for non-existent bugs. "
        "Once complete, you MUST provide also pass in this file into the files parameter of this tool. "
        "It is ESSENTIAL that this detailed work is performed by you before sharing all the relevant details with its development assistant. This will greatly help in zeroing in on the root cause."
    ),
    "step_number": "Current step number in the investigation sequence (starts at 1)",
    "total_steps": "Current estimate of total investigation steps needed (can be adjusted as investigation progresses)",
    "next_step_required": "Whether another investigation step is required",
    "findings": (
-        "You MUST first perform its own investigation, gather its findings and analysis. Include: steps taken to analyze the issue, "
+        "Current findings from this investigation step. Include code patterns discovered, "
-        "code patterns discovered, initial hypotheses formed, any relevant classes/functions/methods examined, "
+        "potential causes identified, hypotheses formed, or evidence gathered."
        "and any preliminary conclusions. If investigation yields no concrete evidence of a bug correlating to the reported symptoms, "
        "You should clearly state this finding and consider that the issue may not exist as described. "
        "This provides context for the assistant model's analysis."
    ),
-    "files": (
+    "files_checked": (
-        "Essential files for debugging - ONLY include files that are directly related to the issue, "
+        "List of files you've examined so far in the investigation (cumulative list). "
-        "contain the problematic code, or are necessary for understanding the root cause. "
+        "Include all files you've looked at, even if they turned out to be irrelevant."
        "This can include any relevant log files, error description documents, investigation documents, "
        "Your own findings as a document, related code that may help with analysis."
        "DO NOT include every file scanned during investigation (must be FULL absolute paths - DO NOT SHORTEN)."
    ),
-    "error_context": "Stack trace, snippet from logs, or additional error context. For very large text you MUST instead"
+    "relevant_files": (
-    "save the context as a temporary file within the project folder and share it as a FULL absolute file path - DO NOT SHORTEN"
+        "List of files that are definitely related to the issue (subset of files_checked). "
-    "reference to the files parameter.",
+        "Only include files that contain code directly related to the problem."
-    "images": "Optional images showing error screens, UI issues, logs displays, or visual debugging information",
+    ),
    "relevant_methods": (
        "List of specific methods/functions that are involved in the issue. "
        "Format: 'ClassName.methodName' or 'functionName'"
    ),
    "hypothesis": (
        "Your current working hypothesis about the root cause. This can be updated/revised "
        "as the investigation progresses."
    ),
    "confidence": "Your confidence level in the current hypothesis: 'low', 'medium', or 'high'",
    "backtrack_from_step": "If you need to revise a previous finding, which step number to backtrack from",
    "continuation_id": "Thread continuation ID for multi-turn investigation sessions",
    "images": (
        "Optional images showing error screens, UI issues, logs displays, or visual debugging information "
        "that help understand the issue (must be FULL absolute paths - DO NOT SHORTEN)"
    ),
 }
 # Field descriptions for the final debug request
 DEBUG_FIELD_DESCRIPTIONS = {
    "initial_issue": "The original issue description that started the investigation",
    "investigation_summary": "Complete summary of the systematic investigation performed",
    "findings": "Consolidated findings from all investigation steps",
    "files": "Essential files identified during investigation (must be FULL absolute paths - DO NOT SHORTEN)",
    "error_context": "Stack trace, logs, or error context discovered during investigation",
    "relevant_methods": "List of methods/functions identified as involved in the issue",
    "hypothesis": "Final hypothesis about the root cause after investigation",
    "images": "Optional images showing error screens, UI issues, or visual debugging information",
 }
-class DebugIssueRequest(ToolRequest):
+class DebugInvestigationRequest(ToolRequest):
-    """Request model for debug tool"""
+    """Request model for debug investigation steps"""
-    prompt: str = Field(..., description=DEBUG_FIELD_DESCRIPTIONS["prompt"])
+    # Required fields for each investigation step
-    findings: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["findings"])
+    step: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"])
-    files: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["files"])
+    step_number: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"])
-    error_context: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["error_context"])
+    total_steps: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"])
-    images: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["images"])
+    next_step_required: bool = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"])
    # Investigation tracking fields
    findings: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"])
    files_checked: list[str] = Field(
        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"]
    )
    relevant_files: list[str] = Field(
        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"]
    )
    relevant_methods: list[str] = Field(
        default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"]
    )
    hypothesis: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"])
    confidence: Optional[str] = Field("low", description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"])
    # Optional backtracking field
    backtrack_from_step: Optional[int] = Field(
        None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"]
    )
    # Optional continuation field
    continuation_id: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"])
    # Optional images for visual debugging
    images: Optional[list[str]] = Field(default=None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"])
    # Override inherited fields to exclude them
    model: Optional[str] = Field(default=None, exclude=True)
    temperature: Optional[float] = Field(default=None, exclude=True)
    thinking_mode: Optional[str] = Field(default=None, exclude=True)
    use_websearch: Optional[bool] = Field(default=None, exclude=True)
 class DebugIssueTool(BaseTool):
-    """Advanced debugging and root cause analysis tool"""
+    """Advanced debugging tool with systematic self-investigation"""
    def __init__(self):
        super().__init__()
        self.investigation_history = []
        self.consolidated_findings = {
            "files_checked": set(),
            "relevant_files": set(),
            "relevant_methods": set(),
            "findings": [],
            "hypotheses": [],
            "images": [],
        }
    def get_name(self) -> str:
        return "debug"
    def get_description(self) -> str:
        return (
-            "DEBUG & ROOT CAUSE ANALYSIS - Expert debugging for complex issues with systematic investigation support. "
+            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
-            "Use this when you need to debug code, find out why something is failing, identify root causes, "
+            "This tool guides you through a step-by-step investigation process where you:\n\n"
-            "trace errors, or diagnose issues. "
+            "1. Start with step 1: describe the issue to investigate\n"
-            "MANDATORY: Claud you MUST first think deep and follow these instructions when using this tool"
+            "2. Continue with investigation steps: examine code, trace errors, test hypotheses\n"
-            "SYSTEMATIC INVESTIGATION WORKFLOW: "
+            "3. Track findings, relevant files, and methods throughout\n"
-            "You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. "
+            "4. Update hypotheses as understanding evolves\n"
-            "First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. "
+            "5. Backtrack and revise findings when needed\n"
-            "You MUST maintain detailed investigation notes while it performs its analysis, "
+            "6. Once investigation is complete, receive expert analysis\n\n"
-            "updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. "
+            "The tool enforces systematic investigation methodology:\n"
-            "This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. "
+            "- Methodical code examination and evidence collection\n"
-            "Once complete, You MUST provide Zen's debug tool with this file passed into the files parameter. "
+            "- Hypothesis formation and validation\n"
-            "1. INVESTIGATE SYSTEMATICALLY: You MUST think and use a methodical approach to trace through error reports, "
+            "- File and method tracking for context\n"
-            "examine code, and gather evidence step by step "
+            "- Confidence assessment and revision capabilities\n\n"
-            "2. DOCUMENT FINDINGS: Maintain detailed investigation notes to "
+            "Perfect for: complex bugs, mysterious errors, performance issues, "
-            "keep the user informed during its initial investigation. This investigation MUST be shared with this tool for the assistant "
+            "race conditions, memory leaks, integration problems."
            "to be able to help more effectively. "
            "3. USE TRACER TOOL: For complex method calls, class references, or side effects use Zen's tracer tool and include its output as part of the "
            "prompt or additional context "
            "4. COLLECT EVIDENCE: Document important discoveries and validation attempts "
            "5. PROVIDE COMPREHENSIVE FINDINGS: Pass complete findings to this tool for expert analysis "
            "INVESTIGATION METHODOLOGY: "
            "- Start with error messages/symptoms and work backwards to root cause "
            "- Examine code flow and identify potential failure points "
            "- Use tracer tool for complex method interactions and dependencies if and as needed but continue with the investigation after using it "
            "- Test hypotheses against actual code and logs and confirm the idea holds "
            "- Document everything systematically "
            "- CRITICAL: If investigation yields no concrete evidence of a bug, consider that the reported issue may not exist as described and gather more information through questioning "
            "ESSENTIAL FILES ONLY: Include only files (documents, code etc) directly related to the issue. "
            "Focus on quality over quantity for assistant model analysis. "
            "STRUCTURED OUTPUT: Assistant models return JSON responses with hypothesis "
            "ranking, evidence correlation, and actionable fixes. "
            "Choose thinking_mode based on issue complexity: 'low' for simple errors, "
            "'medium' for standard debugging (default), 'high' for complex system issues, "
            "'max' for extremely challenging bugs requiring deepest analysis. "
            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
        )
    def get_input_schema(self) -> dict[str, Any]:
        schema = {
            "type": "object",
            "properties": {
-                "prompt": {
+                # Investigation step fields
                "step": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["prompt"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"],
                },
                "step_number": {
                    "type": "integer",
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"],
                    "minimum": 1,
                },
                "total_steps": {
                    "type": "integer",
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"],
                    "minimum": 1,
                },
                "next_step_required": {
                    "type": "boolean",
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"],
                },
                "model": self.get_model_field_schema(),
                "findings": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["findings"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"],
                },
-                "files": {
+                "files_checked": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": DEBUG_FIELD_DESCRIPTIONS["files"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"],
                },
-                "error_context": {
+                "relevant_files": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"],
                },
                "relevant_methods": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"],
                },
                "hypothesis": {
                    "type": "string",
-                    "description": DEBUG_FIELD_DESCRIPTIONS["error_context"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"],
                },
                "confidence": {
                    "type": "string",
                    "enum": ["low", "medium", "high"],
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"],
                },
                "backtrack_from_step": {
                    "type": "integer",
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"],
                    "minimum": 1,
                },
                "continuation_id": {
                    "type": "string",
                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"],
                },
                "images": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": DEBUG_FIELD_DESCRIPTIONS["images"],
+                    "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"],
                },
                "temperature": {
                    "type": "number",
                    "description": "Temperature (0-1, default 0.2 for accuracy)",
                    "minimum": 0,
                    "maximum": 1,
                },
                "thinking_mode": {
                    "type": "string",
                    "enum": ["minimal", "low", "medium", "high", "max"],
                    "description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
                },
                "use_websearch": {
                    "type": "boolean",
                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
                    "default": True,
                },
                "continuation_id": {
                    "type": "string",
                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
                },
            },
-            "required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []),
+            # Required fields for investigation
            "required": ["step", "step_number", "total_steps", "next_step_required", "findings"],
        }
        return schema
    def get_system_prompt(self) -> str:
@@ -171,8 +225,6 @@ class DebugIssueTool(BaseTool):
    def get_default_temperature(self) -> float:
        return TEMPERATURE_ANALYTICAL
    # Line numbers are enabled by default from base class for precise error location
    def get_model_category(self) -> "ToolModelCategory":
        """Debug requires deep analysis and reasoning"""
        from tools.models import ToolModelCategory
@@ -180,138 +232,342 @@ class DebugIssueTool(BaseTool):
        return ToolModelCategory.EXTENDED_REASONING
    def get_request_model(self):
-        return DebugIssueRequest
+        return DebugInvestigationRequest
-    async def prepare_prompt(self, request: DebugIssueRequest) -> str:
+    def requires_model(self) -> bool:
-        """Prepare the debugging prompt"""
+        """
-        # Check for prompt.txt in files
+        Debug tool manages its own model interactions.
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
+        It doesn't need model during investigation steps, only for final analysis.
        """
        return False
-        # If prompt.txt was found, use it as prompt or error_context
+    async def execute(self, arguments: dict[str, Any]) -> list:
-        if prompt_content:
+        """
-            if not request.prompt or request.prompt == "":
+        Override execute to implement self-investigation pattern.
-                request.prompt = prompt_content
+
        Investigation Flow:
        1. Claude calls debug with investigation steps
        2. Tool tracks findings, files, methods progressively
        3. Once investigation is complete, tool calls AI model for expert analysis
        4. Returns structured response combining investigation + expert analysis
        """
        from mcp.types import TextContent
        from utils.conversation_memory import add_turn, create_thread
        try:
            # Validate request
            request = DebugInvestigationRequest(**arguments)
            # Adjust total steps if needed
            if request.step_number > request.total_steps:
                request.total_steps = request.step_number
            # Handle continuation
            continuation_id = request.continuation_id
            # Create thread for first step
            if not continuation_id and request.step_number == 1:
                continuation_id = create_thread("debug", arguments)
                # Store initial issue description
                self.initial_issue = request.step
            # Handle backtracking first if requested
            if request.backtrack_from_step:
                # Remove findings after the backtrack point
                self.investigation_history = [
                    s for s in self.investigation_history if s["step_number"] < request.backtrack_from_step
                ]
                # Reprocess consolidated findings to match truncated history
                self._reprocess_consolidated_findings()
                # Log if step number needs correction
                expected_step_number = len(self.investigation_history) + 1
                if request.step_number != expected_step_number:
                    logger.debug(
                        f"Step number adjusted from {request.step_number} to {expected_step_number} after backtracking"
                    )
            # Process investigation step
            step_data = {
                "step": request.step,
                "step_number": request.step_number,
                "findings": request.findings,
                "files_checked": request.files_checked,
                "relevant_files": request.relevant_files,
                "relevant_methods": request.relevant_methods,
                "hypothesis": request.hypothesis,
                "confidence": request.confidence,
                "images": request.images,
            }
            # Store in history
            self.investigation_history.append(step_data)
            # Update consolidated findings
            self.consolidated_findings["files_checked"].update(request.files_checked)
            self.consolidated_findings["relevant_files"].update(request.relevant_files)
            self.consolidated_findings["relevant_methods"].update(request.relevant_methods)
            self.consolidated_findings["findings"].append(f"Step {request.step_number}: {request.findings}")
            if request.hypothesis:
                self.consolidated_findings["hypotheses"].append(
                    {"step": request.step_number, "hypothesis": request.hypothesis, "confidence": request.confidence}
                )
            if request.images:
                self.consolidated_findings["images"].extend(request.images)
            # Build response
            response_data = {
                "status": "investigation_in_progress",
                "step_number": request.step_number,
                "total_steps": request.total_steps,
                "next_step_required": request.next_step_required,
                "investigation_status": {
                    "files_checked": len(self.consolidated_findings["files_checked"]),
                    "relevant_files": len(self.consolidated_findings["relevant_files"]),
                    "relevant_methods": len(self.consolidated_findings["relevant_methods"]),
                    "hypotheses_formed": len(self.consolidated_findings["hypotheses"]),
                    "images_collected": len(set(self.consolidated_findings["images"])),
                    "current_confidence": request.confidence,
                },
                "output": {
                    "instructions": "Continue systematic investigation. Present findings clearly and proceed to next step if required.",
                    "format": "systematic_investigation",
                },
            }
            if continuation_id:
                response_data["continuation_id"] = continuation_id
            # If investigation is complete, call the AI model for expert analysis
            if not request.next_step_required:
                response_data["status"] = "calling_expert_analysis"
                response_data["investigation_complete"] = True
                # Prepare consolidated investigation summary
                investigation_summary = self._prepare_investigation_summary()
                # Call the AI model with full context
                expert_analysis = await self._call_expert_analysis(
                    initial_issue=getattr(self, "initial_issue", request.step),
                    investigation_summary=investigation_summary,
                    relevant_files=list(self.consolidated_findings["relevant_files"]),
                    relevant_methods=list(self.consolidated_findings["relevant_methods"]),
                    final_hypothesis=request.hypothesis,
                    error_context=self._extract_error_context(),
                    images=list(set(self.consolidated_findings["images"])),  # Unique images
                    model_info=arguments.get("_model_context"),
                    model_override=arguments.get("model"),  # Pass model selection from final step
                )
                # Combine investigation and expert analysis
                response_data["expert_analysis"] = expert_analysis
                response_data["complete_investigation"] = {
                    "initial_issue": getattr(self, "initial_issue", request.step),
                    "steps_taken": len(self.investigation_history),
                    "files_examined": list(self.consolidated_findings["files_checked"]),
                    "relevant_files": list(self.consolidated_findings["relevant_files"]),
                    "relevant_methods": list(self.consolidated_findings["relevant_methods"]),
                    "investigation_summary": investigation_summary,
                }
                response_data["next_steps"] = (
                    "Investigation complete with expert analysis. Present the findings, hypotheses, "
                    "and recommended fixes to the user. Focus on the most likely root cause and "
                    "provide actionable implementation guidance."
                )
            else:
-                request.error_context = prompt_content
+                response_data["next_steps"] = (
                    f"Continue investigation with step {request.step_number + 1}. "
                    f"Focus on: examining relevant code, testing hypotheses, gathering evidence."
                )
-        # Check user input sizes at MCP transport boundary (before adding internal content)
+            # Store in conversation memory
-        size_check = self.check_prompt_size(request.prompt)
+            if continuation_id:
-        if size_check:
+                add_turn(
-            from tools.models import ToolOutput
+                    thread_id=continuation_id,
                    role="assistant",
                    content=json.dumps(response_data, indent=2),
                    tool_name="debug",
                    files=list(self.consolidated_findings["relevant_files"]),
                    images=request.images,
                )
-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+            return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
-        if request.error_context:
+        except Exception as e:
-            size_check = self.check_prompt_size(request.error_context)
+            logger.error(f"Error in debug investigation: {e}", exc_info=True)
-            if size_check:
+            error_data = {
-                from tools.models import ToolOutput
+                "status": "investigation_failed",
                "error": str(e),
                "step_number": arguments.get("step_number", 0),
            }
            return [TextContent(type="text", text=json.dumps(error_data, indent=2))]
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+    def _reprocess_consolidated_findings(self):
        """Reprocess consolidated findings after backtracking"""
        self.consolidated_findings = {
            "files_checked": set(),
            "relevant_files": set(),
            "relevant_methods": set(),
            "findings": [],
            "hypotheses": [],
            "images": [],
        }
-        # Update request files list
+        for step in self.investigation_history:
-        if updated_files is not None:
+            self.consolidated_findings["files_checked"].update(step.get("files_checked", []))
-            request.files = updated_files
+            self.consolidated_findings["relevant_files"].update(step.get("relevant_files", []))
            self.consolidated_findings["relevant_methods"].update(step.get("relevant_methods", []))
            self.consolidated_findings["findings"].append(f"Step {step['step_number']}: {step['findings']}")
            if step.get("hypothesis"):
                self.consolidated_findings["hypotheses"].append(
                    {
                        "step": step["step_number"],
                        "hypothesis": step["hypothesis"],
                        "confidence": step.get("confidence", "low"),
                    }
                )
            if step.get("images"):
                self.consolidated_findings["images"].extend(step["images"])
-        # File size validation happens at MCP boundary in server.py
+    def _prepare_investigation_summary(self) -> str:
        """Prepare a comprehensive summary of the investigation"""
        summary_parts = [
            "=== SYSTEMATIC INVESTIGATION SUMMARY ===",
            f"Total steps: {len(self.investigation_history)}",
            f"Files examined: {len(self.consolidated_findings['files_checked'])}",
            f"Relevant files identified: {len(self.consolidated_findings['relevant_files'])}",
            f"Methods/functions involved: {len(self.consolidated_findings['relevant_methods'])}",
            "",
            "=== INVESTIGATION PROGRESSION ===",
        ]
-        # Build context sections
+        for finding in self.consolidated_findings["findings"]:
-        context_parts = [f"=== ISSUE DESCRIPTION ===\n{request.prompt}\n=== END DESCRIPTION ==="]
+            summary_parts.append(finding)
-        if request.findings:
+        if self.consolidated_findings["hypotheses"]:
-            context_parts.append(f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{request.findings}\n=== END FINDINGS ===")
+            summary_parts.extend(
-
+                [
-        if request.error_context:
+                    "",
-            context_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{request.error_context}\n=== END CONTEXT ===")
+                    "=== HYPOTHESIS EVOLUTION ===",
-
+                ]
        # Add relevant files if provided
        if request.files:
            # Use centralized file processing logic
            continuation_id = getattr(request, "continuation_id", None)
            file_content, processed_files = self._prepare_file_content_for_prompt(
                request.files, continuation_id, "Code"
            )
-            self._actually_processed_files = processed_files
+            for hyp in self.consolidated_findings["hypotheses"]:
                summary_parts.append(f"Step {hyp['step']} ({hyp['confidence']} confidence): {hyp['hypothesis']}")
        return "\n".join(summary_parts)
    def _extract_error_context(self) -> Optional[str]:
        """Extract error context from investigation findings"""
        error_patterns = ["error", "exception", "stack trace", "traceback", "failure"]
        error_context_parts = []
        for finding in self.consolidated_findings["findings"]:
            if any(pattern in finding.lower() for pattern in error_patterns):
                error_context_parts.append(finding)
        return "\n".join(error_context_parts) if error_context_parts else None
    async def _call_expert_analysis(
        self,
        initial_issue: str,
        investigation_summary: str,
        relevant_files: list[str],
        relevant_methods: list[str],
        final_hypothesis: Optional[str],
        error_context: Optional[str],
        images: list[str],
        model_info: Optional[Any] = None,
        model_override: Optional[str] = None,
    ) -> dict:
        """Call AI model for expert analysis of the investigation"""
        # Prepare the debug prompt with all investigation context
        prompt_parts = [
            f"=== ISSUE DESCRIPTION ===\n{initial_issue}\n=== END DESCRIPTION ===",
            f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{investigation_summary}\n=== END FINDINGS ===",
        ]
        if error_context:
            prompt_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{error_context}\n=== END CONTEXT ===")
        if relevant_methods:
            prompt_parts.append(
                "\n=== RELEVANT METHODS/FUNCTIONS ===\n"
                + "\n".join(f"- {method}" for method in relevant_methods)
                + "\n=== END METHODS ==="
            )
        if final_hypothesis:
            prompt_parts.append(f"\n=== FINAL HYPOTHESIS ===\n{final_hypothesis}\n=== END HYPOTHESIS ===")
        if images:
            prompt_parts.append(
                "\n=== VISUAL DEBUGGING INFORMATION ===\n"
                + "\n".join(f"- {img}" for img in images)
                + "\n=== END VISUAL INFORMATION ==="
            )
        # Add file content if we have relevant files
        if relevant_files:
            file_content, _ = self._prepare_file_content_for_prompt(relevant_files, None, "Essential debugging files")
            if file_content:
-                context_parts.append(
+                prompt_parts.append(
                    f"\n=== ESSENTIAL FILES FOR DEBUGGING ===\n{file_content}\n=== END ESSENTIAL FILES ==="
                )
-        full_context = "\n".join(context_parts)
+        full_prompt = "\n".join(prompt_parts)
-        # Check token limits
+        # Get appropriate model and provider
-        self._validate_token_limit(full_context, "Context")
+        from config import DEFAULT_MODEL
        from providers.registry import ModelProviderRegistry
-        # Add web search instruction if enabled
+        model_name = model_override or DEFAULT_MODEL  # Use override if provided
-        websearch_instruction = self.get_websearch_instruction(
+        provider = ModelProviderRegistry.get_provider_for_model(model_name)
            request.use_websearch,
            """When debugging issues, consider if searches for these would help:
 - The exact error message to find known solutions
 - Framework-specific error codes and their meanings
 - Similar issues in forums, GitHub issues, or Stack Overflow
 - Workarounds and patches for known bugs
 - Version-specific issues and compatibility problems""",
        )
-        # Combine everything
+        if not provider:
-        full_prompt = f"""{self.get_system_prompt()}{websearch_instruction}
+            return {"error": f"No provider available for model {model_name}", "status": "provider_error"}
-{full_context}
+        # Generate AI response
        try:
            full_analysis_prompt = f"{self.get_system_prompt()}\n\n{full_prompt}\n\nPlease debug this issue following the structured format in the system prompt."
-Please debug this issue following the structured format in the system prompt.
+            # Prepare generation kwargs
-Focus on finding the root cause and providing actionable solutions."""
+            generation_kwargs = {
                "prompt": full_analysis_prompt,
                "model_name": model_name,
                "system_prompt": "",  # Already included in prompt
                "temperature": self.get_default_temperature(),
                "thinking_mode": "high",  # High thinking for debug analysis
            }
-        return full_prompt
+            # Add images if available
            if images:
                generation_kwargs["images"] = images
-    def _get_model_name(self, model_info: Optional[dict]) -> str:
+            model_response = provider.generate_content(**generation_kwargs)
        """Extract friendly model name from model info."""
        if model_info and model_info.get("model_response"):
            return model_info["model_response"].friendly_name or "the model"
        return "the model"
-    def _generate_systematic_next_steps(self, model_name: str) -> str:
+            if model_response.content:
-        """Generate next steps for systematic investigation completion."""
+                # Try to parse as JSON
-        return f"""**Expert Analysis Complete**
+                try:
                    analysis_result = json.loads(model_response.content.strip())
                    return analysis_result
                except json.JSONDecodeError:
                    # Return as text if not valid JSON
                    return {
                        "status": "analysis_complete",
                        "raw_analysis": model_response.content,
                        "parse_error": "Response was not valid JSON",
                    }
            else:
                return {"error": "No response from model", "status": "empty_response"}
-{model_name} has analyzed your systematic investigation findings.
+        except Exception as e:
            logger.error(f"Error calling expert analysis: {e}", exc_info=True)
            return {"error": str(e), "status": "analysis_error"}
-**Next Steps:**
+    # Stub implementations for base class requirements
-1. **UPDATE INVESTIGATION DOCUMENT**: Add the expert analysis to your DEBUGGING_*.md file
+    async def prepare_prompt(self, request) -> str:
-2. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence validation
+        return ""  # Not used - execute() is overridden
 3. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
 4. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions
 5. **DOCUMENT RESOLUTION**: Update investigation document with final resolution"""
-    def _generate_standard_analysis_steps(self, model_name: str) -> str:
+    def format_response(self, response: str, request, model_info: dict = None) -> str:
-        """Generate next steps for standard analysis completion."""
+        return response  # Not used - execute() is overridden
        return f"""**Expert Analysis Complete**
 {model_name} has analyzed your investigation findings.
 **Next Steps:**
 1. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence
 2. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
 3. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions"""
    def _generate_general_analysis_steps(self, model_name: str) -> str:
        """Generate next steps for general analysis responses."""
        return f"""**Analysis from {model_name}**
 **Next Steps:** Continue your systematic investigation based on the guidance provided, then return
 with comprehensive findings for expert analysis."""
    def format_response(self, response: str, request: DebugIssueRequest, model_info: Optional[dict] = None) -> str:
        """Format the debugging response for Claude to present to user"""
        # The base class automatically handles structured responses like 'files_required_to_continue'
        # and 'analysis_complete' via SPECIAL_STATUS_MODELS, so we only handle normal text responses here
        model_name = self._get_model_name(model_info)
        # For normal text responses, provide general guidance
        next_steps = self._generate_general_analysis_steps(model_name)
        return f"""{response}
 ---
 {next_steps}"""