Re-imagined and re-written Debug tool. Instead of prompting Claude to perform initial analysis (and hoping it did), the tool now works through the debug process as an 'investigation', encouraging Claud to gather its 'findings' / 'hypothesis', stepping back as needed, collecting files it's gone through and keeping track of files relevant to the issue at hand. This structured investiion is then passed to the other model with far greater insight than the original debug tool ever could.

Improved prompts, guard against overengineering and flag that as an antipattern
This commit is contained in:
Fahad
2025-06-19 10:22:30 +04:00
parent 2641c78f8d
commit fccfb0d999
16 changed files with 2243 additions and 707 deletions

View File

@@ -1,21 +1,23 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Debug Tool Validation Test Debug Tool Self-Investigation Validation Test
Tests the debug tool with real bugs to validate: Tests the debug tool's systematic self-investigation capabilities including:
- Proper execution with flash model - Step-by-step investigation with proper JSON responses
- Actual bug identification and analysis - Progressive tracking of findings, files, and methods
- Hypothesis generation for root causes - Hypothesis formation and confidence tracking
- Log validation for tool execution - Backtracking and revision capabilities
- Final expert analysis after investigation completion
""" """
import json import json
from typing import Optional
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
class DebugValidationTest(BaseSimulatorTest): class DebugValidationTest(BaseSimulatorTest):
"""Test debug tool with actual bug scenarios""" """Test debug tool's self-investigation and expert analysis features"""
@property @property
def test_name(self) -> str: def test_name(self) -> str:
@@ -23,23 +25,48 @@ class DebugValidationTest(BaseSimulatorTest):
@property @property
def test_description(self) -> str: def test_description(self) -> str:
return "Debug tool validation with actual bugs" return "Debug tool self-investigation pattern validation"
def run_test(self) -> bool: def run_test(self) -> bool:
"""Test debug tool with real bugs""" """Test debug tool self-investigation capabilities"""
try: try:
self.logger.info("Test: Debug tool validation") self.logger.info("Test: Debug tool self-investigation validation")
# Setup test files directory first # Setup test files directory first
self.setup_test_files() self.setup_test_files()
# Create a Python file with a subtle but realistic bug # Create a Python file with a subtle but realistic bug
buggy_code = """#!/usr/bin/env python3 self._create_buggy_code()
# Test 1: Single investigation session with multiple steps
if not self._test_single_investigation_session():
return False
# Test 2: Investigation with backtracking
if not self._test_investigation_with_backtracking():
return False
# Test 3: Complete investigation with expert analysis
if not self._test_complete_investigation_with_analysis():
return False
self.logger.info(" ✅ All debug validation tests passed")
return True
except Exception as e:
self.logger.error(f"Debug validation test failed: {e}")
return False
finally:
self.cleanup_test_files()
def _create_buggy_code(self):
"""Create test files with a subtle bug for debugging"""
# Create a Python file with dictionary iteration bug
buggy_code = """#!/usr/bin/env python3
import json import json
import requests
from datetime import datetime, timedelta from datetime import datetime, timedelta
class UserSessionManager: class SessionManager:
def __init__(self): def __init__(self):
self.active_sessions = {} self.active_sessions = {}
self.session_timeout = 30 * 60 # 30 minutes in seconds self.session_timeout = 30 * 60 # 30 minutes in seconds
@@ -52,7 +79,6 @@ class UserSessionManager:
'user_id': user_id, 'user_id': user_id,
'user_data': user_data, 'user_data': user_data,
'created_at': datetime.now(), 'created_at': datetime.now(),
'last_activity': datetime.now(),
'expires_at': datetime.now() + timedelta(seconds=self.session_timeout) 'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
} }
@@ -72,322 +98,356 @@ class UserSessionManager:
del self.active_sessions[session_id] del self.active_sessions[session_id]
return False return False
# Update last activity
session['last_activity'] = current_time
return True return True
def cleanup_expired_sessions(self): def cleanup_expired_sessions(self):
\"\"\"Remove expired sessions from memory\"\"\" \"\"\"Remove expired sessions from memory\"\"\"
current_time = datetime.now() current_time = datetime.now()
expired_sessions = [] expired_count = 0
# BUG: Modifying dictionary while iterating over it
for session_id, session in self.active_sessions.items(): for session_id, session in self.active_sessions.items():
if current_time > session['expires_at']: if current_time > session['expires_at']:
expired_sessions.append(session_id) del self.active_sessions[session_id] # This causes RuntimeError
expired_count += 1
for session_id in expired_sessions: return expired_count
del self.active_sessions[session_id]
return len(expired_sessions)
class APIHandler:
def __init__(self):
self.session_manager = UserSessionManager()
self.request_count = 0
def authenticate_user(self, username, password):
\"\"\"Authenticate user and create session\"\"\"
# Simulate API call to auth service
auth_response = self._call_auth_service(username, password)
if auth_response.get('success'):
user_data = auth_response.get('user_data', {})
session_id = self.session_manager.create_session(
user_data['id'], user_data
)
return {'success': True, 'session_id': session_id}
return {'success': False, 'error': 'Authentication failed'}
def process_request(self, session_id, request_data):
\"\"\"Process an API request with session validation\"\"\"
self.request_count += 1
# Validate session before processing
if not self.session_manager.validate_session(session_id):
return {'error': 'Invalid or expired session', 'code': 401}
# Simulate request processing
try:
result = self._process_business_logic(request_data)
return {'success': True, 'data': result}
except Exception as e:
return {'error': str(e), 'code': 500}
def _call_auth_service(self, username, password):
\"\"\"Simulate external authentication service call\"\"\"
# Simulate network delay and response
import time
time.sleep(0.1)
# Mock successful authentication
if username and password:
return {
'success': True,
'user_data': {
'id': hash(username) % 10000,
'username': username,
'roles': ['user']
}
}
return {'success': False}
def _process_business_logic(self, request_data):
\"\"\"Simulate business logic processing\"\"\"
if not request_data:
raise ValueError("Invalid request data")
# Simulate some processing
return {
'processed_at': datetime.now().isoformat(),
'request_id': self.request_count,
'status': 'completed'
}
# Global API handler instance
api_handler = APIHandler()
def handle_api_request(session_id, request_data):
\"\"\"Main API request handler\"\"\"
return api_handler.process_request(session_id, request_data)
""" """
# Create test file with subtle bug # Create test file with subtle bug
test_file = self.create_additional_test_file("session_manager.py", buggy_code) self.buggy_file = self.create_additional_test_file("session_manager.py", buggy_code)
self.logger.info(f" ✅ Created test file with subtle bug: {test_file}") self.logger.info(f" ✅ Created test file with subtle bug: {self.buggy_file}")
# Create a realistic problem description with subtle symptoms # Create error description
error_description = """ISSUE DESCRIPTION: error_description = """ISSUE DESCRIPTION:
Our API service is experiencing intermittent session validation failures in production. Our session management system is experiencing intermittent failures during cleanup operations.
SYMPTOMS OBSERVED: SYMPTOMS:
- Users randomly get "Invalid or expired session" errors even with valid sessions - Random RuntimeError: dictionary changed size during iteration
- The issue happens more frequently during high-traffic periods - Occurs during high load when many sessions expire simultaneously
- Sessions that should still be valid (created < 30 minutes ago) are being rejected - Error happens in cleanup_expired_sessions method
- The problem occurs maybe 2-3% of requests but is hard to reproduce consistently - Affects about 5% of cleanup operations
- Server logs show session validation failing but no clear pattern
ENVIRONMENT: ERROR LOG:
- Python 3.13 API service RuntimeError: dictionary changed size during iteration
- Running in production with multiple concurrent users File "session_manager.py", line 44, in cleanup_expired_sessions
- Redis not used for session storage (in-memory only) for session_id, session in self.active_sessions.items():
- Load balancer distributes requests across multiple instances """
RECENT CHANGES: self.error_file = self.create_additional_test_file("error_description.txt", error_description)
- Increased session timeout from 15 to 30 minutes last week self.logger.info(f" ✅ Created error description file: {self.error_file}")
- Added cleanup routine to remove expired sessions
- No major code changes to session management
USER IMPACT: def _test_single_investigation_session(self) -> bool:
- Users have to re-authenticate randomly """Test a complete investigation session with multiple steps"""
- Affects user experience and causes complaints try:
- Seems to happen more on busy days self.logger.info(" 1.1: Testing single investigation session")
The code looks correct to me, but something is causing valid sessions to be treated as expired or invalid. I'm not sure what's causing this intermittent behavior.""" # Step 1: Start investigation
self.logger.info(" 1.1.1: Step 1 - Initial investigation")
error_file = self.create_additional_test_file("error_description.txt", error_description) response1, continuation_id = self.call_mcp_tool(
self.logger.info(f" ✅ Created error description file: {error_file}")
# Call debug tool with flash model and realistic problem description
self.logger.info(" 🔍 Calling debug tool to investigate session validation issues...")
response, continuation_id = self.call_mcp_tool(
"debug", "debug",
{ {
"prompt": "Investigate why our API is experiencing intermittent session validation failures in production", "step": "I need to investigate intermittent RuntimeError during session cleanup. Let me start by examining the error description and understanding the symptoms.",
"files": [test_file, error_file], "step_number": 1,
"findings": "Users getting 'Invalid or expired session' errors randomly, occurs more during high traffic, sessions should still be valid", "total_steps": 4,
"error_context": "Sessions created < 30 minutes ago being rejected, happens ~2-3% of requests, load balanced environment", "next_step_required": True,
"systematic_investigation": True, "findings": "RuntimeError occurs during dictionary iteration in cleanup_expired_sessions method. Error happens intermittently during high load.",
"model": "flash", "files_checked": [self.error_file],
"thinking_mode": "medium", "relevant_files": [self.error_file],
}, },
) )
if not response: if not response1 or not continuation_id:
self.logger.error("Failed to get debug response") self.logger.error("Failed to get initial investigation response")
return False return False
self.logger.info(" ✅ Got debug response") # Parse and validate JSON response
response1_data = self._parse_debug_response(response1)
if not response1_data:
return False
# Parse response to validate bug identification # Validate step 1 response structure
try: if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
response_data = json.loads(response) return False
self.logger.debug(f"Response keys: {list(response_data.keys())}")
# Extract the actual content if it's wrapped self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
if "content" in response_data:
content = response_data["content"]
# Handle markdown JSON blocks
if content.startswith("```json"):
content = content[7:]
if content.endswith("```"):
content = content[:-3]
content = content.strip()
# Parse the inner JSON # Step 2: Examine the code
inner_data = json.loads(content) self.logger.info(" 1.1.2: Step 2 - Code examination")
self.logger.debug(f"Inner data keys: {list(inner_data.keys())}") response2, _ = self.call_mcp_tool(
else: "debug",
inner_data = response_data {
"step": "Now examining the session_manager.py file to understand the cleanup_expired_sessions implementation and identify the root cause.",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"findings": "Found the issue: cleanup_expired_sessions modifies self.active_sessions dictionary while iterating over it with .items(). This causes RuntimeError when del is called during iteration.",
"files_checked": [self.error_file, self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary is being modified during iteration causing RuntimeError",
"confidence": "high",
"continuation_id": continuation_id,
},
)
# Check for structured debug analysis (should have analysis_complete status) if not response2:
if inner_data.get("status") == "analysis_complete": self.logger.error("Failed to continue investigation to step 2")
self.logger.info(" ✅ Got structured debug analysis") return False
# Validate hypothesis generation response2_data = self._parse_debug_response(response2)
hypotheses = inner_data.get("hypotheses", []) if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
if not hypotheses: return False
self.logger.error("No hypotheses found in debug analysis")
return False
self.logger.info(f" 🧠 Found {len(hypotheses)} hypotheses") # Check investigation status tracking
investigation_status = response2_data.get("investigation_status", {})
if investigation_status.get("files_checked", 0) < 2:
self.logger.error("Files checked count not properly tracked")
return False
# Check if the model identified the real bug: dictionary modification during iteration if investigation_status.get("relevant_methods", 0) != 1:
analysis_text = json.dumps(inner_data).lower() self.logger.error("Relevant methods not properly tracked")
return False
# Look for the actual bug - modifying dictionary while iterating if investigation_status.get("current_confidence") != "high":
bug_indicators = [ self.logger.error("Confidence level not properly tracked")
"dictionary", return False
"iteration",
"modify",
"concurrent",
"runtime error",
"dictionary changed size during iteration",
"cleanup_expired_sessions",
"active_sessions",
"del",
"removing while iterating",
]
found_indicators = [indicator for indicator in bug_indicators if indicator in analysis_text] self.logger.info(" ✅ Step 2 successful with proper tracking")
# Check for specific mentions of the problematic pattern # Step 3: Validate hypothesis
dictionary_bug_patterns = [ self.logger.info(" 1.1.3: Step 3 - Hypothesis validation")
"modifying dictionary while iterating", response3, _ = self.call_mcp_tool(
"dictionary changed size", "debug",
"concurrent modification", {
"iterating over dictionary", "step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
"del.*active_sessions", "step_number": 3,
"cleanup.*iteration", "total_steps": 4,
] "next_step_required": True,
"findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
"confidence": "high",
"continuation_id": continuation_id,
},
)
import re if not response3:
self.logger.error("Failed to continue investigation to step 3")
return False
pattern_matches = [] response3_data = self._parse_debug_response(response3)
for pattern in dictionary_bug_patterns: if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
if re.search(pattern, analysis_text): return False
pattern_matches.append(pattern)
if len(found_indicators) >= 3 or len(pattern_matches) >= 1: self.logger.info(" ✅ Investigation session progressing successfully")
self.logger.info(" ✅ Flash identified the dictionary iteration bug")
self.logger.info(f" Found indicators: {found_indicators[:3]}")
if pattern_matches:
self.logger.info(f" Pattern matches: {pattern_matches}")
else:
self.logger.error(" ❌ Flash missed the dictionary iteration bug")
self.logger.error(f" Found only: {found_indicators}")
return False
# Validate hypothesis quality (should have confidence levels and reasoning) # Store continuation_id for next test
valid_hypotheses = 0 self.investigation_continuation_id = continuation_id
for i, hypothesis in enumerate(hypotheses[:3]): # Check top 3 return True
confidence = hypothesis.get("confidence", "").lower()
reasoning = hypothesis.get("reasoning", "")
if confidence in ["high", "medium", "low"] and len(reasoning) > 20: except Exception as e:
valid_hypotheses += 1 self.logger.error(f"Single investigation session test failed: {e}")
self.logger.debug(f" Hypothesis {i+1}: {confidence} confidence, good reasoning") return False
else:
self.logger.debug(f" Hypothesis {i+1}: weak ({confidence}, {len(reasoning)} chars)")
if valid_hypotheses >= 2: def _test_investigation_with_backtracking(self) -> bool:
self.logger.info(f" ✅ Found {valid_hypotheses} well-structured hypotheses") """Test investigation with backtracking to revise findings"""
else: try:
self.logger.error(f" ❌ Only {valid_hypotheses} well-structured hypotheses") self.logger.info(" 1.2: Testing investigation with backtracking")
return False
# Check for line-specific references # Start a new investigation for testing backtracking
if "line" in analysis_text or "lines" in analysis_text: self.logger.info(" 1.2.1: Start investigation for backtracking test")
self.logger.info(" 📍 Analysis includes line-specific references") response1, continuation_id = self.call_mcp_tool(
else: "debug",
self.logger.warning(" ⚠️ No line-specific references found") {
"step": "Investigating performance degradation in data processing pipeline",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"findings": "Initial analysis shows slow database queries",
"files_checked": ["/db/queries.py"],
"relevant_files": ["/db/queries.py"],
},
)
else: if not response1 or not continuation_id:
# Non-structured response - check for dictionary iteration bug identification self.logger.error("Failed to start backtracking test investigation")
self.logger.info(" 📝 Got general debug response") return False
response_text = response.lower() # Step 2: Wrong direction
self.logger.info(" 1.2.2: Step 2 - Wrong investigation path")
response2, _ = self.call_mcp_tool(
"debug",
{
"step": "Focusing on database optimization strategies",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"findings": "Database queries seem optimized, might be looking in wrong place",
"files_checked": ["/db/queries.py", "/db/indexes.py"],
"relevant_files": [],
"hypothesis": "Database performance issues",
"confidence": "low",
"continuation_id": continuation_id,
},
)
# Check for the specific bug in general response if not response2:
bug_indicators = [ self.logger.error("Failed to continue to step 2")
"dictionary", return False
"iteration",
"modify",
"concurrent",
"active_sessions",
"cleanup",
"del ",
"removing",
"changed size",
]
found_indicators = [indicator for indicator in bug_indicators if indicator in response_text] # Step 3: Backtrack from step 2
self.logger.info(" 1.2.3: Step 3 - Backtrack and revise approach")
response3, _ = self.call_mcp_tool(
"debug",
{
"step": "Backtracking - the issue might not be database related. Let me investigate the data processing algorithm instead.",
"step_number": 3,
"total_steps": 4,
"next_step_required": True,
"findings": "Found inefficient nested loops in data processor causing O(n²) complexity",
"files_checked": ["/processor/algorithm.py"],
"relevant_files": ["/processor/algorithm.py"],
"relevant_methods": ["DataProcessor.process_batch"],
"hypothesis": "Inefficient algorithm causing performance issues",
"confidence": "medium",
"backtrack_from_step": 2, # Backtrack from step 2
"continuation_id": continuation_id,
},
)
if len(found_indicators) >= 3: if not response3:
self.logger.info(f" ✅ Found {len(found_indicators)} relevant indicators in response") self.logger.error("Failed to backtrack")
self.logger.info(f" Found: {found_indicators}") return False
else:
self.logger.error(f" ❌ Only found {len(found_indicators)} relevant indicators")
self.logger.error(f" Found: {found_indicators}")
return False
except json.JSONDecodeError as e: response3_data = self._parse_debug_response(response3)
self.logger.error(f"Failed to parse debug response as JSON: {e}") if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
# For non-JSON responses, check for dictionary iteration bug return False
response_text = response.lower()
bug_indicators = [ self.logger.info(" ✅ Backtracking working correctly")
"dictionary", return True
"iteration",
"modify",
"concurrent",
"active_sessions",
"cleanup",
"del ",
"removing",
]
found_indicators = [indicator for indicator in bug_indicators if indicator in response_text] except Exception as e:
self.logger.error(f"Backtracking test failed: {e}")
return False
if len(found_indicators) >= 3: def _test_complete_investigation_with_analysis(self) -> bool:
self.logger.info(f"Text response found {len(found_indicators)} relevant indicators") """Test complete investigation ending with expert analysis"""
else: try:
self.logger.error(f" ❌ Text response only found {len(found_indicators)} relevant indicators") self.logger.info(" 1.3: Testing complete investigation with expert analysis")
# Use the continuation from first test
continuation_id = getattr(self, "investigation_continuation_id", None)
if not continuation_id:
# Start fresh if no continuation available
self.logger.info(" 1.3.0: Starting fresh investigation")
response0, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "Investigating the dictionary iteration bug in session cleanup",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Found dictionary modification during iteration",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
},
)
if not response0 or not continuation_id:
self.logger.error("Failed to start fresh investigation")
return False return False
# Final step - trigger expert analysis
self.logger.info(" 1.3.1: Final step - complete investigation")
response_final, _ = self.call_mcp_tool(
"debug",
{
"step": "Investigation complete. The root cause is confirmed: cleanup_expired_sessions modifies the dictionary while iterating, causing RuntimeError.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step - triggers expert analysis
"findings": "Root cause identified: del self.active_sessions[session_id] on line 46 modifies dictionary during iteration starting at line 44. Fix: collect expired IDs first, then delete.",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration causes RuntimeError in cleanup_expired_sessions",
"confidence": "high",
"continuation_id": continuation_id,
"model": "flash", # Use flash for expert analysis
},
)
if not response_final:
self.logger.error("Failed to complete investigation")
return False
response_final_data = self._parse_debug_response(response_final)
if not response_final_data:
return False
# Validate final response structure
if response_final_data.get("status") != "calling_expert_analysis":
self.logger.error(
f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
)
return False
if not response_final_data.get("investigation_complete"):
self.logger.error("Expected investigation_complete=true for final step")
return False
# Check for expert analysis
if "expert_analysis" not in response_final_data:
self.logger.error("Missing expert_analysis in final response")
return False
expert_analysis = response_final_data.get("expert_analysis", {})
# Check for expected analysis content (checking common patterns)
analysis_text = json.dumps(expert_analysis).lower()
# Look for bug identification
bug_indicators = ["dictionary", "iteration", "modify", "runtime", "error", "del"]
found_indicators = sum(1 for indicator in bug_indicators if indicator in analysis_text)
if found_indicators >= 3:
self.logger.info(" ✅ Expert analysis identified the bug correctly")
else:
self.logger.warning(
f" ⚠️ Expert analysis may not have fully identified the bug (found {found_indicators}/6 indicators)"
)
# Check complete investigation summary
if "complete_investigation" not in response_final_data:
self.logger.error("Missing complete_investigation in final response")
return False
complete_investigation = response_final_data["complete_investigation"]
if not complete_investigation.get("relevant_methods"):
self.logger.error("Missing relevant methods in complete investigation")
return False
if "SessionManager.cleanup_expired_sessions" not in complete_investigation["relevant_methods"]:
self.logger.error("Expected method not found in investigation summary")
return False
self.logger.info(" ✅ Complete investigation with expert analysis successful")
# Validate logs # Validate logs
self.logger.info(" 📋 Validating execution logs...") self.logger.info(" 📋 Validating execution logs...")
# Get server logs using inherited method # Get server logs
logs = self.get_recent_server_logs(500) logs = self.get_recent_server_logs(500)
# Look for debug tool execution patterns # Look for debug tool execution patterns
debug_patterns = [ debug_patterns = [
"debug tool", "debug tool",
"[DEBUG]", "investigation",
"systematic investigation", "Expert analysis",
"Token budget", "calling_expert_analysis",
"Essential files for debugging",
] ]
patterns_found = 0 patterns_found = 0
@@ -396,34 +456,101 @@ The code looks correct to me, but something is causing valid sessions to be trea
patterns_found += 1 patterns_found += 1
self.logger.debug(f" ✅ Found log pattern: {pattern}") self.logger.debug(f" ✅ Found log pattern: {pattern}")
if patterns_found >= 3: if patterns_found >= 2:
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)") self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
else: else:
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns") self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
# Test continuation if available
if continuation_id:
self.logger.info(" 🔄 Testing debug continuation...")
follow_up_response, _ = self.call_mcp_tool(
"debug",
{
"prompt": "Based on your analysis, which bug should we fix first and how?",
"continuation_id": continuation_id,
"model": "flash",
},
)
if follow_up_response:
self.logger.info(" ✅ Debug continuation worked")
else:
self.logger.warning(" ⚠️ Debug continuation failed")
self.logger.info(" ✅ Debug tool validation completed successfully")
return True return True
except Exception as e: except Exception as e:
self.logger.error(f"Debug validation test failed: {e}") self.logger.error(f"Complete investigation test failed: {e}")
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via standalone server - override for debug-specific response handling"""
# Use parent implementation to get the raw response
response_text, _ = super().call_mcp_tool(tool_name, params)
if not response_text:
return None, None
# Extract continuation_id from debug response specifically
continuation_id = self._extract_debug_continuation_id(response_text)
return response_text, continuation_id
def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from debug response"""
try:
# Parse the response
response_data = json.loads(response_text)
return response_data.get("continuation_id")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
return None
def _parse_debug_response(self, response_text: str) -> dict:
"""Parse debug tool JSON response"""
try:
# Parse the response - it should be direct JSON
return json.loads(response_text)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse debug response as JSON: {e}")
self.logger.error(f"Response text: {response_text[:500]}...")
return {}
def _validate_step_response(
self,
response_data: dict,
expected_step: int,
expected_total: int,
expected_next_required: bool,
expected_status: str,
) -> bool:
"""Validate a debug investigation step response structure"""
try:
# Check status
if response_data.get("status") != expected_status:
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
return False
# Check step number
if response_data.get("step_number") != expected_step:
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
return False
# Check total steps
if response_data.get("total_steps") != expected_total:
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
return False
# Check next_step_required
if response_data.get("next_step_required") != expected_next_required:
self.logger.error(
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
)
return False
# Check investigation_status exists
if "investigation_status" not in response_data:
self.logger.error("Missing investigation_status in response")
return False
# Check output guidance exists
if "output" not in response_data:
self.logger.error("Missing output guidance in response")
return False
# Check next_steps guidance
if not response_data.get("next_steps"):
self.logger.error("Missing next_steps guidance in response")
return False
return True
except Exception as e:
self.logger.error(f"Error validating step response: {e}")
return False return False
finally:
self.cleanup_test_files()

View File

@@ -39,6 +39,10 @@ SCOPE & FOCUS
• Identify strengths, risks, and strategic improvement areas that affect future development • Identify strengths, risks, and strategic improvement areas that affect future development
• Avoid line-by-line bug hunts or minor style critiques—those are covered by CodeReview • Avoid line-by-line bug hunts or minor style critiques—those are covered by CodeReview
• Recommend practical, proportional changes; no "rip-and-replace" proposals unless the architecture is untenable • Recommend practical, proportional changes; no "rip-and-replace" proposals unless the architecture is untenable
• Identify and flag overengineered solutions — excessive abstraction, unnecessary configuration layers, or generic
frameworks introduced without a clear, current need. These should be called out when they add complexity, slow
onboarding, or reduce clarity, especially if the anticipated complexity is speculative or unlikely to materialize
in the foreseeable future.
ANALYSIS STRATEGY ANALYSIS STRATEGY
1. Map the tech stack, frameworks, deployment model, and constraints 1. Map the tech stack, frameworks, deployment model, and constraints

View File

@@ -29,6 +29,9 @@ SCOPE & FOCUS
• Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity. • Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.
• Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs. • Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.
• Keep proposals practical and directly actionable within the existing architecture. • Keep proposals practical and directly actionable within the existing architecture.
• Overengineering is an anti-pattern — avoid solutions that introduce unnecessary abstraction, indirection, or
configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,
and may not arise in the foreseeable future.
COLLABORATION APPROACH COLLABORATION APPROACH
1. Engage deeply with Claude's input extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial. 1. Engage deeply with Claude's input extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.

View File

@@ -55,6 +55,9 @@ Your review approach:
- Ways to reduce the overall complexity while maintaining and retaining functionality without introducing regression - Ways to reduce the overall complexity while maintaining and retaining functionality without introducing regression
8. Where further investigation and analysis is required, be direct and suggest which code or related file needs to be 8. Where further investigation and analysis is required, be direct and suggest which code or related file needs to be
reviewed reviewed
9. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
current scope, and may not arise in the foreseeable future.
SEVERITY DEFINITIONS SEVERITY DEFINITIONS
🔴 CRITICAL: Security flaws or defects that cause crashes, data loss, or undefined behavior 🔴 CRITICAL: Security flaws or defects that cause crashes, data loss, or undefined behavior

View File

@@ -53,6 +53,9 @@ REVIEW METHOD
4. Flag bugs, regressions, crash risks, data loss, or race conditions. 4. Flag bugs, regressions, crash risks, data loss, or race conditions.
5. Recommend specific fixes for each issue raised; include code where helpful. 5. Recommend specific fixes for each issue raised; include code where helpful.
6. Acknowledge sound patterns to reinforce best practices. 6. Acknowledge sound patterns to reinforce best practices.
7. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
current scope, and may not arise in the foreseeable future.
CORE ANALYSIS (adapt to diff and stack) CORE ANALYSIS (adapt to diff and stack)
• Security injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety • Security injection risks, auth flaws, exposure of sensitive data, unsafe dependencies, memory safety
@@ -62,6 +65,11 @@ CORE ANALYSIS (adapt to diff and stack)
ADDITIONAL ANALYSIS (only when relevant) ADDITIONAL ANALYSIS (only when relevant)
• Language/runtime concerns memory management, concurrency, exception handling • Language/runtime concerns memory management, concurrency, exception handling
• Carefully assess the code's context and purpose before raising concurrency-related concerns. Confirm the presence
of shared state, race conditions, or unsafe access patterns before flagging any issues to avoid false positives.
• Also carefully evaluate concurrency and parallelism risks only after confirming that the code runs in an environment
where such concerns are applicable. Avoid flagging issues unless shared state, asynchronous execution, or multi-threaded
access are clearly possible based on context.
• System/integration config handling, external calls, operational impact • System/integration config handling, external calls, operational impact
• Testing coverage gaps for new logic • Testing coverage gaps for new logic
• If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic • If no tests are found in the project, do not flag test coverage as an issue unless the change introduces logic

View File

@@ -32,6 +32,9 @@ GUIDELINES
5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment. 5. Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.
6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted. 6. Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.
7. Use concise, technical language; assume an experienced engineering audience. 7. Use concise, technical language; assume an experienced engineering audience.
8. Remember: Overengineering is an anti-pattern — avoid suggesting solutions that introduce unnecessary abstraction,
indirection, or configuration in anticipation of complexity that does not yet exist, is not clearly justified by the
current scope, and may not arise in the foreseeable future.
KEY FOCUS AREAS (apply when relevant) KEY FOCUS AREAS (apply when relevant)
- Architecture & Design: modularity, boundaries, abstraction layers, dependencies - Architecture & Design: modularity, boundaries, abstraction layers, dependencies

View File

@@ -198,13 +198,20 @@ class TestAutoModelPlannerFix:
Verify that other tools still properly require model resolution. Verify that other tools still properly require model resolution.
This ensures our fix doesn't break existing functionality. This ensures our fix doesn't break existing functionality.
Note: Debug tool now manages its own model calls like planner.
""" """
from tools.analyze import AnalyzeTool from tools.analyze import AnalyzeTool
from tools.chat import ChatTool from tools.chat import ChatTool
from tools.debug import DebugIssueTool from tools.debug import DebugIssueTool
# Test various tools still require models # Test various tools still require models
tools_requiring_models = [ChatTool(), DebugIssueTool(), AnalyzeTool()] tools_requiring_models = [ChatTool(), AnalyzeTool()]
for tool in tools_requiring_models: for tool in tools_requiring_models:
assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution" assert tool.requires_model() is True, f"{tool.get_name()} should require model resolution"
# Test tools that manage their own model calls
tools_managing_own_models = [DebugIssueTool()]
for tool in tools_managing_own_models:
assert tool.requires_model() is False, f"{tool.get_name()} should manage its own model calls"

View File

@@ -70,35 +70,35 @@ class TestDynamicContextRequests:
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("tools.base.BaseTool.get_model_provider") @patch("tools.base.BaseTool.get_model_provider")
async def test_normal_response_not_parsed_as_clarification(self, mock_get_provider, debug_tool): @patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid")
"""Test that normal responses are not mistaken for clarification requests""" @patch("utils.conversation_memory.add_turn")
normal_response = """ async def test_normal_response_not_parsed_as_clarification(
## Summary self, mock_add_turn, mock_create_thread, mock_get_provider, debug_tool
The error is caused by a missing import statement. ):
"""Test that normal investigation responses work correctly with new debug tool"""
## Hypotheses (Ranked by Likelihood) # The new debug tool uses self-investigation pattern
result = await debug_tool.execute(
### 1. Missing Import (Confidence: High) {
**Root Cause:** The module 'utils' is not imported "step": "Investigating NameError: name 'utils' is not defined",
""" "step_number": 1,
"total_steps": 3,
mock_provider = create_mock_provider() "next_step_required": True,
mock_provider.get_provider_type.return_value = Mock(value="google") "findings": "The error indicates 'utils' module is not imported or defined",
mock_provider.supports_thinking_mode.return_value = False "files_checked": ["/code/main.py"],
mock_provider.generate_content.return_value = Mock( "relevant_files": ["/code/main.py"],
content=normal_response, usage={}, model_name="gemini-2.5-flash", metadata={} "hypothesis": "Missing import statement for utils module",
"confidence": "high",
}
) )
mock_get_provider.return_value = mock_provider
result = await debug_tool.execute({"prompt": "NameError: name 'utils' is not defined"})
assert len(result) == 1 assert len(result) == 1
# Parse the response # Parse the response - new debug tool returns structured JSON
response_data = json.loads(result[0].text) response_data = json.loads(result[0].text)
assert response_data["status"] == "success" assert response_data["status"] == "investigation_in_progress"
assert response_data["content_type"] in ["text", "markdown"] assert response_data["step_number"] == 1
assert "Summary" in response_data["content"] assert response_data["next_step_required"] is True
assert response_data["investigation_status"]["current_confidence"] == "high"
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("tools.base.BaseTool.get_model_provider") @patch("tools.base.BaseTool.get_model_provider")
@@ -125,17 +125,17 @@ class TestDynamicContextRequests:
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("tools.base.BaseTool.get_model_provider") @patch("tools.base.BaseTool.get_model_provider")
async def test_clarification_with_suggested_action(self, mock_get_provider, debug_tool): async def test_clarification_with_suggested_action(self, mock_get_provider, analyze_tool):
"""Test clarification request with suggested next action""" """Test clarification request with suggested next action"""
clarification_json = json.dumps( clarification_json = json.dumps(
{ {
"status": "files_required_to_continue", "status": "files_required_to_continue",
"mandatory_instructions": "I need to see the database configuration to diagnose the connection error", "mandatory_instructions": "I need to see the database configuration to analyze the connection error",
"files_needed": ["config/database.yml", "src/db.py"], "files_needed": ["config/database.yml", "src/db.py"],
"suggested_next_action": { "suggested_next_action": {
"tool": "debug", "tool": "analyze",
"args": { "args": {
"prompt": "Connection timeout to database", "prompt": "Analyze database connection timeout issue",
"files": [ "files": [
"/config/database.yml", "/config/database.yml",
"/src/db.py", "/src/db.py",
@@ -154,9 +154,9 @@ class TestDynamicContextRequests:
) )
mock_get_provider.return_value = mock_provider mock_get_provider.return_value = mock_provider
result = await debug_tool.execute( result = await analyze_tool.execute(
{ {
"prompt": "Connection timeout to database", "prompt": "Analyze database connection timeout issue",
"files": ["/absolute/logs/error.log"], "files": ["/absolute/logs/error.log"],
} }
) )
@@ -168,7 +168,7 @@ class TestDynamicContextRequests:
clarification = json.loads(response_data["content"]) clarification = json.loads(response_data["content"])
assert "suggested_next_action" in clarification assert "suggested_next_action" in clarification
assert clarification["suggested_next_action"]["tool"] == "debug" assert clarification["suggested_next_action"]["tool"] == "analyze"
def test_tool_output_model_serialization(self): def test_tool_output_model_serialization(self):
"""Test ToolOutput model serialization""" """Test ToolOutput model serialization"""
@@ -298,7 +298,7 @@ class TestCollaborationWorkflow:
@patch("tools.base.BaseTool.get_model_provider") @patch("tools.base.BaseTool.get_model_provider")
async def test_multi_step_collaboration(self, mock_get_provider): async def test_multi_step_collaboration(self, mock_get_provider):
"""Test a multi-step collaboration workflow""" """Test a multi-step collaboration workflow"""
tool = DebugIssueTool() tool = AnalyzeTool()
# Step 1: Initial request returns clarification needed # Step 1: Initial request returns clarification needed
clarification_json = json.dumps( clarification_json = json.dumps(
@@ -319,8 +319,8 @@ class TestCollaborationWorkflow:
result1 = await tool.execute( result1 = await tool.execute(
{ {
"prompt": "Database connection timeout", "prompt": "Analyze database connection timeout issue",
"error_context": "Timeout after 30s", "files": ["/logs/error.log"],
} }
) )
@@ -345,9 +345,8 @@ class TestCollaborationWorkflow:
result2 = await tool.execute( result2 = await tool.execute(
{ {
"prompt": "Database connection timeout", "prompt": "Analyze database connection timeout issue with config file",
"error_context": "Timeout after 30s", "files": ["/absolute/path/config.py", "/logs/error.log"], # Additional context provided
"files": ["/absolute/path/config.py"], # Additional context provided
} }
) )

View File

@@ -157,10 +157,10 @@ async def test_unknown_tool_defaults_to_prompt():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_tool_parameter_standardization(): async def test_tool_parameter_standardization():
"""Test that all tools use standardized 'prompt' parameter""" """Test that most tools use standardized 'prompt' parameter (debug uses investigation pattern)"""
from tools.analyze import AnalyzeRequest from tools.analyze import AnalyzeRequest
from tools.codereview import CodeReviewRequest from tools.codereview import CodeReviewRequest
from tools.debug import DebugIssueRequest from tools.debug import DebugInvestigationRequest
from tools.precommit import PrecommitRequest from tools.precommit import PrecommitRequest
from tools.thinkdeep import ThinkDeepRequest from tools.thinkdeep import ThinkDeepRequest
@@ -168,9 +168,16 @@ async def test_tool_parameter_standardization():
analyze = AnalyzeRequest(files=["/test.py"], prompt="What does this do?") analyze = AnalyzeRequest(files=["/test.py"], prompt="What does this do?")
assert analyze.prompt == "What does this do?" assert analyze.prompt == "What does this do?"
# Test debug tool uses prompt # Debug tool now uses self-investigation pattern with different fields
debug = DebugIssueRequest(prompt="Error occurred") debug = DebugInvestigationRequest(
assert debug.prompt == "Error occurred" step="Investigating error",
step_number=1,
total_steps=3,
next_step_required=True,
findings="Initial error analysis",
)
assert debug.step == "Investigating error"
assert debug.findings == "Initial error analysis"
# Test codereview tool uses prompt # Test codereview tool uses prompt
review = CodeReviewRequest(files=["/test.py"], prompt="Review this") review = CodeReviewRequest(files=["/test.py"], prompt="Review this")

514
tests/test_debug.py Normal file
View File

@@ -0,0 +1,514 @@
"""
Tests for the debug tool.
"""
from unittest.mock import patch
import pytest
from tools.debug import DebugInvestigationRequest, DebugIssueTool
from tools.models import ToolModelCategory
class TestDebugTool:
"""Test suite for DebugIssueTool."""
def test_tool_metadata(self):
"""Test basic tool metadata and configuration."""
tool = DebugIssueTool()
assert tool.get_name() == "debug"
assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
assert tool.get_default_temperature() == 0.2 # TEMPERATURE_ANALYTICAL
assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
assert tool.requires_model() is False # Since it manages its own model calls
def test_request_validation(self):
"""Test Pydantic request model validation."""
# Valid investigation step request
step_request = DebugInvestigationRequest(
step="Investigating null pointer exception in UserService",
step_number=1,
total_steps=5,
next_step_required=True,
findings="Found that UserService.getUser() is called with null ID",
)
assert step_request.step == "Investigating null pointer exception in UserService"
assert step_request.step_number == 1
assert step_request.next_step_required is True
assert step_request.confidence == "low" # default
# Request with optional fields
detailed_request = DebugInvestigationRequest(
step="Deep dive into getUser method implementation",
step_number=2,
total_steps=5,
next_step_required=True,
findings="Method doesn't validate input parameters",
files_checked=["/src/UserService.java", "/src/UserController.java"],
relevant_files=["/src/UserService.java"],
relevant_methods=["UserService.getUser", "UserController.handleRequest"],
hypothesis="Null ID passed from controller without validation",
confidence="medium",
)
assert len(detailed_request.files_checked) == 2
assert len(detailed_request.relevant_files) == 1
assert detailed_request.confidence == "medium"
# Missing required fields should fail
with pytest.raises(ValueError):
DebugInvestigationRequest() # Missing all required fields
with pytest.raises(ValueError):
DebugInvestigationRequest(step="test") # Missing other required fields
def test_input_schema_generation(self):
"""Test JSON schema generation for MCP client."""
tool = DebugIssueTool()
schema = tool.get_input_schema()
assert schema["type"] == "object"
# Investigation fields
assert "step" in schema["properties"]
assert "step_number" in schema["properties"]
assert "total_steps" in schema["properties"]
assert "next_step_required" in schema["properties"]
assert "findings" in schema["properties"]
assert "files_checked" in schema["properties"]
assert "relevant_files" in schema["properties"]
assert "relevant_methods" in schema["properties"]
assert "hypothesis" in schema["properties"]
assert "confidence" in schema["properties"]
assert "backtrack_from_step" in schema["properties"]
assert "continuation_id" in schema["properties"]
assert "images" in schema["properties"] # Now supported for visual debugging
# Check excluded fields are NOT present
assert "model" not in schema["properties"]
assert "temperature" not in schema["properties"]
assert "thinking_mode" not in schema["properties"]
assert "use_websearch" not in schema["properties"]
# Check required fields
assert "step" in schema["required"]
assert "step_number" in schema["required"]
assert "total_steps" in schema["required"]
assert "next_step_required" in schema["required"]
assert "findings" in schema["required"]
def test_model_category_for_debugging(self):
"""Test that debug uses extended reasoning category."""
tool = DebugIssueTool()
category = tool.get_model_category()
# Debugging needs deep thinking
assert category == ToolModelCategory.EXTENDED_REASONING
@pytest.mark.asyncio
async def test_execute_first_investigation_step(self):
"""Test execute method for first investigation step."""
tool = DebugIssueTool()
arguments = {
"step": "Investigating intermittent session validation failures in production",
"step_number": 1,
"total_steps": 5,
"next_step_required": True,
"findings": "Users report random session invalidation, occurs more during high traffic",
"files_checked": ["/api/session_manager.py"],
"relevant_files": ["/api/session_manager.py"],
}
# Mock conversation memory functions
with patch("utils.conversation_memory.create_thread", return_value="debug-uuid-123"):
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
assert result[0].type == "text"
# Parse the JSON response
import json
parsed_response = json.loads(result[0].text)
assert parsed_response["status"] == "investigation_in_progress"
assert parsed_response["step_number"] == 1
assert parsed_response["total_steps"] == 5
assert parsed_response["next_step_required"] is True
assert parsed_response["continuation_id"] == "debug-uuid-123"
assert parsed_response["investigation_status"]["files_checked"] == 1
assert parsed_response["investigation_status"]["relevant_files"] == 1
@pytest.mark.asyncio
async def test_execute_subsequent_investigation_step(self):
"""Test execute method for subsequent investigation step."""
tool = DebugIssueTool()
# Set up initial state
tool.initial_issue = "Session validation failures"
tool.consolidated_findings["files_checked"].add("/api/session_manager.py")
arguments = {
"step": "Examining session cleanup method for concurrent modification issues",
"step_number": 2,
"total_steps": 5,
"next_step_required": True,
"findings": "Found dictionary modification during iteration in cleanup_expired_sessions",
"files_checked": ["/api/session_manager.py", "/api/utils.py"],
"relevant_files": ["/api/session_manager.py"],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modified during iteration causing RuntimeError",
"confidence": "high",
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
assert result[0].type == "text"
# Parse the JSON response
import json
parsed_response = json.loads(result[0].text)
assert parsed_response["step_number"] == 2
assert parsed_response["next_step_required"] is True
assert parsed_response["continuation_id"] == "debug-uuid-123"
assert parsed_response["investigation_status"]["files_checked"] == 2 # Cumulative
assert parsed_response["investigation_status"]["relevant_methods"] == 1
assert parsed_response["investigation_status"]["current_confidence"] == "high"
@pytest.mark.asyncio
async def test_execute_final_investigation_step(self):
"""Test execute method for final investigation step with expert analysis."""
tool = DebugIssueTool()
# Set up investigation history
tool.initial_issue = "Session validation failures"
tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation of session validation failures",
"findings": "Initial investigation",
"files_checked": ["/api/utils.py"],
},
{
"step_number": 2,
"step": "Deeper analysis of session manager",
"findings": "Found dictionary issue",
"files_checked": ["/api/session_manager.py"],
},
]
tool.consolidated_findings = {
"files_checked": {"/api/session_manager.py", "/api/utils.py"},
"relevant_files": {"/api/session_manager.py"},
"relevant_methods": {"SessionManager.cleanup_expired_sessions"},
"findings": ["Step 1: Initial investigation", "Step 2: Found dictionary issue"],
"hypotheses": [{"step": 2, "hypothesis": "Dictionary modified during iteration", "confidence": "high"}],
"images": [],
}
arguments = {
"step": "Confirmed the root cause and identified fix",
"step_number": 3,
"total_steps": 3,
"next_step_required": False, # Final step
"findings": "Root cause confirmed: dictionary modification during iteration in cleanup method",
"files_checked": ["/api/session_manager.py"],
"relevant_files": ["/api/session_manager.py"],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration causes intermittent RuntimeError",
"confidence": "high",
"continuation_id": "debug-uuid-123",
}
# Mock the expert analysis call
mock_expert_response = {
"status": "analysis_complete",
"summary": "Dictionary modification during iteration bug identified",
"hypotheses": [
{
"name": "CONCURRENT_MODIFICATION",
"confidence": "High",
"root_cause": "Modifying dictionary while iterating",
"minimal_fix": "Create list of keys to delete first",
}
],
}
# Mock conversation memory and file reading
with patch("utils.conversation_memory.add_turn"):
with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
with patch.object(tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
# Check final step structure
assert parsed_response["status"] == "calling_expert_analysis"
assert parsed_response["investigation_complete"] is True
assert parsed_response["expert_analysis"]["status"] == "analysis_complete"
assert "complete_investigation" in parsed_response
assert parsed_response["complete_investigation"]["steps_taken"] == 3 # All steps including current
@pytest.mark.asyncio
async def test_execute_with_backtracking(self):
"""Test execute method with backtracking to revise findings."""
tool = DebugIssueTool()
# Set up some investigation history with all required fields
tool.investigation_history = [
{
"step": "Initial investigation",
"step_number": 1,
"findings": "Initial findings",
"files_checked": ["file1.py"],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
},
{
"step": "Wrong direction",
"step_number": 2,
"findings": "Wrong path",
"files_checked": ["file2.py"],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
},
]
tool.consolidated_findings = {
"files_checked": {"file1.py", "file2.py"},
"relevant_files": set(),
"relevant_methods": set(),
"findings": ["Step 1: Initial findings", "Step 2: Wrong path"],
"hypotheses": [],
"images": [],
}
arguments = {
"step": "Backtracking to revise approach",
"step_number": 3,
"total_steps": 5,
"next_step_required": True,
"findings": "Taking a different investigation approach",
"files_checked": ["file3.py"],
"backtrack_from_step": 2, # Backtrack from step 2
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "investigation_in_progress"
# After backtracking from step 2, history should have step 1 plus the new step
assert len(tool.investigation_history) == 2 # Step 1 + new step 3
assert tool.investigation_history[0]["step_number"] == 1
assert tool.investigation_history[1]["step_number"] == 3 # The new step that triggered backtrack
@pytest.mark.asyncio
async def test_execute_adjusts_total_steps(self):
"""Test execute method adjusts total steps when current step exceeds estimate."""
tool = DebugIssueTool()
arguments = {
"step": "Additional investigation needed",
"step_number": 8,
"total_steps": 5, # Current step exceeds total
"next_step_required": True,
"findings": "More complexity discovered",
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
# Total steps should be adjusted to match current step
assert parsed_response["total_steps"] == 8
assert parsed_response["step_number"] == 8
@pytest.mark.asyncio
async def test_execute_error_handling(self):
"""Test execute method error handling."""
tool = DebugIssueTool()
# Invalid arguments - missing required fields
arguments = {
"step": "Invalid request"
# Missing required fields
}
result = await tool.execute(arguments)
# Should return error response
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "investigation_failed"
assert "error" in parsed_response
def test_prepare_investigation_summary(self):
"""Test investigation summary preparation."""
tool = DebugIssueTool()
tool.consolidated_findings = {
"files_checked": {"file1.py", "file2.py", "file3.py"},
"relevant_files": {"file1.py", "file2.py"},
"relevant_methods": {"Class1.method1", "Class2.method2"},
"findings": [
"Step 1: Initial investigation findings",
"Step 2: Discovered potential issue",
"Step 3: Confirmed root cause",
],
"hypotheses": [
{"step": 1, "hypothesis": "Initial hypothesis", "confidence": "low"},
{"step": 2, "hypothesis": "Refined hypothesis", "confidence": "medium"},
{"step": 3, "hypothesis": "Final hypothesis", "confidence": "high"},
],
"images": [],
}
summary = tool._prepare_investigation_summary()
assert "SYSTEMATIC INVESTIGATION SUMMARY" in summary
assert "Files examined: 3" in summary
assert "Relevant files identified: 2" in summary
assert "Methods/functions involved: 2" in summary
assert "INVESTIGATION PROGRESSION" in summary
assert "Step 1:" in summary
assert "Step 2:" in summary
assert "Step 3:" in summary
assert "HYPOTHESIS EVOLUTION" in summary
assert "low confidence" in summary
assert "medium confidence" in summary
assert "high confidence" in summary
def test_extract_error_context(self):
"""Test error context extraction from findings."""
tool = DebugIssueTool()
tool.consolidated_findings = {
"findings": [
"Step 1: Found no issues initially",
"Step 2: Discovered ERROR: Dictionary size changed during iteration",
"Step 3: Stack trace shows RuntimeError in cleanup method",
"Step 4: Exception occurs intermittently",
],
}
error_context = tool._extract_error_context()
assert error_context is not None
assert "ERROR: Dictionary size changed" in error_context
assert "Stack trace shows RuntimeError" in error_context
assert "Exception occurs intermittently" in error_context
assert "Found no issues initially" not in error_context # Should not include non-error findings
def test_reprocess_consolidated_findings(self):
"""Test reprocessing of consolidated findings after backtracking."""
tool = DebugIssueTool()
tool.investigation_history = [
{
"step_number": 1,
"findings": "Initial findings",
"files_checked": ["file1.py"],
"relevant_files": ["file1.py"],
"relevant_methods": ["method1"],
"hypothesis": "Initial hypothesis",
"confidence": "low",
},
{
"step_number": 2,
"findings": "Second findings",
"files_checked": ["file2.py"],
"relevant_files": [],
"relevant_methods": ["method2"],
},
]
tool._reprocess_consolidated_findings()
assert tool.consolidated_findings["files_checked"] == {"file1.py", "file2.py"}
assert tool.consolidated_findings["relevant_files"] == {"file1.py"}
assert tool.consolidated_findings["relevant_methods"] == {"method1", "method2"}
assert len(tool.consolidated_findings["findings"]) == 2
assert len(tool.consolidated_findings["hypotheses"]) == 1
assert tool.consolidated_findings["hypotheses"][0]["hypothesis"] == "Initial hypothesis"
# Integration test
class TestDebugToolIntegration:
"""Integration tests for debug tool."""
def setup_method(self):
"""Set up model context for integration tests."""
from utils.model_context import ModelContext
self.tool = DebugIssueTool()
self.tool._model_context = ModelContext("flash") # Test model
@pytest.mark.asyncio
async def test_complete_investigation_flow(self):
"""Test complete investigation flow from start to expert analysis."""
# Step 1: Initial investigation
arguments = {
"step": "Investigating memory leak in data processing pipeline",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "High memory usage observed during batch processing",
"files_checked": ["/processor/main.py"],
}
# Mock conversation memory and expert analysis
with patch("utils.conversation_memory.create_thread", return_value="debug-flow-uuid"):
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(arguments)
# Verify response structure
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "investigation_in_progress"
assert parsed_response["step_number"] == 1
assert parsed_response["continuation_id"] == "debug-flow-uuid"

View File

@@ -0,0 +1,363 @@
"""
Comprehensive test demonstrating debug tool's self-investigation pattern
and continuation ID functionality working together end-to-end.
"""
import json
from unittest.mock import patch
import pytest
from tools.debug import DebugIssueTool
from utils.conversation_memory import (
ConversationTurn,
ThreadContext,
build_conversation_history,
get_conversation_file_list,
)
class TestDebugComprehensiveWorkflow:
"""Test the complete debug workflow from investigation to expert analysis to continuation."""
@pytest.mark.asyncio
async def test_full_debug_workflow_with_continuation(self):
"""Test complete debug workflow: investigation → expert analysis → continuation to another tool."""
tool = DebugIssueTool()
# Step 1: Initial investigation
with patch("utils.conversation_memory.create_thread", return_value="debug-workflow-uuid"):
with patch("utils.conversation_memory.add_turn") as mock_add_turn:
result1 = await tool.execute(
{
"step": "Investigating memory leak in user session handler",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "High memory usage detected in session handler",
"files_checked": ["/api/sessions.py"],
"images": ["/screenshots/memory_profile.png"],
}
)
# Verify step 1 response
assert len(result1) == 1
response1 = json.loads(result1[0].text)
assert response1["status"] == "investigation_in_progress"
assert response1["step_number"] == 1
assert response1["continuation_id"] == "debug-workflow-uuid"
# Verify conversation turn was added
assert mock_add_turn.called
call_args = mock_add_turn.call_args
if call_args:
# Check if args were passed positionally or as keywords
args = call_args.args if hasattr(call_args, "args") else call_args[0]
if args and len(args) >= 3:
assert args[0] == "debug-workflow-uuid"
assert args[1] == "assistant"
assert json.loads(args[2])["status"] == "investigation_in_progress"
# Step 2: Continue investigation with findings
with patch("utils.conversation_memory.add_turn") as mock_add_turn:
result2 = await tool.execute(
{
"step": "Found circular references in session cache preventing garbage collection",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"findings": "Session objects hold references to themselves through event handlers",
"files_checked": ["/api/sessions.py", "/api/cache.py"],
"relevant_files": ["/api/sessions.py"],
"relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
"hypothesis": "Circular references preventing garbage collection",
"confidence": "high",
"continuation_id": "debug-workflow-uuid",
}
)
# Verify step 2 response
response2 = json.loads(result2[0].text)
assert response2["status"] == "investigation_in_progress"
assert response2["step_number"] == 2
assert response2["investigation_status"]["files_checked"] == 2
assert response2["investigation_status"]["relevant_methods"] == 2
assert response2["investigation_status"]["current_confidence"] == "high"
# Step 3: Final investigation with expert analysis
# Mock the expert analysis response
mock_expert_response = {
"status": "analysis_complete",
"summary": "Memory leak caused by circular references in session event handlers",
"hypotheses": [
{
"name": "CIRCULAR_REFERENCE_LEAK",
"confidence": "High (95%)",
"evidence": ["Event handlers hold strong references", "No weak references used"],
"root_cause": "SessionHandler stores callbacks that reference the handler itself",
"potential_fixes": [
{
"description": "Use weakref for event handler callbacks",
"files_to_modify": ["/api/sessions.py"],
"complexity": "Low",
}
],
"minimal_fix": "Replace self references in callbacks with weakref.ref(self)",
}
],
"investigation_summary": {
"pattern": "Classic circular reference memory leak",
"severity": "High - causes unbounded memory growth",
"recommended_action": "Implement weakref solution immediately",
},
}
with patch("utils.conversation_memory.add_turn") as mock_add_turn:
with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
result3 = await tool.execute(
{
"step": "Investigation complete - confirmed circular reference memory leak pattern",
"step_number": 3,
"total_steps": 3,
"next_step_required": False, # Triggers expert analysis
"findings": "Circular references between SessionHandler and event callbacks prevent GC",
"files_checked": ["/api/sessions.py", "/api/cache.py"],
"relevant_files": ["/api/sessions.py"],
"relevant_methods": ["SessionHandler.__init__", "SessionHandler.add_event_listener"],
"hypothesis": "Circular references in event handler callbacks causing memory leak",
"confidence": "high",
"continuation_id": "debug-workflow-uuid",
"model": "flash",
}
)
# Verify final response with expert analysis
response3 = json.loads(result3[0].text)
assert response3["status"] == "calling_expert_analysis"
assert response3["investigation_complete"] is True
assert "expert_analysis" in response3
expert = response3["expert_analysis"]
assert expert["status"] == "analysis_complete"
assert "CIRCULAR_REFERENCE_LEAK" in expert["hypotheses"][0]["name"]
assert "weakref" in expert["hypotheses"][0]["minimal_fix"]
# Verify complete investigation summary
assert "complete_investigation" in response3
complete = response3["complete_investigation"]
assert complete["steps_taken"] == 3
assert "/api/sessions.py" in complete["files_examined"]
assert "SessionHandler.add_event_listener" in complete["relevant_methods"]
# Step 4: Test continuation to another tool (e.g., analyze)
# Create a mock thread context representing the debug conversation
debug_context = ThreadContext(
thread_id="debug-workflow-uuid",
created_at="2025-01-01T00:00:00Z",
last_updated_at="2025-01-01T00:10:00Z",
tool_name="debug",
turns=[
ConversationTurn(
role="user",
content="Step 1: Investigating memory leak",
timestamp="2025-01-01T00:01:00Z",
tool_name="debug",
files=["/api/sessions.py"],
images=["/screenshots/memory_profile.png"],
),
ConversationTurn(
role="assistant",
content=json.dumps(response1),
timestamp="2025-01-01T00:02:00Z",
tool_name="debug",
),
ConversationTurn(
role="user",
content="Step 2: Found circular references",
timestamp="2025-01-01T00:03:00Z",
tool_name="debug",
),
ConversationTurn(
role="assistant",
content=json.dumps(response2),
timestamp="2025-01-01T00:04:00Z",
tool_name="debug",
),
ConversationTurn(
role="user",
content="Step 3: Investigation complete",
timestamp="2025-01-01T00:05:00Z",
tool_name="debug",
),
ConversationTurn(
role="assistant",
content=json.dumps(response3),
timestamp="2025-01-01T00:06:00Z",
tool_name="debug",
),
],
initial_context={},
)
# Test that another tool can use the continuation
with patch("utils.conversation_memory.get_thread", return_value=debug_context):
# Mock file reading
def mock_read_file(file_path):
if file_path == "/api/sessions.py":
return "# SessionHandler with circular refs\nclass SessionHandler:\n pass", 20
elif file_path == "/screenshots/memory_profile.png":
# Images return empty string for content but 0 tokens
return "", 0
elif file_path == "/api/cache.py":
return "# Cache module", 5
return "", 0
# Build conversation history for another tool
from utils.model_context import ModelContext
model_context = ModelContext("flash")
history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
# Verify history contains all debug information
assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
assert "Thread: debug-workflow-uuid" in history
assert "Tool: debug" in history
# Check investigation progression
assert "Step 1: Investigating memory leak" in history
assert "Step 2: Found circular references" in history
assert "Step 3: Investigation complete" in history
# Check expert analysis is included
assert "CIRCULAR_REFERENCE_LEAK" in history
assert "weakref" in history
assert "memory leak" in history
# Check files are referenced in conversation history
assert "/api/sessions.py" in history
# File content would be in referenced files section if the files were readable
# In our test they're not real files so they won't be embedded
# But the expert analysis content should be there
assert "Memory leak caused by circular references" in history
# Verify file list includes all files from investigation
file_list = get_conversation_file_list(debug_context)
assert "/api/sessions.py" in file_list
@pytest.mark.asyncio
async def test_debug_investigation_state_machine(self):
"""Test the debug tool's investigation state machine behavior."""
tool = DebugIssueTool()
# Test state transitions
states = []
# Initial state
with patch("utils.conversation_memory.create_thread", return_value="state-test-uuid"):
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(
{
"step": "Starting investigation",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Initial findings",
}
)
states.append(json.loads(result[0].text))
# Verify initial state
assert states[0]["status"] == "investigation_in_progress"
assert states[0]["step_number"] == 1
assert states[0]["next_step_required"] is True
# Final state (triggers expert analysis)
mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
with patch("utils.conversation_memory.add_turn"):
with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
result = await tool.execute(
{
"step": "Final findings",
"step_number": 2,
"total_steps": 2,
"next_step_required": False,
"findings": "Complete findings",
"continuation_id": "state-test-uuid",
"model": "flash",
}
)
states.append(json.loads(result[0].text))
# Verify final state
assert states[1]["status"] == "calling_expert_analysis"
assert states[1]["investigation_complete"] is True
assert "expert_analysis" in states[1]
@pytest.mark.asyncio
async def test_debug_backtracking_preserves_continuation(self):
"""Test that backtracking preserves continuation ID and investigation state."""
tool = DebugIssueTool()
# Start investigation
with patch("utils.conversation_memory.create_thread", return_value="backtrack-test-uuid"):
with patch("utils.conversation_memory.add_turn"):
result1 = await tool.execute(
{
"step": "Initial hypothesis",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Initial findings",
}
)
response1 = json.loads(result1[0].text)
continuation_id = response1["continuation_id"]
# Step 2 - wrong direction
with patch("utils.conversation_memory.add_turn"):
await tool.execute(
{
"step": "Wrong hypothesis",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"findings": "Dead end",
"hypothesis": "Wrong initial hypothesis",
"confidence": "low",
"continuation_id": continuation_id,
}
)
# Backtrack from step 2
with patch("utils.conversation_memory.add_turn"):
result3 = await tool.execute(
{
"step": "Backtracking - new hypothesis",
"step_number": 3,
"total_steps": 4, # Adjusted total
"next_step_required": True,
"findings": "New direction",
"hypothesis": "New hypothesis after backtracking",
"confidence": "medium",
"backtrack_from_step": 2,
"continuation_id": continuation_id,
}
)
response3 = json.loads(result3[0].text)
# Verify continuation preserved through backtracking
assert response3["continuation_id"] == continuation_id
assert response3["step_number"] == 3
assert response3["total_steps"] == 4
# Verify investigation status after backtracking
# When we backtrack, investigation continues
assert response3["investigation_status"]["files_checked"] == 0 # Reset after backtrack
assert response3["investigation_status"]["current_confidence"] == "medium"
# The key thing is the continuation ID is preserved
# and we've adjusted our approach (total_steps increased)

View File

@@ -0,0 +1,336 @@
"""
Test debug tool continuation ID functionality and conversation history formatting.
"""
import json
from unittest.mock import patch
import pytest
from tools.debug import DebugIssueTool
from utils.conversation_memory import (
ConversationTurn,
ThreadContext,
build_conversation_history,
get_conversation_file_list,
)
class TestDebugContinuation:
"""Test debug tool continuation ID and conversation history integration."""
@pytest.mark.asyncio
async def test_debug_creates_continuation_id(self):
"""Test that debug tool creates continuation ID on first step."""
tool = DebugIssueTool()
with patch("utils.conversation_memory.create_thread", return_value="debug-test-uuid-123"):
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(
{
"step": "Investigating null pointer exception",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Initial investigation shows null reference in UserService",
"files_checked": ["/api/UserService.java"],
}
)
assert len(result) == 1
response = json.loads(result[0].text)
assert response["status"] == "investigation_in_progress"
assert response["continuation_id"] == "debug-test-uuid-123"
def test_debug_conversation_formatting(self):
"""Test that debug tool's structured output is properly formatted in conversation history."""
# Create a mock conversation with debug tool output
debug_output = {
"status": "investigation_in_progress",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"investigation_status": {
"files_checked": 3,
"relevant_files": 2,
"relevant_methods": 1,
"hypotheses_formed": 1,
"images_collected": 0,
"current_confidence": "medium",
},
"output": {"instructions": "Continue systematic investigation.", "format": "systematic_investigation"},
"continuation_id": "debug-test-uuid-123",
"next_steps": "Continue investigation with step 3.",
}
context = ThreadContext(
thread_id="debug-test-uuid-123",
created_at="2025-01-01T00:00:00Z",
last_updated_at="2025-01-01T00:05:00Z",
tool_name="debug",
turns=[
ConversationTurn(
role="user",
content="Step 1: Investigating null pointer exception",
timestamp="2025-01-01T00:01:00Z",
tool_name="debug",
files=["/api/UserService.java"],
),
ConversationTurn(
role="assistant",
content=json.dumps(debug_output, indent=2),
timestamp="2025-01-01T00:02:00Z",
tool_name="debug",
files=["/api/UserService.java", "/api/UserController.java"],
),
],
initial_context={
"step": "Investigating null pointer exception",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Initial investigation",
},
)
# Mock file reading to avoid actual file I/O
def mock_read_file(file_path):
if file_path == "/api/UserService.java":
return "// UserService.java\npublic class UserService {\n // code...\n}", 10
elif file_path == "/api/UserController.java":
return "// UserController.java\npublic class UserController {\n // code...\n}", 10
return "", 0
# Build conversation history
from utils.model_context import ModelContext
model_context = ModelContext("flash")
history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
# Verify the history contains debug-specific content
assert "=== CONVERSATION HISTORY (CONTINUATION) ===" in history
assert "Thread: debug-test-uuid-123" in history
assert "Tool: debug" in history
# Check that files are included
assert "UserService.java" in history
assert "UserController.java" in history
# Check that debug output is included
assert "investigation_in_progress" in history
assert '"step_number": 2' in history
assert '"files_checked": 3' in history
assert '"current_confidence": "medium"' in history
def test_debug_continuation_preserves_investigation_state(self):
"""Test that continuation preserves investigation state across tools."""
# Create a debug investigation context
context = ThreadContext(
thread_id="debug-test-uuid-123",
created_at="2025-01-01T00:00:00Z",
last_updated_at="2025-01-01T00:10:00Z",
tool_name="debug",
turns=[
ConversationTurn(
role="user",
content="Step 1: Initial investigation",
timestamp="2025-01-01T00:01:00Z",
tool_name="debug",
files=["/api/SessionManager.java"],
),
ConversationTurn(
role="assistant",
content=json.dumps(
{
"status": "investigation_in_progress",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"investigation_status": {"files_checked": 1, "relevant_files": 1},
"continuation_id": "debug-test-uuid-123",
}
),
timestamp="2025-01-01T00:02:00Z",
tool_name="debug",
),
ConversationTurn(
role="user",
content="Step 2: Found dictionary modification issue",
timestamp="2025-01-01T00:03:00Z",
tool_name="debug",
files=["/api/SessionManager.java", "/api/utils.py"],
),
ConversationTurn(
role="assistant",
content=json.dumps(
{
"status": "investigation_in_progress",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"investigation_status": {
"files_checked": 2,
"relevant_files": 1,
"relevant_methods": 1,
"hypotheses_formed": 1,
"current_confidence": "high",
},
"continuation_id": "debug-test-uuid-123",
}
),
timestamp="2025-01-01T00:04:00Z",
tool_name="debug",
),
],
initial_context={},
)
# Get file list to verify prioritization
file_list = get_conversation_file_list(context)
assert file_list == ["/api/SessionManager.java", "/api/utils.py"]
# Mock file reading
def mock_read_file(file_path):
return f"// {file_path}\n// Mock content", 5
# Build history
from utils.model_context import ModelContext
model_context = ModelContext("flash")
history, tokens = build_conversation_history(context, model_context, read_files_func=mock_read_file)
# Verify investigation progression is preserved
assert "Step 1: Initial investigation" in history
assert "Step 2: Found dictionary modification issue" in history
assert '"step_number": 1' in history
assert '"step_number": 2' in history
assert '"current_confidence": "high"' in history
@pytest.mark.asyncio
async def test_debug_to_analyze_continuation(self):
"""Test continuation from debug tool to analyze tool."""
# Simulate debug tool creating initial investigation
debug_context = ThreadContext(
thread_id="debug-analyze-uuid-123",
created_at="2025-01-01T00:00:00Z",
last_updated_at="2025-01-01T00:10:00Z",
tool_name="debug",
turns=[
ConversationTurn(
role="user",
content="Final investigation step",
timestamp="2025-01-01T00:01:00Z",
tool_name="debug",
files=["/api/SessionManager.java"],
),
ConversationTurn(
role="assistant",
content=json.dumps(
{
"status": "calling_expert_analysis",
"investigation_complete": True,
"expert_analysis": {
"status": "analysis_complete",
"summary": "Dictionary modification during iteration bug",
"hypotheses": [
{
"name": "CONCURRENT_MODIFICATION",
"confidence": "High",
"root_cause": "Modifying dict while iterating",
"minimal_fix": "Create list of keys first",
}
],
},
"complete_investigation": {
"initial_issue": "Session validation failures",
"steps_taken": 3,
"files_examined": ["/api/SessionManager.java"],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
},
}
),
timestamp="2025-01-01T00:02:00Z",
tool_name="debug",
),
],
initial_context={},
)
# Mock getting the thread
with patch("utils.conversation_memory.get_thread", return_value=debug_context):
# Mock file reading
def mock_read_file(file_path):
return "// SessionManager.java\n// cleanup_expired_sessions method", 10
# Build history for analyze tool
from utils.model_context import ModelContext
model_context = ModelContext("flash")
history, tokens = build_conversation_history(debug_context, model_context, read_files_func=mock_read_file)
# Verify analyze tool can see debug investigation
assert "calling_expert_analysis" in history
assert "CONCURRENT_MODIFICATION" in history
assert "Dictionary modification during iteration bug" in history
assert "SessionManager.cleanup_expired_sessions" in history
# Verify the continuation context is clear
assert "Thread: debug-analyze-uuid-123" in history
assert "Tool: debug" in history # Shows original tool
def test_debug_planner_style_formatting(self):
"""Test that debug tool uses similar formatting to planner for structured responses."""
# Create debug investigation with multiple steps
context = ThreadContext(
thread_id="debug-format-uuid-123",
created_at="2025-01-01T00:00:00Z",
last_updated_at="2025-01-01T00:15:00Z",
tool_name="debug",
turns=[
ConversationTurn(
role="user",
content="Step 1: Initial error analysis",
timestamp="2025-01-01T00:01:00Z",
tool_name="debug",
),
ConversationTurn(
role="assistant",
content=json.dumps(
{
"status": "investigation_in_progress",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"output": {
"instructions": "Continue systematic investigation.",
"format": "systematic_investigation",
},
"continuation_id": "debug-format-uuid-123",
},
indent=2,
),
timestamp="2025-01-01T00:02:00Z",
tool_name="debug",
),
],
initial_context={},
)
# Build history
from utils.model_context import ModelContext
model_context = ModelContext("flash")
history, _ = build_conversation_history(context, model_context, read_files_func=lambda x: ("", 0))
# Verify structured format is preserved
assert '"status": "investigation_in_progress"' in history
assert '"format": "systematic_investigation"' in history
assert "--- Turn 1 (Claude using debug) ---" in history
assert "--- Turn 2 (Gemini using debug" in history
# The JSON structure should be preserved for tools to parse
# This allows other tools to understand the investigation state
turn_2_start = history.find("--- Turn 2 (Gemini using debug")
turn_2_content = history[turn_2_start:]
assert "{\n" in turn_2_content # JSON formatting preserved
assert '"continuation_id"' in turn_2_content

View File

@@ -19,7 +19,8 @@ from config import MCP_PROMPT_SIZE_LIMIT
from tools.analyze import AnalyzeTool from tools.analyze import AnalyzeTool
from tools.chat import ChatTool from tools.chat import ChatTool
from tools.codereview import CodeReviewTool from tools.codereview import CodeReviewTool
from tools.debug import DebugIssueTool
# from tools.debug import DebugIssueTool # Commented out - debug tool refactored
from tools.precommit import Precommit from tools.precommit import Precommit
from tools.thinkdeep import ThinkDeepTool from tools.thinkdeep import ThinkDeepTool
@@ -250,25 +251,30 @@ class TestLargePromptHandling:
# The core fix ensures large prompts are detected at the right time # The core fix ensures large prompts are detected at the right time
assert output["status"] in ["success", "files_required_to_continue", "resend_prompt"] assert output["status"] in ["success", "files_required_to_continue", "resend_prompt"]
@pytest.mark.asyncio # NOTE: Debug tool tests have been commented out because the debug tool has been
async def test_debug_large_error_description(self, large_prompt): # refactored to use a self-investigation pattern instead of accepting a prompt field.
"""Test that debug tool detects large error_description.""" # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
tool = DebugIssueTool() # and doesn't have the "resend_prompt" functionality for large prompts.
result = await tool.execute({"prompt": large_prompt})
assert len(result) == 1 # @pytest.mark.asyncio
output = json.loads(result[0].text) # async def test_debug_large_error_description(self, large_prompt):
assert output["status"] == "resend_prompt" # """Test that debug tool detects large error_description."""
# tool = DebugIssueTool()
# result = await tool.execute({"prompt": large_prompt})
#
# assert len(result) == 1
# output = json.loads(result[0].text)
# assert output["status"] == "resend_prompt"
@pytest.mark.asyncio # @pytest.mark.asyncio
async def test_debug_large_error_context(self, large_prompt, normal_prompt): # async def test_debug_large_error_context(self, large_prompt, normal_prompt):
"""Test that debug tool detects large error_context.""" # """Test that debug tool detects large error_context."""
tool = DebugIssueTool() # tool = DebugIssueTool()
result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt}) # result = await tool.execute({"prompt": normal_prompt, "error_context": large_prompt})
#
assert len(result) == 1 # assert len(result) == 1
output = json.loads(result[0].text) # output = json.loads(result[0].text)
assert output["status"] == "resend_prompt" # assert output["status"] == "resend_prompt"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_analyze_large_question(self, large_prompt): async def test_analyze_large_question(self, large_prompt):

View File

@@ -13,7 +13,8 @@ import pytest
from tools.analyze import AnalyzeTool from tools.analyze import AnalyzeTool
from tools.chat import ChatTool from tools.chat import ChatTool
from tools.codereview import CodeReviewTool from tools.codereview import CodeReviewTool
from tools.debug import DebugIssueTool
# from tools.debug import DebugIssueTool # Commented out - debug tool refactored
from tools.precommit import Precommit from tools.precommit import Precommit
from tools.thinkdeep import ThinkDeepTool from tools.thinkdeep import ThinkDeepTool
@@ -182,33 +183,37 @@ class TestPromptRegression:
output = json.loads(result[0].text) output = json.loads(result[0].text)
assert output["status"] == "success" assert output["status"] == "success"
@pytest.mark.asyncio # NOTE: Debug tool test has been commented out because the debug tool has been
async def test_debug_normal_error(self, mock_model_response): # refactored to use a self-investigation pattern instead of accepting prompt/error_context fields.
"""Test debug tool with normal error description.""" # The new debug tool requires fields like: step, step_number, total_steps, next_step_required, findings
tool = DebugIssueTool()
with patch.object(tool, "get_model_provider") as mock_get_provider: # @pytest.mark.asyncio
mock_provider = MagicMock() # async def test_debug_normal_error(self, mock_model_response):
mock_provider.get_provider_type.return_value = MagicMock(value="google") # """Test debug tool with normal error description."""
mock_provider.supports_thinking_mode.return_value = False # tool = DebugIssueTool()
mock_provider.generate_content.return_value = mock_model_response( #
"Root cause: The variable is undefined. Fix: Initialize it..." # with patch.object(tool, "get_model_provider") as mock_get_provider:
) # mock_provider = MagicMock()
mock_get_provider.return_value = mock_provider # mock_provider.get_provider_type.return_value = MagicMock(value="google")
# mock_provider.supports_thinking_mode.return_value = False
result = await tool.execute( # mock_provider.generate_content.return_value = mock_model_response(
{ # "Root cause: The variable is undefined. Fix: Initialize it..."
"prompt": "TypeError: Cannot read property 'name' of undefined", # )
"error_context": "at line 42 in user.js\n console.log(user.name)", # mock_get_provider.return_value = mock_provider
"runtime_info": "Node.js v16.14.0", #
} # result = await tool.execute(
) # {
# "prompt": "TypeError: Cannot read property 'name' of undefined",
assert len(result) == 1 # "error_context": "at line 42 in user.js\n console.log(user.name)",
output = json.loads(result[0].text) # "runtime_info": "Node.js v16.14.0",
assert output["status"] == "success" # }
assert "Next Steps:" in output["content"] # )
assert "Root cause" in output["content"] #
# assert len(result) == 1
# output = json.loads(result[0].text)
# assert output["status"] == "success"
# assert "Next Steps:" in output["content"]
# assert "Root cause" in output["content"]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_analyze_normal_question(self, mock_model_response): async def test_analyze_normal_question(self, mock_model_response):

View File

@@ -6,7 +6,7 @@ import json
import pytest import pytest
from tools import AnalyzeTool, ChatTool, CodeReviewTool, DebugIssueTool, ThinkDeepTool from tools import AnalyzeTool, ChatTool, CodeReviewTool, ThinkDeepTool
class TestThinkDeepTool: class TestThinkDeepTool:
@@ -183,94 +183,6 @@ class TestCodeReviewTool:
ModelProviderRegistry._instance = None ModelProviderRegistry._instance = None
class TestDebugIssueTool:
"""Test the debug tool"""
@pytest.fixture
def tool(self):
return DebugIssueTool()
def test_tool_metadata(self, tool):
"""Test tool metadata"""
assert tool.get_name() == "debug"
assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
assert tool.get_default_temperature() == 0.2
schema = tool.get_input_schema()
assert "prompt" in schema["properties"]
assert schema["required"] == ["prompt"]
@pytest.mark.asyncio
async def test_execute_with_context(self, tool):
"""Test execution with error context using real integration testing"""
import importlib
import os
# Save original environment
original_env = {
"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
"DEFAULT_MODEL": os.environ.get("DEFAULT_MODEL"),
}
try:
# Set up environment for real provider resolution
os.environ["OPENAI_API_KEY"] = "sk-test-key-debug-context-test-not-real"
os.environ["DEFAULT_MODEL"] = "o3-mini"
# Clear other provider keys to isolate to OpenAI
for key in ["GEMINI_API_KEY", "XAI_API_KEY", "OPENROUTER_API_KEY"]:
os.environ.pop(key, None)
# Reload config and clear registry
import config
importlib.reload(config)
from providers.registry import ModelProviderRegistry
ModelProviderRegistry._instance = None
# Test with real provider resolution
try:
result = await tool.execute(
{
"prompt": "Test fails intermittently",
"error_context": "AssertionError in test_async",
"previous_attempts": "Added sleep, still fails",
"model": "o3-mini",
}
)
# If we get here, check the response format
assert len(result) == 1
# Should contain debug analysis
assert result[0].text is not None
except Exception as e:
# Expected: API call will fail with fake key
error_msg = str(e)
# Should NOT be a mock-related error
assert "MagicMock" not in error_msg
assert "'<' not supported between instances" not in error_msg
# Should be a real provider error
assert any(
phrase in error_msg
for phrase in ["API", "key", "authentication", "provider", "network", "connection"]
)
finally:
# Restore environment
for key, value in original_env.items():
if value is not None:
os.environ[key] = value
else:
os.environ.pop(key, None)
# Reload config and clear registry
importlib.reload(config)
ModelProviderRegistry._instance = None
class TestAnalyzeTool: class TestAnalyzeTool:
"""Test the analyze tool""" """Test the analyze tool"""
@@ -400,23 +312,6 @@ class TestAbsolutePathValidation:
assert "must be FULL absolute paths" in response["content"] assert "must be FULL absolute paths" in response["content"]
assert "../parent/file.py" in response["content"] assert "../parent/file.py" in response["content"]
@pytest.mark.asyncio
async def test_debug_tool_relative_path_rejected(self):
"""Test that debug tool rejects relative paths"""
tool = DebugIssueTool()
result = await tool.execute(
{
"prompt": "Something broke",
"files": ["src/main.py"], # relative path
}
)
assert len(result) == 1
response = json.loads(result[0].text)
assert response["status"] == "error"
assert "must be FULL absolute paths" in response["content"]
assert "src/main.py" in response["content"]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_thinkdeep_tool_relative_path_rejected(self): async def test_thinkdeep_tool_relative_path_rejected(self):
"""Test that thinkdeep tool rejects relative paths""" """Test that thinkdeep tool rejects relative paths"""

View File

@@ -1,7 +1,9 @@
""" """
Debug Issue tool - Root cause analysis and debugging assistance Debug Issue tool - Root cause analysis and debugging assistance with systematic investigation
""" """
import json
import logging
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional
from pydantic import Field from pydantic import Field
@@ -14,155 +16,207 @@ from systemprompts import DEBUG_ISSUE_PROMPT
from .base import BaseTool, ToolRequest from .base import BaseTool, ToolRequest
# Field descriptions to avoid duplication between Pydantic and JSON schema logger = logging.getLogger(__name__)
DEBUG_FIELD_DESCRIPTIONS = {
"prompt": ( # Field descriptions for the investigation steps
"MANDATORY: You MUST first think deep about the issue, what it is, why it might be happening, what code might be involved, " DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
"is it an error stemming out of the code directly or is it a side-effect of some part of the existing code. If it's an error " "step": (
"message, could it be coming from an external resource and NOT directly from the project? What part of the code seems most likely" "Your current investigation step. For the first step, describe the issue/error to investigate. "
"the culprit. MUST try and ZERO IN on the issue and surrounding code. Include all the details into the prompt that you can provide: " "For subsequent steps, describe what you're investigating, what code you're examining, "
"error messages, symptoms, when it occurs, steps to reproduce, environment details, " "what patterns you're looking for, or what hypothesis you're testing."
"recent changes, and any other relevant information. Mention any previous attempts at fixing this issue, "
"including any past fix that was in place but has now regressed. "
"The more context available, the better the analysis. "
"PERFORM SYSTEMATIC INVESTIGATION: You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. "
"First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. "
"You MUST maintain detailed investigation notes in a DEBUGGING_{issue_description}.md file within the project folder, "
"updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. "
"This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. "
"CRITICAL: If after thorough investigation You has very high confidence that NO BUG EXISTS that correlates to the reported symptoms, "
"You should consider the possibility that the reported issue may not actually be present, may be a misunderstanding, or may be conflated with something else entirely. "
"In such cases, you should gather more information from the user through targeted questioning rather than continue hunting for non-existent bugs. "
"Once complete, you MUST provide also pass in this file into the files parameter of this tool. "
"It is ESSENTIAL that this detailed work is performed by you before sharing all the relevant details with its development assistant. This will greatly help in zeroing in on the root cause."
), ),
"step_number": "Current step number in the investigation sequence (starts at 1)",
"total_steps": "Current estimate of total investigation steps needed (can be adjusted as investigation progresses)",
"next_step_required": "Whether another investigation step is required",
"findings": ( "findings": (
"You MUST first perform its own investigation, gather its findings and analysis. Include: steps taken to analyze the issue, " "Current findings from this investigation step. Include code patterns discovered, "
"code patterns discovered, initial hypotheses formed, any relevant classes/functions/methods examined, " "potential causes identified, hypotheses formed, or evidence gathered."
"and any preliminary conclusions. If investigation yields no concrete evidence of a bug correlating to the reported symptoms, "
"You should clearly state this finding and consider that the issue may not exist as described. "
"This provides context for the assistant model's analysis."
), ),
"files": ( "files_checked": (
"Essential files for debugging - ONLY include files that are directly related to the issue, " "List of files you've examined so far in the investigation (cumulative list). "
"contain the problematic code, or are necessary for understanding the root cause. " "Include all files you've looked at, even if they turned out to be irrelevant."
"This can include any relevant log files, error description documents, investigation documents, "
"Your own findings as a document, related code that may help with analysis."
"DO NOT include every file scanned during investigation (must be FULL absolute paths - DO NOT SHORTEN)."
), ),
"error_context": "Stack trace, snippet from logs, or additional error context. For very large text you MUST instead" "relevant_files": (
"save the context as a temporary file within the project folder and share it as a FULL absolute file path - DO NOT SHORTEN" "List of files that are definitely related to the issue (subset of files_checked). "
"reference to the files parameter.", "Only include files that contain code directly related to the problem."
"images": "Optional images showing error screens, UI issues, logs displays, or visual debugging information", ),
"relevant_methods": (
"List of specific methods/functions that are involved in the issue. "
"Format: 'ClassName.methodName' or 'functionName'"
),
"hypothesis": (
"Your current working hypothesis about the root cause. This can be updated/revised "
"as the investigation progresses."
),
"confidence": "Your confidence level in the current hypothesis: 'low', 'medium', or 'high'",
"backtrack_from_step": "If you need to revise a previous finding, which step number to backtrack from",
"continuation_id": "Thread continuation ID for multi-turn investigation sessions",
"images": (
"Optional images showing error screens, UI issues, logs displays, or visual debugging information "
"that help understand the issue (must be FULL absolute paths - DO NOT SHORTEN)"
),
}
# Field descriptions for the final debug request
DEBUG_FIELD_DESCRIPTIONS = {
"initial_issue": "The original issue description that started the investigation",
"investigation_summary": "Complete summary of the systematic investigation performed",
"findings": "Consolidated findings from all investigation steps",
"files": "Essential files identified during investigation (must be FULL absolute paths - DO NOT SHORTEN)",
"error_context": "Stack trace, logs, or error context discovered during investigation",
"relevant_methods": "List of methods/functions identified as involved in the issue",
"hypothesis": "Final hypothesis about the root cause after investigation",
"images": "Optional images showing error screens, UI issues, or visual debugging information",
} }
class DebugIssueRequest(ToolRequest): class DebugInvestigationRequest(ToolRequest):
"""Request model for debug tool""" """Request model for debug investigation steps"""
prompt: str = Field(..., description=DEBUG_FIELD_DESCRIPTIONS["prompt"]) # Required fields for each investigation step
findings: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["findings"]) step: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"])
files: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["files"]) step_number: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"])
error_context: Optional[str] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["error_context"]) total_steps: int = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"])
images: Optional[list[str]] = Field(None, description=DEBUG_FIELD_DESCRIPTIONS["images"]) next_step_required: bool = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"])
# Investigation tracking fields
findings: str = Field(..., description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"])
files_checked: list[str] = Field(
default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"]
)
relevant_files: list[str] = Field(
default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"]
)
relevant_methods: list[str] = Field(
default_factory=list, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"]
)
hypothesis: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"])
confidence: Optional[str] = Field("low", description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"])
# Optional backtracking field
backtrack_from_step: Optional[int] = Field(
None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"]
)
# Optional continuation field
continuation_id: Optional[str] = Field(None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"])
# Optional images for visual debugging
images: Optional[list[str]] = Field(default=None, description=DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"])
# Override inherited fields to exclude them
model: Optional[str] = Field(default=None, exclude=True)
temperature: Optional[float] = Field(default=None, exclude=True)
thinking_mode: Optional[str] = Field(default=None, exclude=True)
use_websearch: Optional[bool] = Field(default=None, exclude=True)
class DebugIssueTool(BaseTool): class DebugIssueTool(BaseTool):
"""Advanced debugging and root cause analysis tool""" """Advanced debugging tool with systematic self-investigation"""
def __init__(self):
super().__init__()
self.investigation_history = []
self.consolidated_findings = {
"files_checked": set(),
"relevant_files": set(),
"relevant_methods": set(),
"findings": [],
"hypotheses": [],
"images": [],
}
def get_name(self) -> str: def get_name(self) -> str:
return "debug" return "debug"
def get_description(self) -> str: def get_description(self) -> str:
return ( return (
"DEBUG & ROOT CAUSE ANALYSIS - Expert debugging for complex issues with systematic investigation support. " "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
"Use this when you need to debug code, find out why something is failing, identify root causes, " "This tool guides you through a step-by-step investigation process where you:\n\n"
"trace errors, or diagnose issues. " "1. Start with step 1: describe the issue to investigate\n"
"MANDATORY: Claud you MUST first think deep and follow these instructions when using this tool" "2. Continue with investigation steps: examine code, trace errors, test hypotheses\n"
"SYSTEMATIC INVESTIGATION WORKFLOW: " "3. Track findings, relevant files, and methods throughout\n"
"You MUST begin by thinking hard and performing a thorough investigation using a systematic approach. " "4. Update hypotheses as understanding evolves\n"
"First understand the issue, find the code that may be causing it or code that is breaking, as well as any related code that could have caused this as a side effect. " "5. Backtrack and revise findings when needed\n"
"You MUST maintain detailed investigation notes while it performs its analysis, " "6. Once investigation is complete, receive expert analysis\n\n"
"updating it as it performs step-by-step analysis of the code, trying to determine the actual root cause and understanding how a minimal, appropriate fix can be found. " "The tool enforces systematic investigation methodology:\n"
"This file MUST contain functions, methods, files visited OR determined to be part of the problem. You MUST update this and remove any references that it finds to be irrelevant during its investigation. " "- Methodical code examination and evidence collection\n"
"Once complete, You MUST provide Zen's debug tool with this file passed into the files parameter. " "- Hypothesis formation and validation\n"
"1. INVESTIGATE SYSTEMATICALLY: You MUST think and use a methodical approach to trace through error reports, " "- File and method tracking for context\n"
"examine code, and gather evidence step by step " "- Confidence assessment and revision capabilities\n\n"
"2. DOCUMENT FINDINGS: Maintain detailed investigation notes to " "Perfect for: complex bugs, mysterious errors, performance issues, "
"keep the user informed during its initial investigation. This investigation MUST be shared with this tool for the assistant " "race conditions, memory leaks, integration problems."
"to be able to help more effectively. "
"3. USE TRACER TOOL: For complex method calls, class references, or side effects use Zen's tracer tool and include its output as part of the "
"prompt or additional context "
"4. COLLECT EVIDENCE: Document important discoveries and validation attempts "
"5. PROVIDE COMPREHENSIVE FINDINGS: Pass complete findings to this tool for expert analysis "
"INVESTIGATION METHODOLOGY: "
"- Start with error messages/symptoms and work backwards to root cause "
"- Examine code flow and identify potential failure points "
"- Use tracer tool for complex method interactions and dependencies if and as needed but continue with the investigation after using it "
"- Test hypotheses against actual code and logs and confirm the idea holds "
"- Document everything systematically "
"- CRITICAL: If investigation yields no concrete evidence of a bug, consider that the reported issue may not exist as described and gather more information through questioning "
"ESSENTIAL FILES ONLY: Include only files (documents, code etc) directly related to the issue. "
"Focus on quality over quantity for assistant model analysis. "
"STRUCTURED OUTPUT: Assistant models return JSON responses with hypothesis "
"ranking, evidence correlation, and actionable fixes. "
"Choose thinking_mode based on issue complexity: 'low' for simple errors, "
"'medium' for standard debugging (default), 'high' for complex system issues, "
"'max' for extremely challenging bugs requiring deepest analysis. "
"Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
) )
def get_input_schema(self) -> dict[str, Any]: def get_input_schema(self) -> dict[str, Any]:
schema = { schema = {
"type": "object", "type": "object",
"properties": { "properties": {
"prompt": { # Investigation step fields
"step": {
"type": "string", "type": "string",
"description": DEBUG_FIELD_DESCRIPTIONS["prompt"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step"],
},
"step_number": {
"type": "integer",
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["step_number"],
"minimum": 1,
},
"total_steps": {
"type": "integer",
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["total_steps"],
"minimum": 1,
},
"next_step_required": {
"type": "boolean",
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["next_step_required"],
}, },
"model": self.get_model_field_schema(),
"findings": { "findings": {
"type": "string", "type": "string",
"description": DEBUG_FIELD_DESCRIPTIONS["findings"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["findings"],
}, },
"files": { "files_checked": {
"type": "array", "type": "array",
"items": {"type": "string"}, "items": {"type": "string"},
"description": DEBUG_FIELD_DESCRIPTIONS["files"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["files_checked"],
}, },
"error_context": { "relevant_files": {
"type": "array",
"items": {"type": "string"},
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_files"],
},
"relevant_methods": {
"type": "array",
"items": {"type": "string"},
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["relevant_methods"],
},
"hypothesis": {
"type": "string", "type": "string",
"description": DEBUG_FIELD_DESCRIPTIONS["error_context"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["hypothesis"],
},
"confidence": {
"type": "string",
"enum": ["low", "medium", "high"],
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"],
},
"backtrack_from_step": {
"type": "integer",
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["backtrack_from_step"],
"minimum": 1,
},
"continuation_id": {
"type": "string",
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["continuation_id"],
}, },
"images": { "images": {
"type": "array", "type": "array",
"items": {"type": "string"}, "items": {"type": "string"},
"description": DEBUG_FIELD_DESCRIPTIONS["images"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["images"],
},
"temperature": {
"type": "number",
"description": "Temperature (0-1, default 0.2 for accuracy)",
"minimum": 0,
"maximum": 1,
},
"thinking_mode": {
"type": "string",
"enum": ["minimal", "low", "medium", "high", "max"],
"description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
},
"use_websearch": {
"type": "boolean",
"description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
"default": True,
},
"continuation_id": {
"type": "string",
"description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
}, },
}, },
"required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []), # Required fields for investigation
"required": ["step", "step_number", "total_steps", "next_step_required", "findings"],
} }
return schema return schema
def get_system_prompt(self) -> str: def get_system_prompt(self) -> str:
@@ -171,8 +225,6 @@ class DebugIssueTool(BaseTool):
def get_default_temperature(self) -> float: def get_default_temperature(self) -> float:
return TEMPERATURE_ANALYTICAL return TEMPERATURE_ANALYTICAL
# Line numbers are enabled by default from base class for precise error location
def get_model_category(self) -> "ToolModelCategory": def get_model_category(self) -> "ToolModelCategory":
"""Debug requires deep analysis and reasoning""" """Debug requires deep analysis and reasoning"""
from tools.models import ToolModelCategory from tools.models import ToolModelCategory
@@ -180,138 +232,342 @@ class DebugIssueTool(BaseTool):
return ToolModelCategory.EXTENDED_REASONING return ToolModelCategory.EXTENDED_REASONING
def get_request_model(self): def get_request_model(self):
return DebugIssueRequest return DebugInvestigationRequest
async def prepare_prompt(self, request: DebugIssueRequest) -> str: def requires_model(self) -> bool:
"""Prepare the debugging prompt""" """
# Check for prompt.txt in files Debug tool manages its own model interactions.
prompt_content, updated_files = self.handle_prompt_file(request.files) It doesn't need model during investigation steps, only for final analysis.
"""
return False
# If prompt.txt was found, use it as prompt or error_context async def execute(self, arguments: dict[str, Any]) -> list:
if prompt_content: """
if not request.prompt or request.prompt == "": Override execute to implement self-investigation pattern.
request.prompt = prompt_content
Investigation Flow:
1. Claude calls debug with investigation steps
2. Tool tracks findings, files, methods progressively
3. Once investigation is complete, tool calls AI model for expert analysis
4. Returns structured response combining investigation + expert analysis
"""
from mcp.types import TextContent
from utils.conversation_memory import add_turn, create_thread
try:
# Validate request
request = DebugInvestigationRequest(**arguments)
# Adjust total steps if needed
if request.step_number > request.total_steps:
request.total_steps = request.step_number
# Handle continuation
continuation_id = request.continuation_id
# Create thread for first step
if not continuation_id and request.step_number == 1:
continuation_id = create_thread("debug", arguments)
# Store initial issue description
self.initial_issue = request.step
# Handle backtracking first if requested
if request.backtrack_from_step:
# Remove findings after the backtrack point
self.investigation_history = [
s for s in self.investigation_history if s["step_number"] < request.backtrack_from_step
]
# Reprocess consolidated findings to match truncated history
self._reprocess_consolidated_findings()
# Log if step number needs correction
expected_step_number = len(self.investigation_history) + 1
if request.step_number != expected_step_number:
logger.debug(
f"Step number adjusted from {request.step_number} to {expected_step_number} after backtracking"
)
# Process investigation step
step_data = {
"step": request.step,
"step_number": request.step_number,
"findings": request.findings,
"files_checked": request.files_checked,
"relevant_files": request.relevant_files,
"relevant_methods": request.relevant_methods,
"hypothesis": request.hypothesis,
"confidence": request.confidence,
"images": request.images,
}
# Store in history
self.investigation_history.append(step_data)
# Update consolidated findings
self.consolidated_findings["files_checked"].update(request.files_checked)
self.consolidated_findings["relevant_files"].update(request.relevant_files)
self.consolidated_findings["relevant_methods"].update(request.relevant_methods)
self.consolidated_findings["findings"].append(f"Step {request.step_number}: {request.findings}")
if request.hypothesis:
self.consolidated_findings["hypotheses"].append(
{"step": request.step_number, "hypothesis": request.hypothesis, "confidence": request.confidence}
)
if request.images:
self.consolidated_findings["images"].extend(request.images)
# Build response
response_data = {
"status": "investigation_in_progress",
"step_number": request.step_number,
"total_steps": request.total_steps,
"next_step_required": request.next_step_required,
"investigation_status": {
"files_checked": len(self.consolidated_findings["files_checked"]),
"relevant_files": len(self.consolidated_findings["relevant_files"]),
"relevant_methods": len(self.consolidated_findings["relevant_methods"]),
"hypotheses_formed": len(self.consolidated_findings["hypotheses"]),
"images_collected": len(set(self.consolidated_findings["images"])),
"current_confidence": request.confidence,
},
"output": {
"instructions": "Continue systematic investigation. Present findings clearly and proceed to next step if required.",
"format": "systematic_investigation",
},
}
if continuation_id:
response_data["continuation_id"] = continuation_id
# If investigation is complete, call the AI model for expert analysis
if not request.next_step_required:
response_data["status"] = "calling_expert_analysis"
response_data["investigation_complete"] = True
# Prepare consolidated investigation summary
investigation_summary = self._prepare_investigation_summary()
# Call the AI model with full context
expert_analysis = await self._call_expert_analysis(
initial_issue=getattr(self, "initial_issue", request.step),
investigation_summary=investigation_summary,
relevant_files=list(self.consolidated_findings["relevant_files"]),
relevant_methods=list(self.consolidated_findings["relevant_methods"]),
final_hypothesis=request.hypothesis,
error_context=self._extract_error_context(),
images=list(set(self.consolidated_findings["images"])), # Unique images
model_info=arguments.get("_model_context"),
model_override=arguments.get("model"), # Pass model selection from final step
)
# Combine investigation and expert analysis
response_data["expert_analysis"] = expert_analysis
response_data["complete_investigation"] = {
"initial_issue": getattr(self, "initial_issue", request.step),
"steps_taken": len(self.investigation_history),
"files_examined": list(self.consolidated_findings["files_checked"]),
"relevant_files": list(self.consolidated_findings["relevant_files"]),
"relevant_methods": list(self.consolidated_findings["relevant_methods"]),
"investigation_summary": investigation_summary,
}
response_data["next_steps"] = (
"Investigation complete with expert analysis. Present the findings, hypotheses, "
"and recommended fixes to the user. Focus on the most likely root cause and "
"provide actionable implementation guidance."
)
else: else:
request.error_context = prompt_content response_data["next_steps"] = (
f"Continue investigation with step {request.step_number + 1}. "
f"Focus on: examining relevant code, testing hypotheses, gathering evidence."
)
# Check user input sizes at MCP transport boundary (before adding internal content) # Store in conversation memory
size_check = self.check_prompt_size(request.prompt) if continuation_id:
if size_check: add_turn(
from tools.models import ToolOutput thread_id=continuation_id,
role="assistant",
content=json.dumps(response_data, indent=2),
tool_name="debug",
files=list(self.consolidated_findings["relevant_files"]),
images=request.images,
)
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}") return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
if request.error_context: except Exception as e:
size_check = self.check_prompt_size(request.error_context) logger.error(f"Error in debug investigation: {e}", exc_info=True)
if size_check: error_data = {
from tools.models import ToolOutput "status": "investigation_failed",
"error": str(e),
"step_number": arguments.get("step_number", 0),
}
return [TextContent(type="text", text=json.dumps(error_data, indent=2))]
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}") def _reprocess_consolidated_findings(self):
"""Reprocess consolidated findings after backtracking"""
self.consolidated_findings = {
"files_checked": set(),
"relevant_files": set(),
"relevant_methods": set(),
"findings": [],
"hypotheses": [],
"images": [],
}
# Update request files list for step in self.investigation_history:
if updated_files is not None: self.consolidated_findings["files_checked"].update(step.get("files_checked", []))
request.files = updated_files self.consolidated_findings["relevant_files"].update(step.get("relevant_files", []))
self.consolidated_findings["relevant_methods"].update(step.get("relevant_methods", []))
self.consolidated_findings["findings"].append(f"Step {step['step_number']}: {step['findings']}")
if step.get("hypothesis"):
self.consolidated_findings["hypotheses"].append(
{
"step": step["step_number"],
"hypothesis": step["hypothesis"],
"confidence": step.get("confidence", "low"),
}
)
if step.get("images"):
self.consolidated_findings["images"].extend(step["images"])
# File size validation happens at MCP boundary in server.py def _prepare_investigation_summary(self) -> str:
"""Prepare a comprehensive summary of the investigation"""
summary_parts = [
"=== SYSTEMATIC INVESTIGATION SUMMARY ===",
f"Total steps: {len(self.investigation_history)}",
f"Files examined: {len(self.consolidated_findings['files_checked'])}",
f"Relevant files identified: {len(self.consolidated_findings['relevant_files'])}",
f"Methods/functions involved: {len(self.consolidated_findings['relevant_methods'])}",
"",
"=== INVESTIGATION PROGRESSION ===",
]
# Build context sections for finding in self.consolidated_findings["findings"]:
context_parts = [f"=== ISSUE DESCRIPTION ===\n{request.prompt}\n=== END DESCRIPTION ==="] summary_parts.append(finding)
if request.findings: if self.consolidated_findings["hypotheses"]:
context_parts.append(f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{request.findings}\n=== END FINDINGS ===") summary_parts.extend(
[
if request.error_context: "",
context_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{request.error_context}\n=== END CONTEXT ===") "=== HYPOTHESIS EVOLUTION ===",
]
# Add relevant files if provided
if request.files:
# Use centralized file processing logic
continuation_id = getattr(request, "continuation_id", None)
file_content, processed_files = self._prepare_file_content_for_prompt(
request.files, continuation_id, "Code"
) )
self._actually_processed_files = processed_files for hyp in self.consolidated_findings["hypotheses"]:
summary_parts.append(f"Step {hyp['step']} ({hyp['confidence']} confidence): {hyp['hypothesis']}")
return "\n".join(summary_parts)
def _extract_error_context(self) -> Optional[str]:
"""Extract error context from investigation findings"""
error_patterns = ["error", "exception", "stack trace", "traceback", "failure"]
error_context_parts = []
for finding in self.consolidated_findings["findings"]:
if any(pattern in finding.lower() for pattern in error_patterns):
error_context_parts.append(finding)
return "\n".join(error_context_parts) if error_context_parts else None
async def _call_expert_analysis(
self,
initial_issue: str,
investigation_summary: str,
relevant_files: list[str],
relevant_methods: list[str],
final_hypothesis: Optional[str],
error_context: Optional[str],
images: list[str],
model_info: Optional[Any] = None,
model_override: Optional[str] = None,
) -> dict:
"""Call AI model for expert analysis of the investigation"""
# Prepare the debug prompt with all investigation context
prompt_parts = [
f"=== ISSUE DESCRIPTION ===\n{initial_issue}\n=== END DESCRIPTION ===",
f"\n=== CLAUDE'S INVESTIGATION FINDINGS ===\n{investigation_summary}\n=== END FINDINGS ===",
]
if error_context:
prompt_parts.append(f"\n=== ERROR CONTEXT/STACK TRACE ===\n{error_context}\n=== END CONTEXT ===")
if relevant_methods:
prompt_parts.append(
"\n=== RELEVANT METHODS/FUNCTIONS ===\n"
+ "\n".join(f"- {method}" for method in relevant_methods)
+ "\n=== END METHODS ==="
)
if final_hypothesis:
prompt_parts.append(f"\n=== FINAL HYPOTHESIS ===\n{final_hypothesis}\n=== END HYPOTHESIS ===")
if images:
prompt_parts.append(
"\n=== VISUAL DEBUGGING INFORMATION ===\n"
+ "\n".join(f"- {img}" for img in images)
+ "\n=== END VISUAL INFORMATION ==="
)
# Add file content if we have relevant files
if relevant_files:
file_content, _ = self._prepare_file_content_for_prompt(relevant_files, None, "Essential debugging files")
if file_content: if file_content:
context_parts.append( prompt_parts.append(
f"\n=== ESSENTIAL FILES FOR DEBUGGING ===\n{file_content}\n=== END ESSENTIAL FILES ===" f"\n=== ESSENTIAL FILES FOR DEBUGGING ===\n{file_content}\n=== END ESSENTIAL FILES ==="
) )
full_context = "\n".join(context_parts) full_prompt = "\n".join(prompt_parts)
# Check token limits # Get appropriate model and provider
self._validate_token_limit(full_context, "Context") from config import DEFAULT_MODEL
from providers.registry import ModelProviderRegistry
# Add web search instruction if enabled model_name = model_override or DEFAULT_MODEL # Use override if provided
websearch_instruction = self.get_websearch_instruction( provider = ModelProviderRegistry.get_provider_for_model(model_name)
request.use_websearch,
"""When debugging issues, consider if searches for these would help:
- The exact error message to find known solutions
- Framework-specific error codes and their meanings
- Similar issues in forums, GitHub issues, or Stack Overflow
- Workarounds and patches for known bugs
- Version-specific issues and compatibility problems""",
)
# Combine everything if not provider:
full_prompt = f"""{self.get_system_prompt()}{websearch_instruction} return {"error": f"No provider available for model {model_name}", "status": "provider_error"}
{full_context} # Generate AI response
try:
full_analysis_prompt = f"{self.get_system_prompt()}\n\n{full_prompt}\n\nPlease debug this issue following the structured format in the system prompt."
Please debug this issue following the structured format in the system prompt. # Prepare generation kwargs
Focus on finding the root cause and providing actionable solutions.""" generation_kwargs = {
"prompt": full_analysis_prompt,
"model_name": model_name,
"system_prompt": "", # Already included in prompt
"temperature": self.get_default_temperature(),
"thinking_mode": "high", # High thinking for debug analysis
}
return full_prompt # Add images if available
if images:
generation_kwargs["images"] = images
def _get_model_name(self, model_info: Optional[dict]) -> str: model_response = provider.generate_content(**generation_kwargs)
"""Extract friendly model name from model info."""
if model_info and model_info.get("model_response"):
return model_info["model_response"].friendly_name or "the model"
return "the model"
def _generate_systematic_next_steps(self, model_name: str) -> str: if model_response.content:
"""Generate next steps for systematic investigation completion.""" # Try to parse as JSON
return f"""**Expert Analysis Complete** try:
analysis_result = json.loads(model_response.content.strip())
return analysis_result
except json.JSONDecodeError:
# Return as text if not valid JSON
return {
"status": "analysis_complete",
"raw_analysis": model_response.content,
"parse_error": "Response was not valid JSON",
}
else:
return {"error": "No response from model", "status": "empty_response"}
{model_name} has analyzed your systematic investigation findings. except Exception as e:
logger.error(f"Error calling expert analysis: {e}", exc_info=True)
return {"error": str(e), "status": "analysis_error"}
**Next Steps:** # Stub implementations for base class requirements
1. **UPDATE INVESTIGATION DOCUMENT**: Add the expert analysis to your DEBUGGING_*.md file async def prepare_prompt(self, request) -> str:
2. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence validation return "" # Not used - execute() is overridden
3. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
4. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions
5. **DOCUMENT RESOLUTION**: Update investigation document with final resolution"""
def _generate_standard_analysis_steps(self, model_name: str) -> str: def format_response(self, response: str, request, model_info: dict = None) -> str:
"""Generate next steps for standard analysis completion.""" return response # Not used - execute() is overridden
return f"""**Expert Analysis Complete**
{model_name} has analyzed your investigation findings.
**Next Steps:**
1. **REVIEW HYPOTHESES**: Examine the ranked hypotheses and evidence
2. **IMPLEMENT FIXES**: Apply recommended minimal fixes in order of likelihood
3. **VALIDATE CHANGES**: Test each fix thoroughly to ensure no regressions"""
def _generate_general_analysis_steps(self, model_name: str) -> str:
"""Generate next steps for general analysis responses."""
return f"""**Analysis from {model_name}**
**Next Steps:** Continue your systematic investigation based on the guidance provided, then return
with comprehensive findings for expert analysis."""
def format_response(self, response: str, request: DebugIssueRequest, model_info: Optional[dict] = None) -> str:
"""Format the debugging response for Claude to present to user"""
# The base class automatically handles structured responses like 'files_required_to_continue'
# and 'analysis_complete' via SPECIAL_STATUS_MODELS, so we only handle normal text responses here
model_name = self._get_model_name(model_info)
# For normal text responses, provide general guidance
next_steps = self._generate_general_analysis_steps(model_name)
return f"""{response}
---
{next_steps}"""