Vastly improved debug tool and related instructions
Accompanying simulation test Cleanup - A single source of truth for parameter descriptions
This commit is contained in:
@@ -14,6 +14,7 @@ from .test_content_validation import ContentValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||
from .test_cross_tool_continuation import CrossToolContinuationTest
|
||||
from .test_debug_validation import DebugValidationTest
|
||||
from .test_line_number_validation import LineNumberValidationTest
|
||||
from .test_logs_validation import LogsValidationTest
|
||||
from .test_model_thinking_config import TestModelThinkingConfig
|
||||
@@ -48,6 +49,7 @@ TEST_REGISTRY = {
|
||||
"token_allocation_validation": TokenAllocationValidationTest,
|
||||
"testgen_validation": TestGenValidationTest,
|
||||
"refactor_validation": RefactorValidationTest,
|
||||
"debug_validation": DebugValidationTest,
|
||||
"conversation_chain_validation": ConversationChainValidationTest,
|
||||
"vision_capability": VisionCapabilityTest,
|
||||
"xai_models": XAIModelsTest,
|
||||
@@ -76,6 +78,7 @@ __all__ = [
|
||||
"TokenAllocationValidationTest",
|
||||
"TestGenValidationTest",
|
||||
"RefactorValidationTest",
|
||||
"DebugValidationTest",
|
||||
"ConversationChainValidationTest",
|
||||
"VisionCapabilityTest",
|
||||
"XAIModelsTest",
|
||||
|
||||
436
simulator_tests/test_debug_validation.py
Normal file
436
simulator_tests/test_debug_validation.py
Normal file
@@ -0,0 +1,436 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug Tool Validation Test
|
||||
|
||||
Tests the debug tool with real bugs to validate:
|
||||
- Proper execution with flash model
|
||||
- Actual bug identification and analysis
|
||||
- Hypothesis generation for root causes
|
||||
- Log validation for tool execution
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class DebugValidationTest(BaseSimulatorTest):
|
||||
"""Test debug tool with actual bug scenarios"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "debug_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Debug tool validation with actual bugs"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test debug tool with real bugs"""
|
||||
try:
|
||||
self.logger.info("Test: Debug tool validation")
|
||||
|
||||
# Setup test files directory first
|
||||
self.setup_test_files()
|
||||
|
||||
# Create a Python file with a subtle but realistic bug
|
||||
buggy_code = """#!/usr/bin/env python3
|
||||
import json
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class UserSessionManager:
|
||||
def __init__(self):
|
||||
self.active_sessions = {}
|
||||
self.session_timeout = 30 * 60 # 30 minutes in seconds
|
||||
|
||||
def create_session(self, user_id, user_data):
|
||||
\"\"\"Create a new user session\"\"\"
|
||||
session_id = f"sess_{user_id}_{datetime.now().timestamp()}"
|
||||
|
||||
session_info = {
|
||||
'user_id': user_id,
|
||||
'user_data': user_data,
|
||||
'created_at': datetime.now(),
|
||||
'last_activity': datetime.now(),
|
||||
'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
|
||||
}
|
||||
|
||||
self.active_sessions[session_id] = session_info
|
||||
return session_id
|
||||
|
||||
def validate_session(self, session_id):
|
||||
\"\"\"Check if session is valid and not expired\"\"\"
|
||||
if session_id not in self.active_sessions:
|
||||
return False
|
||||
|
||||
session = self.active_sessions[session_id]
|
||||
current_time = datetime.now()
|
||||
|
||||
# Check if session has expired
|
||||
if current_time > session['expires_at']:
|
||||
del self.active_sessions[session_id]
|
||||
return False
|
||||
|
||||
# Update last activity
|
||||
session['last_activity'] = current_time
|
||||
return True
|
||||
|
||||
def cleanup_expired_sessions(self):
|
||||
\"\"\"Remove expired sessions from memory\"\"\"
|
||||
current_time = datetime.now()
|
||||
expired_sessions = []
|
||||
|
||||
for session_id, session in self.active_sessions.items():
|
||||
if current_time > session['expires_at']:
|
||||
expired_sessions.append(session_id)
|
||||
|
||||
for session_id in expired_sessions:
|
||||
del self.active_sessions[session_id]
|
||||
|
||||
return len(expired_sessions)
|
||||
|
||||
class APIHandler:
|
||||
def __init__(self):
|
||||
self.session_manager = UserSessionManager()
|
||||
self.request_count = 0
|
||||
|
||||
def authenticate_user(self, username, password):
|
||||
\"\"\"Authenticate user and create session\"\"\"
|
||||
# Simulate API call to auth service
|
||||
auth_response = self._call_auth_service(username, password)
|
||||
|
||||
if auth_response.get('success'):
|
||||
user_data = auth_response.get('user_data', {})
|
||||
session_id = self.session_manager.create_session(
|
||||
user_data['id'], user_data
|
||||
)
|
||||
return {'success': True, 'session_id': session_id}
|
||||
|
||||
return {'success': False, 'error': 'Authentication failed'}
|
||||
|
||||
def process_request(self, session_id, request_data):
|
||||
\"\"\"Process an API request with session validation\"\"\"
|
||||
self.request_count += 1
|
||||
|
||||
# Validate session before processing
|
||||
if not self.session_manager.validate_session(session_id):
|
||||
return {'error': 'Invalid or expired session', 'code': 401}
|
||||
|
||||
# Simulate request processing
|
||||
try:
|
||||
result = self._process_business_logic(request_data)
|
||||
return {'success': True, 'data': result}
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'code': 500}
|
||||
|
||||
def _call_auth_service(self, username, password):
|
||||
\"\"\"Simulate external authentication service call\"\"\"
|
||||
# Simulate network delay and response
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
|
||||
# Mock successful authentication
|
||||
if username and password:
|
||||
return {
|
||||
'success': True,
|
||||
'user_data': {
|
||||
'id': hash(username) % 10000,
|
||||
'username': username,
|
||||
'roles': ['user']
|
||||
}
|
||||
}
|
||||
return {'success': False}
|
||||
|
||||
def _process_business_logic(self, request_data):
|
||||
\"\"\"Simulate business logic processing\"\"\"
|
||||
if not request_data:
|
||||
raise ValueError("Invalid request data")
|
||||
|
||||
# Simulate some processing
|
||||
return {
|
||||
'processed_at': datetime.now().isoformat(),
|
||||
'request_id': self.request_count,
|
||||
'status': 'completed'
|
||||
}
|
||||
|
||||
# Global API handler instance
|
||||
api_handler = APIHandler()
|
||||
|
||||
def handle_api_request(session_id, request_data):
|
||||
\"\"\"Main API request handler\"\"\"
|
||||
return api_handler.process_request(session_id, request_data)
|
||||
"""
|
||||
|
||||
# Create test file with subtle bug
|
||||
test_file = self.create_additional_test_file("session_manager.py", buggy_code)
|
||||
self.logger.info(f" ✅ Created test file with subtle bug: {test_file}")
|
||||
|
||||
# Create a realistic problem description with subtle symptoms
|
||||
error_description = """ISSUE DESCRIPTION:
|
||||
Our API service is experiencing intermittent session validation failures in production.
|
||||
|
||||
SYMPTOMS OBSERVED:
|
||||
- Users randomly get "Invalid or expired session" errors even with valid sessions
|
||||
- The issue happens more frequently during high-traffic periods
|
||||
- Sessions that should still be valid (created < 30 minutes ago) are being rejected
|
||||
- The problem occurs maybe 2-3% of requests but is hard to reproduce consistently
|
||||
- Server logs show session validation failing but no clear pattern
|
||||
|
||||
ENVIRONMENT:
|
||||
- Python 3.13 API service
|
||||
- Running in production with multiple concurrent users
|
||||
- Redis not used for session storage (in-memory only)
|
||||
- Load balancer distributes requests across multiple instances
|
||||
|
||||
RECENT CHANGES:
|
||||
- Increased session timeout from 15 to 30 minutes last week
|
||||
- Added cleanup routine to remove expired sessions
|
||||
- No major code changes to session management
|
||||
|
||||
USER IMPACT:
|
||||
- Users have to re-authenticate randomly
|
||||
- Affects user experience and causes complaints
|
||||
- Seems to happen more on busy days
|
||||
|
||||
The code looks correct to me, but something is causing valid sessions to be treated as expired or invalid. I'm not sure what's causing this intermittent behavior."""
|
||||
|
||||
error_file = self.create_additional_test_file("error_description.txt", error_description)
|
||||
self.logger.info(f" ✅ Created error description file: {error_file}")
|
||||
|
||||
# Call debug tool with flash model and realistic problem description
|
||||
self.logger.info(" 🔍 Calling debug tool to investigate session validation issues...")
|
||||
response, continuation_id = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"prompt": "Investigate why our API is experiencing intermittent session validation failures in production",
|
||||
"files": [test_file, error_file],
|
||||
"findings": "Users getting 'Invalid or expired session' errors randomly, occurs more during high traffic, sessions should still be valid",
|
||||
"error_context": "Sessions created < 30 minutes ago being rejected, happens ~2-3% of requests, load balanced environment",
|
||||
"systematic_investigation": True,
|
||||
"model": "flash",
|
||||
"thinking_mode": "medium",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error("Failed to get debug response")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Got debug response")
|
||||
|
||||
# Parse response to validate bug identification
|
||||
try:
|
||||
response_data = json.loads(response)
|
||||
self.logger.debug(f"Response keys: {list(response_data.keys())}")
|
||||
|
||||
# Extract the actual content if it's wrapped
|
||||
if "content" in response_data:
|
||||
content = response_data["content"]
|
||||
# Handle markdown JSON blocks
|
||||
if content.startswith("```json"):
|
||||
content = content[7:]
|
||||
if content.endswith("```"):
|
||||
content = content[:-3]
|
||||
content = content.strip()
|
||||
|
||||
# Parse the inner JSON
|
||||
inner_data = json.loads(content)
|
||||
self.logger.debug(f"Inner data keys: {list(inner_data.keys())}")
|
||||
else:
|
||||
inner_data = response_data
|
||||
|
||||
# Check for structured debug analysis (should have analysis_complete status)
|
||||
if inner_data.get("status") == "analysis_complete":
|
||||
self.logger.info(" ✅ Got structured debug analysis")
|
||||
|
||||
# Validate hypothesis generation
|
||||
hypotheses = inner_data.get("hypotheses", [])
|
||||
if not hypotheses:
|
||||
self.logger.error("No hypotheses found in debug analysis")
|
||||
return False
|
||||
|
||||
self.logger.info(f" 🧠 Found {len(hypotheses)} hypotheses")
|
||||
|
||||
# Check if the model identified the real bug: dictionary modification during iteration
|
||||
analysis_text = json.dumps(inner_data).lower()
|
||||
|
||||
# Look for the actual bug - modifying dictionary while iterating
|
||||
bug_indicators = [
|
||||
"dictionary",
|
||||
"iteration",
|
||||
"modify",
|
||||
"concurrent",
|
||||
"runtime error",
|
||||
"dictionary changed size during iteration",
|
||||
"cleanup_expired_sessions",
|
||||
"active_sessions",
|
||||
"del",
|
||||
"removing while iterating",
|
||||
]
|
||||
|
||||
found_indicators = [indicator for indicator in bug_indicators if indicator in analysis_text]
|
||||
|
||||
# Check for specific mentions of the problematic pattern
|
||||
dictionary_bug_patterns = [
|
||||
"modifying dictionary while iterating",
|
||||
"dictionary changed size",
|
||||
"concurrent modification",
|
||||
"iterating over dictionary",
|
||||
"del.*active_sessions",
|
||||
"cleanup.*iteration",
|
||||
]
|
||||
|
||||
import re
|
||||
|
||||
pattern_matches = []
|
||||
for pattern in dictionary_bug_patterns:
|
||||
if re.search(pattern, analysis_text):
|
||||
pattern_matches.append(pattern)
|
||||
|
||||
if len(found_indicators) >= 3 or len(pattern_matches) >= 1:
|
||||
self.logger.info(" ✅ Flash identified the dictionary iteration bug")
|
||||
self.logger.info(f" Found indicators: {found_indicators[:3]}")
|
||||
if pattern_matches:
|
||||
self.logger.info(f" Pattern matches: {pattern_matches}")
|
||||
else:
|
||||
self.logger.error(" ❌ Flash missed the dictionary iteration bug")
|
||||
self.logger.error(f" Found only: {found_indicators}")
|
||||
return False
|
||||
|
||||
# Validate hypothesis quality (should have confidence levels and reasoning)
|
||||
valid_hypotheses = 0
|
||||
for i, hypothesis in enumerate(hypotheses[:3]): # Check top 3
|
||||
confidence = hypothesis.get("confidence", "").lower()
|
||||
reasoning = hypothesis.get("reasoning", "")
|
||||
|
||||
if confidence in ["high", "medium", "low"] and len(reasoning) > 20:
|
||||
valid_hypotheses += 1
|
||||
self.logger.debug(f" Hypothesis {i+1}: {confidence} confidence, good reasoning")
|
||||
else:
|
||||
self.logger.debug(f" Hypothesis {i+1}: weak ({confidence}, {len(reasoning)} chars)")
|
||||
|
||||
if valid_hypotheses >= 2:
|
||||
self.logger.info(f" ✅ Found {valid_hypotheses} well-structured hypotheses")
|
||||
else:
|
||||
self.logger.error(f" ❌ Only {valid_hypotheses} well-structured hypotheses")
|
||||
return False
|
||||
|
||||
# Check for line-specific references
|
||||
if "line" in analysis_text or "lines" in analysis_text:
|
||||
self.logger.info(" 📍 Analysis includes line-specific references")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ No line-specific references found")
|
||||
|
||||
else:
|
||||
# Non-structured response - check for dictionary iteration bug identification
|
||||
self.logger.info(" 📝 Got general debug response")
|
||||
|
||||
response_text = response.lower()
|
||||
|
||||
# Check for the specific bug in general response
|
||||
bug_indicators = [
|
||||
"dictionary",
|
||||
"iteration",
|
||||
"modify",
|
||||
"concurrent",
|
||||
"active_sessions",
|
||||
"cleanup",
|
||||
"del ",
|
||||
"removing",
|
||||
"changed size",
|
||||
]
|
||||
|
||||
found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
|
||||
|
||||
if len(found_indicators) >= 3:
|
||||
self.logger.info(f" ✅ Found {len(found_indicators)} relevant indicators in response")
|
||||
self.logger.info(f" Found: {found_indicators}")
|
||||
else:
|
||||
self.logger.error(f" ❌ Only found {len(found_indicators)} relevant indicators")
|
||||
self.logger.error(f" Found: {found_indicators}")
|
||||
return False
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse debug response as JSON: {e}")
|
||||
# For non-JSON responses, check for dictionary iteration bug
|
||||
response_text = response.lower()
|
||||
|
||||
bug_indicators = [
|
||||
"dictionary",
|
||||
"iteration",
|
||||
"modify",
|
||||
"concurrent",
|
||||
"active_sessions",
|
||||
"cleanup",
|
||||
"del ",
|
||||
"removing",
|
||||
]
|
||||
|
||||
found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
|
||||
|
||||
if len(found_indicators) >= 3:
|
||||
self.logger.info(f" ✅ Text response found {len(found_indicators)} relevant indicators")
|
||||
else:
|
||||
self.logger.error(f" ❌ Text response only found {len(found_indicators)} relevant indicators")
|
||||
return False
|
||||
|
||||
# Validate logs
|
||||
self.logger.info(" 📋 Validating execution logs...")
|
||||
|
||||
# Get server logs from the actual log file inside the container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode() + result.stderr.decode()
|
||||
|
||||
# Look for debug tool execution patterns
|
||||
debug_patterns = [
|
||||
"debug tool",
|
||||
"[DEBUG]",
|
||||
"systematic investigation",
|
||||
"Token budget",
|
||||
"Essential files for debugging",
|
||||
]
|
||||
|
||||
patterns_found = 0
|
||||
for pattern in debug_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
|
||||
|
||||
# Test continuation if available
|
||||
if continuation_id:
|
||||
self.logger.info(" 🔄 Testing debug continuation...")
|
||||
|
||||
follow_up_response, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"prompt": "Based on your analysis, which bug should we fix first and how?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if follow_up_response:
|
||||
self.logger.info(" ✅ Debug continuation worked")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Debug continuation failed")
|
||||
|
||||
self.logger.info(" ✅ Debug tool validation completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Debug validation test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
Reference in New Issue
Block a user