Vastly improved debug tool and related instructions

Accompanying simulation test Cleanup - A single source of truth for parameter descriptions
2025-06-17 16:23:26 +04:00
parent 9bf2a2a51c
commit 044a8621a3
12 changed files with 829 additions and 238 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -14,6 +14,7 @@ from .test_content_validation import ContentValidationTest
 from .test_conversation_chain_validation import ConversationChainValidationTest
 from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
 from .test_cross_tool_continuation import CrossToolContinuationTest
+from .test_debug_validation import DebugValidationTest
 from .test_line_number_validation import LineNumberValidationTest
 from .test_logs_validation import LogsValidationTest
 from .test_model_thinking_config import TestModelThinkingConfig
@@ -48,6 +49,7 @@ TEST_REGISTRY = {
    "token_allocation_validation": TokenAllocationValidationTest,
    "testgen_validation": TestGenValidationTest,
    "refactor_validation": RefactorValidationTest,
+    "debug_validation": DebugValidationTest,
    "conversation_chain_validation": ConversationChainValidationTest,
    "vision_capability": VisionCapabilityTest,
    "xai_models": XAIModelsTest,
@@ -76,6 +78,7 @@ __all__ = [
    "TokenAllocationValidationTest",
    "TestGenValidationTest",
    "RefactorValidationTest",
+    "DebugValidationTest",
    "ConversationChainValidationTest",
    "VisionCapabilityTest",
    "XAIModelsTest",
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Debug Tool Validation Test
+
+Tests the debug tool with real bugs to validate:
+- Proper execution with flash model
+- Actual bug identification and analysis
+- Hypothesis generation for root causes
+- Log validation for tool execution
+"""
+
+import json
+
+from .base_test import BaseSimulatorTest
+
+
+class DebugValidationTest(BaseSimulatorTest):
+    """Test debug tool with actual bug scenarios"""
+
+    @property
+    def test_name(self) -> str:
+        return "debug_validation"
+
+    @property
+    def test_description(self) -> str:
+        return "Debug tool validation with actual bugs"
+
+    def run_test(self) -> bool:
+        """Test debug tool with real bugs"""
+        try:
+            self.logger.info("Test: Debug tool validation")
+
+            # Setup test files directory first
+            self.setup_test_files()
+
+            # Create a Python file with a subtle but realistic bug
+            buggy_code = """#!/usr/bin/env python3
+import json
+import requests
+from datetime import datetime, timedelta
+
+class UserSessionManager:
+    def __init__(self):
+        self.active_sessions = {}
+        self.session_timeout = 30 * 60  # 30 minutes in seconds
+
+    def create_session(self, user_id, user_data):
+        \"\"\"Create a new user session\"\"\"
+        session_id = f"sess_{user_id}_{datetime.now().timestamp()}"
+
+        session_info = {
+            'user_id': user_id,
+            'user_data': user_data,
+            'created_at': datetime.now(),
+            'last_activity': datetime.now(),
+            'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
+        }
+
+        self.active_sessions[session_id] = session_info
+        return session_id
+
+    def validate_session(self, session_id):
+        \"\"\"Check if session is valid and not expired\"\"\"
+        if session_id not in self.active_sessions:
+            return False
+
+        session = self.active_sessions[session_id]
+        current_time = datetime.now()
+
+        # Check if session has expired
+        if current_time > session['expires_at']:
+            del self.active_sessions[session_id]
+            return False
+
+        # Update last activity
+        session['last_activity'] = current_time
+        return True
+
+    def cleanup_expired_sessions(self):
+        \"\"\"Remove expired sessions from memory\"\"\"
+        current_time = datetime.now()
+        expired_sessions = []
+
+        for session_id, session in self.active_sessions.items():
+            if current_time > session['expires_at']:
+                expired_sessions.append(session_id)
+
+        for session_id in expired_sessions:
+            del self.active_sessions[session_id]
+
+        return len(expired_sessions)
+
+class APIHandler:
+    def __init__(self):
+        self.session_manager = UserSessionManager()
+        self.request_count = 0
+
+    def authenticate_user(self, username, password):
+        \"\"\"Authenticate user and create session\"\"\"
+        # Simulate API call to auth service
+        auth_response = self._call_auth_service(username, password)
+
+        if auth_response.get('success'):
+            user_data = auth_response.get('user_data', {})
+            session_id = self.session_manager.create_session(
+                user_data['id'], user_data
+            )
+            return {'success': True, 'session_id': session_id}
+
+        return {'success': False, 'error': 'Authentication failed'}
+
+    def process_request(self, session_id, request_data):
+        \"\"\"Process an API request with session validation\"\"\"
+        self.request_count += 1
+
+        # Validate session before processing
+        if not self.session_manager.validate_session(session_id):
+            return {'error': 'Invalid or expired session', 'code': 401}
+
+        # Simulate request processing
+        try:
+            result = self._process_business_logic(request_data)
+            return {'success': True, 'data': result}
+        except Exception as e:
+            return {'error': str(e), 'code': 500}
+
+    def _call_auth_service(self, username, password):
+        \"\"\"Simulate external authentication service call\"\"\"
+        # Simulate network delay and response
+        import time
+        time.sleep(0.1)
+
+        # Mock successful authentication
+        if username and password:
+            return {
+                'success': True,
+                'user_data': {
+                    'id': hash(username) % 10000,
+                    'username': username,
+                    'roles': ['user']
+                }
+            }
+        return {'success': False}
+
+    def _process_business_logic(self, request_data):
+        \"\"\"Simulate business logic processing\"\"\"
+        if not request_data:
+            raise ValueError("Invalid request data")
+
+        # Simulate some processing
+        return {
+            'processed_at': datetime.now().isoformat(),
+            'request_id': self.request_count,
+            'status': 'completed'
+        }
+
+# Global API handler instance
+api_handler = APIHandler()
+
+def handle_api_request(session_id, request_data):
+    \"\"\"Main API request handler\"\"\"
+    return api_handler.process_request(session_id, request_data)
+"""
+
+            # Create test file with subtle bug
+            test_file = self.create_additional_test_file("session_manager.py", buggy_code)
+            self.logger.info(f"  ✅ Created test file with subtle bug: {test_file}")
+
+            # Create a realistic problem description with subtle symptoms
+            error_description = """ISSUE DESCRIPTION:
+Our API service is experiencing intermittent session validation failures in production.
+
+SYMPTOMS OBSERVED:
+- Users randomly get "Invalid or expired session" errors even with valid sessions
+- The issue happens more frequently during high-traffic periods
+- Sessions that should still be valid (created < 30 minutes ago) are being rejected
+- The problem occurs maybe 2-3% of requests but is hard to reproduce consistently
+- Server logs show session validation failing but no clear pattern
+
+ENVIRONMENT:
+- Python 3.13 API service
+- Running in production with multiple concurrent users
+- Redis not used for session storage (in-memory only)
+- Load balancer distributes requests across multiple instances
+
+RECENT CHANGES:
+- Increased session timeout from 15 to 30 minutes last week
+- Added cleanup routine to remove expired sessions
+- No major code changes to session management
+
+USER IMPACT:
+- Users have to re-authenticate randomly
+- Affects user experience and causes complaints
+- Seems to happen more on busy days
+
+The code looks correct to me, but something is causing valid sessions to be treated as expired or invalid. I'm not sure what's causing this intermittent behavior."""
+
+            error_file = self.create_additional_test_file("error_description.txt", error_description)
+            self.logger.info(f"  ✅ Created error description file: {error_file}")
+
+            # Call debug tool with flash model and realistic problem description
+            self.logger.info("  🔍 Calling debug tool to investigate session validation issues...")
+            response, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "prompt": "Investigate why our API is experiencing intermittent session validation failures in production",
+                    "files": [test_file, error_file],
+                    "findings": "Users getting 'Invalid or expired session' errors randomly, occurs more during high traffic, sessions should still be valid",
+                    "error_context": "Sessions created < 30 minutes ago being rejected, happens ~2-3% of requests, load balanced environment",
+                    "systematic_investigation": True,
+                    "model": "flash",
+                    "thinking_mode": "medium",
+                },
+            )
+
+            if not response:
+                self.logger.error("Failed to get debug response")
+                return False
+
+            self.logger.info("  ✅ Got debug response")
+
+            # Parse response to validate bug identification
+            try:
+                response_data = json.loads(response)
+                self.logger.debug(f"Response keys: {list(response_data.keys())}")
+
+                # Extract the actual content if it's wrapped
+                if "content" in response_data:
+                    content = response_data["content"]
+                    # Handle markdown JSON blocks
+                    if content.startswith("```json"):
+                        content = content[7:]
+                    if content.endswith("```"):
+                        content = content[:-3]
+                    content = content.strip()
+
+                    # Parse the inner JSON
+                    inner_data = json.loads(content)
+                    self.logger.debug(f"Inner data keys: {list(inner_data.keys())}")
+                else:
+                    inner_data = response_data
+
+                # Check for structured debug analysis (should have analysis_complete status)
+                if inner_data.get("status") == "analysis_complete":
+                    self.logger.info("  ✅ Got structured debug analysis")
+
+                    # Validate hypothesis generation
+                    hypotheses = inner_data.get("hypotheses", [])
+                    if not hypotheses:
+                        self.logger.error("No hypotheses found in debug analysis")
+                        return False
+
+                    self.logger.info(f"  🧠 Found {len(hypotheses)} hypotheses")
+
+                    # Check if the model identified the real bug: dictionary modification during iteration
+                    analysis_text = json.dumps(inner_data).lower()
+
+                    # Look for the actual bug - modifying dictionary while iterating
+                    bug_indicators = [
+                        "dictionary",
+                        "iteration",
+                        "modify",
+                        "concurrent",
+                        "runtime error",
+                        "dictionary changed size during iteration",
+                        "cleanup_expired_sessions",
+                        "active_sessions",
+                        "del",
+                        "removing while iterating",
+                    ]
+
+                    found_indicators = [indicator for indicator in bug_indicators if indicator in analysis_text]
+
+                    # Check for specific mentions of the problematic pattern
+                    dictionary_bug_patterns = [
+                        "modifying dictionary while iterating",
+                        "dictionary changed size",
+                        "concurrent modification",
+                        "iterating over dictionary",
+                        "del.*active_sessions",
+                        "cleanup.*iteration",
+                    ]
+
+                    import re
+
+                    pattern_matches = []
+                    for pattern in dictionary_bug_patterns:
+                        if re.search(pattern, analysis_text):
+                            pattern_matches.append(pattern)
+
+                    if len(found_indicators) >= 3 or len(pattern_matches) >= 1:
+                        self.logger.info("  ✅ Flash identified the dictionary iteration bug")
+                        self.logger.info(f"     Found indicators: {found_indicators[:3]}")
+                        if pattern_matches:
+                            self.logger.info(f"     Pattern matches: {pattern_matches}")
+                    else:
+                        self.logger.error("  ❌ Flash missed the dictionary iteration bug")
+                        self.logger.error(f"     Found only: {found_indicators}")
+                        return False
+
+                    # Validate hypothesis quality (should have confidence levels and reasoning)
+                    valid_hypotheses = 0
+                    for i, hypothesis in enumerate(hypotheses[:3]):  # Check top 3
+                        confidence = hypothesis.get("confidence", "").lower()
+                        reasoning = hypothesis.get("reasoning", "")
+
+                        if confidence in ["high", "medium", "low"] and len(reasoning) > 20:
+                            valid_hypotheses += 1
+                            self.logger.debug(f"  Hypothesis {i+1}: {confidence} confidence, good reasoning")
+                        else:
+                            self.logger.debug(f"  Hypothesis {i+1}: weak ({confidence}, {len(reasoning)} chars)")
+
+                    if valid_hypotheses >= 2:
+                        self.logger.info(f"  ✅ Found {valid_hypotheses} well-structured hypotheses")
+                    else:
+                        self.logger.error(f"  ❌ Only {valid_hypotheses} well-structured hypotheses")
+                        return False
+
+                    # Check for line-specific references
+                    if "line" in analysis_text or "lines" in analysis_text:
+                        self.logger.info("  📍 Analysis includes line-specific references")
+                    else:
+                        self.logger.warning("  ⚠️ No line-specific references found")
+
+                else:
+                    # Non-structured response - check for dictionary iteration bug identification
+                    self.logger.info("  📝 Got general debug response")
+
+                    response_text = response.lower()
+
+                    # Check for the specific bug in general response
+                    bug_indicators = [
+                        "dictionary",
+                        "iteration",
+                        "modify",
+                        "concurrent",
+                        "active_sessions",
+                        "cleanup",
+                        "del ",
+                        "removing",
+                        "changed size",
+                    ]
+
+                    found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+
+                    if len(found_indicators) >= 3:
+                        self.logger.info(f"  ✅ Found {len(found_indicators)} relevant indicators in response")
+                        self.logger.info(f"     Found: {found_indicators}")
+                    else:
+                        self.logger.error(f"  ❌ Only found {len(found_indicators)} relevant indicators")
+                        self.logger.error(f"     Found: {found_indicators}")
+                        return False
+
+            except json.JSONDecodeError as e:
+                self.logger.error(f"Failed to parse debug response as JSON: {e}")
+                # For non-JSON responses, check for dictionary iteration bug
+                response_text = response.lower()
+
+                bug_indicators = [
+                    "dictionary",
+                    "iteration",
+                    "modify",
+                    "concurrent",
+                    "active_sessions",
+                    "cleanup",
+                    "del ",
+                    "removing",
+                ]
+
+                found_indicators = [indicator for indicator in bug_indicators if indicator in response_text]
+
+                if len(found_indicators) >= 3:
+                    self.logger.info(f"  ✅ Text response found {len(found_indicators)} relevant indicators")
+                else:
+                    self.logger.error(f"  ❌ Text response only found {len(found_indicators)} relevant indicators")
+                    return False
+
+            # Validate logs
+            self.logger.info("  📋 Validating execution logs...")
+
+            # Get server logs from the actual log file inside the container
+            result = self.run_command(
+                ["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
+            )
+
+            if result.returncode == 0:
+                logs = result.stdout.decode() + result.stderr.decode()
+
+                # Look for debug tool execution patterns
+                debug_patterns = [
+                    "debug tool",
+                    "[DEBUG]",
+                    "systematic investigation",
+                    "Token budget",
+                    "Essential files for debugging",
+                ]
+
+                patterns_found = 0
+                for pattern in debug_patterns:
+                    if pattern in logs:
+                        patterns_found += 1
+                        self.logger.debug(f"  ✅ Found log pattern: {pattern}")
+
+                if patterns_found >= 3:
+                    self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
+                else:
+                    self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
+            else:
+                self.logger.warning("  ⚠️ Could not retrieve Docker logs")
+
+            # Test continuation if available
+            if continuation_id:
+                self.logger.info("  🔄 Testing debug continuation...")
+
+                follow_up_response, _ = self.call_mcp_tool(
+                    "debug",
+                    {
+                        "prompt": "Based on your analysis, which bug should we fix first and how?",
+                        "continuation_id": continuation_id,
+                        "model": "flash",
+                    },
+                )
+
+                if follow_up_response:
+                    self.logger.info("  ✅ Debug continuation worked")
+                else:
+                    self.logger.warning("  ⚠️ Debug continuation failed")
+
+            self.logger.info("  ✅ Debug tool validation completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Debug validation test failed: {e}")
+            return False
+        finally:
+            self.cleanup_test_files()