Files
my-pal-mcp-server/simulator_tests/test_debug_validation.py

961 lines
42 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
DebugWorkflow Tool Validation Test
Tests the debug tool's capabilities using the new workflow architecture.
This validates that the new workflow-based implementation maintains
all the functionality of the original debug tool.
"""
import json
from typing import Optional
from .conversation_base_test import ConversationBaseTest
class DebugValidationTest(ConversationBaseTest):
"""Test debug tool with new workflow architecture"""
@property
def test_name(self) -> str:
return "debug_validation"
@property
def test_description(self) -> str:
return "Debug tool validation with new workflow architecture"
def run_test(self) -> bool:
"""Test debug tool capabilities"""
# Set up the test environment
self.setUp()
try:
self.logger.info("Test: DebugWorkflow tool validation (new architecture)")
# Create a Python file with a subtle but realistic bug
self._create_buggy_code()
# Test 1: Single investigation session with multiple steps
if not self._test_single_investigation_session():
return False
# Test 2: Investigation flow that requires refinement
if not self._test_investigation_refine_flow():
return False
# Test 3: Complete investigation with expert analysis
if not self._test_complete_investigation_with_analysis():
return False
# Test 4: Certain confidence behavior
if not self._test_certain_confidence():
return False
# Test 5: Context-aware file embedding
if not self._test_context_aware_file_embedding():
return False
# Test 6: Multi-step file context optimization
if not self._test_multi_step_file_context():
return False
self.logger.info(" ✅ All debug validation tests passed")
return True
except Exception as e:
self.logger.error(f"DebugWorkflow validation test failed: {e}")
return False
def _create_buggy_code(self):
"""Create test files with a subtle bug for debugging"""
# Create a Python file with dictionary iteration bug
buggy_code = """#!/usr/bin/env python3
import json
from datetime import datetime, timedelta
class SessionManager:
def __init__(self):
self.active_sessions = {}
self.session_timeout = 30 * 60 # 30 minutes in seconds
def create_session(self, user_id, user_data):
\"\"\"Create a new user session\"\"\"
session_id = f"sess_{user_id}_{datetime.now().timestamp()}"
session_info = {
'user_id': user_id,
'user_data': user_data,
'created_at': datetime.now(),
'expires_at': datetime.now() + timedelta(seconds=self.session_timeout)
}
self.active_sessions[session_id] = session_info
return session_id
def validate_session(self, session_id):
\"\"\"Check if session is valid and not expired\"\"\"
if session_id not in self.active_sessions:
return False
session = self.active_sessions[session_id]
current_time = datetime.now()
# Check if session has expired
if current_time > session['expires_at']:
del self.active_sessions[session_id]
return False
return True
def cleanup_expired_sessions(self):
\"\"\"Remove expired sessions from memory\"\"\"
current_time = datetime.now()
expired_count = 0
# BUG: Modifying dictionary while iterating over it
for session_id, session in self.active_sessions.items():
if current_time > session['expires_at']:
del self.active_sessions[session_id] # This causes RuntimeError
expired_count += 1
return expired_count
"""
# Create test file with subtle bug
self.buggy_file = self.create_additional_test_file("session_manager.py", buggy_code)
self.logger.info(f" ✅ Created test file with subtle bug: {self.buggy_file}")
# Create error description
error_description = """ISSUE DESCRIPTION:
Our session management system is experiencing intermittent failures during cleanup operations.
SYMPTOMS:
- Random RuntimeError: dictionary changed size during iteration
- Occurs during high load when many sessions expire simultaneously
- Error happens in cleanup_expired_sessions method
- Affects about 5% of cleanup operations
ERROR LOG:
RuntimeError: dictionary changed size during iteration
File "session_manager.py", line 44, in cleanup_expired_sessions
for session_id, session in self.active_sessions.items():
"""
self.error_file = self.create_additional_test_file("error_description.txt", error_description)
self.logger.info(f" ✅ Created error description file: {self.error_file}")
def _test_single_investigation_session(self) -> bool:
"""Test a complete investigation session with multiple steps"""
try:
self.logger.info(" 1.1: Testing single investigation session")
# Step 1: Start investigation
self.logger.info(" 1.1.1: Step 1 - Initial investigation")
response1, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "I need to investigate intermittent RuntimeError during session cleanup. Let me start by examining the error description and understanding the symptoms.",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"findings": "RuntimeError occurs during dictionary iteration in cleanup_expired_sessions method. Error happens intermittently during high load.",
"files_checked": [self.error_file],
"relevant_files": [self.error_file],
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to get initial investigation response")
return False
# Parse and validate JSON response
response1_data = self._parse_debug_response(response1)
if not response1_data:
return False
# Validate step 1 response structure - expect pause_for_investigation for next_step_required=True
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_investigation"):
return False
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
# Step 2: Examine the code
self.logger.info(" 1.1.2: Step 2 - Code examination")
response2, _ = self.call_mcp_tool(
"debug",
{
"step": "Now examining the session_manager.py file to understand the cleanup_expired_sessions implementation and identify the root cause.",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"findings": "Found the issue: cleanup_expired_sessions modifies self.active_sessions dictionary while iterating over it with .items(). This causes RuntimeError when del is called during iteration.",
"files_checked": [self.error_file, self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_context": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary is being modified during iteration causing RuntimeError",
"confidence": "high",
"continuation_id": continuation_id,
},
)
if not response2:
self.logger.error("Failed to continue investigation to step 2")
return False
response2_data = self._parse_debug_response(response2)
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_investigation"):
return False
# Check investigation status tracking
investigation_status = response2_data.get("investigation_status", {})
if investigation_status.get("files_checked", 0) < 2:
self.logger.error("Files checked count not properly tracked")
return False
if investigation_status.get("relevant_context", 0) != 1:
self.logger.error("Relevant context not properly tracked")
return False
if investigation_status.get("current_confidence") != "high":
self.logger.error("Confidence level not properly tracked")
return False
self.logger.info(" ✅ Step 2 successful with proper tracking")
# Store continuation_id for next test
self.investigation_continuation_id = continuation_id
return True
except Exception as e:
self.logger.error(f"Single investigation session test failed: {e}")
return False
def _test_investigation_refine_flow(self) -> bool:
"""Test investigation flow that requires refining the approach"""
try:
self.logger.info(" 1.2: Testing investigation refinement workflow")
# Start a new investigation for testing refinement behaviour
self.logger.info(" 1.2.1: Start investigation for refinement test")
response1, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "Investigating performance degradation in data processing pipeline",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"findings": "Initial analysis shows slow database queries",
"files_checked": ["/db/queries.py"],
"relevant_files": ["/db/queries.py"],
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start refinement test investigation")
return False
# Step 2: Wrong direction
self.logger.info(" 1.2.2: Step 2 - Wrong investigation path")
response2, _ = self.call_mcp_tool(
"debug",
{
"step": "Focusing on database optimization strategies",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"findings": "Database queries seem optimized, might be looking in wrong place",
"files_checked": ["/db/queries.py", "/db/indexes.py"],
"relevant_files": [],
"hypothesis": "Database performance issues",
"confidence": "low",
"continuation_id": continuation_id,
},
)
if not response2:
self.logger.error("Failed to continue to step 2")
return False
# Step 3: Backtrack from step 2
self.logger.info(" 1.2.3: Step 3 - Refine investigation path")
response3, _ = self.call_mcp_tool(
"debug",
{
"step": "Refocusing - the issue might not be database related. Let me investigate the data processing algorithm instead.",
"step_number": 3,
"total_steps": 4,
"next_step_required": True,
"findings": "Found inefficient nested loops in data processor causing O(n²) complexity",
"files_checked": ["/processor/algorithm.py"],
"relevant_files": ["/processor/algorithm.py"],
"relevant_context": ["DataProcessor.process_batch"],
"hypothesis": "Inefficient algorithm causing performance issues",
"confidence": "medium",
"continuation_id": continuation_id,
},
)
if not response3:
self.logger.error("Failed to refine investigation")
return False
response3_data = self._parse_debug_response(response3)
if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_investigation"):
return False
self.logger.info(" ✅ Investigation refinement working correctly")
return True
except Exception as e:
self.logger.error(f"Investigation refinement test failed: {e}")
return False
def _test_complete_investigation_with_analysis(self) -> bool:
"""Test complete investigation ending with expert analysis"""
try:
self.logger.info(" 1.3: Testing complete investigation with expert analysis")
# Use the continuation from first test
continuation_id = getattr(self, "investigation_continuation_id", None)
if not continuation_id:
# Start fresh if no continuation available
self.logger.info(" 1.3.0: Starting fresh investigation")
response0, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "Investigating the dictionary iteration bug in session cleanup",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Found dictionary modification during iteration",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_context": ["SessionManager.cleanup_expired_sessions"],
},
)
if not response0 or not continuation_id:
self.logger.error("Failed to start fresh investigation")
return False
# Final step - trigger expert analysis
self.logger.info(" 1.3.1: Final step - complete investigation")
response_final, _ = self.call_mcp_tool(
"debug",
{
"step": "Investigation complete. The root cause is confirmed: cleanup_expired_sessions modifies the dictionary while iterating, causing RuntimeError.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step - triggers expert analysis
"findings": "Root cause identified: del self.active_sessions[session_id] on line 46 modifies dictionary during iteration starting at line 44. Fix: collect expired IDs first, then delete.",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_context": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration causes RuntimeError in cleanup_expired_sessions",
"confidence": "high",
"continuation_id": continuation_id,
"model": "flash", # Use flash for expert analysis
},
)
if not response_final:
self.logger.error("Failed to complete investigation")
return False
response_final_data = self._parse_debug_response(response_final)
if not response_final_data:
return False
# Validate final response structure - expect calling_expert_analysis for next_step_required=False
if response_final_data.get("status") != "calling_expert_analysis":
self.logger.error(
f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
)
return False
if not response_final_data.get("investigation_complete"):
self.logger.error("Expected investigation_complete=true for final step")
return False
# Check for expert analysis
if "expert_analysis" not in response_final_data:
self.logger.error("Missing expert_analysis in final response")
return False
expert_analysis = response_final_data.get("expert_analysis", {})
# Check for expected analysis content (checking common patterns)
analysis_text = json.dumps(expert_analysis, ensure_ascii=False).lower()
# Look for bug identification
bug_indicators = ["dictionary", "iteration", "modify", "runtime", "error", "del"]
found_indicators = sum(1 for indicator in bug_indicators if indicator in analysis_text)
if found_indicators >= 3:
self.logger.info(" ✅ Expert analysis identified the bug correctly")
else:
self.logger.warning(
f" ⚠️ Expert analysis may not have fully identified the bug (found {found_indicators}/6 indicators)"
)
# Check complete investigation summary
if "complete_investigation" not in response_final_data:
self.logger.error("Missing complete_investigation in final response")
return False
complete_investigation = response_final_data["complete_investigation"]
if not complete_investigation.get("relevant_context"):
self.logger.error("Missing relevant context in complete investigation")
return False
if "SessionManager.cleanup_expired_sessions" not in complete_investigation["relevant_context"]:
self.logger.error("Expected method not found in investigation summary")
return False
self.logger.info(" ✅ Complete investigation with expert analysis successful")
return True
except Exception as e:
self.logger.error(f"Complete investigation test failed: {e}")
return False
def _test_certain_confidence(self) -> bool:
"""Test certain confidence behavior - should skip expert analysis"""
try:
self.logger.info(" 1.4: Testing certain confidence behavior")
# Test certain confidence - should skip expert analysis
self.logger.info(" 1.4.1: Certain confidence investigation")
response_certain, _ = self.call_mcp_tool(
"debug",
{
"step": "I have confirmed the exact root cause with 100% certainty: dictionary modification during iteration.",
"step_number": 1,
"total_steps": 1,
"next_step_required": False, # Final step
"findings": "The bug is on line 44-47: for loop iterates over dict.items() while del modifies the dict inside the loop. Fix is simple: collect expired IDs first, then delete after iteration.",
"files_checked": [self.buggy_file],
"relevant_files": [self.buggy_file],
"relevant_context": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration causes RuntimeError - fix is straightforward",
"confidence": "certain", # This should skip expert analysis
"model": "flash",
},
)
if not response_certain:
self.logger.error("Failed to test certain confidence")
return False
response_certain_data = self._parse_debug_response(response_certain)
if not response_certain_data:
return False
# Validate certain confidence response - should skip expert analysis
if response_certain_data.get("status") != "certain_confidence_proceed_with_fix":
self.logger.error(
f"Expected status 'certain_confidence_proceed_with_fix', got '{response_certain_data.get('status')}'"
)
return False
if not response_certain_data.get("skip_expert_analysis"):
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
return False
expert_analysis = response_certain_data.get("expert_analysis", {})
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
self.logger.error("Expert analysis should be skipped for certain confidence")
return False
self.logger.info(" ✅ Certain confidence behavior working correctly")
return True
except Exception as e:
self.logger.error(f"Certain confidence test failed: {e}")
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool in-process - override for debug-specific response handling"""
# Use in-process implementation to maintain conversation memory
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
if not response_text:
return None, None
# Extract continuation_id from debug response specifically
continuation_id = self._extract_debug_continuation_id(response_text)
return response_text, continuation_id
def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from debug response"""
try:
# Parse the response
response_data = json.loads(response_text)
return response_data.get("continuation_id")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
return None
def _parse_debug_response(self, response_text: str) -> dict:
"""Parse debug tool JSON response"""
try:
# Parse the response - it should be direct JSON
return json.loads(response_text)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse debug response as JSON: {e}")
self.logger.error(f"Response text: {response_text[:500]}...")
return {}
def _validate_step_response(
self,
response_data: dict,
expected_step: int,
expected_total: int,
expected_next_required: bool,
expected_status: str,
) -> bool:
"""Validate a debug investigation step response structure"""
try:
# Check status
if response_data.get("status") != expected_status:
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
return False
# Check step number
if response_data.get("step_number") != expected_step:
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
return False
# Check total steps
if response_data.get("total_steps") != expected_total:
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
return False
# Check next_step_required
if response_data.get("next_step_required") != expected_next_required:
self.logger.error(
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
)
return False
# Check investigation_status exists
if "investigation_status" not in response_data:
self.logger.error("Missing investigation_status in response")
return False
# Check next_steps guidance
if not response_data.get("next_steps"):
self.logger.error("Missing next_steps guidance in response")
return False
return True
except Exception as e:
self.logger.error(f"Error validating step response: {e}")
return False
def _test_context_aware_file_embedding(self) -> bool:
"""Test context-aware file embedding optimization"""
try:
self.logger.info(" 1.5: Testing context-aware file embedding")
# Create multiple test files for context testing
file1_content = """#!/usr/bin/env python3
def process_data(data):
\"\"\"Process incoming data\"\"\"
result = []
for item in data:
if item.get('valid'):
result.append(item['value'])
return result
"""
file2_content = """#!/usr/bin/env python3
def validate_input(data):
\"\"\"Validate input data\"\"\"
if not isinstance(data, list):
raise ValueError("Data must be a list")
for item in data:
if not isinstance(item, dict):
raise ValueError("Items must be dictionaries")
if 'value' not in item:
raise ValueError("Items must have 'value' key")
return True
"""
# Create test files
file1 = self.create_additional_test_file("data_processor.py", file1_content)
file2 = self.create_additional_test_file("validator.py", file2_content)
# Test 1: New conversation, intermediate step - should only reference files
self.logger.info(" 1.5.1: New conversation intermediate step (should reference only)")
response1, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "Starting investigation of data processing pipeline",
"step_number": 1,
"total_steps": 3,
"next_step_required": True, # Intermediate step
"findings": "Initial analysis of data processing components",
"files_checked": [file1, file2],
"relevant_files": [file1], # This should be referenced, not embedded
"relevant_context": ["process_data"],
"hypothesis": "Investigating data flow",
"confidence": "low",
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start context-aware file embedding test")
return False
response1_data = self._parse_debug_response(response1)
if not response1_data:
return False
# Check file context - should be reference_only for intermediate step
file_context = response1_data.get("file_context", {})
if file_context.get("type") != "reference_only":
self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
return False
if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
self.logger.error("Expected context optimization message for reference_only")
return False
self.logger.info(" ✅ Intermediate step correctly uses reference_only file context")
# Test 2: Intermediate step with continuation - should still only reference
self.logger.info(" 1.5.2: Intermediate step with continuation (should reference only)")
response2, _ = self.call_mcp_tool(
"debug",
{
"step": "Continuing investigation with more detailed analysis",
"step_number": 2,
"total_steps": 3,
"next_step_required": True, # Still intermediate
"continuation_id": continuation_id,
"findings": "Found potential issues in validation logic",
"files_checked": [file1, file2],
"relevant_files": [file1, file2], # Both files referenced
"relevant_context": ["process_data", "validate_input"],
"hypothesis": "Validation might be too strict",
"confidence": "medium",
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to continue to step 2")
return False
response2_data = self._parse_debug_response(response2)
if not response2_data:
return False
# Check file context - should still be reference_only
file_context2 = response2_data.get("file_context", {})
if file_context2.get("type") != "reference_only":
self.logger.error(f"Expected reference_only file context for step 2, got: {file_context2.get('type')}")
return False
# Should include reference note
if not file_context2.get("note"):
self.logger.error("Expected file reference note for intermediate step")
return False
reference_note = file_context2.get("note", "")
if "data_processor.py" not in reference_note or "validator.py" not in reference_note:
self.logger.error("File reference note should mention both files")
return False
self.logger.info(" ✅ Intermediate step with continuation correctly uses reference_only")
# Test 3: Final step - should embed files for expert analysis
self.logger.info(" 1.5.3: Final step (should embed files)")
response3, _ = self.call_mcp_tool(
"debug",
{
"step": "Investigation complete - identified the root cause",
"step_number": 3,
"total_steps": 3,
"next_step_required": False, # Final step - should embed files
"continuation_id": continuation_id,
"findings": "Root cause: validator is rejecting valid data due to strict type checking",
"files_checked": [file1, file2],
"relevant_files": [file1, file2], # Should be fully embedded
"relevant_context": ["process_data", "validate_input"],
"hypothesis": "Validation logic is too restrictive for valid edge cases",
"confidence": "high",
"model": "flash",
},
)
if not response3:
self.logger.error("Failed to complete to final step")
return False
response3_data = self._parse_debug_response(response3)
if not response3_data:
return False
# Check file context - should be fully_embedded for final step
file_context3 = response3_data.get("file_context", {})
if file_context3.get("type") != "fully_embedded":
self.logger.error(
f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
)
return False
if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
self.logger.error("Expected expert analysis optimization message for fully_embedded")
return False
# Should show files embedded count
files_embedded = file_context3.get("files_embedded", 0)
if files_embedded == 0:
# This is OK - files might already be in conversation history
self.logger.info(
" Files embedded count is 0 - files already in conversation history (smart deduplication)"
)
else:
self.logger.info(f" ✅ Files embedded count: {files_embedded}")
self.logger.info(" ✅ Final step correctly uses fully_embedded file context")
# Verify expert analysis was called for final step
if response3_data.get("status") != "calling_expert_analysis":
self.logger.error("Final step should trigger expert analysis")
return False
if "expert_analysis" not in response3_data:
self.logger.error("Expert analysis should be present in final step")
return False
self.logger.info(" ✅ Context-aware file embedding test completed successfully")
return True
except Exception as e:
self.logger.error(f"Context-aware file embedding test failed: {e}")
return False
def _test_multi_step_file_context(self) -> bool:
"""Test multi-step workflow with proper file context transitions"""
try:
self.logger.info(" 1.6: Testing multi-step file context optimization")
# Create a complex scenario with multiple files
config_content = """#!/usr/bin/env python3
import os
DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///app.db')
DEBUG_MODE = os.getenv('DEBUG', 'False').lower() == 'true'
MAX_CONNECTIONS = int(os.getenv('MAX_CONNECTIONS', '10'))
# Bug: This will cause issues when MAX_CONNECTIONS is not a valid integer
CACHE_SIZE = MAX_CONNECTIONS * 2 # Problematic if MAX_CONNECTIONS is invalid
"""
server_content = """#!/usr/bin/env python3
from config import DATABASE_URL, DEBUG_MODE, CACHE_SIZE
import sqlite3
class DatabaseServer:
def __init__(self):
self.connection_pool = []
self.cache_size = CACHE_SIZE # This will fail if CACHE_SIZE is invalid
def connect(self):
try:
conn = sqlite3.connect(DATABASE_URL)
self.connection_pool.append(conn)
return conn
except Exception as e:
print(f"Connection failed: {e}")
return None
"""
# Create test files
config_file = self.create_additional_test_file("config.py", config_content)
server_file = self.create_additional_test_file("database_server.py", server_content)
# Step 1: Start investigation (new conversation)
self.logger.info(" 1.6.1: Step 1 - Start investigation")
response1, continuation_id = self.call_mcp_tool(
"debug",
{
"step": "Investigating application startup failures in production environment",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"findings": "Application fails to start with configuration errors",
"files_checked": [config_file],
"relevant_files": [config_file],
"relevant_context": [],
"hypothesis": "Configuration issue causing startup failure",
"confidence": "low",
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start multi-step file context test")
return False
response1_data = self._parse_debug_response(response1)
# Validate step 1 - should use reference_only
file_context1 = response1_data.get("file_context", {})
if file_context1.get("type") != "reference_only":
self.logger.error("Step 1 should use reference_only file context")
return False
self.logger.info(" ✅ Step 1: reference_only file context")
# Step 2: Expand investigation
self.logger.info(" 1.6.2: Step 2 - Expand investigation")
response2, _ = self.call_mcp_tool(
"debug",
{
"step": "Found configuration issue - investigating database server initialization",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"continuation_id": continuation_id,
"findings": "MAX_CONNECTIONS environment variable contains invalid value, causing CACHE_SIZE calculation to fail",
"files_checked": [config_file, server_file],
"relevant_files": [config_file, server_file],
"relevant_context": ["DatabaseServer.__init__"],
"hypothesis": "Invalid environment variable causing integer conversion error",
"confidence": "medium",
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to continue to step 2")
return False
response2_data = self._parse_debug_response(response2)
# Validate step 2 - should still use reference_only
file_context2 = response2_data.get("file_context", {})
if file_context2.get("type") != "reference_only":
self.logger.error("Step 2 should use reference_only file context")
return False
# Should reference both files
reference_note = file_context2.get("note", "")
if "config.py" not in reference_note or "database_server.py" not in reference_note:
self.logger.error("Step 2 should reference both files in note")
return False
self.logger.info(" ✅ Step 2: reference_only file context with multiple files")
# Step 3: Deep analysis
self.logger.info(" 1.6.3: Step 3 - Deep analysis")
response3, _ = self.call_mcp_tool(
"debug",
{
"step": "Analyzing the exact error propagation path and impact",
"step_number": 3,
"total_steps": 4,
"next_step_required": True,
"continuation_id": continuation_id,
"findings": "Error occurs in config.py line 8 when MAX_CONNECTIONS is not numeric, then propagates to DatabaseServer.__init__",
"files_checked": [config_file, server_file],
"relevant_files": [config_file, server_file],
"relevant_context": ["DatabaseServer.__init__"],
"hypothesis": "Need proper error handling and validation for environment variables",
"confidence": "high",
"model": "flash",
},
)
if not response3:
self.logger.error("Failed to continue to step 3")
return False
response3_data = self._parse_debug_response(response3)
# Validate step 3 - should still use reference_only
file_context3 = response3_data.get("file_context", {})
if file_context3.get("type") != "reference_only":
self.logger.error("Step 3 should use reference_only file context")
return False
self.logger.info(" ✅ Step 3: reference_only file context")
# Step 4: Final analysis with expert consultation
self.logger.info(" 1.6.4: Step 4 - Final step with expert analysis")
response4, _ = self.call_mcp_tool(
"debug",
{
"step": "Investigation complete - root cause identified with solution",
"step_number": 4,
"total_steps": 4,
"next_step_required": False, # Final step - should embed files
"continuation_id": continuation_id,
"findings": "Root cause: config.py assumes MAX_CONNECTIONS env var is always a valid integer. Fix: add try/except with default value and proper validation.",
"files_checked": [config_file, server_file],
"relevant_files": [config_file, server_file],
"relevant_context": ["DatabaseServer.__init__"],
"hypothesis": "Environment variable validation needed with proper error handling",
"confidence": "high",
"model": "flash",
},
)
if not response4:
self.logger.error("Failed to complete to final step")
return False
response4_data = self._parse_debug_response(response4)
# Validate step 4 - should use fully_embedded for expert analysis
file_context4 = response4_data.get("file_context", {})
if file_context4.get("type") != "fully_embedded":
self.logger.error("Step 4 (final) should use fully_embedded file context")
return False
if "expert analysis" not in file_context4.get("context_optimization", "").lower():
self.logger.error("Final step should mention expert analysis in context optimization")
return False
# Verify expert analysis was triggered
if response4_data.get("status") != "calling_expert_analysis":
self.logger.error("Final step should trigger expert analysis")
return False
# Check that expert analysis has file context
expert_analysis = response4_data.get("expert_analysis", {})
if not expert_analysis:
self.logger.error("Expert analysis should be present in final step")
return False
self.logger.info(" ✅ Step 4: fully_embedded file context with expert analysis")
# Validate the complete workflow progression
progression_summary = {
"step_1": "reference_only (new conversation, intermediate)",
"step_2": "reference_only (continuation, intermediate)",
"step_3": "reference_only (continuation, intermediate)",
"step_4": "fully_embedded (continuation, final)",
}
self.logger.info(" 📋 File context progression:")
for step, context_type in progression_summary.items():
self.logger.info(f" {step}: {context_type}")
self.logger.info(" ✅ Multi-step file context optimization test completed successfully")
return True
except Exception as e:
self.logger.error(f"Multi-step file context test failed: {e}")
return False