Files
my-pal-mcp-server/tests/test_debug_certain_confidence.py

364 lines
16 KiB
Python

"""
Integration tests for the debug tool's 'certain' confidence feature.
Tests the complete workflow where Claude identifies obvious bugs with absolute certainty
and can skip expensive expert analysis for minimal fixes.
"""
import json
from unittest.mock import patch
import pytest
from tools.debug import DebugIssueTool
class TestDebugCertainConfidence:
"""Integration tests for certain confidence optimization."""
def setup_method(self):
"""Set up test tool instance."""
self.tool = DebugIssueTool()
@pytest.mark.asyncio
async def test_certain_confidence_skips_expert_analysis(self):
"""Test that certain confidence with valid minimal fix skips expert analysis."""
# Simulate a multi-step investigation ending with certain confidence
# Step 1: Initial investigation
with patch("utils.conversation_memory.create_thread", return_value="debug-certain-uuid"):
with patch("utils.conversation_memory.add_turn"):
result1 = await self.tool.execute(
{
"step": "Investigating Python ImportError in user authentication module",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Users cannot log in, getting 'ModuleNotFoundError: No module named hashlib'",
"files_checked": ["/auth/user_auth.py"],
"relevant_files": ["/auth/user_auth.py"],
"hypothesis": "Missing import statement",
"confidence": "medium",
"continuation_id": None,
}
)
# Verify step 1 response
response1 = json.loads(result1[0].text)
assert response1["status"] == "investigation_in_progress"
assert response1["step_number"] == 1
continuation_id = response1["continuation_id"]
# Step 2: Final step with certain confidence (simple import fix)
with patch("utils.conversation_memory.add_turn"):
result2 = await self.tool.execute(
{
"step": "Found the exact issue and fix",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Missing 'import hashlib' statement at top of user_auth.py file, line 3. Simple one-line fix required.",
"files_checked": ["/auth/user_auth.py"],
"relevant_files": ["/auth/user_auth.py"],
"relevant_methods": ["UserAuth.hash_password"],
"hypothesis": "Missing import hashlib statement causes ModuleNotFoundError when hash_password method is called",
"confidence": "certain", # NAILEDIT confidence - should skip expert analysis
"continuation_id": continuation_id,
}
)
# Verify final response skipped expert analysis
response2 = json.loads(result2[0].text)
# Should indicate certain confidence was used
assert response2["status"] == "certain_confidence_proceed_with_fix"
assert response2["investigation_complete"] is True
assert response2["skip_expert_analysis"] is True
# Expert analysis should be marked as skipped
assert response2["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
assert (
response2["expert_analysis"]["reason"] == "Claude identified exact root cause with minimal fix requirement"
)
# Should have complete investigation summary
assert "complete_investigation" in response2
assert response2["complete_investigation"]["confidence_level"] == "certain"
assert response2["complete_investigation"]["steps_taken"] == 2
# Next steps should guide Claude to implement the fix directly
assert "CERTAIN confidence" in response2["next_steps"]
assert "minimal fix" in response2["next_steps"]
assert "without requiring further consultation" in response2["next_steps"]
@pytest.mark.asyncio
async def test_certain_confidence_always_trusted(self):
"""Test that certain confidence is always trusted, even for complex issues."""
# Set up investigation state
self.tool.initial_issue = "Any kind of issue"
self.tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation",
"findings": "Some findings",
"files_checked": [],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
}
]
self.tool.consolidated_findings = {
"files_checked": set(),
"relevant_files": set(),
"relevant_methods": set(),
"findings": ["Step 1: Some findings"],
"hypotheses": [],
"images": [],
}
# Final step with certain confidence - should ALWAYS be trusted
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(
{
"step": "Found the issue and fix",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Complex or simple, doesn't matter - Claude says certain",
"files_checked": ["/any/file.py"],
"relevant_files": ["/any/file.py"],
"relevant_methods": ["any_method"],
"hypothesis": "Claude has decided this is certain - trust the judgment",
"confidence": "certain", # Should always be trusted
"continuation_id": "debug-trust-uuid",
}
)
# Verify certain is always trusted
response = json.loads(result[0].text)
# Should proceed with certain confidence
assert response["status"] == "certain_confidence_proceed_with_fix"
assert response["investigation_complete"] is True
assert response["skip_expert_analysis"] is True
# Expert analysis should be skipped
assert response["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
# Next steps should guide Claude to implement fix directly
assert "CERTAIN confidence" in response["next_steps"]
@pytest.mark.asyncio
async def test_regular_high_confidence_still_uses_expert_analysis(self):
"""Test that regular 'high' confidence still triggers expert analysis."""
# Set up investigation state
self.tool.initial_issue = "Session validation issue"
self.tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation",
"findings": "Found session issue",
"files_checked": [],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
}
]
self.tool.consolidated_findings = {
"files_checked": set(),
"relevant_files": {"/api/sessions.py"},
"relevant_methods": {"SessionManager.validate"},
"findings": ["Step 1: Found session issue"],
"hypotheses": [],
"images": [],
}
# Mock expert analysis
mock_expert_response = {
"status": "analysis_complete",
"summary": "Expert analysis of session validation",
"hypotheses": [
{
"name": "SESSION_VALIDATION_BUG",
"confidence": "High",
"root_cause": "Session timeout not properly handled",
}
],
}
# Final step with regular 'high' confidence (should trigger expert analysis)
with patch("utils.conversation_memory.add_turn"):
with patch.object(self.tool, "_call_expert_analysis", return_value=mock_expert_response):
with patch.object(self.tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
result = await self.tool.execute(
{
"step": "Identified likely root cause",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Session validation fails when timeout occurs during user activity",
"files_checked": ["/api/sessions.py"],
"relevant_files": ["/api/sessions.py"],
"relevant_methods": ["SessionManager.validate", "SessionManager.cleanup"],
"hypothesis": "Session timeout handling bug causes validation failures",
"confidence": "high", # Regular high confidence, NOT certain
"continuation_id": "debug-regular-uuid",
}
)
# Verify expert analysis was called (not skipped)
response = json.loads(result[0].text)
# Should call expert analysis normally
assert response["status"] == "calling_expert_analysis"
assert response["investigation_complete"] is True
assert "skip_expert_analysis" not in response # Should not be present
# Expert analysis should be present with real results
assert response["expert_analysis"]["status"] == "analysis_complete"
assert response["expert_analysis"]["summary"] == "Expert analysis of session validation"
# Next steps should indicate normal investigation completion (not certain confidence)
assert "INVESTIGATION IS COMPLETE" in response["next_steps"]
assert "certain" not in response["next_steps"].lower()
def test_certain_confidence_schema_requirements(self):
"""Test that certain confidence is properly described in schema for Claude's guidance."""
# The schema description should guide Claude on proper certain usage
schema = self.tool.get_input_schema()
confidence_description = schema["properties"]["confidence"]["description"]
# Should emphasize it's only when root cause and fix are confirmed
assert "root cause" in confidence_description.lower()
assert "minimal fix" in confidence_description.lower()
assert "confirmed" in confidence_description.lower()
# Should emphasize trust in Claude's judgment
assert "absolutely" in confidence_description.lower() or "certain" in confidence_description.lower()
# Should mention no thought-partner assistance needed
assert "thought-partner" in confidence_description.lower() or "assistance" in confidence_description.lower()
@pytest.mark.asyncio
async def test_confidence_enum_validation(self):
"""Test that certain is properly included in confidence enum validation."""
# Valid confidence values should not raise errors
valid_confidences = ["low", "medium", "high", "certain"]
for confidence in valid_confidences:
# This should not raise validation errors
with patch("utils.conversation_memory.create_thread", return_value="test-uuid"):
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(
{
"step": f"Test step with {confidence} confidence",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Test findings",
"confidence": confidence,
}
)
# Should get valid response
response = json.loads(result[0].text)
assert "error" not in response or response.get("status") != "investigation_failed"
def test_tool_schema_includes_certain(self):
"""Test that the tool schema properly includes certain in confidence enum."""
schema = self.tool.get_input_schema()
confidence_property = schema["properties"]["confidence"]
assert confidence_property["type"] == "string"
assert "certain" in confidence_property["enum"]
assert confidence_property["enum"] == ["exploring", "low", "medium", "high", "certain"]
# Check that description explains certain usage
description = confidence_property["description"]
assert "certain" in description.lower()
assert "root cause" in description.lower()
assert "minimal fix" in description.lower()
assert "thought-partner" in description.lower()
@pytest.mark.asyncio
async def test_certain_confidence_preserves_investigation_data(self):
"""Test that certain confidence path preserves all investigation data properly."""
# Multi-step investigation leading to certain
with patch("utils.conversation_memory.create_thread", return_value="preserve-data-uuid"):
with patch("utils.conversation_memory.add_turn"):
# Step 1
await self.tool.execute(
{
"step": "Initial investigation of login failure",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Users can't log in after password reset",
"files_checked": ["/auth/password.py"],
"relevant_files": ["/auth/password.py"],
"confidence": "low",
}
)
# Step 2
await self.tool.execute(
{
"step": "Examining password validation logic",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"findings": "Password hash function not imported correctly",
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
"relevant_files": ["/auth/password.py"],
"relevant_methods": ["PasswordManager.validate_password"],
"hypothesis": "Import statement issue",
"confidence": "medium",
"continuation_id": "preserve-data-uuid",
}
)
# Step 3: Final with certain
result = await self.tool.execute(
{
"step": "Found exact issue and fix",
"step_number": 3,
"total_steps": 3,
"next_step_required": False,
"findings": "Missing 'from utils.crypto import hash_password' at line 5",
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
"relevant_files": ["/auth/password.py"],
"relevant_methods": ["PasswordManager.validate_password", "hash_password"],
"hypothesis": "Missing import statement for hash_password function",
"confidence": "certain",
"continuation_id": "preserve-data-uuid",
}
)
# Verify all investigation data is preserved
response = json.loads(result[0].text)
assert response["status"] == "certain_confidence_proceed_with_fix"
investigation = response["complete_investigation"]
assert investigation["steps_taken"] == 3
assert len(investigation["files_examined"]) == 2 # Both files from all steps
assert "/auth/password.py" in investigation["files_examined"]
assert "/utils/crypto.py" in investigation["files_examined"]
assert len(investigation["relevant_files"]) == 1
assert len(investigation["relevant_methods"]) == 2
assert investigation["confidence_level"] == "certain"
# Should have complete investigation summary
assert "SYSTEMATIC INVESTIGATION SUMMARY" in investigation["investigation_summary"]
assert (
"Steps taken: 3" in investigation["investigation_summary"]
or "Total steps: 3" in investigation["investigation_summary"]
)