364 lines
16 KiB
Python
364 lines
16 KiB
Python
"""
|
|
Integration tests for the debug tool's 'certain' confidence feature.
|
|
|
|
Tests the complete workflow where Claude identifies obvious bugs with absolute certainty
|
|
and can skip expensive expert analysis for minimal fixes.
|
|
"""
|
|
|
|
import json
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from tools.debug import DebugIssueTool
|
|
|
|
|
|
class TestDebugCertainConfidence:
|
|
"""Integration tests for certain confidence optimization."""
|
|
|
|
def setup_method(self):
|
|
"""Set up test tool instance."""
|
|
self.tool = DebugIssueTool()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_certain_confidence_skips_expert_analysis(self):
|
|
"""Test that certain confidence with valid minimal fix skips expert analysis."""
|
|
# Simulate a multi-step investigation ending with certain confidence
|
|
|
|
# Step 1: Initial investigation
|
|
with patch("utils.conversation_memory.create_thread", return_value="debug-certain-uuid"):
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
result1 = await self.tool.execute(
|
|
{
|
|
"step": "Investigating Python ImportError in user authentication module",
|
|
"step_number": 1,
|
|
"total_steps": 2,
|
|
"next_step_required": True,
|
|
"findings": "Users cannot log in, getting 'ModuleNotFoundError: No module named hashlib'",
|
|
"files_checked": ["/auth/user_auth.py"],
|
|
"relevant_files": ["/auth/user_auth.py"],
|
|
"hypothesis": "Missing import statement",
|
|
"confidence": "medium",
|
|
"continuation_id": None,
|
|
}
|
|
)
|
|
|
|
# Verify step 1 response
|
|
response1 = json.loads(result1[0].text)
|
|
assert response1["status"] == "investigation_in_progress"
|
|
assert response1["step_number"] == 1
|
|
continuation_id = response1["continuation_id"]
|
|
|
|
# Step 2: Final step with certain confidence (simple import fix)
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
result2 = await self.tool.execute(
|
|
{
|
|
"step": "Found the exact issue and fix",
|
|
"step_number": 2,
|
|
"total_steps": 2,
|
|
"next_step_required": False, # Final step
|
|
"findings": "Missing 'import hashlib' statement at top of user_auth.py file, line 3. Simple one-line fix required.",
|
|
"files_checked": ["/auth/user_auth.py"],
|
|
"relevant_files": ["/auth/user_auth.py"],
|
|
"relevant_methods": ["UserAuth.hash_password"],
|
|
"hypothesis": "Missing import hashlib statement causes ModuleNotFoundError when hash_password method is called",
|
|
"confidence": "certain", # NAILEDIT confidence - should skip expert analysis
|
|
"continuation_id": continuation_id,
|
|
}
|
|
)
|
|
|
|
# Verify final response skipped expert analysis
|
|
response2 = json.loads(result2[0].text)
|
|
|
|
# Should indicate certain confidence was used
|
|
assert response2["status"] == "certain_confidence_proceed_with_fix"
|
|
assert response2["investigation_complete"] is True
|
|
assert response2["skip_expert_analysis"] is True
|
|
|
|
# Expert analysis should be marked as skipped
|
|
assert response2["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
|
|
assert (
|
|
response2["expert_analysis"]["reason"] == "Claude identified exact root cause with minimal fix requirement"
|
|
)
|
|
|
|
# Should have complete investigation summary
|
|
assert "complete_investigation" in response2
|
|
assert response2["complete_investigation"]["confidence_level"] == "certain"
|
|
assert response2["complete_investigation"]["steps_taken"] == 2
|
|
|
|
# Next steps should guide Claude to implement the fix directly
|
|
assert "CERTAIN confidence" in response2["next_steps"]
|
|
assert "minimal fix" in response2["next_steps"]
|
|
assert "without requiring further consultation" in response2["next_steps"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_certain_confidence_always_trusted(self):
|
|
"""Test that certain confidence is always trusted, even for complex issues."""
|
|
|
|
# Set up investigation state
|
|
self.tool.initial_issue = "Any kind of issue"
|
|
self.tool.investigation_history = [
|
|
{
|
|
"step_number": 1,
|
|
"step": "Initial investigation",
|
|
"findings": "Some findings",
|
|
"files_checked": [],
|
|
"relevant_files": [],
|
|
"relevant_methods": [],
|
|
"hypothesis": None,
|
|
"confidence": "low",
|
|
}
|
|
]
|
|
self.tool.consolidated_findings = {
|
|
"files_checked": set(),
|
|
"relevant_files": set(),
|
|
"relevant_methods": set(),
|
|
"findings": ["Step 1: Some findings"],
|
|
"hypotheses": [],
|
|
"images": [],
|
|
}
|
|
|
|
# Final step with certain confidence - should ALWAYS be trusted
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
result = await self.tool.execute(
|
|
{
|
|
"step": "Found the issue and fix",
|
|
"step_number": 2,
|
|
"total_steps": 2,
|
|
"next_step_required": False, # Final step
|
|
"findings": "Complex or simple, doesn't matter - Claude says certain",
|
|
"files_checked": ["/any/file.py"],
|
|
"relevant_files": ["/any/file.py"],
|
|
"relevant_methods": ["any_method"],
|
|
"hypothesis": "Claude has decided this is certain - trust the judgment",
|
|
"confidence": "certain", # Should always be trusted
|
|
"continuation_id": "debug-trust-uuid",
|
|
}
|
|
)
|
|
|
|
# Verify certain is always trusted
|
|
response = json.loads(result[0].text)
|
|
|
|
# Should proceed with certain confidence
|
|
assert response["status"] == "certain_confidence_proceed_with_fix"
|
|
assert response["investigation_complete"] is True
|
|
assert response["skip_expert_analysis"] is True
|
|
|
|
# Expert analysis should be skipped
|
|
assert response["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
|
|
|
|
# Next steps should guide Claude to implement fix directly
|
|
assert "CERTAIN confidence" in response["next_steps"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_regular_high_confidence_still_uses_expert_analysis(self):
|
|
"""Test that regular 'high' confidence still triggers expert analysis."""
|
|
|
|
# Set up investigation state
|
|
self.tool.initial_issue = "Session validation issue"
|
|
self.tool.investigation_history = [
|
|
{
|
|
"step_number": 1,
|
|
"step": "Initial investigation",
|
|
"findings": "Found session issue",
|
|
"files_checked": [],
|
|
"relevant_files": [],
|
|
"relevant_methods": [],
|
|
"hypothesis": None,
|
|
"confidence": "low",
|
|
}
|
|
]
|
|
self.tool.consolidated_findings = {
|
|
"files_checked": set(),
|
|
"relevant_files": {"/api/sessions.py"},
|
|
"relevant_methods": {"SessionManager.validate"},
|
|
"findings": ["Step 1: Found session issue"],
|
|
"hypotheses": [],
|
|
"images": [],
|
|
}
|
|
|
|
# Mock expert analysis
|
|
mock_expert_response = {
|
|
"status": "analysis_complete",
|
|
"summary": "Expert analysis of session validation",
|
|
"hypotheses": [
|
|
{
|
|
"name": "SESSION_VALIDATION_BUG",
|
|
"confidence": "High",
|
|
"root_cause": "Session timeout not properly handled",
|
|
}
|
|
],
|
|
}
|
|
|
|
# Final step with regular 'high' confidence (should trigger expert analysis)
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
with patch.object(self.tool, "_call_expert_analysis", return_value=mock_expert_response):
|
|
with patch.object(self.tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
|
|
result = await self.tool.execute(
|
|
{
|
|
"step": "Identified likely root cause",
|
|
"step_number": 2,
|
|
"total_steps": 2,
|
|
"next_step_required": False, # Final step
|
|
"findings": "Session validation fails when timeout occurs during user activity",
|
|
"files_checked": ["/api/sessions.py"],
|
|
"relevant_files": ["/api/sessions.py"],
|
|
"relevant_methods": ["SessionManager.validate", "SessionManager.cleanup"],
|
|
"hypothesis": "Session timeout handling bug causes validation failures",
|
|
"confidence": "high", # Regular high confidence, NOT certain
|
|
"continuation_id": "debug-regular-uuid",
|
|
}
|
|
)
|
|
|
|
# Verify expert analysis was called (not skipped)
|
|
response = json.loads(result[0].text)
|
|
|
|
# Should call expert analysis normally
|
|
assert response["status"] == "calling_expert_analysis"
|
|
assert response["investigation_complete"] is True
|
|
assert "skip_expert_analysis" not in response # Should not be present
|
|
|
|
# Expert analysis should be present with real results
|
|
assert response["expert_analysis"]["status"] == "analysis_complete"
|
|
assert response["expert_analysis"]["summary"] == "Expert analysis of session validation"
|
|
|
|
# Next steps should indicate normal investigation completion (not certain confidence)
|
|
assert "INVESTIGATION IS COMPLETE" in response["next_steps"]
|
|
assert "certain" not in response["next_steps"].lower()
|
|
|
|
def test_certain_confidence_schema_requirements(self):
|
|
"""Test that certain confidence is properly described in schema for Claude's guidance."""
|
|
|
|
# The schema description should guide Claude on proper certain usage
|
|
schema = self.tool.get_input_schema()
|
|
confidence_description = schema["properties"]["confidence"]["description"]
|
|
|
|
# Should emphasize it's only when root cause and fix are confirmed
|
|
assert "root cause" in confidence_description.lower()
|
|
assert "minimal fix" in confidence_description.lower()
|
|
assert "confirmed" in confidence_description.lower()
|
|
|
|
# Should emphasize trust in Claude's judgment
|
|
assert "absolutely" in confidence_description.lower() or "certain" in confidence_description.lower()
|
|
|
|
# Should mention no thought-partner assistance needed
|
|
assert "thought-partner" in confidence_description.lower() or "assistance" in confidence_description.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_confidence_enum_validation(self):
|
|
"""Test that certain is properly included in confidence enum validation."""
|
|
|
|
# Valid confidence values should not raise errors
|
|
valid_confidences = ["low", "medium", "high", "certain"]
|
|
|
|
for confidence in valid_confidences:
|
|
# This should not raise validation errors
|
|
with patch("utils.conversation_memory.create_thread", return_value="test-uuid"):
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
result = await self.tool.execute(
|
|
{
|
|
"step": f"Test step with {confidence} confidence",
|
|
"step_number": 1,
|
|
"total_steps": 1,
|
|
"next_step_required": False,
|
|
"findings": "Test findings",
|
|
"confidence": confidence,
|
|
}
|
|
)
|
|
|
|
# Should get valid response
|
|
response = json.loads(result[0].text)
|
|
assert "error" not in response or response.get("status") != "investigation_failed"
|
|
|
|
def test_tool_schema_includes_certain(self):
|
|
"""Test that the tool schema properly includes certain in confidence enum."""
|
|
schema = self.tool.get_input_schema()
|
|
|
|
confidence_property = schema["properties"]["confidence"]
|
|
assert confidence_property["type"] == "string"
|
|
assert "certain" in confidence_property["enum"]
|
|
assert confidence_property["enum"] == ["exploring", "low", "medium", "high", "certain"]
|
|
|
|
# Check that description explains certain usage
|
|
description = confidence_property["description"]
|
|
assert "certain" in description.lower()
|
|
assert "root cause" in description.lower()
|
|
assert "minimal fix" in description.lower()
|
|
assert "thought-partner" in description.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_certain_confidence_preserves_investigation_data(self):
|
|
"""Test that certain confidence path preserves all investigation data properly."""
|
|
|
|
# Multi-step investigation leading to certain
|
|
with patch("utils.conversation_memory.create_thread", return_value="preserve-data-uuid"):
|
|
with patch("utils.conversation_memory.add_turn"):
|
|
# Step 1
|
|
await self.tool.execute(
|
|
{
|
|
"step": "Initial investigation of login failure",
|
|
"step_number": 1,
|
|
"total_steps": 3,
|
|
"next_step_required": True,
|
|
"findings": "Users can't log in after password reset",
|
|
"files_checked": ["/auth/password.py"],
|
|
"relevant_files": ["/auth/password.py"],
|
|
"confidence": "low",
|
|
}
|
|
)
|
|
|
|
# Step 2
|
|
await self.tool.execute(
|
|
{
|
|
"step": "Examining password validation logic",
|
|
"step_number": 2,
|
|
"total_steps": 3,
|
|
"next_step_required": True,
|
|
"findings": "Password hash function not imported correctly",
|
|
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
|
|
"relevant_files": ["/auth/password.py"],
|
|
"relevant_methods": ["PasswordManager.validate_password"],
|
|
"hypothesis": "Import statement issue",
|
|
"confidence": "medium",
|
|
"continuation_id": "preserve-data-uuid",
|
|
}
|
|
)
|
|
|
|
# Step 3: Final with certain
|
|
result = await self.tool.execute(
|
|
{
|
|
"step": "Found exact issue and fix",
|
|
"step_number": 3,
|
|
"total_steps": 3,
|
|
"next_step_required": False,
|
|
"findings": "Missing 'from utils.crypto import hash_password' at line 5",
|
|
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
|
|
"relevant_files": ["/auth/password.py"],
|
|
"relevant_methods": ["PasswordManager.validate_password", "hash_password"],
|
|
"hypothesis": "Missing import statement for hash_password function",
|
|
"confidence": "certain",
|
|
"continuation_id": "preserve-data-uuid",
|
|
}
|
|
)
|
|
|
|
# Verify all investigation data is preserved
|
|
response = json.loads(result[0].text)
|
|
|
|
assert response["status"] == "certain_confidence_proceed_with_fix"
|
|
|
|
investigation = response["complete_investigation"]
|
|
assert investigation["steps_taken"] == 3
|
|
assert len(investigation["files_examined"]) == 2 # Both files from all steps
|
|
assert "/auth/password.py" in investigation["files_examined"]
|
|
assert "/utils/crypto.py" in investigation["files_examined"]
|
|
assert len(investigation["relevant_files"]) == 1
|
|
assert len(investigation["relevant_methods"]) == 2
|
|
assert investigation["confidence_level"] == "certain"
|
|
|
|
# Should have complete investigation summary
|
|
assert "SYSTEMATIC INVESTIGATION SUMMARY" in investigation["investigation_summary"]
|
|
assert (
|
|
"Steps taken: 3" in investigation["investigation_summary"]
|
|
or "Total steps: 3" in investigation["investigation_summary"]
|
|
)
|