certain confidence - no need to waste tokens on another assistant when it's a no brainer fix for Claude

This commit is contained in:
Fahad
2025-06-19 17:05:43 +04:00
parent 91acc0bd26
commit 79abb9ca7e
12 changed files with 998 additions and 61 deletions

View File

@@ -53,8 +53,8 @@ jobs:
pip install -r requirements-dev.txt
- name: Run black formatter check
run: black --check .
run: black --check . --exclude="test_simulation_files/"
- name: Run ruff linter
run: ruff check .
run: ruff check . --exclude test_simulation_files

View File

@@ -370,7 +370,10 @@ Nice!
**[📖 Read More](docs/tools/precommit.md)** - Multi-repository validation and change analysis
### 7. `debug` - Expert Debugging Assistant
Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the selected AI model.
Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs
methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the
selected AI model. When Claude's confidence reaches **100% certainty** during the investigative workflow, expert analysis
via another model is skipped to save on tokens and cost, and Claude proceeds directly to fixing the issue.
```
See logs under /Users/me/project/diagnostics.log and related code under the sync folder. Logs show that sync

View File

@@ -67,16 +67,16 @@ echo "📋 Step 1: Running Linting and Formatting Checks"
echo "--------------------------------------------------"
echo "🔧 Running ruff linting with auto-fix..."
$RUFF check --fix
$RUFF check --fix --exclude test_simulation_files
echo "🎨 Running black code formatting..."
$BLACK .
$BLACK . --exclude="test_simulation_files/"
echo "📦 Running import sorting with isort..."
$ISORT . --skip-glob=".zen_venv/*"
$ISORT . --skip-glob=".zen_venv/*" --skip-glob="test_simulation_files/*"
echo "✅ Verifying all linting passes..."
$RUFF check
$RUFF check --exclude test_simulation_files
echo "✅ Step 1 Complete: All linting and formatting checks passed!"
echo ""

View File

@@ -14,7 +14,7 @@ import os
# These values are used in server responses and for tracking releases
# IMPORTANT: This is the single source of truth for version and author info
# Semantic versioning: MAJOR.MINOR.PATCH
__version__ = "5.2.1"
__version__ = "5.2.2"
# Last update date in ISO format
__updated__ = "2025-06-19"
# Primary maintainer

View File

@@ -27,7 +27,8 @@ The debug tool implements a **systematic investigation methodology** where Claud
5. **Completion**: Once investigation is thorough, Claude signals completion
**Expert Analysis Phase:**
After Claude completes the investigation, the tool automatically calls the selected AI model with:
After Claude completes the investigation, it automatically calls the selected AI model with (unless confidence is **certain**,
in which case expert analysis is bypassed):
- Complete investigation summary with all steps and findings
- Relevant files and methods identified during investigation
- Final hypothesis and confidence assessment

View File

@@ -122,7 +122,7 @@ try:
file_handler = RotatingFileHandler(
log_dir / "mcp_server.log",
maxBytes=20 * 1024 * 1024, # 20MB max file size
backupCount=10, # Keep 10 rotated files (200MB total)
backupCount=5, # Keep 10 rotated files (100MB total)
encoding="utf-8",
)
file_handler.setLevel(getattr(logging, log_level, logging.INFO))
@@ -133,8 +133,8 @@ try:
mcp_logger = logging.getLogger("mcp_activity")
mcp_file_handler = RotatingFileHandler(
log_dir / "mcp_activity.log",
maxBytes=20 * 1024 * 1024, # 20MB max file size
backupCount=5, # Keep 5 rotated files (100MB total)
maxBytes=10 * 1024 * 1024, # 20MB max file size
backupCount=2, # Keep 5 rotated files (20MB total)
encoding="utf-8",
)
mcp_file_handler.setLevel(logging.INFO)

View File

@@ -14,6 +14,7 @@ from .test_content_validation import ContentValidationTest
from .test_conversation_chain_validation import ConversationChainValidationTest
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
from .test_cross_tool_continuation import CrossToolContinuationTest
from .test_debug_certain_confidence import DebugCertainConfidenceTest
from .test_debug_validation import DebugValidationTest
from .test_line_number_validation import LineNumberValidationTest
from .test_logs_validation import LogsValidationTest
@@ -55,6 +56,7 @@ TEST_REGISTRY = {
"testgen_validation": TestGenValidationTest,
"refactor_validation": RefactorValidationTest,
"debug_validation": DebugValidationTest,
"debug_certain_confidence": DebugCertainConfidenceTest,
"conversation_chain_validation": ConversationChainValidationTest,
"vision_capability": VisionCapabilityTest,
"xai_models": XAIModelsTest,
@@ -86,6 +88,7 @@ __all__ = [
"TestGenValidationTest",
"RefactorValidationTest",
"DebugValidationTest",
"DebugCertainConfidenceTest",
"ConversationChainValidationTest",
"VisionCapabilityTest",
"XAIModelsTest",

View File

@@ -0,0 +1,560 @@
#!/usr/bin/env python3
"""
Debug Tool Certain Confidence Simulator Test
Tests the debug tool's 'certain' confidence feature in a realistic simulation:
- Multi-step investigation leading to certain confidence
- Validation that expert analysis is skipped for obvious bugs
- Verification that certain confidence is always trusted
- Ensures token optimization works correctly for minimal fixes
"""
import json
from typing import Optional
from .conversation_base_test import ConversationBaseTest
class DebugCertainConfidenceTest(ConversationBaseTest):
"""Test debug tool's certain confidence optimization feature"""
@property
def test_name(self) -> str:
return "debug_certain_confidence"
@property
def test_description(self) -> str:
return "Debug tool certain confidence optimization validation"
def run_test(self) -> bool:
"""Test debug tool certain confidence capabilities"""
# Set up the test environment
self.setUp()
try:
self.logger.info("Test: Debug tool certain confidence validation")
# Create test files with obvious bugs for certain scenarios
self._create_obvious_bug_scenarios()
# Test 1: Obvious import error with certain confidence
if not self._test_obvious_import_error_certain():
return False
# Test 2: Certain confidence is always trusted
if not self._test_certain_always_trusted():
return False
# Test 3: Regular high confidence still triggers expert analysis
if not self._test_regular_high_confidence_expert_analysis():
return False
# Test 4: Multi-step investigation ending in certain
if not self._test_multi_step_investigation_certain():
return False
self.logger.info(" ✅ All debug certain confidence tests passed")
return True
except Exception as e:
self.logger.error(f"Debug certain confidence test failed: {e}")
return False
def _create_obvious_bug_scenarios(self):
"""Create test files with obvious bugs perfect for certain confidence"""
# Scenario 1: Missing import statement (very obvious)
missing_import_code = """#!/usr/bin/env python3
import os
import sys
# import hashlib # <-- Missing import!
class UserAuth:
def __init__(self, secret_key):
self.secret_key = secret_key
def hash_password(self, password):
# This will fail with NameError: name 'hashlib' is not defined
salt = os.urandom(32)
return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000)
def verify_password(self, password, stored_hash):
# This function also uses hashlib
return hashlib.pbkdf2_hmac('sha256', password.encode(), stored_hash[:32], 100000) == stored_hash[32:]
"""
# Scenario 2: Typo in method name (obvious once spotted)
typo_bug_code = """#!/usr/bin/env python3
class Calculator:
def __init__(self):
self.history = []
def add_numbers(self, a, b):
result = a + b
self.history.append(f"{a} + {b} = {result}")
return result
def calculate_total(self, numbers):
total = 0
for num in numbers:
# Typo: should be add_numbers, not add_number
total = self.add_number(total, num) # NameError: no method 'add_number'
return total
"""
# Scenario 3: Indentation error (Python syntax error)
indentation_error_code = """#!/usr/bin/env python3
def process_data(data_list):
results = []
for item in data_list:
if item > 0:
processed = item * 2
results.append(processed) # IndentationError: unindent does not match any outer indentation level
return results
def main():
data = [1, 2, 3, 4, 5]
print(process_data(data))
"""
# Create test files
self.missing_import_file = self.create_additional_test_file("user_auth.py", missing_import_code)
self.typo_bug_file = self.create_additional_test_file("calculator.py", typo_bug_code)
self.indentation_file = self.create_additional_test_file("data_processor.py", indentation_error_code)
self.logger.info(" ✅ Created obvious bug scenarios:")
self.logger.info(f" - Missing import: {self.missing_import_file}")
self.logger.info(f" - Method typo: {self.typo_bug_file}")
self.logger.info(f" - Indentation error: {self.indentation_file}")
# Create error logs for context
import_error_log = """ERROR: User authentication failing during login
Traceback (most recent call last):
File "user_auth.py", line 12, in hash_password
return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000)
NameError: name 'hashlib' is not defined
This happens every time a user tries to log in. The error occurs in the password hashing function.
"""
self.error_log_file = self.create_additional_test_file("error.log", import_error_log)
self.logger.info(f" - Error log: {self.error_log_file}")
def _test_obvious_import_error_certain(self) -> bool:
"""Test certain confidence with obvious missing import error"""
try:
self.logger.info(" 1.1: Testing obvious import error with certain confidence")
# Step 1: Initial investigation
self.logger.info(" 1.1.1: Step 1 - Initial problem description")
response1, continuation_id = self.call_mcp_tool_direct(
"debug",
{
"step": "Investigating NameError in user authentication - users cannot log in due to 'name hashlib is not defined' error.",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "NameError occurs in hash_password method when trying to use hashlib.pbkdf2_hmac. Error happens on every login attempt.",
"files_checked": [self.error_log_file],
"relevant_files": [self.error_log_file],
"hypothesis": "Missing import statement for hashlib module",
"confidence": "medium",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to get initial investigation response")
return False
response1_data = self._parse_debug_response(response1)
if not self._validate_investigation_response(response1_data, 1, True, "investigation_in_progress"):
return False
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
# Step 2: Examine code and identify obvious fix - use certain confidence
self.logger.info(" 1.1.2: Step 2 - Found exact issue and simple fix (certain)")
response2, _ = self.call_mcp_tool_direct(
"debug",
{
"step": "Found the exact issue and the minimal fix required",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Missing 'import hashlib' statement at the top of user_auth.py file. The error occurs because hashlib is used in hash_password() method on line 12 but never imported. Simple one-line fix: add 'import hashlib' after line 2.",
"files_checked": [self.error_log_file, self.missing_import_file],
"relevant_files": [self.missing_import_file],
"relevant_methods": ["UserAuth.hash_password", "UserAuth.verify_password"],
"hypothesis": "Missing 'import hashlib' statement causes NameError when hash_password method executes",
"confidence": "certain", # Use certain - should skip expert analysis
"continuation_id": continuation_id,
"model": "flash", # Specify model for consistency
},
)
if not response2:
self.logger.error("Failed to complete investigation with certain confidence")
return False
response2_data = self._parse_debug_response(response2)
if not response2_data:
return False
# Validate certain response structure
expected_status = "certain_confidence_proceed_with_fix"
if response2_data.get("status") != expected_status:
self.logger.error(f"Expected status '{expected_status}', got '{response2_data.get('status')}'")
return False
if not response2_data.get("investigation_complete"):
self.logger.error("Expected investigation_complete=true for certain confidence")
return False
if not response2_data.get("skip_expert_analysis"):
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
return False
# Verify expert analysis is marked as skipped
expert_analysis = response2_data.get("expert_analysis", {})
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
self.logger.error("Expert analysis should be marked as skipped for certain confidence")
return False
# Check for proper investigation summary
complete_investigation = response2_data.get("complete_investigation", {})
if complete_investigation.get("confidence_level") != "certain":
self.logger.error("Expected confidence_level='certain' in complete investigation")
return False
if complete_investigation.get("steps_taken") != 2:
self.logger.error("Expected steps_taken=2 in complete investigation")
return False
# Verify next steps guidance
next_steps = response2_data.get("next_steps", "")
if "CERTAIN confidence" not in next_steps:
self.logger.error("Expected 'CERTAIN confidence' in next_steps guidance")
return False
if "minimal fix" not in next_steps:
self.logger.error("Expected 'minimal fix' guidance in next_steps")
return False
self.logger.info(" ✅ Certain confidence skipped expert analysis correctly")
return True
except Exception as e:
self.logger.error(f"Obvious import error certain test failed: {e}")
return False
def _test_certain_always_trusted(self) -> bool:
"""Test that certain confidence is always trusted regardless of complexity"""
try:
self.logger.info(" 1.2: Testing that certain confidence is always trusted")
# Single step investigation with certain - should always be trusted
self.logger.info(" 1.2.1: Direct certain confidence (always trusted)")
response, _ = self.call_mcp_tool_direct(
"debug",
{
"step": "Found the exact root cause and minimal fix for this complex issue",
"step_number": 1,
"total_steps": 1,
"next_step_required": False, # Final step
"findings": "After thorough investigation, identified that the issue is caused by method name typo in Calculator.calculate_total() - calls self.add_number() instead of self.add_numbers(). Simple fix: change line 14 from 'add_number' to 'add_numbers'.",
"files_checked": [self.typo_bug_file],
"relevant_files": [self.typo_bug_file],
"relevant_methods": ["Calculator.calculate_total", "Calculator.add_numbers"],
"hypothesis": "Method name typo in calculate_total() calls non-existent add_number() instead of add_numbers()",
"confidence": "certain", # Should always be trusted
"model": "flash",
},
)
if not response:
self.logger.error("Failed to get certain confidence response")
return False
response_data = self._parse_debug_response(response)
if not response_data:
return False
# Verify certain is trusted regardless of complexity
if response_data.get("status") != "certain_confidence_proceed_with_fix":
self.logger.error("Certain confidence should always be trusted")
return False
if not response_data.get("skip_expert_analysis"):
self.logger.error("Expert analysis should be skipped for certain confidence")
return False
# Ensure expert analysis is marked as skipped
expert_analysis = response_data.get("expert_analysis", {})
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
self.logger.error("Expert analysis status should indicate certain skip")
return False
self.logger.info(" ✅ Certain confidence always trusted correctly")
return True
except Exception as e:
self.logger.error(f"Certain always trusted test failed: {e}")
return False
def _test_regular_high_confidence_expert_analysis(self) -> bool:
"""Test that regular 'high' confidence still triggers expert analysis"""
try:
self.logger.info(" 1.3: Testing that regular 'high' confidence triggers expert analysis")
# Investigation with regular high confidence (not certain)
self.logger.info(" 1.3.1: High confidence (not certain) - should trigger expert analysis")
response, _ = self.call_mcp_tool_direct(
"debug",
{
"step": "Identified likely root cause with strong evidence",
"step_number": 1,
"total_steps": 1,
"next_step_required": False, # Final step
"findings": "IndentationError in data_processor.py line 8 - results.append(processed) is incorrectly indented. Should align with the 'if' statement above it.",
"files_checked": [self.indentation_file],
"relevant_files": [self.indentation_file],
"relevant_methods": ["process_data"],
"hypothesis": "Incorrect indentation causes IndentationError in process_data function",
"confidence": "high", # Regular high confidence, NOT certain
"model": "flash",
},
)
if not response:
self.logger.error("Failed to get high confidence response")
return False
response_data = self._parse_debug_response(response)
if not response_data:
return False
# Verify that regular high confidence triggers expert analysis
if response_data.get("status") != "calling_expert_analysis":
self.logger.error(
f"Expected 'calling_expert_analysis' for high confidence, got '{response_data.get('status')}'"
)
return False
if response_data.get("skip_expert_analysis"):
self.logger.error("Expert analysis should NOT be skipped for regular high confidence")
return False
# Verify expert analysis was called
expert_analysis = response_data.get("expert_analysis", {})
if not expert_analysis:
self.logger.error("Expected expert analysis for regular high confidence")
return False
# Check that expert analysis has content
if "status" not in expert_analysis:
self.logger.error("Expert analysis should have status field")
return False
self.logger.info(" ✅ Regular high confidence triggers expert analysis correctly")
return True
except Exception as e:
self.logger.error(f"Regular high confidence test failed: {e}")
return False
def _test_multi_step_investigation_certain(self) -> bool:
"""Test multi-step investigation that ends with certain confidence"""
try:
self.logger.info(" 1.4: Testing multi-step investigation ending with certain")
# Step 1: Start investigation
self.logger.info(" 1.4.1: Step 1 - Initial investigation")
response1, continuation_id = self.call_mcp_tool_direct(
"debug",
{
"step": "Investigating Python syntax error in data processing module",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "IndentationError reported when running data_processor.py - 'unindent does not match any outer indentation level'",
"files_checked": [self.indentation_file],
"relevant_files": [],
"hypothesis": "Indentation inconsistency in Python code",
"confidence": "low",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start multi-step investigation")
return False
# Step 2: Examine code structure
self.logger.info(" 1.4.2: Step 2 - Code examination")
response2, _ = self.call_mcp_tool_direct(
"debug",
{
"step": "Examining the indentation structure in process_data function",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"findings": "Found the issue: line 8 'results.append(processed)' is indented incorrectly. It should align with the 'if' statement, not be at the same level as the 'for' loop.",
"files_checked": [self.indentation_file],
"relevant_files": [self.indentation_file],
"relevant_methods": ["process_data"],
"hypothesis": "Line 8 has incorrect indentation level causing IndentationError",
"confidence": "medium",
"continuation_id": continuation_id,
},
)
if not response2:
self.logger.error("Failed to continue to step 2")
return False
# Step 3: Confirm fix with certain confidence
self.logger.info(" 1.4.3: Step 3 - Confirmed fix (certain)")
response3, _ = self.call_mcp_tool_direct(
"debug",
{
"step": "Confirmed the exact issue and simple fix",
"step_number": 3,
"total_steps": 3,
"next_step_required": False, # Final step
"findings": "Confirmed: line 8 'results.append(processed)' needs to be indented 4 more spaces to align with line 6 'if item > 0:'. This is a simple indentation fix.",
"files_checked": [self.indentation_file],
"relevant_files": [self.indentation_file],
"relevant_methods": ["process_data"],
"hypothesis": "IndentationError on line 8 due to incorrect indentation level - needs 4 more spaces",
"confidence": "certain", # Final step with certain
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response3:
self.logger.error("Failed to complete multi-step investigation")
return False
response3_data = self._parse_debug_response(response3)
if not response3_data:
return False
# Validate multi-step certain response
if response3_data.get("status") != "certain_confidence_proceed_with_fix":
self.logger.error("Expected certain status for final step")
return False
if not response3_data.get("skip_expert_analysis"):
self.logger.error("Expected expert analysis to be skipped for certain")
return False
# Verify investigation preserves steps (at least the current step)
complete_investigation = response3_data.get("complete_investigation", {})
steps_taken = complete_investigation.get("steps_taken", 0)
if steps_taken < 1:
self.logger.error("Expected at least 1 step in complete investigation")
return False
# Check that investigation summary includes progression
investigation_summary = complete_investigation.get("investigation_summary", "")
if "Total steps:" not in investigation_summary and "Steps taken:" not in investigation_summary:
self.logger.error("Investigation summary should show steps information")
return False
self.logger.info(" ✅ Multi-step investigation with certain ending successful")
return True
except Exception as e:
self.logger.error(f"Multi-step investigation certain test failed: {e}")
return False
def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool directly in-process to maintain conversation memory"""
try:
# Get the tool instance
if tool_name not in self._tools:
self.logger.error(f"Tool '{tool_name}' not found in available tools")
return None, None
tool = self._tools[tool_name]
# Execute the tool with proper async handling
loop = self._get_event_loop()
# Call the tool's execute method
result = loop.run_until_complete(tool.execute(params))
if not result or len(result) == 0:
self.logger.error(f"Tool '{tool_name}' returned empty result")
return None, None
# Extract the text content from the result
response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
# Extract continuation_id from debug response if present
continuation_id = self._extract_debug_continuation_id(response_text)
return response_text, continuation_id
except Exception as e:
self.logger.error(f"Failed to call tool '{tool_name}' directly: {e}")
return None, None
def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from debug response"""
try:
response_data = json.loads(response_text)
return response_data.get("continuation_id")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
return None
def _parse_debug_response(self, response_text: str) -> dict:
"""Parse debug tool JSON response"""
try:
return json.loads(response_text)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse debug response as JSON: {e}")
self.logger.error(f"Response text: {response_text[:500]}...")
return {}
def _validate_investigation_response(
self,
response_data: dict,
expected_step: int,
expected_next_required: bool,
expected_status: str,
) -> bool:
"""Validate debug investigation response structure"""
try:
# Check status
if response_data.get("status") != expected_status:
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
return False
# Check step number
if response_data.get("step_number") != expected_step:
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
return False
# Check next_step_required
if response_data.get("next_step_required") != expected_next_required:
self.logger.error(
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
)
return False
# Basic structure checks
if "investigation_status" not in response_data:
self.logger.error("Missing investigation_status in response")
return False
if not response_data.get("next_steps"):
self.logger.error("Missing next_steps guidance in response")
return False
return True
except Exception as e:
self.logger.error(f"Error validating investigation response: {e}")
return False

View File

@@ -0,0 +1,363 @@
"""
Integration tests for the debug tool's 'certain' confidence feature.
Tests the complete workflow where Claude identifies obvious bugs with absolute certainty
and can skip expensive expert analysis for minimal fixes.
"""
import json
from unittest.mock import patch
import pytest
from tools.debug import DebugIssueTool
class TestDebugCertainConfidence:
"""Integration tests for certain confidence optimization."""
def setup_method(self):
"""Set up test tool instance."""
self.tool = DebugIssueTool()
@pytest.mark.asyncio
async def test_certain_confidence_skips_expert_analysis(self):
"""Test that certain confidence with valid minimal fix skips expert analysis."""
# Simulate a multi-step investigation ending with certain confidence
# Step 1: Initial investigation
with patch("utils.conversation_memory.create_thread", return_value="debug-certain-uuid"):
with patch("utils.conversation_memory.add_turn"):
result1 = await self.tool.execute(
{
"step": "Investigating Python ImportError in user authentication module",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Users cannot log in, getting 'ModuleNotFoundError: No module named hashlib'",
"files_checked": ["/auth/user_auth.py"],
"relevant_files": ["/auth/user_auth.py"],
"hypothesis": "Missing import statement",
"confidence": "medium",
"continuation_id": None,
}
)
# Verify step 1 response
response1 = json.loads(result1[0].text)
assert response1["status"] == "investigation_in_progress"
assert response1["step_number"] == 1
continuation_id = response1["continuation_id"]
# Step 2: Final step with certain confidence (simple import fix)
with patch("utils.conversation_memory.add_turn"):
result2 = await self.tool.execute(
{
"step": "Found the exact issue and fix",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Missing 'import hashlib' statement at top of user_auth.py file, line 3. Simple one-line fix required.",
"files_checked": ["/auth/user_auth.py"],
"relevant_files": ["/auth/user_auth.py"],
"relevant_methods": ["UserAuth.hash_password"],
"hypothesis": "Missing import hashlib statement causes ModuleNotFoundError when hash_password method is called",
"confidence": "certain", # NAILEDIT confidence - should skip expert analysis
"continuation_id": continuation_id,
}
)
# Verify final response skipped expert analysis
response2 = json.loads(result2[0].text)
# Should indicate certain confidence was used
assert response2["status"] == "certain_confidence_proceed_with_fix"
assert response2["investigation_complete"] is True
assert response2["skip_expert_analysis"] is True
# Expert analysis should be marked as skipped
assert response2["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
assert (
response2["expert_analysis"]["reason"] == "Claude identified exact root cause with minimal fix requirement"
)
# Should have complete investigation summary
assert "complete_investigation" in response2
assert response2["complete_investigation"]["confidence_level"] == "certain"
assert response2["complete_investigation"]["steps_taken"] == 2
# Next steps should guide Claude to implement the fix directly
assert "CERTAIN confidence" in response2["next_steps"]
assert "minimal fix" in response2["next_steps"]
assert "without requiring further consultation" in response2["next_steps"]
@pytest.mark.asyncio
async def test_certain_confidence_always_trusted(self):
"""Test that certain confidence is always trusted, even for complex issues."""
# Set up investigation state
self.tool.initial_issue = "Any kind of issue"
self.tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation",
"findings": "Some findings",
"files_checked": [],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
}
]
self.tool.consolidated_findings = {
"files_checked": set(),
"relevant_files": set(),
"relevant_methods": set(),
"findings": ["Step 1: Some findings"],
"hypotheses": [],
"images": [],
}
# Final step with certain confidence - should ALWAYS be trusted
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(
{
"step": "Found the issue and fix",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Complex or simple, doesn't matter - Claude says certain",
"files_checked": ["/any/file.py"],
"relevant_files": ["/any/file.py"],
"relevant_methods": ["any_method"],
"hypothesis": "Claude has decided this is certain - trust the judgment",
"confidence": "certain", # Should always be trusted
"continuation_id": "debug-trust-uuid",
}
)
# Verify certain is always trusted
response = json.loads(result[0].text)
# Should proceed with certain confidence
assert response["status"] == "certain_confidence_proceed_with_fix"
assert response["investigation_complete"] is True
assert response["skip_expert_analysis"] is True
# Expert analysis should be skipped
assert response["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
# Next steps should guide Claude to implement fix directly
assert "CERTAIN confidence" in response["next_steps"]
@pytest.mark.asyncio
async def test_regular_high_confidence_still_uses_expert_analysis(self):
"""Test that regular 'high' confidence still triggers expert analysis."""
# Set up investigation state
self.tool.initial_issue = "Session validation issue"
self.tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation",
"findings": "Found session issue",
"files_checked": [],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
}
]
self.tool.consolidated_findings = {
"files_checked": set(),
"relevant_files": {"/api/sessions.py"},
"relevant_methods": {"SessionManager.validate"},
"findings": ["Step 1: Found session issue"],
"hypotheses": [],
"images": [],
}
# Mock expert analysis
mock_expert_response = {
"status": "analysis_complete",
"summary": "Expert analysis of session validation",
"hypotheses": [
{
"name": "SESSION_VALIDATION_BUG",
"confidence": "High",
"root_cause": "Session timeout not properly handled",
}
],
}
# Final step with regular 'high' confidence (should trigger expert analysis)
with patch("utils.conversation_memory.add_turn"):
with patch.object(self.tool, "_call_expert_analysis", return_value=mock_expert_response):
with patch.object(self.tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
result = await self.tool.execute(
{
"step": "Identified likely root cause",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"findings": "Session validation fails when timeout occurs during user activity",
"files_checked": ["/api/sessions.py"],
"relevant_files": ["/api/sessions.py"],
"relevant_methods": ["SessionManager.validate", "SessionManager.cleanup"],
"hypothesis": "Session timeout handling bug causes validation failures",
"confidence": "high", # Regular high confidence, NOT certain
"continuation_id": "debug-regular-uuid",
}
)
# Verify expert analysis was called (not skipped)
response = json.loads(result[0].text)
# Should call expert analysis normally
assert response["status"] == "calling_expert_analysis"
assert response["investigation_complete"] is True
assert "skip_expert_analysis" not in response # Should not be present
# Expert analysis should be present with real results
assert response["expert_analysis"]["status"] == "analysis_complete"
assert response["expert_analysis"]["summary"] == "Expert analysis of session validation"
# Next steps should indicate normal investigation completion (not certain confidence)
assert "INVESTIGATION IS COMPLETE" in response["next_steps"]
assert "certain" not in response["next_steps"].lower()
def test_certain_confidence_schema_requirements(self):
"""Test that certain confidence is properly described in schema for Claude's guidance."""
# The schema description should guide Claude on proper certain usage
schema = self.tool.get_input_schema()
confidence_description = schema["properties"]["confidence"]["description"]
# Should emphasize it's only when root cause and fix are confirmed
assert "root cause" in confidence_description.lower()
assert "minimal fix" in confidence_description.lower()
assert "confirmed" in confidence_description.lower()
# Should emphasize trust in Claude's judgment
assert "absolutely" in confidence_description.lower() or "certain" in confidence_description.lower()
# Should mention no thought-partner assistance needed
assert "thought-partner" in confidence_description.lower() or "assistance" in confidence_description.lower()
@pytest.mark.asyncio
async def test_confidence_enum_validation(self):
"""Test that certain is properly included in confidence enum validation."""
# Valid confidence values should not raise errors
valid_confidences = ["low", "medium", "high", "certain"]
for confidence in valid_confidences:
# This should not raise validation errors
with patch("utils.conversation_memory.create_thread", return_value="test-uuid"):
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(
{
"step": f"Test step with {confidence} confidence",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Test findings",
"confidence": confidence,
}
)
# Should get valid response
response = json.loads(result[0].text)
assert "error" not in response or response.get("status") != "investigation_failed"
def test_tool_schema_includes_certain(self):
"""Test that the tool schema properly includes certain in confidence enum."""
schema = self.tool.get_input_schema()
confidence_property = schema["properties"]["confidence"]
assert confidence_property["type"] == "string"
assert "certain" in confidence_property["enum"]
assert confidence_property["enum"] == ["exploring", "low", "medium", "high", "certain"]
# Check that description explains certain usage
description = confidence_property["description"]
assert "certain" in description.lower()
assert "root cause" in description.lower()
assert "minimal fix" in description.lower()
assert "thought-partner" in description.lower()
@pytest.mark.asyncio
async def test_certain_confidence_preserves_investigation_data(self):
"""Test that certain confidence path preserves all investigation data properly."""
# Multi-step investigation leading to certain
with patch("utils.conversation_memory.create_thread", return_value="preserve-data-uuid"):
with patch("utils.conversation_memory.add_turn"):
# Step 1
await self.tool.execute(
{
"step": "Initial investigation of login failure",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Users can't log in after password reset",
"files_checked": ["/auth/password.py"],
"relevant_files": ["/auth/password.py"],
"confidence": "low",
}
)
# Step 2
await self.tool.execute(
{
"step": "Examining password validation logic",
"step_number": 2,
"total_steps": 3,
"next_step_required": True,
"findings": "Password hash function not imported correctly",
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
"relevant_files": ["/auth/password.py"],
"relevant_methods": ["PasswordManager.validate_password"],
"hypothesis": "Import statement issue",
"confidence": "medium",
"continuation_id": "preserve-data-uuid",
}
)
# Step 3: Final with certain
result = await self.tool.execute(
{
"step": "Found exact issue and fix",
"step_number": 3,
"total_steps": 3,
"next_step_required": False,
"findings": "Missing 'from utils.crypto import hash_password' at line 5",
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
"relevant_files": ["/auth/password.py"],
"relevant_methods": ["PasswordManager.validate_password", "hash_password"],
"hypothesis": "Missing import statement for hash_password function",
"confidence": "certain",
"continuation_id": "preserve-data-uuid",
}
)
# Verify all investigation data is preserved
response = json.loads(result[0].text)
assert response["status"] == "certain_confidence_proceed_with_fix"
investigation = response["complete_investigation"]
assert investigation["steps_taken"] == 3
assert len(investigation["files_examined"]) == 2 # Both files from all steps
assert "/auth/password.py" in investigation["files_examined"]
assert "/utils/crypto.py" in investigation["files_examined"]
assert len(investigation["relevant_files"]) == 1
assert len(investigation["relevant_methods"]) == 2
assert investigation["confidence_level"] == "certain"
# Should have complete investigation summary
assert "SYSTEMATIC INVESTIGATION SUMMARY" in investigation["investigation_summary"]
assert (
"Steps taken: 3" in investigation["investigation_summary"]
or "Total steps: 3" in investigation["investigation_summary"]
)

View File

@@ -163,7 +163,7 @@ class TestImageSupportIntegration:
images_field = schema["properties"]["images"]
assert images_field["type"] == "array"
assert images_field["items"]["type"] == "string"
assert "error screens" in images_field["description"].lower()
assert "screenshots" in images_field["description"].lower()
def test_tool_image_validation_limits(self):
"""Test that tools validate image size limits using real provider resolution."""

View File

@@ -21,47 +21,62 @@ logger = logging.getLogger(__name__)
# Field descriptions for the investigation steps
DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
"step": (
"Describe what you're currently investigating by beginning to think deeply about the issue, its root cause"
"and possible reasons. Prepare and learn about the related code first. In step 1, clearly state the issue to investigate and begin "
"thinking deeply about not just the described issue, but possible underlying causes, side-effects, or external "
"components that might contribute to it. Follow the code flow carefully—bugs may originate "
"in one part of the code-dependencies, or upstream logic may not be immediately visible. Bugs and issues can "
"arise due to poor logic, incorrect assumptions, bad input or failures elsewhere."
"In all subsequent steps, continue uncovering relevant code, examining patterns, and formulating hypotheses "
"with deliberate attention to detail."
"Describe what you're currently investigating by thinking deeply about the issue and its possible causes. "
"In step 1, clearly state the issue and begin forming an investigative direction. Consider not only obvious "
"failures, but also subtle contributing factors like upstream logic, invalid inputs, missing preconditions, "
"or hidden side effects. Map out the flow of related functions or modules. Identify call paths where input "
"values or branching logic could cause instability. In concurrent systems, watch for race conditions, shared "
"state, or timing dependencies. In all later steps, continue exploring with precision: trace deeper "
"dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence."
),
"step_number": (
"The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or "
"revise the previous one."
),
"total_steps": (
"Your current estimate for how many steps will be needed to complete the investigation. Adjust as new findings emerge."
),
"next_step_required": (
"Set to true if you plan to continue the investigation with another step. False means you believe the root "
"cause is known or the investigation is complete."
),
"step_number": "Current step number in the investigation sequence (starts at 1).",
"total_steps": "Estimate of total investigation steps expected (adjustable as the process evolves).",
"next_step_required": "Whether another investigation step is needed after this one.",
"findings": (
"Summarize discoveries in this step. Think critically and include relevant code behavior, suspicious patterns, "
"evidence collected, and any partial conclusions or leads."
"Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or "
"logs, or disproven theories. Be specific and avoid vague language—document what you now know and how it "
"affects your hypothesis. In later steps, confirm or disprove past findings with reason."
),
"files_checked": (
"List all files (as absolute paths, do not clip or shrink file names) examined during the investigation so far. "
"Include even files ruled out, as this tracks your exploration path."
),
"relevant_files": (
"Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list those that are directly tied to the root cause or its effects."
"Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list "
"those that are directly tied to the root cause or its effects. This could include the cause, trigger, or "
"place of manifestation."
),
"relevant_methods": (
"List specific methods/functions clearly tied to the issue. Use 'ClassName.methodName' or 'functionName' format."
"List methods or functions that are central to the issue, in the format 'ClassName.methodName' or 'functionName'. "
"Prioritize those that influence or process inputs, drive branching, or pass state between modules."
),
"hypothesis": (
"Formulate your current best guess about the underlying cause. This is a working theory and may evolve based on further evidence."
"A concrete theory for what's causing the issue based on the evidence so far. This can include suspected "
"failures, incorrect assumptions, or violated constraints. You are encouraged to revise or abandon it in later "
"steps as needed."
),
"confidence": (
"How confident you are in the current hypothesis: "
"'low' (initial theory), 'medium' (good evidence), 'high' (strong to very strong evidence), "
"'nailedit' (ONLY use for final step and ONLY when you have found the EXACT root cause with 100% certainty AND "
"identified a simple, minimal fix that requires no expert consultation. Use this ONLY "
"for obvious bugs and logic errors that you ABSOLUTELY are certain about and have no doubts because you have"
"successfully mapped out the code flow and the root cause behind the issue."
"Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), "
"'medium' (some supporting evidence), 'high' (strong evidence), 'certain' (only when the root cause and minimal "
"fix are both confirmed). Do NOT use 'certain' unless the issue can be fully resolved with a fix, use 'high' "
"instead when in doubt. Using 'certain' prevents you from taking assistance from another thought-partner."
),
"backtrack_from_step": (
"If an earlier finding or hypothesis needs to be revised or discarded, specify the step number from which to "
"start over. Use this to acknowledge investigative dead ends and correct the course."
),
"backtrack_from_step": "If a previous step needs revision, specify the step number to backtrack from.",
"continuation_id": "Continuation token used for linking multi-step investigations and continuing conversations after discovery.",
"images": (
"Optional. Include full absolute paths to visual debugging images (UI issues, logs, error screens) that help clarify the issue."
"Optional list of absolute paths to screenshots or UI visuals that clarify the issue. "
"Only include if they materially assist understanding or hypothesis formulation."
),
}
@@ -204,7 +219,7 @@ class DebugIssueTool(BaseTool):
},
"confidence": {
"type": "string",
"enum": ["low", "medium", "high", "nailedit"],
"enum": ["exploring", "low", "medium", "high", "certain"],
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"],
},
"backtrack_from_step": {
@@ -356,9 +371,9 @@ class DebugIssueTool(BaseTool):
response_data["investigation_complete"] = True
# Check if Claude has absolute certainty and can proceed with minimal fix
if request.confidence == "nailedit":
# Trust Claude's judgment completely - if it says nailedit, skip expert analysis
response_data["status"] = "nailedit_confidence_proceed_with_fix"
if request.confidence == "certain":
# Trust Claude's judgment completely - if it says certain, skip expert analysis
response_data["status"] = "certain_confidence_proceed_with_fix"
investigation_summary = self._prepare_investigation_summary()
response_data["complete_investigation"] = {
@@ -369,20 +384,21 @@ class DebugIssueTool(BaseTool):
"relevant_methods": list(self.consolidated_findings["relevant_methods"]),
"investigation_summary": investigation_summary,
"final_hypothesis": request.hypothesis,
"confidence_level": "nailedit",
"confidence_level": "certain",
}
response_data["next_steps"] = (
"Investigation complete with NAILED-IT confidence. You have identified the exact "
"root cause and a minimal fix. Proceed directly with implementing the simple fix "
"without requiring expert consultation. Focus on the precise, minimal change needed."
"Investigation complete with CERTAIN confidence. You have identified the exact "
"root cause and a minimal fix. MANDATORY: Present the user with the root cause analysis"
"and IMMEDIATELY proceed with implementing the simple fix without requiring further "
"consultation. Focus on the precise, minimal change needed."
)
response_data["skip_expert_analysis"] = True
response_data["expert_analysis"] = {
"status": "skipped_due_to_nailedit_confidence",
"status": "skipped_due_to_certain_confidence",
"reason": "Claude identified exact root cause with minimal fix requirement",
}
else:
# Standard expert analysis for high/medium/low confidence
# Standard expert analysis for certain/high/medium/low/exploring confidence
response_data["status"] = "calling_expert_analysis"
# Prepare consolidated investigation summary
@@ -413,9 +429,11 @@ class DebugIssueTool(BaseTool):
"investigation_summary": investigation_summary,
}
response_data["next_steps"] = (
"Investigation complete with expert analysis. Present the findings, hypotheses, "
"and recommended fixes to the user. Focus on the most likely root cause and "
"provide actionable implementation guidance."
"INVESTIGATION IS COMPLETE. YOU MUST now summarize and present ALL key findings, confirmed "
"hypotheses, and exact recommended fixes. Clearly identify the most likely root cause and "
"provide concrete, actionable implementation guidance. Highlight affected code paths and display "
"reasoning that led to this conclusion—make it easy for a developer to understand exactly where "
"the problem lies."
)
else:
response_data["next_steps"] = (

View File

@@ -1,11 +0,0 @@
"""
Zen MCP Server - Entry point
The main implementation is in server.py
"""
import asyncio
from server import main
if __name__ == "__main__":
asyncio.run(main())