certain confidence - no need to waste tokens on another assistant when it's a no brainer fix for Claude
This commit is contained in:
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -53,8 +53,8 @@ jobs:
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
- name: Run black formatter check
|
||||
run: black --check .
|
||||
run: black --check . --exclude="test_simulation_files/"
|
||||
|
||||
- name: Run ruff linter
|
||||
run: ruff check .
|
||||
run: ruff check . --exclude test_simulation_files
|
||||
|
||||
|
||||
@@ -370,7 +370,10 @@ Nice!
|
||||
**[📖 Read More](docs/tools/precommit.md)** - Multi-repository validation and change analysis
|
||||
|
||||
### 7. `debug` - Expert Debugging Assistant
|
||||
Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the selected AI model.
|
||||
Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs
|
||||
methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the
|
||||
selected AI model. When Claude's confidence reaches **100% certainty** during the investigative workflow, expert analysis
|
||||
via another model is skipped to save on tokens and cost, and Claude proceeds directly to fixing the issue.
|
||||
|
||||
```
|
||||
See logs under /Users/me/project/diagnostics.log and related code under the sync folder. Logs show that sync
|
||||
|
||||
@@ -67,16 +67,16 @@ echo "📋 Step 1: Running Linting and Formatting Checks"
|
||||
echo "--------------------------------------------------"
|
||||
|
||||
echo "🔧 Running ruff linting with auto-fix..."
|
||||
$RUFF check --fix
|
||||
$RUFF check --fix --exclude test_simulation_files
|
||||
|
||||
echo "🎨 Running black code formatting..."
|
||||
$BLACK .
|
||||
$BLACK . --exclude="test_simulation_files/"
|
||||
|
||||
echo "📦 Running import sorting with isort..."
|
||||
$ISORT . --skip-glob=".zen_venv/*"
|
||||
$ISORT . --skip-glob=".zen_venv/*" --skip-glob="test_simulation_files/*"
|
||||
|
||||
echo "✅ Verifying all linting passes..."
|
||||
$RUFF check
|
||||
$RUFF check --exclude test_simulation_files
|
||||
|
||||
echo "✅ Step 1 Complete: All linting and formatting checks passed!"
|
||||
echo ""
|
||||
|
||||
@@ -14,7 +14,7 @@ import os
|
||||
# These values are used in server responses and for tracking releases
|
||||
# IMPORTANT: This is the single source of truth for version and author info
|
||||
# Semantic versioning: MAJOR.MINOR.PATCH
|
||||
__version__ = "5.2.1"
|
||||
__version__ = "5.2.2"
|
||||
# Last update date in ISO format
|
||||
__updated__ = "2025-06-19"
|
||||
# Primary maintainer
|
||||
|
||||
@@ -27,7 +27,8 @@ The debug tool implements a **systematic investigation methodology** where Claud
|
||||
5. **Completion**: Once investigation is thorough, Claude signals completion
|
||||
|
||||
**Expert Analysis Phase:**
|
||||
After Claude completes the investigation, the tool automatically calls the selected AI model with:
|
||||
After Claude completes the investigation, it automatically calls the selected AI model with (unless confidence is **certain**,
|
||||
in which case expert analysis is bypassed):
|
||||
- Complete investigation summary with all steps and findings
|
||||
- Relevant files and methods identified during investigation
|
||||
- Final hypothesis and confidence assessment
|
||||
|
||||
@@ -122,7 +122,7 @@ try:
|
||||
file_handler = RotatingFileHandler(
|
||||
log_dir / "mcp_server.log",
|
||||
maxBytes=20 * 1024 * 1024, # 20MB max file size
|
||||
backupCount=10, # Keep 10 rotated files (200MB total)
|
||||
backupCount=5, # Keep 10 rotated files (100MB total)
|
||||
encoding="utf-8",
|
||||
)
|
||||
file_handler.setLevel(getattr(logging, log_level, logging.INFO))
|
||||
@@ -133,8 +133,8 @@ try:
|
||||
mcp_logger = logging.getLogger("mcp_activity")
|
||||
mcp_file_handler = RotatingFileHandler(
|
||||
log_dir / "mcp_activity.log",
|
||||
maxBytes=20 * 1024 * 1024, # 20MB max file size
|
||||
backupCount=5, # Keep 5 rotated files (100MB total)
|
||||
maxBytes=10 * 1024 * 1024, # 20MB max file size
|
||||
backupCount=2, # Keep 5 rotated files (20MB total)
|
||||
encoding="utf-8",
|
||||
)
|
||||
mcp_file_handler.setLevel(logging.INFO)
|
||||
|
||||
@@ -14,6 +14,7 @@ from .test_content_validation import ContentValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||
from .test_cross_tool_continuation import CrossToolContinuationTest
|
||||
from .test_debug_certain_confidence import DebugCertainConfidenceTest
|
||||
from .test_debug_validation import DebugValidationTest
|
||||
from .test_line_number_validation import LineNumberValidationTest
|
||||
from .test_logs_validation import LogsValidationTest
|
||||
@@ -55,6 +56,7 @@ TEST_REGISTRY = {
|
||||
"testgen_validation": TestGenValidationTest,
|
||||
"refactor_validation": RefactorValidationTest,
|
||||
"debug_validation": DebugValidationTest,
|
||||
"debug_certain_confidence": DebugCertainConfidenceTest,
|
||||
"conversation_chain_validation": ConversationChainValidationTest,
|
||||
"vision_capability": VisionCapabilityTest,
|
||||
"xai_models": XAIModelsTest,
|
||||
@@ -86,6 +88,7 @@ __all__ = [
|
||||
"TestGenValidationTest",
|
||||
"RefactorValidationTest",
|
||||
"DebugValidationTest",
|
||||
"DebugCertainConfidenceTest",
|
||||
"ConversationChainValidationTest",
|
||||
"VisionCapabilityTest",
|
||||
"XAIModelsTest",
|
||||
|
||||
560
simulator_tests/test_debug_certain_confidence.py
Normal file
560
simulator_tests/test_debug_certain_confidence.py
Normal file
@@ -0,0 +1,560 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug Tool Certain Confidence Simulator Test
|
||||
|
||||
Tests the debug tool's 'certain' confidence feature in a realistic simulation:
|
||||
- Multi-step investigation leading to certain confidence
|
||||
- Validation that expert analysis is skipped for obvious bugs
|
||||
- Verification that certain confidence is always trusted
|
||||
- Ensures token optimization works correctly for minimal fixes
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class DebugCertainConfidenceTest(ConversationBaseTest):
|
||||
"""Test debug tool's certain confidence optimization feature"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "debug_certain_confidence"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Debug tool certain confidence optimization validation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test debug tool certain confidence capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: Debug tool certain confidence validation")
|
||||
|
||||
# Create test files with obvious bugs for certain scenarios
|
||||
self._create_obvious_bug_scenarios()
|
||||
|
||||
# Test 1: Obvious import error with certain confidence
|
||||
if not self._test_obvious_import_error_certain():
|
||||
return False
|
||||
|
||||
# Test 2: Certain confidence is always trusted
|
||||
if not self._test_certain_always_trusted():
|
||||
return False
|
||||
|
||||
# Test 3: Regular high confidence still triggers expert analysis
|
||||
if not self._test_regular_high_confidence_expert_analysis():
|
||||
return False
|
||||
|
||||
# Test 4: Multi-step investigation ending in certain
|
||||
if not self._test_multi_step_investigation_certain():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All debug certain confidence tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Debug certain confidence test failed: {e}")
|
||||
return False
|
||||
|
||||
def _create_obvious_bug_scenarios(self):
|
||||
"""Create test files with obvious bugs perfect for certain confidence"""
|
||||
|
||||
# Scenario 1: Missing import statement (very obvious)
|
||||
missing_import_code = """#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
# import hashlib # <-- Missing import!
|
||||
|
||||
class UserAuth:
|
||||
def __init__(self, secret_key):
|
||||
self.secret_key = secret_key
|
||||
|
||||
def hash_password(self, password):
|
||||
# This will fail with NameError: name 'hashlib' is not defined
|
||||
salt = os.urandom(32)
|
||||
return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000)
|
||||
|
||||
def verify_password(self, password, stored_hash):
|
||||
# This function also uses hashlib
|
||||
return hashlib.pbkdf2_hmac('sha256', password.encode(), stored_hash[:32], 100000) == stored_hash[32:]
|
||||
"""
|
||||
|
||||
# Scenario 2: Typo in method name (obvious once spotted)
|
||||
typo_bug_code = """#!/usr/bin/env python3
|
||||
class Calculator:
|
||||
def __init__(self):
|
||||
self.history = []
|
||||
|
||||
def add_numbers(self, a, b):
|
||||
result = a + b
|
||||
self.history.append(f"{a} + {b} = {result}")
|
||||
return result
|
||||
|
||||
def calculate_total(self, numbers):
|
||||
total = 0
|
||||
for num in numbers:
|
||||
# Typo: should be add_numbers, not add_number
|
||||
total = self.add_number(total, num) # NameError: no method 'add_number'
|
||||
return total
|
||||
"""
|
||||
|
||||
# Scenario 3: Indentation error (Python syntax error)
|
||||
indentation_error_code = """#!/usr/bin/env python3
|
||||
def process_data(data_list):
|
||||
results = []
|
||||
for item in data_list:
|
||||
if item > 0:
|
||||
processed = item * 2
|
||||
results.append(processed) # IndentationError: unindent does not match any outer indentation level
|
||||
return results
|
||||
|
||||
def main():
|
||||
data = [1, 2, 3, 4, 5]
|
||||
print(process_data(data))
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
self.missing_import_file = self.create_additional_test_file("user_auth.py", missing_import_code)
|
||||
self.typo_bug_file = self.create_additional_test_file("calculator.py", typo_bug_code)
|
||||
self.indentation_file = self.create_additional_test_file("data_processor.py", indentation_error_code)
|
||||
|
||||
self.logger.info(" ✅ Created obvious bug scenarios:")
|
||||
self.logger.info(f" - Missing import: {self.missing_import_file}")
|
||||
self.logger.info(f" - Method typo: {self.typo_bug_file}")
|
||||
self.logger.info(f" - Indentation error: {self.indentation_file}")
|
||||
|
||||
# Create error logs for context
|
||||
import_error_log = """ERROR: User authentication failing during login
|
||||
Traceback (most recent call last):
|
||||
File "user_auth.py", line 12, in hash_password
|
||||
return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000)
|
||||
NameError: name 'hashlib' is not defined
|
||||
|
||||
This happens every time a user tries to log in. The error occurs in the password hashing function.
|
||||
"""
|
||||
|
||||
self.error_log_file = self.create_additional_test_file("error.log", import_error_log)
|
||||
self.logger.info(f" - Error log: {self.error_log_file}")
|
||||
|
||||
def _test_obvious_import_error_certain(self) -> bool:
|
||||
"""Test certain confidence with obvious missing import error"""
|
||||
try:
|
||||
self.logger.info(" 1.1: Testing obvious import error with certain confidence")
|
||||
|
||||
# Step 1: Initial investigation
|
||||
self.logger.info(" 1.1.1: Step 1 - Initial problem description")
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Investigating NameError in user authentication - users cannot log in due to 'name hashlib is not defined' error.",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "NameError occurs in hash_password method when trying to use hashlib.pbkdf2_hmac. Error happens on every login attempt.",
|
||||
"files_checked": [self.error_log_file],
|
||||
"relevant_files": [self.error_log_file],
|
||||
"hypothesis": "Missing import statement for hashlib module",
|
||||
"confidence": "medium",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to get initial investigation response")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_debug_response(response1)
|
||||
if not self._validate_investigation_response(response1_data, 1, True, "investigation_in_progress"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
|
||||
# Step 2: Examine code and identify obvious fix - use certain confidence
|
||||
self.logger.info(" 1.1.2: Step 2 - Found exact issue and simple fix (certain)")
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Found the exact issue and the minimal fix required",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Missing 'import hashlib' statement at the top of user_auth.py file. The error occurs because hashlib is used in hash_password() method on line 12 but never imported. Simple one-line fix: add 'import hashlib' after line 2.",
|
||||
"files_checked": [self.error_log_file, self.missing_import_file],
|
||||
"relevant_files": [self.missing_import_file],
|
||||
"relevant_methods": ["UserAuth.hash_password", "UserAuth.verify_password"],
|
||||
"hypothesis": "Missing 'import hashlib' statement causes NameError when hash_password method executes",
|
||||
"confidence": "certain", # Use certain - should skip expert analysis
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash", # Specify model for consistency
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to complete investigation with certain confidence")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_debug_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Validate certain response structure
|
||||
expected_status = "certain_confidence_proceed_with_fix"
|
||||
if response2_data.get("status") != expected_status:
|
||||
self.logger.error(f"Expected status '{expected_status}', got '{response2_data.get('status')}'")
|
||||
return False
|
||||
|
||||
if not response2_data.get("investigation_complete"):
|
||||
self.logger.error("Expected investigation_complete=true for certain confidence")
|
||||
return False
|
||||
|
||||
if not response2_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
|
||||
return False
|
||||
|
||||
# Verify expert analysis is marked as skipped
|
||||
expert_analysis = response2_data.get("expert_analysis", {})
|
||||
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
|
||||
self.logger.error("Expert analysis should be marked as skipped for certain confidence")
|
||||
return False
|
||||
|
||||
# Check for proper investigation summary
|
||||
complete_investigation = response2_data.get("complete_investigation", {})
|
||||
if complete_investigation.get("confidence_level") != "certain":
|
||||
self.logger.error("Expected confidence_level='certain' in complete investigation")
|
||||
return False
|
||||
|
||||
if complete_investigation.get("steps_taken") != 2:
|
||||
self.logger.error("Expected steps_taken=2 in complete investigation")
|
||||
return False
|
||||
|
||||
# Verify next steps guidance
|
||||
next_steps = response2_data.get("next_steps", "")
|
||||
if "CERTAIN confidence" not in next_steps:
|
||||
self.logger.error("Expected 'CERTAIN confidence' in next_steps guidance")
|
||||
return False
|
||||
|
||||
if "minimal fix" not in next_steps:
|
||||
self.logger.error("Expected 'minimal fix' guidance in next_steps")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Certain confidence skipped expert analysis correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Obvious import error certain test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_certain_always_trusted(self) -> bool:
|
||||
"""Test that certain confidence is always trusted regardless of complexity"""
|
||||
try:
|
||||
self.logger.info(" 1.2: Testing that certain confidence is always trusted")
|
||||
|
||||
# Single step investigation with certain - should always be trusted
|
||||
self.logger.info(" 1.2.1: Direct certain confidence (always trusted)")
|
||||
response, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Found the exact root cause and minimal fix for this complex issue",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "After thorough investigation, identified that the issue is caused by method name typo in Calculator.calculate_total() - calls self.add_number() instead of self.add_numbers(). Simple fix: change line 14 from 'add_number' to 'add_numbers'.",
|
||||
"files_checked": [self.typo_bug_file],
|
||||
"relevant_files": [self.typo_bug_file],
|
||||
"relevant_methods": ["Calculator.calculate_total", "Calculator.add_numbers"],
|
||||
"hypothesis": "Method name typo in calculate_total() calls non-existent add_number() instead of add_numbers()",
|
||||
"confidence": "certain", # Should always be trusted
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error("Failed to get certain confidence response")
|
||||
return False
|
||||
|
||||
response_data = self._parse_debug_response(response)
|
||||
if not response_data:
|
||||
return False
|
||||
|
||||
# Verify certain is trusted regardless of complexity
|
||||
if response_data.get("status") != "certain_confidence_proceed_with_fix":
|
||||
self.logger.error("Certain confidence should always be trusted")
|
||||
return False
|
||||
|
||||
if not response_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expert analysis should be skipped for certain confidence")
|
||||
return False
|
||||
|
||||
# Ensure expert analysis is marked as skipped
|
||||
expert_analysis = response_data.get("expert_analysis", {})
|
||||
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
|
||||
self.logger.error("Expert analysis status should indicate certain skip")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Certain confidence always trusted correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Certain always trusted test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_regular_high_confidence_expert_analysis(self) -> bool:
|
||||
"""Test that regular 'high' confidence still triggers expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing that regular 'high' confidence triggers expert analysis")
|
||||
|
||||
# Investigation with regular high confidence (not certain)
|
||||
self.logger.info(" 1.3.1: High confidence (not certain) - should trigger expert analysis")
|
||||
response, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Identified likely root cause with strong evidence",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "IndentationError in data_processor.py line 8 - results.append(processed) is incorrectly indented. Should align with the 'if' statement above it.",
|
||||
"files_checked": [self.indentation_file],
|
||||
"relevant_files": [self.indentation_file],
|
||||
"relevant_methods": ["process_data"],
|
||||
"hypothesis": "Incorrect indentation causes IndentationError in process_data function",
|
||||
"confidence": "high", # Regular high confidence, NOT certain
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error("Failed to get high confidence response")
|
||||
return False
|
||||
|
||||
response_data = self._parse_debug_response(response)
|
||||
if not response_data:
|
||||
return False
|
||||
|
||||
# Verify that regular high confidence triggers expert analysis
|
||||
if response_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error(
|
||||
f"Expected 'calling_expert_analysis' for high confidence, got '{response_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if response_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expert analysis should NOT be skipped for regular high confidence")
|
||||
return False
|
||||
|
||||
# Verify expert analysis was called
|
||||
expert_analysis = response_data.get("expert_analysis", {})
|
||||
if not expert_analysis:
|
||||
self.logger.error("Expected expert analysis for regular high confidence")
|
||||
return False
|
||||
|
||||
# Check that expert analysis has content
|
||||
if "status" not in expert_analysis:
|
||||
self.logger.error("Expert analysis should have status field")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Regular high confidence triggers expert analysis correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Regular high confidence test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_multi_step_investigation_certain(self) -> bool:
|
||||
"""Test multi-step investigation that ends with certain confidence"""
|
||||
try:
|
||||
self.logger.info(" 1.4: Testing multi-step investigation ending with certain")
|
||||
|
||||
# Step 1: Start investigation
|
||||
self.logger.info(" 1.4.1: Step 1 - Initial investigation")
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Investigating Python syntax error in data processing module",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "IndentationError reported when running data_processor.py - 'unindent does not match any outer indentation level'",
|
||||
"files_checked": [self.indentation_file],
|
||||
"relevant_files": [],
|
||||
"hypothesis": "Indentation inconsistency in Python code",
|
||||
"confidence": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start multi-step investigation")
|
||||
return False
|
||||
|
||||
# Step 2: Examine code structure
|
||||
self.logger.info(" 1.4.2: Step 2 - Code examination")
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Examining the indentation structure in process_data function",
|
||||
"step_number": 2,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "Found the issue: line 8 'results.append(processed)' is indented incorrectly. It should align with the 'if' statement, not be at the same level as the 'for' loop.",
|
||||
"files_checked": [self.indentation_file],
|
||||
"relevant_files": [self.indentation_file],
|
||||
"relevant_methods": ["process_data"],
|
||||
"hypothesis": "Line 8 has incorrect indentation level causing IndentationError",
|
||||
"confidence": "medium",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
# Step 3: Confirm fix with certain confidence
|
||||
self.logger.info(" 1.4.3: Step 3 - Confirmed fix (certain)")
|
||||
response3, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"step": "Confirmed the exact issue and simple fix",
|
||||
"step_number": 3,
|
||||
"total_steps": 3,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Confirmed: line 8 'results.append(processed)' needs to be indented 4 more spaces to align with line 6 'if item > 0:'. This is a simple indentation fix.",
|
||||
"files_checked": [self.indentation_file],
|
||||
"relevant_files": [self.indentation_file],
|
||||
"relevant_methods": ["process_data"],
|
||||
"hypothesis": "IndentationError on line 8 due to incorrect indentation level - needs 4 more spaces",
|
||||
"confidence": "certain", # Final step with certain
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to complete multi-step investigation")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_debug_response(response3)
|
||||
if not response3_data:
|
||||
return False
|
||||
|
||||
# Validate multi-step certain response
|
||||
if response3_data.get("status") != "certain_confidence_proceed_with_fix":
|
||||
self.logger.error("Expected certain status for final step")
|
||||
return False
|
||||
|
||||
if not response3_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expected expert analysis to be skipped for certain")
|
||||
return False
|
||||
|
||||
# Verify investigation preserves steps (at least the current step)
|
||||
complete_investigation = response3_data.get("complete_investigation", {})
|
||||
steps_taken = complete_investigation.get("steps_taken", 0)
|
||||
if steps_taken < 1:
|
||||
self.logger.error("Expected at least 1 step in complete investigation")
|
||||
return False
|
||||
|
||||
# Check that investigation summary includes progression
|
||||
investigation_summary = complete_investigation.get("investigation_summary", "")
|
||||
if "Total steps:" not in investigation_summary and "Steps taken:" not in investigation_summary:
|
||||
self.logger.error("Investigation summary should show steps information")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Multi-step investigation with certain ending successful")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Multi-step investigation certain test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool directly in-process to maintain conversation memory"""
|
||||
try:
|
||||
# Get the tool instance
|
||||
if tool_name not in self._tools:
|
||||
self.logger.error(f"Tool '{tool_name}' not found in available tools")
|
||||
return None, None
|
||||
|
||||
tool = self._tools[tool_name]
|
||||
|
||||
# Execute the tool with proper async handling
|
||||
loop = self._get_event_loop()
|
||||
|
||||
# Call the tool's execute method
|
||||
result = loop.run_until_complete(tool.execute(params))
|
||||
|
||||
if not result or len(result) == 0:
|
||||
self.logger.error(f"Tool '{tool_name}' returned empty result")
|
||||
return None, None
|
||||
|
||||
# Extract the text content from the result
|
||||
response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||
|
||||
# Extract continuation_id from debug response if present
|
||||
continuation_id = self._extract_debug_continuation_id(response_text)
|
||||
|
||||
return response_text, continuation_id
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to call tool '{tool_name}' directly: {e}")
|
||||
return None, None
|
||||
|
||||
def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from debug response"""
|
||||
try:
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.debug(f"Failed to parse response for debug continuation_id: {e}")
|
||||
return None
|
||||
|
||||
def _parse_debug_response(self, response_text: str) -> dict:
|
||||
"""Parse debug tool JSON response"""
|
||||
try:
|
||||
return json.loads(response_text)
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse debug response as JSON: {e}")
|
||||
self.logger.error(f"Response text: {response_text[:500]}...")
|
||||
return {}
|
||||
|
||||
def _validate_investigation_response(
|
||||
self,
|
||||
response_data: dict,
|
||||
expected_step: int,
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate debug investigation response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
|
||||
return False
|
||||
|
||||
# Check step number
|
||||
if response_data.get("step_number") != expected_step:
|
||||
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
|
||||
return False
|
||||
|
||||
# Check next_step_required
|
||||
if response_data.get("next_step_required") != expected_next_required:
|
||||
self.logger.error(
|
||||
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Basic structure checks
|
||||
if "investigation_status" not in response_data:
|
||||
self.logger.error("Missing investigation_status in response")
|
||||
return False
|
||||
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating investigation response: {e}")
|
||||
return False
|
||||
363
tests/test_debug_certain_confidence.py
Normal file
363
tests/test_debug_certain_confidence.py
Normal file
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
Integration tests for the debug tool's 'certain' confidence feature.
|
||||
|
||||
Tests the complete workflow where Claude identifies obvious bugs with absolute certainty
|
||||
and can skip expensive expert analysis for minimal fixes.
|
||||
"""
|
||||
|
||||
import json
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tools.debug import DebugIssueTool
|
||||
|
||||
|
||||
class TestDebugCertainConfidence:
|
||||
"""Integration tests for certain confidence optimization."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test tool instance."""
|
||||
self.tool = DebugIssueTool()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_certain_confidence_skips_expert_analysis(self):
|
||||
"""Test that certain confidence with valid minimal fix skips expert analysis."""
|
||||
# Simulate a multi-step investigation ending with certain confidence
|
||||
|
||||
# Step 1: Initial investigation
|
||||
with patch("utils.conversation_memory.create_thread", return_value="debug-certain-uuid"):
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
result1 = await self.tool.execute(
|
||||
{
|
||||
"step": "Investigating Python ImportError in user authentication module",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Users cannot log in, getting 'ModuleNotFoundError: No module named hashlib'",
|
||||
"files_checked": ["/auth/user_auth.py"],
|
||||
"relevant_files": ["/auth/user_auth.py"],
|
||||
"hypothesis": "Missing import statement",
|
||||
"confidence": "medium",
|
||||
"continuation_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Verify step 1 response
|
||||
response1 = json.loads(result1[0].text)
|
||||
assert response1["status"] == "investigation_in_progress"
|
||||
assert response1["step_number"] == 1
|
||||
continuation_id = response1["continuation_id"]
|
||||
|
||||
# Step 2: Final step with certain confidence (simple import fix)
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
result2 = await self.tool.execute(
|
||||
{
|
||||
"step": "Found the exact issue and fix",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Missing 'import hashlib' statement at top of user_auth.py file, line 3. Simple one-line fix required.",
|
||||
"files_checked": ["/auth/user_auth.py"],
|
||||
"relevant_files": ["/auth/user_auth.py"],
|
||||
"relevant_methods": ["UserAuth.hash_password"],
|
||||
"hypothesis": "Missing import hashlib statement causes ModuleNotFoundError when hash_password method is called",
|
||||
"confidence": "certain", # NAILEDIT confidence - should skip expert analysis
|
||||
"continuation_id": continuation_id,
|
||||
}
|
||||
)
|
||||
|
||||
# Verify final response skipped expert analysis
|
||||
response2 = json.loads(result2[0].text)
|
||||
|
||||
# Should indicate certain confidence was used
|
||||
assert response2["status"] == "certain_confidence_proceed_with_fix"
|
||||
assert response2["investigation_complete"] is True
|
||||
assert response2["skip_expert_analysis"] is True
|
||||
|
||||
# Expert analysis should be marked as skipped
|
||||
assert response2["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
|
||||
assert (
|
||||
response2["expert_analysis"]["reason"] == "Claude identified exact root cause with minimal fix requirement"
|
||||
)
|
||||
|
||||
# Should have complete investigation summary
|
||||
assert "complete_investigation" in response2
|
||||
assert response2["complete_investigation"]["confidence_level"] == "certain"
|
||||
assert response2["complete_investigation"]["steps_taken"] == 2
|
||||
|
||||
# Next steps should guide Claude to implement the fix directly
|
||||
assert "CERTAIN confidence" in response2["next_steps"]
|
||||
assert "minimal fix" in response2["next_steps"]
|
||||
assert "without requiring further consultation" in response2["next_steps"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_certain_confidence_always_trusted(self):
|
||||
"""Test that certain confidence is always trusted, even for complex issues."""
|
||||
|
||||
# Set up investigation state
|
||||
self.tool.initial_issue = "Any kind of issue"
|
||||
self.tool.investigation_history = [
|
||||
{
|
||||
"step_number": 1,
|
||||
"step": "Initial investigation",
|
||||
"findings": "Some findings",
|
||||
"files_checked": [],
|
||||
"relevant_files": [],
|
||||
"relevant_methods": [],
|
||||
"hypothesis": None,
|
||||
"confidence": "low",
|
||||
}
|
||||
]
|
||||
self.tool.consolidated_findings = {
|
||||
"files_checked": set(),
|
||||
"relevant_files": set(),
|
||||
"relevant_methods": set(),
|
||||
"findings": ["Step 1: Some findings"],
|
||||
"hypotheses": [],
|
||||
"images": [],
|
||||
}
|
||||
|
||||
# Final step with certain confidence - should ALWAYS be trusted
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
result = await self.tool.execute(
|
||||
{
|
||||
"step": "Found the issue and fix",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Complex or simple, doesn't matter - Claude says certain",
|
||||
"files_checked": ["/any/file.py"],
|
||||
"relevant_files": ["/any/file.py"],
|
||||
"relevant_methods": ["any_method"],
|
||||
"hypothesis": "Claude has decided this is certain - trust the judgment",
|
||||
"confidence": "certain", # Should always be trusted
|
||||
"continuation_id": "debug-trust-uuid",
|
||||
}
|
||||
)
|
||||
|
||||
# Verify certain is always trusted
|
||||
response = json.loads(result[0].text)
|
||||
|
||||
# Should proceed with certain confidence
|
||||
assert response["status"] == "certain_confidence_proceed_with_fix"
|
||||
assert response["investigation_complete"] is True
|
||||
assert response["skip_expert_analysis"] is True
|
||||
|
||||
# Expert analysis should be skipped
|
||||
assert response["expert_analysis"]["status"] == "skipped_due_to_certain_confidence"
|
||||
|
||||
# Next steps should guide Claude to implement fix directly
|
||||
assert "CERTAIN confidence" in response["next_steps"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_regular_high_confidence_still_uses_expert_analysis(self):
|
||||
"""Test that regular 'high' confidence still triggers expert analysis."""
|
||||
|
||||
# Set up investigation state
|
||||
self.tool.initial_issue = "Session validation issue"
|
||||
self.tool.investigation_history = [
|
||||
{
|
||||
"step_number": 1,
|
||||
"step": "Initial investigation",
|
||||
"findings": "Found session issue",
|
||||
"files_checked": [],
|
||||
"relevant_files": [],
|
||||
"relevant_methods": [],
|
||||
"hypothesis": None,
|
||||
"confidence": "low",
|
||||
}
|
||||
]
|
||||
self.tool.consolidated_findings = {
|
||||
"files_checked": set(),
|
||||
"relevant_files": {"/api/sessions.py"},
|
||||
"relevant_methods": {"SessionManager.validate"},
|
||||
"findings": ["Step 1: Found session issue"],
|
||||
"hypotheses": [],
|
||||
"images": [],
|
||||
}
|
||||
|
||||
# Mock expert analysis
|
||||
mock_expert_response = {
|
||||
"status": "analysis_complete",
|
||||
"summary": "Expert analysis of session validation",
|
||||
"hypotheses": [
|
||||
{
|
||||
"name": "SESSION_VALIDATION_BUG",
|
||||
"confidence": "High",
|
||||
"root_cause": "Session timeout not properly handled",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Final step with regular 'high' confidence (should trigger expert analysis)
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
with patch.object(self.tool, "_call_expert_analysis", return_value=mock_expert_response):
|
||||
with patch.object(self.tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
|
||||
result = await self.tool.execute(
|
||||
{
|
||||
"step": "Identified likely root cause",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Session validation fails when timeout occurs during user activity",
|
||||
"files_checked": ["/api/sessions.py"],
|
||||
"relevant_files": ["/api/sessions.py"],
|
||||
"relevant_methods": ["SessionManager.validate", "SessionManager.cleanup"],
|
||||
"hypothesis": "Session timeout handling bug causes validation failures",
|
||||
"confidence": "high", # Regular high confidence, NOT certain
|
||||
"continuation_id": "debug-regular-uuid",
|
||||
}
|
||||
)
|
||||
|
||||
# Verify expert analysis was called (not skipped)
|
||||
response = json.loads(result[0].text)
|
||||
|
||||
# Should call expert analysis normally
|
||||
assert response["status"] == "calling_expert_analysis"
|
||||
assert response["investigation_complete"] is True
|
||||
assert "skip_expert_analysis" not in response # Should not be present
|
||||
|
||||
# Expert analysis should be present with real results
|
||||
assert response["expert_analysis"]["status"] == "analysis_complete"
|
||||
assert response["expert_analysis"]["summary"] == "Expert analysis of session validation"
|
||||
|
||||
# Next steps should indicate normal investigation completion (not certain confidence)
|
||||
assert "INVESTIGATION IS COMPLETE" in response["next_steps"]
|
||||
assert "certain" not in response["next_steps"].lower()
|
||||
|
||||
def test_certain_confidence_schema_requirements(self):
|
||||
"""Test that certain confidence is properly described in schema for Claude's guidance."""
|
||||
|
||||
# The schema description should guide Claude on proper certain usage
|
||||
schema = self.tool.get_input_schema()
|
||||
confidence_description = schema["properties"]["confidence"]["description"]
|
||||
|
||||
# Should emphasize it's only when root cause and fix are confirmed
|
||||
assert "root cause" in confidence_description.lower()
|
||||
assert "minimal fix" in confidence_description.lower()
|
||||
assert "confirmed" in confidence_description.lower()
|
||||
|
||||
# Should emphasize trust in Claude's judgment
|
||||
assert "absolutely" in confidence_description.lower() or "certain" in confidence_description.lower()
|
||||
|
||||
# Should mention no thought-partner assistance needed
|
||||
assert "thought-partner" in confidence_description.lower() or "assistance" in confidence_description.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_confidence_enum_validation(self):
|
||||
"""Test that certain is properly included in confidence enum validation."""
|
||||
|
||||
# Valid confidence values should not raise errors
|
||||
valid_confidences = ["low", "medium", "high", "certain"]
|
||||
|
||||
for confidence in valid_confidences:
|
||||
# This should not raise validation errors
|
||||
with patch("utils.conversation_memory.create_thread", return_value="test-uuid"):
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
result = await self.tool.execute(
|
||||
{
|
||||
"step": f"Test step with {confidence} confidence",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Test findings",
|
||||
"confidence": confidence,
|
||||
}
|
||||
)
|
||||
|
||||
# Should get valid response
|
||||
response = json.loads(result[0].text)
|
||||
assert "error" not in response or response.get("status") != "investigation_failed"
|
||||
|
||||
def test_tool_schema_includes_certain(self):
|
||||
"""Test that the tool schema properly includes certain in confidence enum."""
|
||||
schema = self.tool.get_input_schema()
|
||||
|
||||
confidence_property = schema["properties"]["confidence"]
|
||||
assert confidence_property["type"] == "string"
|
||||
assert "certain" in confidence_property["enum"]
|
||||
assert confidence_property["enum"] == ["exploring", "low", "medium", "high", "certain"]
|
||||
|
||||
# Check that description explains certain usage
|
||||
description = confidence_property["description"]
|
||||
assert "certain" in description.lower()
|
||||
assert "root cause" in description.lower()
|
||||
assert "minimal fix" in description.lower()
|
||||
assert "thought-partner" in description.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_certain_confidence_preserves_investigation_data(self):
|
||||
"""Test that certain confidence path preserves all investigation data properly."""
|
||||
|
||||
# Multi-step investigation leading to certain
|
||||
with patch("utils.conversation_memory.create_thread", return_value="preserve-data-uuid"):
|
||||
with patch("utils.conversation_memory.add_turn"):
|
||||
# Step 1
|
||||
await self.tool.execute(
|
||||
{
|
||||
"step": "Initial investigation of login failure",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "Users can't log in after password reset",
|
||||
"files_checked": ["/auth/password.py"],
|
||||
"relevant_files": ["/auth/password.py"],
|
||||
"confidence": "low",
|
||||
}
|
||||
)
|
||||
|
||||
# Step 2
|
||||
await self.tool.execute(
|
||||
{
|
||||
"step": "Examining password validation logic",
|
||||
"step_number": 2,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "Password hash function not imported correctly",
|
||||
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
|
||||
"relevant_files": ["/auth/password.py"],
|
||||
"relevant_methods": ["PasswordManager.validate_password"],
|
||||
"hypothesis": "Import statement issue",
|
||||
"confidence": "medium",
|
||||
"continuation_id": "preserve-data-uuid",
|
||||
}
|
||||
)
|
||||
|
||||
# Step 3: Final with certain
|
||||
result = await self.tool.execute(
|
||||
{
|
||||
"step": "Found exact issue and fix",
|
||||
"step_number": 3,
|
||||
"total_steps": 3,
|
||||
"next_step_required": False,
|
||||
"findings": "Missing 'from utils.crypto import hash_password' at line 5",
|
||||
"files_checked": ["/auth/password.py", "/utils/crypto.py"],
|
||||
"relevant_files": ["/auth/password.py"],
|
||||
"relevant_methods": ["PasswordManager.validate_password", "hash_password"],
|
||||
"hypothesis": "Missing import statement for hash_password function",
|
||||
"confidence": "certain",
|
||||
"continuation_id": "preserve-data-uuid",
|
||||
}
|
||||
)
|
||||
|
||||
# Verify all investigation data is preserved
|
||||
response = json.loads(result[0].text)
|
||||
|
||||
assert response["status"] == "certain_confidence_proceed_with_fix"
|
||||
|
||||
investigation = response["complete_investigation"]
|
||||
assert investigation["steps_taken"] == 3
|
||||
assert len(investigation["files_examined"]) == 2 # Both files from all steps
|
||||
assert "/auth/password.py" in investigation["files_examined"]
|
||||
assert "/utils/crypto.py" in investigation["files_examined"]
|
||||
assert len(investigation["relevant_files"]) == 1
|
||||
assert len(investigation["relevant_methods"]) == 2
|
||||
assert investigation["confidence_level"] == "certain"
|
||||
|
||||
# Should have complete investigation summary
|
||||
assert "SYSTEMATIC INVESTIGATION SUMMARY" in investigation["investigation_summary"]
|
||||
assert (
|
||||
"Steps taken: 3" in investigation["investigation_summary"]
|
||||
or "Total steps: 3" in investigation["investigation_summary"]
|
||||
)
|
||||
@@ -163,7 +163,7 @@ class TestImageSupportIntegration:
|
||||
images_field = schema["properties"]["images"]
|
||||
assert images_field["type"] == "array"
|
||||
assert images_field["items"]["type"] == "string"
|
||||
assert "error screens" in images_field["description"].lower()
|
||||
assert "screenshots" in images_field["description"].lower()
|
||||
|
||||
def test_tool_image_validation_limits(self):
|
||||
"""Test that tools validate image size limits using real provider resolution."""
|
||||
|
||||
@@ -21,47 +21,62 @@ logger = logging.getLogger(__name__)
|
||||
# Field descriptions for the investigation steps
|
||||
DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
|
||||
"step": (
|
||||
"Describe what you're currently investigating by beginning to think deeply about the issue, its root cause"
|
||||
"and possible reasons. Prepare and learn about the related code first. In step 1, clearly state the issue to investigate and begin "
|
||||
"thinking deeply about not just the described issue, but possible underlying causes, side-effects, or external "
|
||||
"components that might contribute to it. Follow the code flow carefully—bugs may originate "
|
||||
"in one part of the code-dependencies, or upstream logic may not be immediately visible. Bugs and issues can "
|
||||
"arise due to poor logic, incorrect assumptions, bad input or failures elsewhere."
|
||||
"In all subsequent steps, continue uncovering relevant code, examining patterns, and formulating hypotheses "
|
||||
"with deliberate attention to detail."
|
||||
"Describe what you're currently investigating by thinking deeply about the issue and its possible causes. "
|
||||
"In step 1, clearly state the issue and begin forming an investigative direction. Consider not only obvious "
|
||||
"failures, but also subtle contributing factors like upstream logic, invalid inputs, missing preconditions, "
|
||||
"or hidden side effects. Map out the flow of related functions or modules. Identify call paths where input "
|
||||
"values or branching logic could cause instability. In concurrent systems, watch for race conditions, shared "
|
||||
"state, or timing dependencies. In all later steps, continue exploring with precision: trace deeper "
|
||||
"dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence."
|
||||
),
|
||||
"step_number": (
|
||||
"The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or "
|
||||
"revise the previous one."
|
||||
),
|
||||
"total_steps": (
|
||||
"Your current estimate for how many steps will be needed to complete the investigation. Adjust as new findings emerge."
|
||||
),
|
||||
"next_step_required": (
|
||||
"Set to true if you plan to continue the investigation with another step. False means you believe the root "
|
||||
"cause is known or the investigation is complete."
|
||||
),
|
||||
"step_number": "Current step number in the investigation sequence (starts at 1).",
|
||||
"total_steps": "Estimate of total investigation steps expected (adjustable as the process evolves).",
|
||||
"next_step_required": "Whether another investigation step is needed after this one.",
|
||||
"findings": (
|
||||
"Summarize discoveries in this step. Think critically and include relevant code behavior, suspicious patterns, "
|
||||
"evidence collected, and any partial conclusions or leads."
|
||||
"Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or "
|
||||
"logs, or disproven theories. Be specific and avoid vague language—document what you now know and how it "
|
||||
"affects your hypothesis. In later steps, confirm or disprove past findings with reason."
|
||||
),
|
||||
"files_checked": (
|
||||
"List all files (as absolute paths, do not clip or shrink file names) examined during the investigation so far. "
|
||||
"Include even files ruled out, as this tracks your exploration path."
|
||||
),
|
||||
"relevant_files": (
|
||||
"Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list those that are directly tied to the root cause or its effects."
|
||||
"Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list "
|
||||
"those that are directly tied to the root cause or its effects. This could include the cause, trigger, or "
|
||||
"place of manifestation."
|
||||
),
|
||||
"relevant_methods": (
|
||||
"List specific methods/functions clearly tied to the issue. Use 'ClassName.methodName' or 'functionName' format."
|
||||
"List methods or functions that are central to the issue, in the format 'ClassName.methodName' or 'functionName'. "
|
||||
"Prioritize those that influence or process inputs, drive branching, or pass state between modules."
|
||||
),
|
||||
"hypothesis": (
|
||||
"Formulate your current best guess about the underlying cause. This is a working theory and may evolve based on further evidence."
|
||||
"A concrete theory for what's causing the issue based on the evidence so far. This can include suspected "
|
||||
"failures, incorrect assumptions, or violated constraints. You are encouraged to revise or abandon it in later "
|
||||
"steps as needed."
|
||||
),
|
||||
"confidence": (
|
||||
"How confident you are in the current hypothesis: "
|
||||
"'low' (initial theory), 'medium' (good evidence), 'high' (strong to very strong evidence), "
|
||||
"'nailedit' (ONLY use for final step and ONLY when you have found the EXACT root cause with 100% certainty AND "
|
||||
"identified a simple, minimal fix that requires no expert consultation. Use this ONLY "
|
||||
"for obvious bugs and logic errors that you ABSOLUTELY are certain about and have no doubts because you have"
|
||||
"successfully mapped out the code flow and the root cause behind the issue."
|
||||
"Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), "
|
||||
"'medium' (some supporting evidence), 'high' (strong evidence), 'certain' (only when the root cause and minimal "
|
||||
"fix are both confirmed). Do NOT use 'certain' unless the issue can be fully resolved with a fix, use 'high' "
|
||||
"instead when in doubt. Using 'certain' prevents you from taking assistance from another thought-partner."
|
||||
),
|
||||
"backtrack_from_step": (
|
||||
"If an earlier finding or hypothesis needs to be revised or discarded, specify the step number from which to "
|
||||
"start over. Use this to acknowledge investigative dead ends and correct the course."
|
||||
),
|
||||
"backtrack_from_step": "If a previous step needs revision, specify the step number to backtrack from.",
|
||||
"continuation_id": "Continuation token used for linking multi-step investigations and continuing conversations after discovery.",
|
||||
"images": (
|
||||
"Optional. Include full absolute paths to visual debugging images (UI issues, logs, error screens) that help clarify the issue."
|
||||
"Optional list of absolute paths to screenshots or UI visuals that clarify the issue. "
|
||||
"Only include if they materially assist understanding or hypothesis formulation."
|
||||
),
|
||||
}
|
||||
|
||||
@@ -204,7 +219,7 @@ class DebugIssueTool(BaseTool):
|
||||
},
|
||||
"confidence": {
|
||||
"type": "string",
|
||||
"enum": ["low", "medium", "high", "nailedit"],
|
||||
"enum": ["exploring", "low", "medium", "high", "certain"],
|
||||
"description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"],
|
||||
},
|
||||
"backtrack_from_step": {
|
||||
@@ -356,9 +371,9 @@ class DebugIssueTool(BaseTool):
|
||||
response_data["investigation_complete"] = True
|
||||
|
||||
# Check if Claude has absolute certainty and can proceed with minimal fix
|
||||
if request.confidence == "nailedit":
|
||||
# Trust Claude's judgment completely - if it says nailedit, skip expert analysis
|
||||
response_data["status"] = "nailedit_confidence_proceed_with_fix"
|
||||
if request.confidence == "certain":
|
||||
# Trust Claude's judgment completely - if it says certain, skip expert analysis
|
||||
response_data["status"] = "certain_confidence_proceed_with_fix"
|
||||
|
||||
investigation_summary = self._prepare_investigation_summary()
|
||||
response_data["complete_investigation"] = {
|
||||
@@ -369,20 +384,21 @@ class DebugIssueTool(BaseTool):
|
||||
"relevant_methods": list(self.consolidated_findings["relevant_methods"]),
|
||||
"investigation_summary": investigation_summary,
|
||||
"final_hypothesis": request.hypothesis,
|
||||
"confidence_level": "nailedit",
|
||||
"confidence_level": "certain",
|
||||
}
|
||||
response_data["next_steps"] = (
|
||||
"Investigation complete with NAILED-IT confidence. You have identified the exact "
|
||||
"root cause and a minimal fix. Proceed directly with implementing the simple fix "
|
||||
"without requiring expert consultation. Focus on the precise, minimal change needed."
|
||||
"Investigation complete with CERTAIN confidence. You have identified the exact "
|
||||
"root cause and a minimal fix. MANDATORY: Present the user with the root cause analysis"
|
||||
"and IMMEDIATELY proceed with implementing the simple fix without requiring further "
|
||||
"consultation. Focus on the precise, minimal change needed."
|
||||
)
|
||||
response_data["skip_expert_analysis"] = True
|
||||
response_data["expert_analysis"] = {
|
||||
"status": "skipped_due_to_nailedit_confidence",
|
||||
"status": "skipped_due_to_certain_confidence",
|
||||
"reason": "Claude identified exact root cause with minimal fix requirement",
|
||||
}
|
||||
else:
|
||||
# Standard expert analysis for high/medium/low confidence
|
||||
# Standard expert analysis for certain/high/medium/low/exploring confidence
|
||||
response_data["status"] = "calling_expert_analysis"
|
||||
|
||||
# Prepare consolidated investigation summary
|
||||
@@ -413,9 +429,11 @@ class DebugIssueTool(BaseTool):
|
||||
"investigation_summary": investigation_summary,
|
||||
}
|
||||
response_data["next_steps"] = (
|
||||
"Investigation complete with expert analysis. Present the findings, hypotheses, "
|
||||
"and recommended fixes to the user. Focus on the most likely root cause and "
|
||||
"provide actionable implementation guidance."
|
||||
"INVESTIGATION IS COMPLETE. YOU MUST now summarize and present ALL key findings, confirmed "
|
||||
"hypotheses, and exact recommended fixes. Clearly identify the most likely root cause and "
|
||||
"provide concrete, actionable implementation guidance. Highlight affected code paths and display "
|
||||
"reasoning that led to this conclusion—make it easy for a developer to understand exactly where "
|
||||
"the problem lies."
|
||||
)
|
||||
else:
|
||||
response_data["next_steps"] = (
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
"""
|
||||
Zen MCP Server - Entry point
|
||||
The main implementation is in server.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from server import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user