diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4513ec6..810b72e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,8 +53,8 @@ jobs: pip install -r requirements-dev.txt - name: Run black formatter check - run: black --check . + run: black --check . --exclude="test_simulation_files/" - name: Run ruff linter - run: ruff check . + run: ruff check . --exclude test_simulation_files diff --git a/README.md b/README.md index f5162af..946ba62 100644 --- a/README.md +++ b/README.md @@ -370,7 +370,10 @@ Nice! **[📖 Read More](docs/tools/precommit.md)** - Multi-repository validation and change analysis ### 7. `debug` - Expert Debugging Assistant -Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the selected AI model. +Systematic investigation-guided debugging that walks Claude through step-by-step root cause analysis. Claude performs +methodical code examination, evidence collection, and hypothesis formation before receiving expert analysis from the +selected AI model. When Claude's confidence reaches **100% certainty** during the investigative workflow, expert analysis +via another model is skipped to save on tokens and cost, and Claude proceeds directly to fixing the issue. ``` See logs under /Users/me/project/diagnostics.log and related code under the sync folder. Logs show that sync diff --git a/code_quality_checks.sh b/code_quality_checks.sh index 4852f9c..d88da5c 100755 --- a/code_quality_checks.sh +++ b/code_quality_checks.sh @@ -67,16 +67,16 @@ echo "📋 Step 1: Running Linting and Formatting Checks" echo "--------------------------------------------------" echo "🔧 Running ruff linting with auto-fix..." -$RUFF check --fix +$RUFF check --fix --exclude test_simulation_files echo "🎨 Running black code formatting..." -$BLACK . +$BLACK . --exclude="test_simulation_files/" echo "📦 Running import sorting with isort..." -$ISORT . --skip-glob=".zen_venv/*" +$ISORT . --skip-glob=".zen_venv/*" --skip-glob="test_simulation_files/*" echo "✅ Verifying all linting passes..." -$RUFF check +$RUFF check --exclude test_simulation_files echo "✅ Step 1 Complete: All linting and formatting checks passed!" echo "" diff --git a/config.py b/config.py index 4ac8ad1..6856d83 100644 --- a/config.py +++ b/config.py @@ -14,7 +14,7 @@ import os # These values are used in server responses and for tracking releases # IMPORTANT: This is the single source of truth for version and author info # Semantic versioning: MAJOR.MINOR.PATCH -__version__ = "5.2.1" +__version__ = "5.2.2" # Last update date in ISO format __updated__ = "2025-06-19" # Primary maintainer diff --git a/docs/tools/debug.md b/docs/tools/debug.md index b5bfccb..ddec9a3 100644 --- a/docs/tools/debug.md +++ b/docs/tools/debug.md @@ -27,7 +27,8 @@ The debug tool implements a **systematic investigation methodology** where Claud 5. **Completion**: Once investigation is thorough, Claude signals completion **Expert Analysis Phase:** -After Claude completes the investigation, the tool automatically calls the selected AI model with: +After Claude completes the investigation, it automatically calls the selected AI model with (unless confidence is **certain**, +in which case expert analysis is bypassed): - Complete investigation summary with all steps and findings - Relevant files and methods identified during investigation - Final hypothesis and confidence assessment diff --git a/server.py b/server.py index bf729a6..d79ad28 100644 --- a/server.py +++ b/server.py @@ -122,7 +122,7 @@ try: file_handler = RotatingFileHandler( log_dir / "mcp_server.log", maxBytes=20 * 1024 * 1024, # 20MB max file size - backupCount=10, # Keep 10 rotated files (200MB total) + backupCount=5, # Keep 10 rotated files (100MB total) encoding="utf-8", ) file_handler.setLevel(getattr(logging, log_level, logging.INFO)) @@ -133,8 +133,8 @@ try: mcp_logger = logging.getLogger("mcp_activity") mcp_file_handler = RotatingFileHandler( log_dir / "mcp_activity.log", - maxBytes=20 * 1024 * 1024, # 20MB max file size - backupCount=5, # Keep 5 rotated files (100MB total) + maxBytes=10 * 1024 * 1024, # 20MB max file size + backupCount=2, # Keep 5 rotated files (20MB total) encoding="utf-8", ) mcp_file_handler.setLevel(logging.INFO) diff --git a/simulator_tests/__init__.py b/simulator_tests/__init__.py index 5f86d71..e1e49a3 100644 --- a/simulator_tests/__init__.py +++ b/simulator_tests/__init__.py @@ -14,6 +14,7 @@ from .test_content_validation import ContentValidationTest from .test_conversation_chain_validation import ConversationChainValidationTest from .test_cross_tool_comprehensive import CrossToolComprehensiveTest from .test_cross_tool_continuation import CrossToolContinuationTest +from .test_debug_certain_confidence import DebugCertainConfidenceTest from .test_debug_validation import DebugValidationTest from .test_line_number_validation import LineNumberValidationTest from .test_logs_validation import LogsValidationTest @@ -55,6 +56,7 @@ TEST_REGISTRY = { "testgen_validation": TestGenValidationTest, "refactor_validation": RefactorValidationTest, "debug_validation": DebugValidationTest, + "debug_certain_confidence": DebugCertainConfidenceTest, "conversation_chain_validation": ConversationChainValidationTest, "vision_capability": VisionCapabilityTest, "xai_models": XAIModelsTest, @@ -86,6 +88,7 @@ __all__ = [ "TestGenValidationTest", "RefactorValidationTest", "DebugValidationTest", + "DebugCertainConfidenceTest", "ConversationChainValidationTest", "VisionCapabilityTest", "XAIModelsTest", diff --git a/simulator_tests/test_debug_certain_confidence.py b/simulator_tests/test_debug_certain_confidence.py new file mode 100644 index 0000000..256b11b --- /dev/null +++ b/simulator_tests/test_debug_certain_confidence.py @@ -0,0 +1,560 @@ +#!/usr/bin/env python3 +""" +Debug Tool Certain Confidence Simulator Test + +Tests the debug tool's 'certain' confidence feature in a realistic simulation: +- Multi-step investigation leading to certain confidence +- Validation that expert analysis is skipped for obvious bugs +- Verification that certain confidence is always trusted +- Ensures token optimization works correctly for minimal fixes +""" + +import json +from typing import Optional + +from .conversation_base_test import ConversationBaseTest + + +class DebugCertainConfidenceTest(ConversationBaseTest): + """Test debug tool's certain confidence optimization feature""" + + @property + def test_name(self) -> str: + return "debug_certain_confidence" + + @property + def test_description(self) -> str: + return "Debug tool certain confidence optimization validation" + + def run_test(self) -> bool: + """Test debug tool certain confidence capabilities""" + # Set up the test environment + self.setUp() + + try: + self.logger.info("Test: Debug tool certain confidence validation") + + # Create test files with obvious bugs for certain scenarios + self._create_obvious_bug_scenarios() + + # Test 1: Obvious import error with certain confidence + if not self._test_obvious_import_error_certain(): + return False + + # Test 2: Certain confidence is always trusted + if not self._test_certain_always_trusted(): + return False + + # Test 3: Regular high confidence still triggers expert analysis + if not self._test_regular_high_confidence_expert_analysis(): + return False + + # Test 4: Multi-step investigation ending in certain + if not self._test_multi_step_investigation_certain(): + return False + + self.logger.info(" ✅ All debug certain confidence tests passed") + return True + + except Exception as e: + self.logger.error(f"Debug certain confidence test failed: {e}") + return False + + def _create_obvious_bug_scenarios(self): + """Create test files with obvious bugs perfect for certain confidence""" + + # Scenario 1: Missing import statement (very obvious) + missing_import_code = """#!/usr/bin/env python3 +import os +import sys +# import hashlib # <-- Missing import! + +class UserAuth: + def __init__(self, secret_key): + self.secret_key = secret_key + + def hash_password(self, password): + # This will fail with NameError: name 'hashlib' is not defined + salt = os.urandom(32) + return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000) + + def verify_password(self, password, stored_hash): + # This function also uses hashlib + return hashlib.pbkdf2_hmac('sha256', password.encode(), stored_hash[:32], 100000) == stored_hash[32:] +""" + + # Scenario 2: Typo in method name (obvious once spotted) + typo_bug_code = """#!/usr/bin/env python3 +class Calculator: + def __init__(self): + self.history = [] + + def add_numbers(self, a, b): + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result + + def calculate_total(self, numbers): + total = 0 + for num in numbers: + # Typo: should be add_numbers, not add_number + total = self.add_number(total, num) # NameError: no method 'add_number' + return total +""" + + # Scenario 3: Indentation error (Python syntax error) + indentation_error_code = """#!/usr/bin/env python3 +def process_data(data_list): + results = [] + for item in data_list: + if item > 0: + processed = item * 2 + results.append(processed) # IndentationError: unindent does not match any outer indentation level + return results + +def main(): + data = [1, 2, 3, 4, 5] + print(process_data(data)) +""" + + # Create test files + self.missing_import_file = self.create_additional_test_file("user_auth.py", missing_import_code) + self.typo_bug_file = self.create_additional_test_file("calculator.py", typo_bug_code) + self.indentation_file = self.create_additional_test_file("data_processor.py", indentation_error_code) + + self.logger.info(" ✅ Created obvious bug scenarios:") + self.logger.info(f" - Missing import: {self.missing_import_file}") + self.logger.info(f" - Method typo: {self.typo_bug_file}") + self.logger.info(f" - Indentation error: {self.indentation_file}") + + # Create error logs for context + import_error_log = """ERROR: User authentication failing during login +Traceback (most recent call last): + File "user_auth.py", line 12, in hash_password + return hashlib.pbkdf2_hmac('sha256', password.encode(), salt, 100000) +NameError: name 'hashlib' is not defined + +This happens every time a user tries to log in. The error occurs in the password hashing function. +""" + + self.error_log_file = self.create_additional_test_file("error.log", import_error_log) + self.logger.info(f" - Error log: {self.error_log_file}") + + def _test_obvious_import_error_certain(self) -> bool: + """Test certain confidence with obvious missing import error""" + try: + self.logger.info(" 1.1: Testing obvious import error with certain confidence") + + # Step 1: Initial investigation + self.logger.info(" 1.1.1: Step 1 - Initial problem description") + response1, continuation_id = self.call_mcp_tool_direct( + "debug", + { + "step": "Investigating NameError in user authentication - users cannot log in due to 'name hashlib is not defined' error.", + "step_number": 1, + "total_steps": 2, + "next_step_required": True, + "findings": "NameError occurs in hash_password method when trying to use hashlib.pbkdf2_hmac. Error happens on every login attempt.", + "files_checked": [self.error_log_file], + "relevant_files": [self.error_log_file], + "hypothesis": "Missing import statement for hashlib module", + "confidence": "medium", + }, + ) + + if not response1 or not continuation_id: + self.logger.error("Failed to get initial investigation response") + return False + + response1_data = self._parse_debug_response(response1) + if not self._validate_investigation_response(response1_data, 1, True, "investigation_in_progress"): + return False + + self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}") + + # Step 2: Examine code and identify obvious fix - use certain confidence + self.logger.info(" 1.1.2: Step 2 - Found exact issue and simple fix (certain)") + response2, _ = self.call_mcp_tool_direct( + "debug", + { + "step": "Found the exact issue and the minimal fix required", + "step_number": 2, + "total_steps": 2, + "next_step_required": False, # Final step + "findings": "Missing 'import hashlib' statement at the top of user_auth.py file. The error occurs because hashlib is used in hash_password() method on line 12 but never imported. Simple one-line fix: add 'import hashlib' after line 2.", + "files_checked": [self.error_log_file, self.missing_import_file], + "relevant_files": [self.missing_import_file], + "relevant_methods": ["UserAuth.hash_password", "UserAuth.verify_password"], + "hypothesis": "Missing 'import hashlib' statement causes NameError when hash_password method executes", + "confidence": "certain", # Use certain - should skip expert analysis + "continuation_id": continuation_id, + "model": "flash", # Specify model for consistency + }, + ) + + if not response2: + self.logger.error("Failed to complete investigation with certain confidence") + return False + + response2_data = self._parse_debug_response(response2) + if not response2_data: + return False + + # Validate certain response structure + expected_status = "certain_confidence_proceed_with_fix" + if response2_data.get("status") != expected_status: + self.logger.error(f"Expected status '{expected_status}', got '{response2_data.get('status')}'") + return False + + if not response2_data.get("investigation_complete"): + self.logger.error("Expected investigation_complete=true for certain confidence") + return False + + if not response2_data.get("skip_expert_analysis"): + self.logger.error("Expected skip_expert_analysis=true for certain confidence") + return False + + # Verify expert analysis is marked as skipped + expert_analysis = response2_data.get("expert_analysis", {}) + if expert_analysis.get("status") != "skipped_due_to_certain_confidence": + self.logger.error("Expert analysis should be marked as skipped for certain confidence") + return False + + # Check for proper investigation summary + complete_investigation = response2_data.get("complete_investigation", {}) + if complete_investigation.get("confidence_level") != "certain": + self.logger.error("Expected confidence_level='certain' in complete investigation") + return False + + if complete_investigation.get("steps_taken") != 2: + self.logger.error("Expected steps_taken=2 in complete investigation") + return False + + # Verify next steps guidance + next_steps = response2_data.get("next_steps", "") + if "CERTAIN confidence" not in next_steps: + self.logger.error("Expected 'CERTAIN confidence' in next_steps guidance") + return False + + if "minimal fix" not in next_steps: + self.logger.error("Expected 'minimal fix' guidance in next_steps") + return False + + self.logger.info(" ✅ Certain confidence skipped expert analysis correctly") + return True + + except Exception as e: + self.logger.error(f"Obvious import error certain test failed: {e}") + return False + + def _test_certain_always_trusted(self) -> bool: + """Test that certain confidence is always trusted regardless of complexity""" + try: + self.logger.info(" 1.2: Testing that certain confidence is always trusted") + + # Single step investigation with certain - should always be trusted + self.logger.info(" 1.2.1: Direct certain confidence (always trusted)") + response, _ = self.call_mcp_tool_direct( + "debug", + { + "step": "Found the exact root cause and minimal fix for this complex issue", + "step_number": 1, + "total_steps": 1, + "next_step_required": False, # Final step + "findings": "After thorough investigation, identified that the issue is caused by method name typo in Calculator.calculate_total() - calls self.add_number() instead of self.add_numbers(). Simple fix: change line 14 from 'add_number' to 'add_numbers'.", + "files_checked": [self.typo_bug_file], + "relevant_files": [self.typo_bug_file], + "relevant_methods": ["Calculator.calculate_total", "Calculator.add_numbers"], + "hypothesis": "Method name typo in calculate_total() calls non-existent add_number() instead of add_numbers()", + "confidence": "certain", # Should always be trusted + "model": "flash", + }, + ) + + if not response: + self.logger.error("Failed to get certain confidence response") + return False + + response_data = self._parse_debug_response(response) + if not response_data: + return False + + # Verify certain is trusted regardless of complexity + if response_data.get("status") != "certain_confidence_proceed_with_fix": + self.logger.error("Certain confidence should always be trusted") + return False + + if not response_data.get("skip_expert_analysis"): + self.logger.error("Expert analysis should be skipped for certain confidence") + return False + + # Ensure expert analysis is marked as skipped + expert_analysis = response_data.get("expert_analysis", {}) + if expert_analysis.get("status") != "skipped_due_to_certain_confidence": + self.logger.error("Expert analysis status should indicate certain skip") + return False + + self.logger.info(" ✅ Certain confidence always trusted correctly") + return True + + except Exception as e: + self.logger.error(f"Certain always trusted test failed: {e}") + return False + + def _test_regular_high_confidence_expert_analysis(self) -> bool: + """Test that regular 'high' confidence still triggers expert analysis""" + try: + self.logger.info(" 1.3: Testing that regular 'high' confidence triggers expert analysis") + + # Investigation with regular high confidence (not certain) + self.logger.info(" 1.3.1: High confidence (not certain) - should trigger expert analysis") + response, _ = self.call_mcp_tool_direct( + "debug", + { + "step": "Identified likely root cause with strong evidence", + "step_number": 1, + "total_steps": 1, + "next_step_required": False, # Final step + "findings": "IndentationError in data_processor.py line 8 - results.append(processed) is incorrectly indented. Should align with the 'if' statement above it.", + "files_checked": [self.indentation_file], + "relevant_files": [self.indentation_file], + "relevant_methods": ["process_data"], + "hypothesis": "Incorrect indentation causes IndentationError in process_data function", + "confidence": "high", # Regular high confidence, NOT certain + "model": "flash", + }, + ) + + if not response: + self.logger.error("Failed to get high confidence response") + return False + + response_data = self._parse_debug_response(response) + if not response_data: + return False + + # Verify that regular high confidence triggers expert analysis + if response_data.get("status") != "calling_expert_analysis": + self.logger.error( + f"Expected 'calling_expert_analysis' for high confidence, got '{response_data.get('status')}'" + ) + return False + + if response_data.get("skip_expert_analysis"): + self.logger.error("Expert analysis should NOT be skipped for regular high confidence") + return False + + # Verify expert analysis was called + expert_analysis = response_data.get("expert_analysis", {}) + if not expert_analysis: + self.logger.error("Expected expert analysis for regular high confidence") + return False + + # Check that expert analysis has content + if "status" not in expert_analysis: + self.logger.error("Expert analysis should have status field") + return False + + self.logger.info(" ✅ Regular high confidence triggers expert analysis correctly") + return True + + except Exception as e: + self.logger.error(f"Regular high confidence test failed: {e}") + return False + + def _test_multi_step_investigation_certain(self) -> bool: + """Test multi-step investigation that ends with certain confidence""" + try: + self.logger.info(" 1.4: Testing multi-step investigation ending with certain") + + # Step 1: Start investigation + self.logger.info(" 1.4.1: Step 1 - Initial investigation") + response1, continuation_id = self.call_mcp_tool_direct( + "debug", + { + "step": "Investigating Python syntax error in data processing module", + "step_number": 1, + "total_steps": 3, + "next_step_required": True, + "findings": "IndentationError reported when running data_processor.py - 'unindent does not match any outer indentation level'", + "files_checked": [self.indentation_file], + "relevant_files": [], + "hypothesis": "Indentation inconsistency in Python code", + "confidence": "low", + }, + ) + + if not response1 or not continuation_id: + self.logger.error("Failed to start multi-step investigation") + return False + + # Step 2: Examine code structure + self.logger.info(" 1.4.2: Step 2 - Code examination") + response2, _ = self.call_mcp_tool_direct( + "debug", + { + "step": "Examining the indentation structure in process_data function", + "step_number": 2, + "total_steps": 3, + "next_step_required": True, + "findings": "Found the issue: line 8 'results.append(processed)' is indented incorrectly. It should align with the 'if' statement, not be at the same level as the 'for' loop.", + "files_checked": [self.indentation_file], + "relevant_files": [self.indentation_file], + "relevant_methods": ["process_data"], + "hypothesis": "Line 8 has incorrect indentation level causing IndentationError", + "confidence": "medium", + "continuation_id": continuation_id, + }, + ) + + if not response2: + self.logger.error("Failed to continue to step 2") + return False + + # Step 3: Confirm fix with certain confidence + self.logger.info(" 1.4.3: Step 3 - Confirmed fix (certain)") + response3, _ = self.call_mcp_tool_direct( + "debug", + { + "step": "Confirmed the exact issue and simple fix", + "step_number": 3, + "total_steps": 3, + "next_step_required": False, # Final step + "findings": "Confirmed: line 8 'results.append(processed)' needs to be indented 4 more spaces to align with line 6 'if item > 0:'. This is a simple indentation fix.", + "files_checked": [self.indentation_file], + "relevant_files": [self.indentation_file], + "relevant_methods": ["process_data"], + "hypothesis": "IndentationError on line 8 due to incorrect indentation level - needs 4 more spaces", + "confidence": "certain", # Final step with certain + "continuation_id": continuation_id, + "model": "flash", + }, + ) + + if not response3: + self.logger.error("Failed to complete multi-step investigation") + return False + + response3_data = self._parse_debug_response(response3) + if not response3_data: + return False + + # Validate multi-step certain response + if response3_data.get("status") != "certain_confidence_proceed_with_fix": + self.logger.error("Expected certain status for final step") + return False + + if not response3_data.get("skip_expert_analysis"): + self.logger.error("Expected expert analysis to be skipped for certain") + return False + + # Verify investigation preserves steps (at least the current step) + complete_investigation = response3_data.get("complete_investigation", {}) + steps_taken = complete_investigation.get("steps_taken", 0) + if steps_taken < 1: + self.logger.error("Expected at least 1 step in complete investigation") + return False + + # Check that investigation summary includes progression + investigation_summary = complete_investigation.get("investigation_summary", "") + if "Total steps:" not in investigation_summary and "Steps taken:" not in investigation_summary: + self.logger.error("Investigation summary should show steps information") + return False + + self.logger.info(" ✅ Multi-step investigation with certain ending successful") + return True + + except Exception as e: + self.logger.error(f"Multi-step investigation certain test failed: {e}") + return False + + def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]: + """Call an MCP tool directly in-process to maintain conversation memory""" + try: + # Get the tool instance + if tool_name not in self._tools: + self.logger.error(f"Tool '{tool_name}' not found in available tools") + return None, None + + tool = self._tools[tool_name] + + # Execute the tool with proper async handling + loop = self._get_event_loop() + + # Call the tool's execute method + result = loop.run_until_complete(tool.execute(params)) + + if not result or len(result) == 0: + self.logger.error(f"Tool '{tool_name}' returned empty result") + return None, None + + # Extract the text content from the result + response_text = result[0].text if hasattr(result[0], "text") else str(result[0]) + + # Extract continuation_id from debug response if present + continuation_id = self._extract_debug_continuation_id(response_text) + + return response_text, continuation_id + + except Exception as e: + self.logger.error(f"Failed to call tool '{tool_name}' directly: {e}") + return None, None + + def _extract_debug_continuation_id(self, response_text: str) -> Optional[str]: + """Extract continuation_id from debug response""" + try: + response_data = json.loads(response_text) + return response_data.get("continuation_id") + except json.JSONDecodeError as e: + self.logger.debug(f"Failed to parse response for debug continuation_id: {e}") + return None + + def _parse_debug_response(self, response_text: str) -> dict: + """Parse debug tool JSON response""" + try: + return json.loads(response_text) + except json.JSONDecodeError as e: + self.logger.error(f"Failed to parse debug response as JSON: {e}") + self.logger.error(f"Response text: {response_text[:500]}...") + return {} + + def _validate_investigation_response( + self, + response_data: dict, + expected_step: int, + expected_next_required: bool, + expected_status: str, + ) -> bool: + """Validate debug investigation response structure""" + try: + # Check status + if response_data.get("status") != expected_status: + self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'") + return False + + # Check step number + if response_data.get("step_number") != expected_step: + self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}") + return False + + # Check next_step_required + if response_data.get("next_step_required") != expected_next_required: + self.logger.error( + f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}" + ) + return False + + # Basic structure checks + if "investigation_status" not in response_data: + self.logger.error("Missing investigation_status in response") + return False + + if not response_data.get("next_steps"): + self.logger.error("Missing next_steps guidance in response") + return False + + return True + + except Exception as e: + self.logger.error(f"Error validating investigation response: {e}") + return False diff --git a/tests/test_debug_certain_confidence.py b/tests/test_debug_certain_confidence.py new file mode 100644 index 0000000..1772170 --- /dev/null +++ b/tests/test_debug_certain_confidence.py @@ -0,0 +1,363 @@ +""" +Integration tests for the debug tool's 'certain' confidence feature. + +Tests the complete workflow where Claude identifies obvious bugs with absolute certainty +and can skip expensive expert analysis for minimal fixes. +""" + +import json +from unittest.mock import patch + +import pytest + +from tools.debug import DebugIssueTool + + +class TestDebugCertainConfidence: + """Integration tests for certain confidence optimization.""" + + def setup_method(self): + """Set up test tool instance.""" + self.tool = DebugIssueTool() + + @pytest.mark.asyncio + async def test_certain_confidence_skips_expert_analysis(self): + """Test that certain confidence with valid minimal fix skips expert analysis.""" + # Simulate a multi-step investigation ending with certain confidence + + # Step 1: Initial investigation + with patch("utils.conversation_memory.create_thread", return_value="debug-certain-uuid"): + with patch("utils.conversation_memory.add_turn"): + result1 = await self.tool.execute( + { + "step": "Investigating Python ImportError in user authentication module", + "step_number": 1, + "total_steps": 2, + "next_step_required": True, + "findings": "Users cannot log in, getting 'ModuleNotFoundError: No module named hashlib'", + "files_checked": ["/auth/user_auth.py"], + "relevant_files": ["/auth/user_auth.py"], + "hypothesis": "Missing import statement", + "confidence": "medium", + "continuation_id": None, + } + ) + + # Verify step 1 response + response1 = json.loads(result1[0].text) + assert response1["status"] == "investigation_in_progress" + assert response1["step_number"] == 1 + continuation_id = response1["continuation_id"] + + # Step 2: Final step with certain confidence (simple import fix) + with patch("utils.conversation_memory.add_turn"): + result2 = await self.tool.execute( + { + "step": "Found the exact issue and fix", + "step_number": 2, + "total_steps": 2, + "next_step_required": False, # Final step + "findings": "Missing 'import hashlib' statement at top of user_auth.py file, line 3. Simple one-line fix required.", + "files_checked": ["/auth/user_auth.py"], + "relevant_files": ["/auth/user_auth.py"], + "relevant_methods": ["UserAuth.hash_password"], + "hypothesis": "Missing import hashlib statement causes ModuleNotFoundError when hash_password method is called", + "confidence": "certain", # NAILEDIT confidence - should skip expert analysis + "continuation_id": continuation_id, + } + ) + + # Verify final response skipped expert analysis + response2 = json.loads(result2[0].text) + + # Should indicate certain confidence was used + assert response2["status"] == "certain_confidence_proceed_with_fix" + assert response2["investigation_complete"] is True + assert response2["skip_expert_analysis"] is True + + # Expert analysis should be marked as skipped + assert response2["expert_analysis"]["status"] == "skipped_due_to_certain_confidence" + assert ( + response2["expert_analysis"]["reason"] == "Claude identified exact root cause with minimal fix requirement" + ) + + # Should have complete investigation summary + assert "complete_investigation" in response2 + assert response2["complete_investigation"]["confidence_level"] == "certain" + assert response2["complete_investigation"]["steps_taken"] == 2 + + # Next steps should guide Claude to implement the fix directly + assert "CERTAIN confidence" in response2["next_steps"] + assert "minimal fix" in response2["next_steps"] + assert "without requiring further consultation" in response2["next_steps"] + + @pytest.mark.asyncio + async def test_certain_confidence_always_trusted(self): + """Test that certain confidence is always trusted, even for complex issues.""" + + # Set up investigation state + self.tool.initial_issue = "Any kind of issue" + self.tool.investigation_history = [ + { + "step_number": 1, + "step": "Initial investigation", + "findings": "Some findings", + "files_checked": [], + "relevant_files": [], + "relevant_methods": [], + "hypothesis": None, + "confidence": "low", + } + ] + self.tool.consolidated_findings = { + "files_checked": set(), + "relevant_files": set(), + "relevant_methods": set(), + "findings": ["Step 1: Some findings"], + "hypotheses": [], + "images": [], + } + + # Final step with certain confidence - should ALWAYS be trusted + with patch("utils.conversation_memory.add_turn"): + result = await self.tool.execute( + { + "step": "Found the issue and fix", + "step_number": 2, + "total_steps": 2, + "next_step_required": False, # Final step + "findings": "Complex or simple, doesn't matter - Claude says certain", + "files_checked": ["/any/file.py"], + "relevant_files": ["/any/file.py"], + "relevant_methods": ["any_method"], + "hypothesis": "Claude has decided this is certain - trust the judgment", + "confidence": "certain", # Should always be trusted + "continuation_id": "debug-trust-uuid", + } + ) + + # Verify certain is always trusted + response = json.loads(result[0].text) + + # Should proceed with certain confidence + assert response["status"] == "certain_confidence_proceed_with_fix" + assert response["investigation_complete"] is True + assert response["skip_expert_analysis"] is True + + # Expert analysis should be skipped + assert response["expert_analysis"]["status"] == "skipped_due_to_certain_confidence" + + # Next steps should guide Claude to implement fix directly + assert "CERTAIN confidence" in response["next_steps"] + + @pytest.mark.asyncio + async def test_regular_high_confidence_still_uses_expert_analysis(self): + """Test that regular 'high' confidence still triggers expert analysis.""" + + # Set up investigation state + self.tool.initial_issue = "Session validation issue" + self.tool.investigation_history = [ + { + "step_number": 1, + "step": "Initial investigation", + "findings": "Found session issue", + "files_checked": [], + "relevant_files": [], + "relevant_methods": [], + "hypothesis": None, + "confidence": "low", + } + ] + self.tool.consolidated_findings = { + "files_checked": set(), + "relevant_files": {"/api/sessions.py"}, + "relevant_methods": {"SessionManager.validate"}, + "findings": ["Step 1: Found session issue"], + "hypotheses": [], + "images": [], + } + + # Mock expert analysis + mock_expert_response = { + "status": "analysis_complete", + "summary": "Expert analysis of session validation", + "hypotheses": [ + { + "name": "SESSION_VALIDATION_BUG", + "confidence": "High", + "root_cause": "Session timeout not properly handled", + } + ], + } + + # Final step with regular 'high' confidence (should trigger expert analysis) + with patch("utils.conversation_memory.add_turn"): + with patch.object(self.tool, "_call_expert_analysis", return_value=mock_expert_response): + with patch.object(self.tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)): + result = await self.tool.execute( + { + "step": "Identified likely root cause", + "step_number": 2, + "total_steps": 2, + "next_step_required": False, # Final step + "findings": "Session validation fails when timeout occurs during user activity", + "files_checked": ["/api/sessions.py"], + "relevant_files": ["/api/sessions.py"], + "relevant_methods": ["SessionManager.validate", "SessionManager.cleanup"], + "hypothesis": "Session timeout handling bug causes validation failures", + "confidence": "high", # Regular high confidence, NOT certain + "continuation_id": "debug-regular-uuid", + } + ) + + # Verify expert analysis was called (not skipped) + response = json.loads(result[0].text) + + # Should call expert analysis normally + assert response["status"] == "calling_expert_analysis" + assert response["investigation_complete"] is True + assert "skip_expert_analysis" not in response # Should not be present + + # Expert analysis should be present with real results + assert response["expert_analysis"]["status"] == "analysis_complete" + assert response["expert_analysis"]["summary"] == "Expert analysis of session validation" + + # Next steps should indicate normal investigation completion (not certain confidence) + assert "INVESTIGATION IS COMPLETE" in response["next_steps"] + assert "certain" not in response["next_steps"].lower() + + def test_certain_confidence_schema_requirements(self): + """Test that certain confidence is properly described in schema for Claude's guidance.""" + + # The schema description should guide Claude on proper certain usage + schema = self.tool.get_input_schema() + confidence_description = schema["properties"]["confidence"]["description"] + + # Should emphasize it's only when root cause and fix are confirmed + assert "root cause" in confidence_description.lower() + assert "minimal fix" in confidence_description.lower() + assert "confirmed" in confidence_description.lower() + + # Should emphasize trust in Claude's judgment + assert "absolutely" in confidence_description.lower() or "certain" in confidence_description.lower() + + # Should mention no thought-partner assistance needed + assert "thought-partner" in confidence_description.lower() or "assistance" in confidence_description.lower() + + @pytest.mark.asyncio + async def test_confidence_enum_validation(self): + """Test that certain is properly included in confidence enum validation.""" + + # Valid confidence values should not raise errors + valid_confidences = ["low", "medium", "high", "certain"] + + for confidence in valid_confidences: + # This should not raise validation errors + with patch("utils.conversation_memory.create_thread", return_value="test-uuid"): + with patch("utils.conversation_memory.add_turn"): + result = await self.tool.execute( + { + "step": f"Test step with {confidence} confidence", + "step_number": 1, + "total_steps": 1, + "next_step_required": False, + "findings": "Test findings", + "confidence": confidence, + } + ) + + # Should get valid response + response = json.loads(result[0].text) + assert "error" not in response or response.get("status") != "investigation_failed" + + def test_tool_schema_includes_certain(self): + """Test that the tool schema properly includes certain in confidence enum.""" + schema = self.tool.get_input_schema() + + confidence_property = schema["properties"]["confidence"] + assert confidence_property["type"] == "string" + assert "certain" in confidence_property["enum"] + assert confidence_property["enum"] == ["exploring", "low", "medium", "high", "certain"] + + # Check that description explains certain usage + description = confidence_property["description"] + assert "certain" in description.lower() + assert "root cause" in description.lower() + assert "minimal fix" in description.lower() + assert "thought-partner" in description.lower() + + @pytest.mark.asyncio + async def test_certain_confidence_preserves_investigation_data(self): + """Test that certain confidence path preserves all investigation data properly.""" + + # Multi-step investigation leading to certain + with patch("utils.conversation_memory.create_thread", return_value="preserve-data-uuid"): + with patch("utils.conversation_memory.add_turn"): + # Step 1 + await self.tool.execute( + { + "step": "Initial investigation of login failure", + "step_number": 1, + "total_steps": 3, + "next_step_required": True, + "findings": "Users can't log in after password reset", + "files_checked": ["/auth/password.py"], + "relevant_files": ["/auth/password.py"], + "confidence": "low", + } + ) + + # Step 2 + await self.tool.execute( + { + "step": "Examining password validation logic", + "step_number": 2, + "total_steps": 3, + "next_step_required": True, + "findings": "Password hash function not imported correctly", + "files_checked": ["/auth/password.py", "/utils/crypto.py"], + "relevant_files": ["/auth/password.py"], + "relevant_methods": ["PasswordManager.validate_password"], + "hypothesis": "Import statement issue", + "confidence": "medium", + "continuation_id": "preserve-data-uuid", + } + ) + + # Step 3: Final with certain + result = await self.tool.execute( + { + "step": "Found exact issue and fix", + "step_number": 3, + "total_steps": 3, + "next_step_required": False, + "findings": "Missing 'from utils.crypto import hash_password' at line 5", + "files_checked": ["/auth/password.py", "/utils/crypto.py"], + "relevant_files": ["/auth/password.py"], + "relevant_methods": ["PasswordManager.validate_password", "hash_password"], + "hypothesis": "Missing import statement for hash_password function", + "confidence": "certain", + "continuation_id": "preserve-data-uuid", + } + ) + + # Verify all investigation data is preserved + response = json.loads(result[0].text) + + assert response["status"] == "certain_confidence_proceed_with_fix" + + investigation = response["complete_investigation"] + assert investigation["steps_taken"] == 3 + assert len(investigation["files_examined"]) == 2 # Both files from all steps + assert "/auth/password.py" in investigation["files_examined"] + assert "/utils/crypto.py" in investigation["files_examined"] + assert len(investigation["relevant_files"]) == 1 + assert len(investigation["relevant_methods"]) == 2 + assert investigation["confidence_level"] == "certain" + + # Should have complete investigation summary + assert "SYSTEMATIC INVESTIGATION SUMMARY" in investigation["investigation_summary"] + assert ( + "Steps taken: 3" in investigation["investigation_summary"] + or "Total steps: 3" in investigation["investigation_summary"] + ) diff --git a/tests/test_image_support_integration.py b/tests/test_image_support_integration.py index 1e38e01..a3d12c1 100644 --- a/tests/test_image_support_integration.py +++ b/tests/test_image_support_integration.py @@ -163,7 +163,7 @@ class TestImageSupportIntegration: images_field = schema["properties"]["images"] assert images_field["type"] == "array" assert images_field["items"]["type"] == "string" - assert "error screens" in images_field["description"].lower() + assert "screenshots" in images_field["description"].lower() def test_tool_image_validation_limits(self): """Test that tools validate image size limits using real provider resolution.""" diff --git a/tools/debug.py b/tools/debug.py index 88200e6..4ee196e 100644 --- a/tools/debug.py +++ b/tools/debug.py @@ -21,47 +21,62 @@ logger = logging.getLogger(__name__) # Field descriptions for the investigation steps DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = { "step": ( - "Describe what you're currently investigating by beginning to think deeply about the issue, its root cause" - "and possible reasons. Prepare and learn about the related code first. In step 1, clearly state the issue to investigate and begin " - "thinking deeply about not just the described issue, but possible underlying causes, side-effects, or external " - "components that might contribute to it. Follow the code flow carefully—bugs may originate " - "in one part of the code-dependencies, or upstream logic may not be immediately visible. Bugs and issues can " - "arise due to poor logic, incorrect assumptions, bad input or failures elsewhere." - "In all subsequent steps, continue uncovering relevant code, examining patterns, and formulating hypotheses " - "with deliberate attention to detail." + "Describe what you're currently investigating by thinking deeply about the issue and its possible causes. " + "In step 1, clearly state the issue and begin forming an investigative direction. Consider not only obvious " + "failures, but also subtle contributing factors like upstream logic, invalid inputs, missing preconditions, " + "or hidden side effects. Map out the flow of related functions or modules. Identify call paths where input " + "values or branching logic could cause instability. In concurrent systems, watch for race conditions, shared " + "state, or timing dependencies. In all later steps, continue exploring with precision: trace deeper " + "dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence." + ), + "step_number": ( + "The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or " + "revise the previous one." + ), + "total_steps": ( + "Your current estimate for how many steps will be needed to complete the investigation. Adjust as new findings emerge." + ), + "next_step_required": ( + "Set to true if you plan to continue the investigation with another step. False means you believe the root " + "cause is known or the investigation is complete." ), - "step_number": "Current step number in the investigation sequence (starts at 1).", - "total_steps": "Estimate of total investigation steps expected (adjustable as the process evolves).", - "next_step_required": "Whether another investigation step is needed after this one.", "findings": ( - "Summarize discoveries in this step. Think critically and include relevant code behavior, suspicious patterns, " - "evidence collected, and any partial conclusions or leads." + "Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or " + "logs, or disproven theories. Be specific and avoid vague language—document what you now know and how it " + "affects your hypothesis. In later steps, confirm or disprove past findings with reason." ), "files_checked": ( "List all files (as absolute paths, do not clip or shrink file names) examined during the investigation so far. " "Include even files ruled out, as this tracks your exploration path." ), "relevant_files": ( - "Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list those that are directly tied to the root cause or its effects." + "Subset of files_checked (as full absolute paths) that contain code directly relevant to the issue. Only list " + "those that are directly tied to the root cause or its effects. This could include the cause, trigger, or " + "place of manifestation." ), "relevant_methods": ( - "List specific methods/functions clearly tied to the issue. Use 'ClassName.methodName' or 'functionName' format." + "List methods or functions that are central to the issue, in the format 'ClassName.methodName' or 'functionName'. " + "Prioritize those that influence or process inputs, drive branching, or pass state between modules." ), "hypothesis": ( - "Formulate your current best guess about the underlying cause. This is a working theory and may evolve based on further evidence." + "A concrete theory for what's causing the issue based on the evidence so far. This can include suspected " + "failures, incorrect assumptions, or violated constraints. You are encouraged to revise or abandon it in later " + "steps as needed." ), "confidence": ( - "How confident you are in the current hypothesis: " - "'low' (initial theory), 'medium' (good evidence), 'high' (strong to very strong evidence), " - "'nailedit' (ONLY use for final step and ONLY when you have found the EXACT root cause with 100% certainty AND " - "identified a simple, minimal fix that requires no expert consultation. Use this ONLY " - "for obvious bugs and logic errors that you ABSOLUTELY are certain about and have no doubts because you have" - "successfully mapped out the code flow and the root cause behind the issue." + "Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), " + "'medium' (some supporting evidence), 'high' (strong evidence), 'certain' (only when the root cause and minimal " + "fix are both confirmed). Do NOT use 'certain' unless the issue can be fully resolved with a fix, use 'high' " + "instead when in doubt. Using 'certain' prevents you from taking assistance from another thought-partner." + ), + "backtrack_from_step": ( + "If an earlier finding or hypothesis needs to be revised or discarded, specify the step number from which to " + "start over. Use this to acknowledge investigative dead ends and correct the course." ), - "backtrack_from_step": "If a previous step needs revision, specify the step number to backtrack from.", "continuation_id": "Continuation token used for linking multi-step investigations and continuing conversations after discovery.", "images": ( - "Optional. Include full absolute paths to visual debugging images (UI issues, logs, error screens) that help clarify the issue." + "Optional list of absolute paths to screenshots or UI visuals that clarify the issue. " + "Only include if they materially assist understanding or hypothesis formulation." ), } @@ -204,7 +219,7 @@ class DebugIssueTool(BaseTool): }, "confidence": { "type": "string", - "enum": ["low", "medium", "high", "nailedit"], + "enum": ["exploring", "low", "medium", "high", "certain"], "description": DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS["confidence"], }, "backtrack_from_step": { @@ -356,9 +371,9 @@ class DebugIssueTool(BaseTool): response_data["investigation_complete"] = True # Check if Claude has absolute certainty and can proceed with minimal fix - if request.confidence == "nailedit": - # Trust Claude's judgment completely - if it says nailedit, skip expert analysis - response_data["status"] = "nailedit_confidence_proceed_with_fix" + if request.confidence == "certain": + # Trust Claude's judgment completely - if it says certain, skip expert analysis + response_data["status"] = "certain_confidence_proceed_with_fix" investigation_summary = self._prepare_investigation_summary() response_data["complete_investigation"] = { @@ -369,20 +384,21 @@ class DebugIssueTool(BaseTool): "relevant_methods": list(self.consolidated_findings["relevant_methods"]), "investigation_summary": investigation_summary, "final_hypothesis": request.hypothesis, - "confidence_level": "nailedit", + "confidence_level": "certain", } response_data["next_steps"] = ( - "Investigation complete with NAILED-IT confidence. You have identified the exact " - "root cause and a minimal fix. Proceed directly with implementing the simple fix " - "without requiring expert consultation. Focus on the precise, minimal change needed." + "Investigation complete with CERTAIN confidence. You have identified the exact " + "root cause and a minimal fix. MANDATORY: Present the user with the root cause analysis" + "and IMMEDIATELY proceed with implementing the simple fix without requiring further " + "consultation. Focus on the precise, minimal change needed." ) response_data["skip_expert_analysis"] = True response_data["expert_analysis"] = { - "status": "skipped_due_to_nailedit_confidence", + "status": "skipped_due_to_certain_confidence", "reason": "Claude identified exact root cause with minimal fix requirement", } else: - # Standard expert analysis for high/medium/low confidence + # Standard expert analysis for certain/high/medium/low/exploring confidence response_data["status"] = "calling_expert_analysis" # Prepare consolidated investigation summary @@ -413,9 +429,11 @@ class DebugIssueTool(BaseTool): "investigation_summary": investigation_summary, } response_data["next_steps"] = ( - "Investigation complete with expert analysis. Present the findings, hypotheses, " - "and recommended fixes to the user. Focus on the most likely root cause and " - "provide actionable implementation guidance." + "INVESTIGATION IS COMPLETE. YOU MUST now summarize and present ALL key findings, confirmed " + "hypotheses, and exact recommended fixes. Clearly identify the most likely root cause and " + "provide concrete, actionable implementation guidance. Highlight affected code paths and display " + "reasoning that led to this conclusion—make it easy for a developer to understand exactly where " + "the problem lies." ) else: response_data["next_steps"] = ( diff --git a/zen_server.py b/zen_server.py deleted file mode 100755 index 0f31a58..0000000 --- a/zen_server.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -Zen MCP Server - Entry point -The main implementation is in server.py -""" - -import asyncio - -from server import main - -if __name__ == "__main__": - asyncio.run(main())