Make code lookup mandatory in between investigation steps for better results

2025-06-19 17:42:41 +04:00
parent 79abb9ca7e
commit 7900fdd21c
8 changed files with 108 additions and 33 deletions
--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "5.2.2"
+__version__ = "5.2.3"
 # Last update date in ISO format
 __updated__ = "2025-06-19"
 # Primary maintainer
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -537,10 +537,8 @@ RuntimeError: dictionary changed size during iteration
                self.logger.error("Missing investigation_status in response")
                return False
-            # Check output guidance exists
+            # Output field removed in favor of contextual next_steps
-            if "output" not in response_data:
+            # No longer checking for "output" field as it was redundant
                self.logger.error("Missing output guidance in response")
                return False
            # Check next_steps guidance
            if not response_data.get("next_steps"):
--- a/tests/test_collaboration.py
+++ b/tests/test_collaboration.py
@@ -95,10 +95,13 @@ class TestDynamicContextRequests:
        # Parse the response - new debug tool returns structured JSON
        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" to force actual investigation
        assert response_data["status"] == "pause_for_investigation"
        assert response_data["step_number"] == 1
        assert response_data["next_step_required"] is True
        assert response_data["investigation_status"]["current_confidence"] == "high"
        assert response_data["investigation_required"] is True
        assert "required_actions" in response_data
    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
--- a/tests/test_debug.py
+++ b/tests/test_debug.py
@@ -133,13 +133,16 @@ class TestDebugTool:
        parsed_response = json.loads(result[0].text)
-        assert parsed_response["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert parsed_response["status"] == "pause_for_investigation"
        assert parsed_response["step_number"] == 1
        assert parsed_response["total_steps"] == 5
        assert parsed_response["next_step_required"] is True
        assert parsed_response["continuation_id"] == "debug-uuid-123"
        assert parsed_response["investigation_status"]["files_checked"] == 1
        assert parsed_response["investigation_status"]["relevant_files"] == 1
        assert parsed_response["investigation_required"] is True
        assert "required_actions" in parsed_response
    @pytest.mark.asyncio
    async def test_execute_subsequent_investigation_step(self):
@@ -317,6 +320,7 @@ class TestDebugTool:
            result = await tool.execute(arguments)
        # Should return a list with TextContent
        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert len(result) == 1
        response_text = result[0].text
@@ -325,7 +329,7 @@ class TestDebugTool:
        parsed_response = json.loads(response_text)
-        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["status"] == "pause_for_investigation"
        # After backtracking from step 2, history should have step 1 plus the new step
        assert len(tool.investigation_history) == 2  # Step 1 + new step 3
        assert tool.investigation_history[0]["step_number"] == 1
@@ -502,6 +506,7 @@ class TestDebugToolIntegration:
                result = await self.tool.execute(arguments)
        # Verify response structure
        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert len(result) == 1
        response_text = result[0].text
@@ -510,7 +515,7 @@ class TestDebugToolIntegration:
        parsed_response = json.loads(response_text)
-        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["status"] == "pause_for_investigation"
        assert parsed_response["step_number"] == 1
        assert parsed_response["continuation_id"] == "debug-flow-uuid"
--- a/tests/test_debug_certain_confidence.py
+++ b/tests/test_debug_certain_confidence.py
@@ -45,8 +45,10 @@ class TestDebugCertainConfidence:
        # Verify step 1 response
        response1 = json.loads(result1[0].text)
-        assert response1["status"] == "investigation_in_progress"
+        assert response1["status"] == "pause_for_investigation"
        assert response1["step_number"] == 1
        assert response1["investigation_required"] is True
        assert "required_actions" in response1
        continuation_id = response1["continuation_id"]
        # Step 2: Final step with certain confidence (simple import fix)
--- a/tests/test_debug_comprehensive_workflow.py
+++ b/tests/test_debug_comprehensive_workflow.py
@@ -43,7 +43,7 @@ class TestDebugComprehensiveWorkflow:
        # Verify step 1 response
        assert len(result1) == 1
        response1 = json.loads(result1[0].text)
-        assert response1["status"] == "investigation_in_progress"
+        assert response1["status"] == "pause_for_investigation"
        assert response1["step_number"] == 1
        assert response1["continuation_id"] == "debug-workflow-uuid"
@@ -56,7 +56,8 @@ class TestDebugComprehensiveWorkflow:
            if args and len(args) >= 3:
                assert args[0] == "debug-workflow-uuid"
                assert args[1] == "assistant"
-                assert json.loads(args[2])["status"] == "investigation_in_progress"
+                # Debug tool now returns "pause_for_investigation" for ongoing steps
                assert json.loads(args[2])["status"] == "pause_for_investigation"
        # Step 2: Continue investigation with findings
        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
@@ -78,7 +79,8 @@ class TestDebugComprehensiveWorkflow:
        # Verify step 2 response
        response2 = json.loads(result2[0].text)
-        assert response2["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert response2["status"] == "pause_for_investigation"
        assert response2["step_number"] == 2
        assert response2["investigation_status"]["files_checked"] == 2
        assert response2["investigation_status"]["relevant_methods"] == 2
@@ -268,9 +270,12 @@ class TestDebugComprehensiveWorkflow:
                states.append(json.loads(result[0].text))
        # Verify initial state
-        assert states[0]["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert states[0]["status"] == "pause_for_investigation"
        assert states[0]["step_number"] == 1
        assert states[0]["next_step_required"] is True
        assert states[0]["investigation_required"] is True
        assert "required_actions" in states[0]
        # Final state (triggers expert analysis)
        mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
--- a/tests/test_debug_continuation.py
+++ b/tests/test_debug_continuation.py
@@ -39,8 +39,10 @@ class TestDebugContinuation:
        assert len(result) == 1
        response = json.loads(result[0].text)
-        assert response["status"] == "investigation_in_progress"
+        assert response["status"] == "pause_for_investigation"
        assert response["continuation_id"] == "debug-test-uuid-123"
        assert response["investigation_required"] is True
        assert "required_actions" in response
    def test_debug_conversation_formatting(self):
        """Test that debug tool's structured output is properly formatted in conversation history."""
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -157,16 +157,18 @@ class DebugIssueTool(BaseTool):
            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
            "This tool guides you through a step-by-step investigation process where you:\n\n"
            "1. Start with step 1: describe the issue to investigate\n"
-            "2. Continue with investigation steps: examine code, trace errors, test hypotheses\n"
+            "2. STOP and investigate using appropriate tools\n"
-            "3. Track findings, relevant files, and methods throughout\n"
+            "3. Report findings in step 2 with concrete evidence from actual code\n"
-            "4. Update hypotheses as understanding evolves\n"
+            "4. Continue investigating between each debug step\n"
-            "5. Backtrack and revise findings when needed\n"
+            "5. Track findings, relevant files, and methods throughout\n"
-            "6. Once investigation is complete, receive expert analysis\n\n"
+            "6. Update hypotheses as understanding evolves\n"
-            "The tool enforces systematic investigation methodology:\n"
+            "7. Once investigation is complete, receive expert analysis\n\n"
-            "- Methodical code examination and evidence collection\n"
+            "IMPORTANT: This tool enforces investigation between steps:\n"
-            "- Hypothesis formation and validation\n"
+            "- After each debug call, you MUST investigate before calling debug again\n"
-            "- File and method tracking for context\n"
+            "- Each step must include NEW evidence from code examination\n"
-            "- Confidence assessment and revision capabilities\n\n"
+            "- No recursive debug calls without actual investigation work\n"
            "- The tool will specify which step number to use next\n"
            "- Follow the required_actions list for investigation guidance\n\n"
            "Perfect for: complex bugs, mysterious errors, performance issues, "
            "race conditions, memory leaks, integration problems."
        )
@@ -357,10 +359,6 @@ class DebugIssueTool(BaseTool):
                    "images_collected": len(set(self.consolidated_findings["images"])),
                    "current_confidence": request.confidence,
                },
                "output": {
                    "instructions": "Continue systematic investigation. Present findings clearly and proceed to next step if required.",
                    "format": "systematic_investigation",
                },
            }
            if continuation_id:
@@ -436,9 +434,71 @@ class DebugIssueTool(BaseTool):
                        "the problem lies."
                    )
            else:
                # CRITICAL: Force Claude to actually investigate before calling debug again
                response_data["status"] = "pause_for_investigation"
                response_data["investigation_required"] = True
                if request.step_number == 1:
                    # Initial investigation tasks
                    response_data["required_actions"] = [
                        "Search for code related to the reported issue or symptoms",
                        "Examine relevant files and understand the current implementation",
                        "Understand the project structure and locate relevant modules",
                        "Identify how the affected functionality is supposed to work",
                    ]
                    response_data["next_steps"] = (
-                    f"Continue investigation with step {request.step_number + 1}. "
+                        f"MANDATORY: DO NOT call the debug tool again immediately. You MUST first investigate "
-                    f"Focus on: examining relevant code, testing hypotheses, gathering evidence."
+                        f"the codebase using appropriate tools. Search for relevant code, examine implementations, "
                        f"understand the logic flow. Only call debug again AFTER you have gathered concrete evidence "
                        f"and examined actual code. When you call debug next time, use step_number: {request.step_number + 1} "
                        f"and report the specific files you've examined and findings you've discovered."
                    )
                elif request.step_number >= 2 and request.confidence in ["exploring", "low"]:
                    # Need deeper investigation
                    response_data["required_actions"] = [
                        "Examine the specific files you've identified as relevant",
                        "Trace method calls and data flow through the system",
                        "Check for edge cases, boundary conditions, and assumptions in the code",
                        "Look for related configuration, dependencies, or external factors",
                    ]
                    response_data["next_steps"] = (
                        f"STOP! Do NOT call debug again yet. Based on your findings, you've identified potential areas "
                        f"but need concrete evidence. MANDATORY ACTIONS before calling debug step {request.step_number + 1}:\n"
                        f"1. Examine ALL files in your relevant_files list\n"
                        f"2. Trace how data flows through {', '.join(request.relevant_methods[:3]) if request.relevant_methods else 'the identified components'}\n"
                        f"3. Look for logic errors, incorrect assumptions, missing validations\n"
                        f"4. Check interactions between components and external dependencies\n"
                        f"Only call debug again with step_number: {request.step_number + 1} AFTER completing these investigations."
                    )
                elif request.confidence in ["medium", "high"]:
                    # Close to root cause - need confirmation
                    response_data["required_actions"] = [
                        "Examine the exact code sections where you believe the issue occurs",
                        "Trace the execution path that leads to the failure",
                        "Verify your hypothesis with concrete code evidence",
                        "Check for any similar patterns elsewhere in the codebase",
                    ]
                    response_data["next_steps"] = (
                        f"WAIT! Your hypothesis needs verification. DO NOT call debug immediately. REQUIRED ACTIONS:\n"
                        f"1. Examine the exact lines where the issue occurs\n"
                        f"2. Trace backwards: how does data get to this point? What transforms it?\n"
                        f"3. Check all assumptions: are inputs validated? Are nulls handled?\n"
                        f"4. Look for the EXACT line where expected != actual behavior\n"
                        f"Document these findings with specific file:line references, then call debug with step_number: {request.step_number + 1}."
                    )
                else:
                    # General investigation needed
                    response_data["required_actions"] = [
                        "Continue examining the code paths identified in your hypothesis",
                        "Gather more evidence using appropriate investigation tools",
                        "Test edge cases and boundary conditions",
                        "Look for patterns that confirm or refute your theory",
                    ]
                    response_data["next_steps"] = (
                        f"PAUSE INVESTIGATION. Before calling debug step {request.step_number + 1}, you MUST examine code. "
                        f"Required: Read files from your files_checked list, search for patterns in your hypothesis, "
                        f"trace execution flow. Your next debug call (step_number: {request.step_number + 1}) must include "
                        f"NEW evidence from actual code examination, not just theories. NO recursive debug calls without investigation work!"
                    )
            # Store in conversation memory