Make code lookup mandatory in between investigation steps for better results

2025-06-19 17:42:41 +04:00
parent 79abb9ca7e
commit 7900fdd21c
8 changed files with 108 additions and 33 deletions
--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "5.2.2"
+__version__ = "5.2.3"
 # Last update date in ISO format
 __updated__ = "2025-06-19"
 # Primary maintainer
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -537,10 +537,8 @@ RuntimeError: dictionary changed size during iteration
                self.logger.error("Missing investigation_status in response")
                return False

-            # Check output guidance exists
-            if "output" not in response_data:
-                self.logger.error("Missing output guidance in response")
-                return False
+            # Output field removed in favor of contextual next_steps
+            # No longer checking for "output" field as it was redundant

            # Check next_steps guidance
            if not response_data.get("next_steps"):
--- a/tests/test_collaboration.py
+++ b/tests/test_collaboration.py
@@ -95,10 +95,13 @@ class TestDynamicContextRequests:

        # Parse the response - new debug tool returns structured JSON
        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" to force actual investigation
+        assert response_data["status"] == "pause_for_investigation"
        assert response_data["step_number"] == 1
        assert response_data["next_step_required"] is True
        assert response_data["investigation_status"]["current_confidence"] == "high"
+        assert response_data["investigation_required"] is True
+        assert "required_actions" in response_data

    @pytest.mark.asyncio
    @patch("tools.base.BaseTool.get_model_provider")
--- a/tests/test_debug.py
+++ b/tests/test_debug.py
@@ -133,13 +133,16 @@ class TestDebugTool:

        parsed_response = json.loads(result[0].text)

-        assert parsed_response["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
+        assert parsed_response["status"] == "pause_for_investigation"
        assert parsed_response["step_number"] == 1
        assert parsed_response["total_steps"] == 5
        assert parsed_response["next_step_required"] is True
        assert parsed_response["continuation_id"] == "debug-uuid-123"
        assert parsed_response["investigation_status"]["files_checked"] == 1
        assert parsed_response["investigation_status"]["relevant_files"] == 1
+        assert parsed_response["investigation_required"] is True
+        assert "required_actions" in parsed_response

    @pytest.mark.asyncio
    async def test_execute_subsequent_investigation_step(self):
@@ -317,6 +320,7 @@ class TestDebugTool:
            result = await tool.execute(arguments)

        # Should return a list with TextContent
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert len(result) == 1
        response_text = result[0].text

@@ -325,7 +329,7 @@ class TestDebugTool:

        parsed_response = json.loads(response_text)

-        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["status"] == "pause_for_investigation"
        # After backtracking from step 2, history should have step 1 plus the new step
        assert len(tool.investigation_history) == 2  # Step 1 + new step 3
        assert tool.investigation_history[0]["step_number"] == 1
@@ -502,6 +506,7 @@ class TestDebugToolIntegration:
                result = await self.tool.execute(arguments)

        # Verify response structure
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
        assert len(result) == 1
        response_text = result[0].text

@@ -510,7 +515,7 @@ class TestDebugToolIntegration:

        parsed_response = json.loads(response_text)

-        assert parsed_response["status"] == "investigation_in_progress"
+        assert parsed_response["status"] == "pause_for_investigation"
        assert parsed_response["step_number"] == 1
        assert parsed_response["continuation_id"] == "debug-flow-uuid"

--- a/tests/test_debug_certain_confidence.py
+++ b/tests/test_debug_certain_confidence.py
@@ -45,8 +45,10 @@ class TestDebugCertainConfidence:

        # Verify step 1 response
        response1 = json.loads(result1[0].text)
-        assert response1["status"] == "investigation_in_progress"
+        assert response1["status"] == "pause_for_investigation"
        assert response1["step_number"] == 1
+        assert response1["investigation_required"] is True
+        assert "required_actions" in response1
        continuation_id = response1["continuation_id"]

        # Step 2: Final step with certain confidence (simple import fix)
--- a/tests/test_debug_comprehensive_workflow.py
+++ b/tests/test_debug_comprehensive_workflow.py
@@ -43,7 +43,7 @@ class TestDebugComprehensiveWorkflow:
        # Verify step 1 response
        assert len(result1) == 1
        response1 = json.loads(result1[0].text)
-        assert response1["status"] == "investigation_in_progress"
+        assert response1["status"] == "pause_for_investigation"
        assert response1["step_number"] == 1
        assert response1["continuation_id"] == "debug-workflow-uuid"

@@ -56,7 +56,8 @@ class TestDebugComprehensiveWorkflow:
            if args and len(args) >= 3:
                assert args[0] == "debug-workflow-uuid"
                assert args[1] == "assistant"
-                assert json.loads(args[2])["status"] == "investigation_in_progress"
+                # Debug tool now returns "pause_for_investigation" for ongoing steps
+                assert json.loads(args[2])["status"] == "pause_for_investigation"

        # Step 2: Continue investigation with findings
        with patch("utils.conversation_memory.add_turn") as mock_add_turn:
@@ -78,7 +79,8 @@ class TestDebugComprehensiveWorkflow:

        # Verify step 2 response
        response2 = json.loads(result2[0].text)
-        assert response2["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
+        assert response2["status"] == "pause_for_investigation"
        assert response2["step_number"] == 2
        assert response2["investigation_status"]["files_checked"] == 2
        assert response2["investigation_status"]["relevant_methods"] == 2
@@ -268,9 +270,12 @@ class TestDebugComprehensiveWorkflow:
                states.append(json.loads(result[0].text))

        # Verify initial state
-        assert states[0]["status"] == "investigation_in_progress"
+        # Debug tool now returns "pause_for_investigation" for ongoing steps
+        assert states[0]["status"] == "pause_for_investigation"
        assert states[0]["step_number"] == 1
        assert states[0]["next_step_required"] is True
+        assert states[0]["investigation_required"] is True
+        assert "required_actions" in states[0]

        # Final state (triggers expert analysis)
        mock_expert_response = {"status": "analysis_complete", "summary": "Test complete"}
--- a/tests/test_debug_continuation.py
+++ b/tests/test_debug_continuation.py
@@ -39,8 +39,10 @@ class TestDebugContinuation:

        assert len(result) == 1
        response = json.loads(result[0].text)
-        assert response["status"] == "investigation_in_progress"
+        assert response["status"] == "pause_for_investigation"
        assert response["continuation_id"] == "debug-test-uuid-123"
+        assert response["investigation_required"] is True
+        assert "required_actions" in response

    def test_debug_conversation_formatting(self):
        """Test that debug tool's structured output is properly formatted in conversation history."""
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -157,16 +157,18 @@ class DebugIssueTool(BaseTool):
            "DEBUG & ROOT CAUSE ANALYSIS - Systematic self-investigation followed by expert analysis. "
            "This tool guides you through a step-by-step investigation process where you:\n\n"
            "1. Start with step 1: describe the issue to investigate\n"
-            "2. Continue with investigation steps: examine code, trace errors, test hypotheses\n"
-            "3. Track findings, relevant files, and methods throughout\n"
-            "4. Update hypotheses as understanding evolves\n"
-            "5. Backtrack and revise findings when needed\n"
-            "6. Once investigation is complete, receive expert analysis\n\n"
-            "The tool enforces systematic investigation methodology:\n"
-            "- Methodical code examination and evidence collection\n"
-            "- Hypothesis formation and validation\n"
-            "- File and method tracking for context\n"
-            "- Confidence assessment and revision capabilities\n\n"
+            "2. STOP and investigate using appropriate tools\n"
+            "3. Report findings in step 2 with concrete evidence from actual code\n"
+            "4. Continue investigating between each debug step\n"
+            "5. Track findings, relevant files, and methods throughout\n"
+            "6. Update hypotheses as understanding evolves\n"
+            "7. Once investigation is complete, receive expert analysis\n\n"
+            "IMPORTANT: This tool enforces investigation between steps:\n"
+            "- After each debug call, you MUST investigate before calling debug again\n"
+            "- Each step must include NEW evidence from code examination\n"
+            "- No recursive debug calls without actual investigation work\n"
+            "- The tool will specify which step number to use next\n"
+            "- Follow the required_actions list for investigation guidance\n\n"
            "Perfect for: complex bugs, mysterious errors, performance issues, "
            "race conditions, memory leaks, integration problems."
        )
@@ -357,10 +359,6 @@ class DebugIssueTool(BaseTool):
                    "images_collected": len(set(self.consolidated_findings["images"])),
                    "current_confidence": request.confidence,
                },
-                "output": {
-                    "instructions": "Continue systematic investigation. Present findings clearly and proceed to next step if required.",
-                    "format": "systematic_investigation",
-                },
            }

            if continuation_id:
@@ -436,10 +434,72 @@ class DebugIssueTool(BaseTool):
                        "the problem lies."
                    )
            else:
-                response_data["next_steps"] = (
-                    f"Continue investigation with step {request.step_number + 1}. "
-                    f"Focus on: examining relevant code, testing hypotheses, gathering evidence."
-                )
+                # CRITICAL: Force Claude to actually investigate before calling debug again
+                response_data["status"] = "pause_for_investigation"
+                response_data["investigation_required"] = True
+
+                if request.step_number == 1:
+                    # Initial investigation tasks
+                    response_data["required_actions"] = [
+                        "Search for code related to the reported issue or symptoms",
+                        "Examine relevant files and understand the current implementation",
+                        "Understand the project structure and locate relevant modules",
+                        "Identify how the affected functionality is supposed to work",
+                    ]
+                    response_data["next_steps"] = (
+                        f"MANDATORY: DO NOT call the debug tool again immediately. You MUST first investigate "
+                        f"the codebase using appropriate tools. Search for relevant code, examine implementations, "
+                        f"understand the logic flow. Only call debug again AFTER you have gathered concrete evidence "
+                        f"and examined actual code. When you call debug next time, use step_number: {request.step_number + 1} "
+                        f"and report the specific files you've examined and findings you've discovered."
+                    )
+                elif request.step_number >= 2 and request.confidence in ["exploring", "low"]:
+                    # Need deeper investigation
+                    response_data["required_actions"] = [
+                        "Examine the specific files you've identified as relevant",
+                        "Trace method calls and data flow through the system",
+                        "Check for edge cases, boundary conditions, and assumptions in the code",
+                        "Look for related configuration, dependencies, or external factors",
+                    ]
+                    response_data["next_steps"] = (
+                        f"STOP! Do NOT call debug again yet. Based on your findings, you've identified potential areas "
+                        f"but need concrete evidence. MANDATORY ACTIONS before calling debug step {request.step_number + 1}:\n"
+                        f"1. Examine ALL files in your relevant_files list\n"
+                        f"2. Trace how data flows through {', '.join(request.relevant_methods[:3]) if request.relevant_methods else 'the identified components'}\n"
+                        f"3. Look for logic errors, incorrect assumptions, missing validations\n"
+                        f"4. Check interactions between components and external dependencies\n"
+                        f"Only call debug again with step_number: {request.step_number + 1} AFTER completing these investigations."
+                    )
+                elif request.confidence in ["medium", "high"]:
+                    # Close to root cause - need confirmation
+                    response_data["required_actions"] = [
+                        "Examine the exact code sections where you believe the issue occurs",
+                        "Trace the execution path that leads to the failure",
+                        "Verify your hypothesis with concrete code evidence",
+                        "Check for any similar patterns elsewhere in the codebase",
+                    ]
+                    response_data["next_steps"] = (
+                        f"WAIT! Your hypothesis needs verification. DO NOT call debug immediately. REQUIRED ACTIONS:\n"
+                        f"1. Examine the exact lines where the issue occurs\n"
+                        f"2. Trace backwards: how does data get to this point? What transforms it?\n"
+                        f"3. Check all assumptions: are inputs validated? Are nulls handled?\n"
+                        f"4. Look for the EXACT line where expected != actual behavior\n"
+                        f"Document these findings with specific file:line references, then call debug with step_number: {request.step_number + 1}."
+                    )
+                else:
+                    # General investigation needed
+                    response_data["required_actions"] = [
+                        "Continue examining the code paths identified in your hypothesis",
+                        "Gather more evidence using appropriate investigation tools",
+                        "Test edge cases and boundary conditions",
+                        "Look for patterns that confirm or refute your theory",
+                    ]
+                    response_data["next_steps"] = (
+                        f"PAUSE INVESTIGATION. Before calling debug step {request.step_number + 1}, you MUST examine code. "
+                        f"Required: Read files from your files_checked list, search for patterns in your hypothesis, "
+                        f"trace execution flow. Your next debug call (step_number: {request.step_number + 1}) must include "
+                        f"NEW evidence from actual code examination, not just theories. NO recursive debug calls without investigation work!"
+                    )

            # Store in conversation memory
            if continuation_id: