🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)

* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-21 00:08:11 +04:00
parent 4dae6e457e
commit 69a3121452
76 changed files with 17111 additions and 7725 deletions
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -1,13 +1,10 @@
 #!/usr/bin/env python3
 """
-Debug Tool Self-Investigation Validation Test
+DebugWorkflow Tool Validation Test

-Tests the debug tool's systematic self-investigation capabilities including:
- Step-by-step investigation with proper JSON responses
- Progressive tracking of findings, files, and methods
- Hypothesis formation and confidence tracking
- Backtracking and revision capabilities
- Final expert analysis after investigation completion
+Tests the debug tool's capabilities using the new workflow architecture.
+This validates that the new workflow-based implementation maintains
+all the functionality of the original debug tool.
 """

 import json
@@ -17,7 +14,7 @@ from .conversation_base_test import ConversationBaseTest


 class DebugValidationTest(ConversationBaseTest):
-    """Test debug tool's self-investigation and expert analysis features"""
+    """Test debug tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
@@ -25,15 +22,15 @@ class DebugValidationTest(ConversationBaseTest):

    @property
    def test_description(self) -> str:
-        return "Debug tool self-investigation pattern validation"
+        return "Debug tool validation with new workflow architecture"

    def run_test(self) -> bool:
-        """Test debug tool self-investigation capabilities"""
+        """Test debug tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
-            self.logger.info("Test: Debug tool self-investigation validation")
+            self.logger.info("Test: DebugWorkflow tool validation (new architecture)")

            # Create a Python file with a subtle but realistic bug
            self._create_buggy_code()
@@ -50,11 +47,23 @@ class DebugValidationTest(ConversationBaseTest):
            if not self._test_complete_investigation_with_analysis():
                return False

+            # Test 4: Certain confidence behavior
+            if not self._test_certain_confidence():
+                return False
+
+            # Test 5: Context-aware file embedding
+            if not self._test_context_aware_file_embedding():
+                return False
+
+            # Test 6: Multi-step file context optimization
+            if not self._test_multi_step_file_context():
+                return False
+
            self.logger.info("  ✅ All debug validation tests passed")
            return True

        except Exception as e:
-            self.logger.error(f"Debug validation test failed: {e}")
+            self.logger.error(f"DebugWorkflow validation test failed: {e}")
            return False

    def _create_buggy_code(self):
@@ -164,8 +173,8 @@ RuntimeError: dictionary changed size during iteration
            if not response1_data:
                return False

-            # Validate step 1 response structure
-            if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
+            # Validate step 1 response structure - expect pause_for_investigation for next_step_required=True
+            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_investigation"):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
@@ -194,7 +203,7 @@ RuntimeError: dictionary changed size during iteration
                return False

            response2_data = self._parse_debug_response(response2)
-            if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
+            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_investigation"):
                return False

            # Check investigation status tracking
@@ -213,35 +222,6 @@ RuntimeError: dictionary changed size during iteration

            self.logger.info("    ✅ Step 2 successful with proper tracking")

-            # Step 3: Validate hypothesis
-            self.logger.info("    1.1.3: Step 3 - Hypothesis validation")
-            response3, _ = self.call_mcp_tool(
-                "debug",
-                {
-                    "step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
-                    "step_number": 3,
-                    "total_steps": 4,
-                    "next_step_required": True,
-                    "findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
-                    "files_checked": [self.buggy_file],
-                    "relevant_files": [self.buggy_file],
-                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
-                    "hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
-                    "confidence": "high",
-                    "continuation_id": continuation_id,
-                },
-            )
-
-            if not response3:
-                self.logger.error("Failed to continue investigation to step 3")
-                return False
-
-            response3_data = self._parse_debug_response(response3)
-            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
-                return False
-
-            self.logger.info("    ✅ Investigation session progressing successfully")
-
            # Store continuation_id for next test
            self.investigation_continuation_id = continuation_id
            return True
@@ -321,7 +301,7 @@ RuntimeError: dictionary changed size during iteration
                return False

            response3_data = self._parse_debug_response(response3)
-            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
+            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_investigation"):
                return False

            self.logger.info("    ✅ Backtracking working correctly")
@@ -386,7 +366,7 @@ RuntimeError: dictionary changed size during iteration
            if not response_final_data:
                return False

-            # Validate final response structure
+            # Validate final response structure - expect calling_expert_analysis for next_step_required=False
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
@@ -433,38 +413,67 @@ RuntimeError: dictionary changed size during iteration
                return False

            self.logger.info("    ✅ Complete investigation with expert analysis successful")
-
-            # Validate logs
-            self.logger.info("  📋 Validating execution logs...")
-
-            # Get server logs
-            logs = self.get_recent_server_logs(500)
-
-            # Look for debug tool execution patterns
-            debug_patterns = [
-                "debug tool",
-                "investigation",
-                "Expert analysis",
-                "calling_expert_analysis",
-            ]
-
-            patterns_found = 0
-            for pattern in debug_patterns:
-                if pattern in logs:
-                    patterns_found += 1
-                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")
-
-            if patterns_found >= 2:
-                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
-            else:
-                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
-
            return True

        except Exception as e:
            self.logger.error(f"Complete investigation test failed: {e}")
            return False

+    def _test_certain_confidence(self) -> bool:
+        """Test certain confidence behavior - should skip expert analysis"""
+        try:
+            self.logger.info("  1.4: Testing certain confidence behavior")
+
+            # Test certain confidence - should skip expert analysis
+            self.logger.info("    1.4.1: Certain confidence investigation")
+            response_certain, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "I have confirmed the exact root cause with 100% certainty: dictionary modification during iteration.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,  # Final step
+                    "findings": "The bug is on line 44-47: for loop iterates over dict.items() while del modifies the dict inside the loop. Fix is simple: collect expired IDs first, then delete after iteration.",
+                    "files_checked": [self.buggy_file],
+                    "relevant_files": [self.buggy_file],
+                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    "hypothesis": "Dictionary modification during iteration causes RuntimeError - fix is straightforward",
+                    "confidence": "certain",  # This should skip expert analysis
+                    "model": "flash",
+                },
+            )
+
+            if not response_certain:
+                self.logger.error("Failed to test certain confidence")
+                return False
+
+            response_certain_data = self._parse_debug_response(response_certain)
+            if not response_certain_data:
+                return False
+
+            # Validate certain confidence response - should skip expert analysis
+            if response_certain_data.get("status") != "certain_confidence_proceed_with_fix":
+                self.logger.error(
+                    f"Expected status 'certain_confidence_proceed_with_fix', got '{response_certain_data.get('status')}'"
+                )
+                return False
+
+            if not response_certain_data.get("skip_expert_analysis"):
+                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
+                return False
+
+            expert_analysis = response_certain_data.get("expert_analysis", {})
+            if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
+                self.logger.error("Expert analysis should be skipped for certain confidence")
+                return False
+
+            self.logger.info("    ✅ Certain confidence behavior working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Certain confidence test failed: {e}")
+            return False
+
    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for debug-specific response handling"""
        # Use in-process implementation to maintain conversation memory
@@ -537,9 +546,6 @@ RuntimeError: dictionary changed size during iteration
                self.logger.error("Missing investigation_status in response")
                return False

-            # Output field removed in favor of contextual next_steps
-            # No longer checking for "output" field as it was redundant
-
            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
@@ -550,3 +556,406 @@ RuntimeError: dictionary changed size during iteration
        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False
+
+    def _test_context_aware_file_embedding(self) -> bool:
+        """Test context-aware file embedding optimization"""
+        try:
+            self.logger.info("  1.5: Testing context-aware file embedding")
+
+            # Create multiple test files for context testing
+            file1_content = """#!/usr/bin/env python3
+def process_data(data):
+    \"\"\"Process incoming data\"\"\"
+    result = []
+    for item in data:
+        if item.get('valid'):
+            result.append(item['value'])
+    return result
+"""
+
+            file2_content = """#!/usr/bin/env python3
+def validate_input(data):
+    \"\"\"Validate input data\"\"\"
+    if not isinstance(data, list):
+        raise ValueError("Data must be a list")
+
+    for item in data:
+        if not isinstance(item, dict):
+            raise ValueError("Items must be dictionaries")
+        if 'value' not in item:
+            raise ValueError("Items must have 'value' key")
+
+    return True
+"""
+
+            # Create test files
+            file1 = self.create_additional_test_file("data_processor.py", file1_content)
+            file2 = self.create_additional_test_file("validator.py", file2_content)
+
+            # Test 1: New conversation, intermediate step - should only reference files
+            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
+            response1, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Starting investigation of data processing pipeline",
+                    "step_number": 1,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Intermediate step
+                    "findings": "Initial analysis of data processing components",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1],  # This should be referenced, not embedded
+                    "relevant_methods": ["process_data"],
+                    "hypothesis": "Investigating data flow",
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start context-aware file embedding test")
+                return False
+
+            response1_data = self._parse_debug_response(response1)
+            if not response1_data:
+                return False
+
+            # Check file context - should be reference_only for intermediate step
+            file_context = response1_data.get("file_context", {})
+            if file_context.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
+                return False
+
+            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
+                self.logger.error("Expected context optimization message for reference_only")
+                return False
+
+            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")
+
+            # Test 2: Intermediate step with continuation - should still only reference
+            self.logger.info("    1.5.2: Intermediate step with continuation (should reference only)")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Continuing investigation with more detailed analysis",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Still intermediate
+                    "continuation_id": continuation_id,
+                    "findings": "Found potential issues in validation logic",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1, file2],  # Both files referenced
+                    "relevant_methods": ["process_data", "validate_input"],
+                    "hypothesis": "Validation might be too strict",
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            response2_data = self._parse_debug_response(response2)
+            if not response2_data:
+                return False
+
+            # Check file context - should still be reference_only
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context for step 2, got: {file_context2.get('type')}")
+                return False
+
+            # Should include reference note
+            if not file_context2.get("note"):
+                self.logger.error("Expected file reference note for intermediate step")
+                return False
+
+            reference_note = file_context2.get("note", "")
+            if "data_processor.py" not in reference_note or "validator.py" not in reference_note:
+                self.logger.error("File reference note should mention both files")
+                return False
+
+            self.logger.info("    ✅ Intermediate step with continuation correctly uses reference_only")
+
+            # Test 3: Final step - should embed files for expert analysis
+            self.logger.info("    1.5.3: Final step (should embed files)")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigation complete - identified the root cause",
+                    "step_number": 3,
+                    "total_steps": 3,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Root cause: validator is rejecting valid data due to strict type checking",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1, file2],  # Should be fully embedded
+                    "relevant_methods": ["process_data", "validate_input"],
+                    "hypothesis": "Validation logic is too restrictive for valid edge cases",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response3_data = self._parse_debug_response(response3)
+            if not response3_data:
+                return False
+
+            # Check file context - should be fully_embedded for final step
+            file_context3 = response3_data.get("file_context", {})
+            if file_context3.get("type") != "fully_embedded":
+                self.logger.error(
+                    f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
+                )
+                return False
+
+            if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
+                self.logger.error("Expected expert analysis optimization message for fully_embedded")
+                return False
+
+            # Should show files embedded count
+            files_embedded = file_context3.get("files_embedded", 0)
+            if files_embedded == 0:
+                # This is OK - files might already be in conversation history
+                self.logger.info(
+                    "    ℹ️ Files embedded count is 0 - files already in conversation history (smart deduplication)"
+                )
+            else:
+                self.logger.info(f"    ✅ Files embedded count: {files_embedded}")
+
+            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")
+
+            # Verify expert analysis was called for final step
+            if response3_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            if "expert_analysis" not in response3_data:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Context-aware file embedding test failed: {e}")
+            return False
+
+    def _test_multi_step_file_context(self) -> bool:
+        """Test multi-step workflow with proper file context transitions"""
+        try:
+            self.logger.info("  1.6: Testing multi-step file context optimization")
+
+            # Create a complex scenario with multiple files
+            config_content = """#!/usr/bin/env python3
+import os
+
+DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///app.db')
+DEBUG_MODE = os.getenv('DEBUG', 'False').lower() == 'true'
+MAX_CONNECTIONS = int(os.getenv('MAX_CONNECTIONS', '10'))
+
+# Bug: This will cause issues when MAX_CONNECTIONS is not a valid integer
+CACHE_SIZE = MAX_CONNECTIONS * 2  # Problematic if MAX_CONNECTIONS is invalid
+"""
+
+            server_content = """#!/usr/bin/env python3
+from config import DATABASE_URL, DEBUG_MODE, CACHE_SIZE
+import sqlite3
+
+class DatabaseServer:
+    def __init__(self):
+        self.connection_pool = []
+        self.cache_size = CACHE_SIZE  # This will fail if CACHE_SIZE is invalid
+
+    def connect(self):
+        try:
+            conn = sqlite3.connect(DATABASE_URL)
+            self.connection_pool.append(conn)
+            return conn
+        except Exception as e:
+            print(f"Connection failed: {e}")
+            return None
+"""
+
+            # Create test files
+            config_file = self.create_additional_test_file("config.py", config_content)
+            server_file = self.create_additional_test_file("database_server.py", server_content)
+
+            # Step 1: Start investigation (new conversation)
+            self.logger.info("    1.6.1: Step 1 - Start investigation")
+            response1, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigating application startup failures in production environment",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Application fails to start with configuration errors",
+                    "files_checked": [config_file],
+                    "relevant_files": [config_file],
+                    "relevant_methods": [],
+                    "hypothesis": "Configuration issue causing startup failure",
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start multi-step file context test")
+                return False
+
+            response1_data = self._parse_debug_response(response1)
+
+            # Validate step 1 - should use reference_only
+            file_context1 = response1_data.get("file_context", {})
+            if file_context1.get("type") != "reference_only":
+                self.logger.error("Step 1 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 1: reference_only file context")
+
+            # Step 2: Expand investigation
+            self.logger.info("    1.6.2: Step 2 - Expand investigation")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Found configuration issue - investigating database server initialization",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "MAX_CONNECTIONS environment variable contains invalid value, causing CACHE_SIZE calculation to fail",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Invalid environment variable causing integer conversion error",
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            response2_data = self._parse_debug_response(response2)
+
+            # Validate step 2 - should still use reference_only
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "reference_only":
+                self.logger.error("Step 2 should use reference_only file context")
+                return False
+
+            # Should reference both files
+            reference_note = file_context2.get("note", "")
+            if "config.py" not in reference_note or "database_server.py" not in reference_note:
+                self.logger.error("Step 2 should reference both files in note")
+                return False
+
+            self.logger.info("    ✅ Step 2: reference_only file context with multiple files")
+
+            # Step 3: Deep analysis
+            self.logger.info("    1.6.3: Step 3 - Deep analysis")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Analyzing the exact error propagation path and impact",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Error occurs in config.py line 8 when MAX_CONNECTIONS is not numeric, then propagates to DatabaseServer.__init__",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Need proper error handling and validation for environment variables",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to continue to step 3")
+                return False
+
+            response3_data = self._parse_debug_response(response3)
+
+            # Validate step 3 - should still use reference_only
+            file_context3 = response3_data.get("file_context", {})
+            if file_context3.get("type") != "reference_only":
+                self.logger.error("Step 3 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 3: reference_only file context")
+
+            # Step 4: Final analysis with expert consultation
+            self.logger.info("    1.6.4: Step 4 - Final step with expert analysis")
+            response4, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigation complete - root cause identified with solution",
+                    "step_number": 4,
+                    "total_steps": 4,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Root cause: config.py assumes MAX_CONNECTIONS env var is always a valid integer. Fix: add try/except with default value and proper validation.",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Environment variable validation needed with proper error handling",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response4:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response4_data = self._parse_debug_response(response4)
+
+            # Validate step 4 - should use fully_embedded for expert analysis
+            file_context4 = response4_data.get("file_context", {})
+            if file_context4.get("type") != "fully_embedded":
+                self.logger.error("Step 4 (final) should use fully_embedded file context")
+                return False
+
+            if "expert analysis" not in file_context4.get("context_optimization", "").lower():
+                self.logger.error("Final step should mention expert analysis in context optimization")
+                return False
+
+            # Verify expert analysis was triggered
+            if response4_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            # Check that expert analysis has file context
+            expert_analysis = response4_data.get("expert_analysis", {})
+            if not expert_analysis:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Step 4: fully_embedded file context with expert analysis")
+
+            # Validate the complete workflow progression
+            progression_summary = {
+                "step_1": "reference_only (new conversation, intermediate)",
+                "step_2": "reference_only (continuation, intermediate)",
+                "step_3": "reference_only (continuation, intermediate)",
+                "step_4": "fully_embedded (continuation, final)",
+            }
+
+            self.logger.info("    📋 File context progression:")
+            for step, context_type in progression_summary.items():
+                self.logger.info(f"      {step}: {context_type}")
+
+            self.logger.info("    ✅ Multi-step file context optimization test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Multi-step file context test failed: {e}")
+            return False