🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)

* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-21 00:08:11 +04:00
parent 4dae6e457e
commit 69a3121452
76 changed files with 17111 additions and 7725 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -6,7 +6,9 @@ Each test is in its own file for better organization and maintainability.
 """

 from .base_test import BaseSimulatorTest
+from .test_analyze_validation import AnalyzeValidationTest
 from .test_basic_conversation import BasicConversationTest
+from .test_codereview_validation import CodeReviewValidationTest
 from .test_consensus_conversation import TestConsensusConversation
 from .test_consensus_stance import TestConsensusStance
 from .test_consensus_three_models import TestConsensusThreeModels
@@ -27,10 +29,12 @@ from .test_openrouter_models import OpenRouterModelsTest
 from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_planner_continuation_history import PlannerContinuationHistoryTest
 from .test_planner_validation import PlannerValidationTest
+from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest

 # Redis validation test removed - no longer needed for standalone server
 from .test_refactor_validation import RefactorValidationTest
 from .test_testgen_validation import TestGenValidationTest
+from .test_thinkdeep_validation import ThinkDeepWorkflowValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
 from .test_vision_capability import VisionCapabilityTest
 from .test_xai_models import XAIModelsTest
@@ -38,6 +42,7 @@ from .test_xai_models import XAIModelsTest
 # Test registry for dynamic loading
 TEST_REGISTRY = {
    "basic_conversation": BasicConversationTest,
+    "codereview_validation": CodeReviewValidationTest,
    "content_validation": ContentValidationTest,
    "per_tool_deduplication": PerToolDeduplicationTest,
    "cross_tool_continuation": CrossToolContinuationTest,
@@ -52,8 +57,10 @@ TEST_REGISTRY = {
    "openrouter_models": OpenRouterModelsTest,
    "planner_validation": PlannerValidationTest,
    "planner_continuation_history": PlannerContinuationHistoryTest,
+    "precommit_validation": PrecommitWorkflowValidationTest,
    "token_allocation_validation": TokenAllocationValidationTest,
    "testgen_validation": TestGenValidationTest,
+    "thinkdeep_validation": ThinkDeepWorkflowValidationTest,
    "refactor_validation": RefactorValidationTest,
    "debug_validation": DebugValidationTest,
    "debug_certain_confidence": DebugCertainConfidenceTest,
@@ -63,19 +70,20 @@ TEST_REGISTRY = {
    "consensus_conversation": TestConsensusConversation,
    "consensus_stance": TestConsensusStance,
    "consensus_three_models": TestConsensusThreeModels,
+    "analyze_validation": AnalyzeValidationTest,
    # "o3_pro_expensive": O3ProExpensiveTest,  # COMMENTED OUT - too expensive to run by default
 }

 __all__ = [
    "BaseSimulatorTest",
    "BasicConversationTest",
+    "CodeReviewValidationTest",
    "ContentValidationTest",
    "PerToolDeduplicationTest",
    "CrossToolContinuationTest",
    "CrossToolComprehensiveTest",
    "LineNumberValidationTest",
    "LogsValidationTest",
-    # "RedisValidationTest",  # Removed - no longer needed for standalone server
    "TestModelThinkingConfig",
    "O3ModelSelectionTest",
    "O3ProExpensiveTest",
@@ -84,8 +92,10 @@ __all__ = [
    "OpenRouterModelsTest",
    "PlannerValidationTest",
    "PlannerContinuationHistoryTest",
+    "PrecommitWorkflowValidationTest",
    "TokenAllocationValidationTest",
    "TestGenValidationTest",
+    "ThinkDeepWorkflowValidationTest",
    "RefactorValidationTest",
    "DebugValidationTest",
    "DebugCertainConfidenceTest",
@@ -95,5 +105,6 @@ __all__ = [
    "TestConsensusConversation",
    "TestConsensusStance",
    "TestConsensusThreeModels",
+    "AnalyzeValidationTest",
    "TEST_REGISTRY",
 ]
--- a/simulator_tests/base_test.py
+++ b/simulator_tests/base_test.py
@@ -228,6 +228,10 @@ class Calculator:

            # Look for continuation_id in various places
            if isinstance(response_data, dict):
+                # Check for direct continuation_id field (new workflow tools)
+                if "continuation_id" in response_data:
+                    return response_data["continuation_id"]
+
                # Check metadata
                metadata = response_data.get("metadata", {})
                if "thread_id" in metadata:
--- a/simulator_tests/conversation_base_test.py
+++ b/simulator_tests/conversation_base_test.py
@@ -80,8 +80,10 @@ class ConversationBaseTest(BaseSimulatorTest):
            if project_root not in sys.path:
                sys.path.insert(0, project_root)

-            # Import tools from server
-            from server import TOOLS
+            # Import and configure providers first (this is what main() does)
+            from server import TOOLS, configure_providers
+
+            configure_providers()

            self._tools = TOOLS
            self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
--- a/simulator_tests/test_analyze_validation.py
+++ b/simulator_tests/test_analyze_validation.py
--- a/simulator_tests/test_codereview_validation.py
+++ b/simulator_tests/test_codereview_validation.py
--- a/simulator_tests/test_cross_tool_continuation.py
+++ b/simulator_tests/test_cross_tool_continuation.py
@@ -62,7 +62,7 @@ class CrossToolContinuationTest(ConversationBaseTest):
            self.logger.info("  1: Testing chat -> thinkdeep -> codereview")

            # Start with chat
-            chat_response, chat_id = self.call_mcp_tool_direct(
+            chat_response, chat_id = self.call_mcp_tool(
                "chat",
                {
                    "prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
@@ -76,11 +76,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
                return False

            # Continue with thinkdeep
-            thinkdeep_response, _ = self.call_mcp_tool_direct(
+            thinkdeep_response, _ = self.call_mcp_tool(
                "thinkdeep",
                {
-                    "prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
-                    "files": [self.test_files["python"]],  # Same file should be deduplicated
+                    "step": "Think deeply about potential performance issues in this code. Please use low thinking mode.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Building on previous chat analysis to examine performance issues",
+                    "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
                    "continuation_id": chat_id,
                    "model": "flash",
                },
@@ -91,11 +95,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
                return False

            # Continue with codereview
-            codereview_response, _ = self.call_mcp_tool_direct(
+            codereview_response, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [self.test_files["python"]],  # Same file should be deduplicated
-                    "prompt": "Building on our previous analysis, provide a comprehensive code review",
+                    "step": "Building on our previous analysis, provide a comprehensive code review",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Continuing from previous chat and thinkdeep analysis for comprehensive review",
+                    "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
                    "continuation_id": chat_id,
                    "model": "flash",
                },
@@ -118,11 +126,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
            self.logger.info("  2: Testing analyze -> debug -> thinkdeep")

            # Start with analyze
-            analyze_response, analyze_id = self.call_mcp_tool_direct(
+            analyze_response, analyze_id = self.call_mcp_tool(
                "analyze",
                {
-                    "files": [self.test_files["python"]],
-                    "prompt": "Analyze this code for quality and performance issues",
+                    "step": "Analyze this code for quality and performance issues",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Starting analysis of Python code for quality and performance issues",
+                    "relevant_files": [self.test_files["python"]],
                    "model": "flash",
                },
            )
@@ -132,11 +144,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
                return False

            # Continue with debug
-            debug_response, _ = self.call_mcp_tool_direct(
+            debug_response, _ = self.call_mcp_tool(
                "debug",
                {
-                    "files": [self.test_files["python"]],  # Same file should be deduplicated
-                    "prompt": "Based on our analysis, help debug the performance issue in fibonacci",
+                    "step": "Based on our analysis, help debug the performance issue in fibonacci",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Building on previous analysis to debug specific performance issue",
+                    "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
                    "continuation_id": analyze_id,
                    "model": "flash",
                },
@@ -147,11 +163,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
                return False

            # Continue with thinkdeep
-            final_response, _ = self.call_mcp_tool_direct(
+            final_response, _ = self.call_mcp_tool(
                "thinkdeep",
                {
-                    "prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
-                    "files": [self.test_files["python"]],  # Same file should be deduplicated
+                    "step": "Think deeply about the architectural implications of the issues we've found. Please use low thinking mode.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Building on analysis and debug findings to explore architectural implications",
+                    "relevant_files": [self.test_files["python"]],  # Same file should be deduplicated
                    "continuation_id": analyze_id,
                    "model": "flash",
                },
@@ -174,7 +194,7 @@ class CrossToolContinuationTest(ConversationBaseTest):
            self.logger.info("  3: Testing multi-file cross-tool continuation")

            # Start with both files
-            multi_response, multi_id = self.call_mcp_tool_direct(
+            multi_response, multi_id = self.call_mcp_tool(
                "chat",
                {
                    "prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
@@ -188,11 +208,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
                return False

            # Switch to codereview with same files (should use conversation history)
-            multi_review, _ = self.call_mcp_tool_direct(
+            multi_review, _ = self.call_mcp_tool(
                "codereview",
                {
-                    "files": [self.test_files["python"], self.test_files["config"]],  # Same files
-                    "prompt": "Review both files in the context of our previous discussion",
+                    "step": "Review both files in the context of our previous discussion",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,
+                    "findings": "Continuing multi-file analysis with code review perspective",
+                    "relevant_files": [self.test_files["python"], self.test_files["config"]],  # Same files
                    "continuation_id": multi_id,
                    "model": "flash",
                },
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -1,13 +1,10 @@
 #!/usr/bin/env python3
 """
-Debug Tool Self-Investigation Validation Test
+DebugWorkflow Tool Validation Test

-Tests the debug tool's systematic self-investigation capabilities including:
- Step-by-step investigation with proper JSON responses
- Progressive tracking of findings, files, and methods
- Hypothesis formation and confidence tracking
- Backtracking and revision capabilities
- Final expert analysis after investigation completion
+Tests the debug tool's capabilities using the new workflow architecture.
+This validates that the new workflow-based implementation maintains
+all the functionality of the original debug tool.
 """

 import json
@@ -17,7 +14,7 @@ from .conversation_base_test import ConversationBaseTest


 class DebugValidationTest(ConversationBaseTest):
-    """Test debug tool's self-investigation and expert analysis features"""
+    """Test debug tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
@@ -25,15 +22,15 @@ class DebugValidationTest(ConversationBaseTest):

    @property
    def test_description(self) -> str:
-        return "Debug tool self-investigation pattern validation"
+        return "Debug tool validation with new workflow architecture"

    def run_test(self) -> bool:
-        """Test debug tool self-investigation capabilities"""
+        """Test debug tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
-            self.logger.info("Test: Debug tool self-investigation validation")
+            self.logger.info("Test: DebugWorkflow tool validation (new architecture)")

            # Create a Python file with a subtle but realistic bug
            self._create_buggy_code()
@@ -50,11 +47,23 @@ class DebugValidationTest(ConversationBaseTest):
            if not self._test_complete_investigation_with_analysis():
                return False

+            # Test 4: Certain confidence behavior
+            if not self._test_certain_confidence():
+                return False
+
+            # Test 5: Context-aware file embedding
+            if not self._test_context_aware_file_embedding():
+                return False
+
+            # Test 6: Multi-step file context optimization
+            if not self._test_multi_step_file_context():
+                return False
+
            self.logger.info("  ✅ All debug validation tests passed")
            return True

        except Exception as e:
-            self.logger.error(f"Debug validation test failed: {e}")
+            self.logger.error(f"DebugWorkflow validation test failed: {e}")
            return False

    def _create_buggy_code(self):
@@ -164,8 +173,8 @@ RuntimeError: dictionary changed size during iteration
            if not response1_data:
                return False

-            # Validate step 1 response structure
-            if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
+            # Validate step 1 response structure - expect pause_for_investigation for next_step_required=True
+            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_investigation"):
                return False

            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
@@ -194,7 +203,7 @@ RuntimeError: dictionary changed size during iteration
                return False

            response2_data = self._parse_debug_response(response2)
-            if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
+            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_investigation"):
                return False

            # Check investigation status tracking
@@ -213,35 +222,6 @@ RuntimeError: dictionary changed size during iteration

            self.logger.info("    ✅ Step 2 successful with proper tracking")

-            # Step 3: Validate hypothesis
-            self.logger.info("    1.1.3: Step 3 - Hypothesis validation")
-            response3, _ = self.call_mcp_tool(
-                "debug",
-                {
-                    "step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
-                    "step_number": 3,
-                    "total_steps": 4,
-                    "next_step_required": True,
-                    "findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
-                    "files_checked": [self.buggy_file],
-                    "relevant_files": [self.buggy_file],
-                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
-                    "hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
-                    "confidence": "high",
-                    "continuation_id": continuation_id,
-                },
-            )
-
-            if not response3:
-                self.logger.error("Failed to continue investigation to step 3")
-                return False
-
-            response3_data = self._parse_debug_response(response3)
-            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
-                return False
-
-            self.logger.info("    ✅ Investigation session progressing successfully")
-
            # Store continuation_id for next test
            self.investigation_continuation_id = continuation_id
            return True
@@ -321,7 +301,7 @@ RuntimeError: dictionary changed size during iteration
                return False

            response3_data = self._parse_debug_response(response3)
-            if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
+            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_investigation"):
                return False

            self.logger.info("    ✅ Backtracking working correctly")
@@ -386,7 +366,7 @@ RuntimeError: dictionary changed size during iteration
            if not response_final_data:
                return False

-            # Validate final response structure
+            # Validate final response structure - expect calling_expert_analysis for next_step_required=False
            if response_final_data.get("status") != "calling_expert_analysis":
                self.logger.error(
                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
@@ -433,38 +413,67 @@ RuntimeError: dictionary changed size during iteration
                return False

            self.logger.info("    ✅ Complete investigation with expert analysis successful")
-
-            # Validate logs
-            self.logger.info("  📋 Validating execution logs...")
-
-            # Get server logs
-            logs = self.get_recent_server_logs(500)
-
-            # Look for debug tool execution patterns
-            debug_patterns = [
-                "debug tool",
-                "investigation",
-                "Expert analysis",
-                "calling_expert_analysis",
-            ]
-
-            patterns_found = 0
-            for pattern in debug_patterns:
-                if pattern in logs:
-                    patterns_found += 1
-                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")
-
-            if patterns_found >= 2:
-                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
-            else:
-                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
-
            return True

        except Exception as e:
            self.logger.error(f"Complete investigation test failed: {e}")
            return False

+    def _test_certain_confidence(self) -> bool:
+        """Test certain confidence behavior - should skip expert analysis"""
+        try:
+            self.logger.info("  1.4: Testing certain confidence behavior")
+
+            # Test certain confidence - should skip expert analysis
+            self.logger.info("    1.4.1: Certain confidence investigation")
+            response_certain, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "I have confirmed the exact root cause with 100% certainty: dictionary modification during iteration.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,  # Final step
+                    "findings": "The bug is on line 44-47: for loop iterates over dict.items() while del modifies the dict inside the loop. Fix is simple: collect expired IDs first, then delete after iteration.",
+                    "files_checked": [self.buggy_file],
+                    "relevant_files": [self.buggy_file],
+                    "relevant_methods": ["SessionManager.cleanup_expired_sessions"],
+                    "hypothesis": "Dictionary modification during iteration causes RuntimeError - fix is straightforward",
+                    "confidence": "certain",  # This should skip expert analysis
+                    "model": "flash",
+                },
+            )
+
+            if not response_certain:
+                self.logger.error("Failed to test certain confidence")
+                return False
+
+            response_certain_data = self._parse_debug_response(response_certain)
+            if not response_certain_data:
+                return False
+
+            # Validate certain confidence response - should skip expert analysis
+            if response_certain_data.get("status") != "certain_confidence_proceed_with_fix":
+                self.logger.error(
+                    f"Expected status 'certain_confidence_proceed_with_fix', got '{response_certain_data.get('status')}'"
+                )
+                return False
+
+            if not response_certain_data.get("skip_expert_analysis"):
+                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
+                return False
+
+            expert_analysis = response_certain_data.get("expert_analysis", {})
+            if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
+                self.logger.error("Expert analysis should be skipped for certain confidence")
+                return False
+
+            self.logger.info("    ✅ Certain confidence behavior working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Certain confidence test failed: {e}")
+            return False
+
    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for debug-specific response handling"""
        # Use in-process implementation to maintain conversation memory
@@ -537,9 +546,6 @@ RuntimeError: dictionary changed size during iteration
                self.logger.error("Missing investigation_status in response")
                return False

-            # Output field removed in favor of contextual next_steps
-            # No longer checking for "output" field as it was redundant
-
            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
@@ -550,3 +556,406 @@ RuntimeError: dictionary changed size during iteration
        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False
+
+    def _test_context_aware_file_embedding(self) -> bool:
+        """Test context-aware file embedding optimization"""
+        try:
+            self.logger.info("  1.5: Testing context-aware file embedding")
+
+            # Create multiple test files for context testing
+            file1_content = """#!/usr/bin/env python3
+def process_data(data):
+    \"\"\"Process incoming data\"\"\"
+    result = []
+    for item in data:
+        if item.get('valid'):
+            result.append(item['value'])
+    return result
+"""
+
+            file2_content = """#!/usr/bin/env python3
+def validate_input(data):
+    \"\"\"Validate input data\"\"\"
+    if not isinstance(data, list):
+        raise ValueError("Data must be a list")
+
+    for item in data:
+        if not isinstance(item, dict):
+            raise ValueError("Items must be dictionaries")
+        if 'value' not in item:
+            raise ValueError("Items must have 'value' key")
+
+    return True
+"""
+
+            # Create test files
+            file1 = self.create_additional_test_file("data_processor.py", file1_content)
+            file2 = self.create_additional_test_file("validator.py", file2_content)
+
+            # Test 1: New conversation, intermediate step - should only reference files
+            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
+            response1, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Starting investigation of data processing pipeline",
+                    "step_number": 1,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Intermediate step
+                    "findings": "Initial analysis of data processing components",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1],  # This should be referenced, not embedded
+                    "relevant_methods": ["process_data"],
+                    "hypothesis": "Investigating data flow",
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start context-aware file embedding test")
+                return False
+
+            response1_data = self._parse_debug_response(response1)
+            if not response1_data:
+                return False
+
+            # Check file context - should be reference_only for intermediate step
+            file_context = response1_data.get("file_context", {})
+            if file_context.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
+                return False
+
+            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
+                self.logger.error("Expected context optimization message for reference_only")
+                return False
+
+            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")
+
+            # Test 2: Intermediate step with continuation - should still only reference
+            self.logger.info("    1.5.2: Intermediate step with continuation (should reference only)")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Continuing investigation with more detailed analysis",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Still intermediate
+                    "continuation_id": continuation_id,
+                    "findings": "Found potential issues in validation logic",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1, file2],  # Both files referenced
+                    "relevant_methods": ["process_data", "validate_input"],
+                    "hypothesis": "Validation might be too strict",
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            response2_data = self._parse_debug_response(response2)
+            if not response2_data:
+                return False
+
+            # Check file context - should still be reference_only
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context for step 2, got: {file_context2.get('type')}")
+                return False
+
+            # Should include reference note
+            if not file_context2.get("note"):
+                self.logger.error("Expected file reference note for intermediate step")
+                return False
+
+            reference_note = file_context2.get("note", "")
+            if "data_processor.py" not in reference_note or "validator.py" not in reference_note:
+                self.logger.error("File reference note should mention both files")
+                return False
+
+            self.logger.info("    ✅ Intermediate step with continuation correctly uses reference_only")
+
+            # Test 3: Final step - should embed files for expert analysis
+            self.logger.info("    1.5.3: Final step (should embed files)")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigation complete - identified the root cause",
+                    "step_number": 3,
+                    "total_steps": 3,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Root cause: validator is rejecting valid data due to strict type checking",
+                    "files_checked": [file1, file2],
+                    "relevant_files": [file1, file2],  # Should be fully embedded
+                    "relevant_methods": ["process_data", "validate_input"],
+                    "hypothesis": "Validation logic is too restrictive for valid edge cases",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response3_data = self._parse_debug_response(response3)
+            if not response3_data:
+                return False
+
+            # Check file context - should be fully_embedded for final step
+            file_context3 = response3_data.get("file_context", {})
+            if file_context3.get("type") != "fully_embedded":
+                self.logger.error(
+                    f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
+                )
+                return False
+
+            if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
+                self.logger.error("Expected expert analysis optimization message for fully_embedded")
+                return False
+
+            # Should show files embedded count
+            files_embedded = file_context3.get("files_embedded", 0)
+            if files_embedded == 0:
+                # This is OK - files might already be in conversation history
+                self.logger.info(
+                    "    ℹ️ Files embedded count is 0 - files already in conversation history (smart deduplication)"
+                )
+            else:
+                self.logger.info(f"    ✅ Files embedded count: {files_embedded}")
+
+            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")
+
+            # Verify expert analysis was called for final step
+            if response3_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            if "expert_analysis" not in response3_data:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Context-aware file embedding test failed: {e}")
+            return False
+
+    def _test_multi_step_file_context(self) -> bool:
+        """Test multi-step workflow with proper file context transitions"""
+        try:
+            self.logger.info("  1.6: Testing multi-step file context optimization")
+
+            # Create a complex scenario with multiple files
+            config_content = """#!/usr/bin/env python3
+import os
+
+DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///app.db')
+DEBUG_MODE = os.getenv('DEBUG', 'False').lower() == 'true'
+MAX_CONNECTIONS = int(os.getenv('MAX_CONNECTIONS', '10'))
+
+# Bug: This will cause issues when MAX_CONNECTIONS is not a valid integer
+CACHE_SIZE = MAX_CONNECTIONS * 2  # Problematic if MAX_CONNECTIONS is invalid
+"""
+
+            server_content = """#!/usr/bin/env python3
+from config import DATABASE_URL, DEBUG_MODE, CACHE_SIZE
+import sqlite3
+
+class DatabaseServer:
+    def __init__(self):
+        self.connection_pool = []
+        self.cache_size = CACHE_SIZE  # This will fail if CACHE_SIZE is invalid
+
+    def connect(self):
+        try:
+            conn = sqlite3.connect(DATABASE_URL)
+            self.connection_pool.append(conn)
+            return conn
+        except Exception as e:
+            print(f"Connection failed: {e}")
+            return None
+"""
+
+            # Create test files
+            config_file = self.create_additional_test_file("config.py", config_content)
+            server_file = self.create_additional_test_file("database_server.py", server_content)
+
+            # Step 1: Start investigation (new conversation)
+            self.logger.info("    1.6.1: Step 1 - Start investigation")
+            response1, continuation_id = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigating application startup failures in production environment",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Application fails to start with configuration errors",
+                    "files_checked": [config_file],
+                    "relevant_files": [config_file],
+                    "relevant_methods": [],
+                    "hypothesis": "Configuration issue causing startup failure",
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start multi-step file context test")
+                return False
+
+            response1_data = self._parse_debug_response(response1)
+
+            # Validate step 1 - should use reference_only
+            file_context1 = response1_data.get("file_context", {})
+            if file_context1.get("type") != "reference_only":
+                self.logger.error("Step 1 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 1: reference_only file context")
+
+            # Step 2: Expand investigation
+            self.logger.info("    1.6.2: Step 2 - Expand investigation")
+            response2, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Found configuration issue - investigating database server initialization",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "MAX_CONNECTIONS environment variable contains invalid value, causing CACHE_SIZE calculation to fail",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Invalid environment variable causing integer conversion error",
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            response2_data = self._parse_debug_response(response2)
+
+            # Validate step 2 - should still use reference_only
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "reference_only":
+                self.logger.error("Step 2 should use reference_only file context")
+                return False
+
+            # Should reference both files
+            reference_note = file_context2.get("note", "")
+            if "config.py" not in reference_note or "database_server.py" not in reference_note:
+                self.logger.error("Step 2 should reference both files in note")
+                return False
+
+            self.logger.info("    ✅ Step 2: reference_only file context with multiple files")
+
+            # Step 3: Deep analysis
+            self.logger.info("    1.6.3: Step 3 - Deep analysis")
+            response3, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Analyzing the exact error propagation path and impact",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Error occurs in config.py line 8 when MAX_CONNECTIONS is not numeric, then propagates to DatabaseServer.__init__",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Need proper error handling and validation for environment variables",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to continue to step 3")
+                return False
+
+            response3_data = self._parse_debug_response(response3)
+
+            # Validate step 3 - should still use reference_only
+            file_context3 = response3_data.get("file_context", {})
+            if file_context3.get("type") != "reference_only":
+                self.logger.error("Step 3 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 3: reference_only file context")
+
+            # Step 4: Final analysis with expert consultation
+            self.logger.info("    1.6.4: Step 4 - Final step with expert analysis")
+            response4, _ = self.call_mcp_tool(
+                "debug",
+                {
+                    "step": "Investigation complete - root cause identified with solution",
+                    "step_number": 4,
+                    "total_steps": 4,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Root cause: config.py assumes MAX_CONNECTIONS env var is always a valid integer. Fix: add try/except with default value and proper validation.",
+                    "files_checked": [config_file, server_file],
+                    "relevant_files": [config_file, server_file],
+                    "relevant_methods": ["DatabaseServer.__init__"],
+                    "hypothesis": "Environment variable validation needed with proper error handling",
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response4:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response4_data = self._parse_debug_response(response4)
+
+            # Validate step 4 - should use fully_embedded for expert analysis
+            file_context4 = response4_data.get("file_context", {})
+            if file_context4.get("type") != "fully_embedded":
+                self.logger.error("Step 4 (final) should use fully_embedded file context")
+                return False
+
+            if "expert analysis" not in file_context4.get("context_optimization", "").lower():
+                self.logger.error("Final step should mention expert analysis in context optimization")
+                return False
+
+            # Verify expert analysis was triggered
+            if response4_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            # Check that expert analysis has file context
+            expert_analysis = response4_data.get("expert_analysis", {})
+            if not expert_analysis:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Step 4: fully_embedded file context with expert analysis")
+
+            # Validate the complete workflow progression
+            progression_summary = {
+                "step_1": "reference_only (new conversation, intermediate)",
+                "step_2": "reference_only (continuation, intermediate)",
+                "step_3": "reference_only (continuation, intermediate)",
+                "step_4": "fully_embedded (continuation, final)",
+            }
+
+            self.logger.info("    📋 File context progression:")
+            for step, context_type in progression_summary.items():
+                self.logger.info(f"      {step}: {context_type}")
+
+            self.logger.info("    ✅ Multi-step file context optimization test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Multi-step file context test failed: {e}")
+            return False
--- a/simulator_tests/test_per_tool_deduplication.py
+++ b/simulator_tests/test_per_tool_deduplication.py
@@ -60,14 +60,18 @@ def divide(x, y):
            # Step 1: precommit tool with dummy file (low thinking mode)
            self.logger.info("  Step 1: precommit tool with dummy file")
            precommit_params = {
+                "step": "Initial analysis of dummy_code.py for commit readiness. Please give me a quick one line reply.",
+                "step_number": 1,
+                "total_steps": 2,
+                "next_step_required": True,
+                "findings": "Starting pre-commit validation of dummy_code.py",
                "path": os.getcwd(),  # Use current working directory as the git repo path
-                "files": [dummy_file_path],
-                "prompt": "Please give me a quick one line reply. Review this code for commit readiness",
+                "relevant_files": [dummy_file_path],
                "thinking_mode": "low",
                "model": "flash",
            }

-            response1, continuation_id = self.call_mcp_tool_direct("precommit", precommit_params)
+            response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
            if not response1:
                self.logger.error("  ❌ Step 1: precommit tool failed")
                return False
@@ -86,13 +90,17 @@ def divide(x, y):
            # Step 2: codereview tool with same file (NO continuation - fresh conversation)
            self.logger.info("  Step 2: codereview tool with same file (fresh conversation)")
            codereview_params = {
-                "files": [dummy_file_path],
-                "prompt": "Please give me a quick one line reply. General code review for quality and best practices",
+                "step": "Initial code review of dummy_code.py for quality and best practices. Please give me a quick one line reply.",
+                "step_number": 1,
+                "total_steps": 1,
+                "next_step_required": False,
+                "findings": "Starting code review of dummy_code.py",
+                "relevant_files": [dummy_file_path],
                "thinking_mode": "low",
                "model": "flash",
            }

-            response2, _ = self.call_mcp_tool_direct("codereview", codereview_params)
+            response2, _ = self.call_mcp_tool("codereview", codereview_params)
            if not response2:
                self.logger.error("  ❌ Step 2: codereview tool failed")
                return False
@@ -115,14 +123,18 @@ def subtract(a, b):
            # Continue precommit with both files
            continue_params = {
                "continuation_id": continuation_id,
+                "step": "Continue analysis with new_feature.py added. Please give me a quick one line reply about both files.",
+                "step_number": 2,
+                "total_steps": 2,
+                "next_step_required": False,
+                "findings": "Continuing pre-commit validation with both dummy_code.py and new_feature.py",
                "path": os.getcwd(),  # Use current working directory as the git repo path
-                "files": [dummy_file_path, new_file_path],  # Old + new file
-                "prompt": "Please give me a quick one line reply. Now also review the new feature file along with the previous one",
+                "relevant_files": [dummy_file_path, new_file_path],  # Old + new file
                "thinking_mode": "low",
                "model": "flash",
            }

-            response3, _ = self.call_mcp_tool_direct("precommit", continue_params)
+            response3, _ = self.call_mcp_tool("precommit", continue_params)
            if not response3:
                self.logger.error("  ❌ Step 3: precommit continuation failed")
                return False
--- a/simulator_tests/test_planner_validation.py
+++ b/simulator_tests/test_planner_validation.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
 """
-Planner Tool Validation Test
+PlannerWorkflow Tool Validation Test

-Tests the planner tool's sequential planning capabilities including:
- Step-by-step planning with proper JSON responses
- Continuation logic across planning sessions
- Branching and revision capabilities
- Previous plan context loading
- Plan completion and summary storage
+Tests the planner tool's capabilities using the new workflow architecture.
+This validates that the new workflow-based implementation maintains all the
+functionality of the original planner tool while using the workflow pattern
+like the debug tool.
 """

 import json
@@ -17,7 +15,7 @@ from .conversation_base_test import ConversationBaseTest


 class PlannerValidationTest(ConversationBaseTest):
-    """Test planner tool's sequential planning and continuation features"""
+    """Test planner tool with new workflow architecture"""

    @property
    def test_name(self) -> str:
@@ -25,49 +23,62 @@ class PlannerValidationTest(ConversationBaseTest):

    @property
    def test_description(self) -> str:
-        return "Planner tool sequential planning and continuation validation"
+        return "PlannerWorkflow tool validation with new workflow architecture"

    def run_test(self) -> bool:
-        """Test planner tool sequential planning capabilities"""
+        """Test planner tool capabilities"""
        # Set up the test environment
        self.setUp()

        try:
-            self.logger.info("Test: Planner tool validation")
+            self.logger.info("Test: PlannerWorkflow tool validation (new architecture)")

-            # Test 1: Single planning session with multiple steps
+            # Test 1: Single planning session with workflow architecture
            if not self._test_single_planning_session():
                return False

-            # Test 2: Plan completion and continuation to new planning session
-            if not self._test_plan_continuation():
+            # Test 2: Planning with continuation using workflow
+            if not self._test_planning_with_continuation():
                return False

-            # Test 3: Branching and revision capabilities
+            # Test 3: Complex plan with deep thinking pauses
+            if not self._test_complex_plan_deep_thinking():
+                return False
+
+            # Test 4: Self-contained completion (no expert analysis)
+            if not self._test_self_contained_completion():
+                return False
+
+            # Test 5: Branching and revision with workflow
            if not self._test_branching_and_revision():
                return False

+            # Test 6: Workflow file context behavior
+            if not self._test_workflow_file_context():
+                return False
+
            self.logger.info("  ✅ All planner validation tests passed")
            return True

        except Exception as e:
-            self.logger.error(f"Planner validation test failed: {e}")
+            self.logger.error(f"PlannerWorkflow validation test failed: {e}")
            return False

    def _test_single_planning_session(self) -> bool:
-        """Test a complete planning session with multiple steps"""
+        """Test a complete planning session with workflow architecture"""
        try:
-            self.logger.info("  1.1: Testing single planning session")
+            self.logger.info("  1.1: Testing single planning session with workflow")

            # Step 1: Start planning
            self.logger.info("    1.1.1: Step 1 - Initial planning step")
            response1, continuation_id = self.call_mcp_tool(
                "planner",
                {
-                    "step": "I need to plan a microservices migration for our monolithic e-commerce platform. Let me start by understanding the current architecture and identifying the key business domains.",
+                    "step": "I need to plan a comprehensive API redesign for our legacy system. Let me start by analyzing the current state and identifying key requirements for the new API architecture.",
                    "step_number": 1,
-                    "total_steps": 5,
+                    "total_steps": 4,
                    "next_step_required": True,
+                    "model": "flash",
                },
            )

@@ -80,22 +91,44 @@ class PlannerValidationTest(ConversationBaseTest):
            if not response1_data:
                return False

-            # Validate step 1 response structure
-            if not self._validate_step_response(response1_data, 1, 5, True, "planning_success"):
+            # Validate step 1 response structure - expect pause_for_planner for next_step_required=True
+            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_planner"):
                return False

-            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
+            # Debug: Log the actual response structure to see what we're getting
+            self.logger.debug(f"Response structure: {list(response1_data.keys())}")
+
+            # Check workflow-specific response structure (more flexible)
+            status_key = None
+            for key in response1_data.keys():
+                if key.endswith("_status"):
+                    status_key = key
+                    break
+
+            if not status_key:
+                self.logger.error(f"Missing workflow status field in response: {list(response1_data.keys())}")
+                return False
+
+            self.logger.debug(f"Found status field: {status_key}")
+
+            # Check required_actions for workflow guidance
+            if not response1_data.get("required_actions"):
+                self.logger.error("Missing required_actions in workflow response")
+                return False
+
+            self.logger.info(f"    ✅ Step 1 successful with workflow, continuation_id: {continuation_id}")

            # Step 2: Continue planning
-            self.logger.info("    1.1.2: Step 2 - Domain identification")
+            self.logger.info("    1.1.2: Step 2 - API domain analysis")
            response2, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Based on my analysis, I can identify the main business domains: User Management, Product Catalog, Order Processing, Payment, and Inventory. Let me plan how to extract these into separate services.",
+                    "step": "After analyzing the current API, I can identify three main domains: User Management, Content Management, and Analytics. Let me design the new API structure with RESTful endpoints and proper versioning.",
                    "step_number": 2,
-                    "total_steps": 5,
+                    "total_steps": 4,
                    "next_step_required": True,
                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

@@ -104,21 +137,39 @@ class PlannerValidationTest(ConversationBaseTest):
                return False

            response2_data = self._parse_planner_response(response2)
-            if not self._validate_step_response(response2_data, 2, 5, True, "planning_success"):
+            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_planner"):
                return False

-            self.logger.info("    ✅ Step 2 successful")
+            # Check step history tracking in workflow (more flexible)
+            status_key = None
+            for key in response2_data.keys():
+                if key.endswith("_status"):
+                    status_key = key
+                    break

-            # Step 3: Final step
+            if status_key:
+                workflow_status = response2_data.get(status_key, {})
+                step_history_length = workflow_status.get("step_history_length", 0)
+                if step_history_length < 2:
+                    self.logger.error(f"Step history not properly tracked in workflow: {step_history_length}")
+                    return False
+                self.logger.debug(f"Step history length: {step_history_length}")
+            else:
+                self.logger.warning("No workflow status found, skipping step history check")
+
+            self.logger.info("    ✅ Step 2 successful with workflow tracking")
+
+            # Step 3: Final step - should trigger completion
            self.logger.info("    1.1.3: Step 3 - Final planning step")
            response3, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Now I'll create a phased migration strategy: Phase 1 - Extract User Management, Phase 2 - Product Catalog and Inventory, Phase 3 - Order Processing and Payment services. This completes the initial migration plan.",
+                    "step": "API redesign plan complete: Phase 1 - User Management API, Phase 2 - Content Management API, Phase 3 - Analytics API. Each phase includes proper authentication, rate limiting, and comprehensive documentation.",
                    "step_number": 3,
                    "total_steps": 3,  # Adjusted total
-                    "next_step_required": False,  # Final step
+                    "next_step_required": False,  # Final step - should complete without expert analysis
                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

@@ -127,125 +178,329 @@ class PlannerValidationTest(ConversationBaseTest):
                return False

            response3_data = self._parse_planner_response(response3)
-            if not self._validate_final_step_response(response3_data, 3, 3):
+            if not response3_data:
                return False

-            self.logger.info("    ✅ Planning session completed successfully")
+            # Validate final response structure - should be self-contained completion
+            if response3_data.get("status") != "planner_complete":
+                self.logger.error(f"Expected status 'planner_complete', got '{response3_data.get('status')}'")
+                return False
+
+            if not response3_data.get("planning_complete"):
+                self.logger.error("Expected planning_complete=true for final step")
+                return False
+
+            # Should NOT have expert_analysis (self-contained)
+            if "expert_analysis" in response3_data:
+                self.logger.error("PlannerWorkflow should be self-contained without expert analysis")
+                return False
+
+            # Check plan_summary exists
+            if not response3_data.get("plan_summary"):
+                self.logger.error("Missing plan_summary in final step")
+                return False
+
+            self.logger.info("    ✅ Planning session completed successfully with workflow architecture")

            # Store continuation_id for next test
-            self.migration_continuation_id = continuation_id
+            self.api_continuation_id = continuation_id
            return True

        except Exception as e:
            self.logger.error(f"Single planning session test failed: {e}")
            return False

-    def _test_plan_continuation(self) -> bool:
-        """Test continuing from a previous completed plan"""
+    def _test_planning_with_continuation(self) -> bool:
+        """Test planning continuation with workflow architecture"""
        try:
-            self.logger.info("  1.2: Testing plan continuation with previous context")
+            self.logger.info("  1.2: Testing planning continuation with workflow")

-            # Start a new planning session using the continuation_id from previous completed plan
-            self.logger.info("    1.2.1: New planning session with previous plan context")
-            response1, new_continuation_id = self.call_mcp_tool(
+            # Use continuation from previous test if available
+            continuation_id = getattr(self, "api_continuation_id", None)
+            if not continuation_id:
+                # Start fresh if no continuation available
+                self.logger.info("    1.2.0: Starting fresh planning session")
+                response0, continuation_id = self.call_mcp_tool(
+                    "planner",
+                    {
+                        "step": "Planning API security strategy",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "model": "flash",
+                    },
+                )
+                if not response0 or not continuation_id:
+                    self.logger.error("Failed to start fresh planning session")
+                    return False
+
+            # Test continuation step
+            self.logger.info("    1.2.1: Continue planning session")
+            response1, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Now that I have the microservices migration plan, let me plan the database strategy. I need to decide how to handle data consistency across the new services.",
-                    "step_number": 1,  # New planning session starts at step 1
-                    "total_steps": 4,
+                    "step": "Building on the API redesign, let me now plan the security implementation with OAuth 2.0, API keys, and rate limiting strategies.",
+                    "step_number": 2,
+                    "total_steps": 2,
                    "next_step_required": True,
-                    "continuation_id": self.migration_continuation_id,  # Use previous plan's continuation_id
+                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

-            if not response1 or not new_continuation_id:
-                self.logger.error("Failed to start new planning session with context")
+            if not response1:
+                self.logger.error("Failed to continue planning")
                return False

            response1_data = self._parse_planner_response(response1)
            if not response1_data:
                return False

-            # Should have previous plan context
-            if "previous_plan_context" not in response1_data:
-                self.logger.error("Expected previous_plan_context in new planning session")
+            # Validate continuation behavior
+            if not self._validate_step_response(response1_data, 2, 2, True, "pause_for_planner"):
                return False

-            # Check for key terms from the previous plan
-            context = response1_data["previous_plan_context"].lower()
-            if "migration" not in context and "plan" not in context:
-                self.logger.error("Previous plan context doesn't contain expected content")
+            # Check that continuation_id is preserved
+            if response1_data.get("continuation_id") != continuation_id:
+                self.logger.error("Continuation ID not preserved in workflow")
                return False

-            self.logger.info("    ✅ New planning session loaded previous plan context")
+            self.logger.info("    ✅ Planning continuation working with workflow")
+            return True

-            # Continue the new planning session (step 2+ should NOT load context)
-            self.logger.info("    1.2.2: Continue new planning session (no context loading)")
+        except Exception as e:
+            self.logger.error(f"Planning continuation test failed: {e}")
+            return False
+
+    def _test_complex_plan_deep_thinking(self) -> bool:
+        """Test complex plan with deep thinking pauses"""
+        try:
+            self.logger.info("  1.3: Testing complex plan with deep thinking pauses")
+
+            # Start complex plan (≥5 steps) - should trigger deep thinking
+            self.logger.info("    1.3.1: Step 1 of complex plan (should trigger deep thinking)")
+            response1, continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "I need to plan a complete digital transformation for our enterprise organization, including cloud migration, process automation, and cultural change management.",
+                    "step_number": 1,
+                    "total_steps": 8,  # Complex plan ≥5 steps
+                    "next_step_required": True,
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start complex planning")
+                return False
+
+            response1_data = self._parse_planner_response(response1)
+            if not response1_data:
+                return False
+
+            # Should trigger deep thinking pause for complex plan
+            if response1_data.get("status") != "pause_for_deep_thinking":
+                self.logger.error("Expected deep thinking pause for complex plan step 1")
+                return False
+
+            if not response1_data.get("thinking_required"):
+                self.logger.error("Expected thinking_required=true for complex plan")
+                return False
+
+            # Check required thinking actions
+            required_thinking = response1_data.get("required_thinking", [])
+            if len(required_thinking) < 4:
+                self.logger.error("Expected comprehensive thinking requirements for complex plan")
+                return False
+
+            # Check for deep thinking guidance in next_steps
+            next_steps = response1_data.get("next_steps", "")
+            if "MANDATORY" not in next_steps or "deep thinking" not in next_steps.lower():
+                self.logger.error("Expected mandatory deep thinking guidance")
+                return False
+
+            self.logger.info("    ✅ Complex plan step 1 correctly triggered deep thinking pause")
+
+            # Step 2 of complex plan - should also trigger deep thinking
+            self.logger.info("    1.3.2: Step 2 of complex plan (should trigger deep thinking)")
            response2, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "I'll implement a database-per-service pattern with eventual consistency using event sourcing for cross-service communication.",
+                    "step": "After deep analysis, I can see this transformation requires three parallel tracks: Technical Infrastructure, Business Process, and Human Capital. Let me design the coordination strategy.",
                    "step_number": 2,
-                    "total_steps": 4,
+                    "total_steps": 8,
                    "next_step_required": True,
-                    "continuation_id": new_continuation_id,  # Same continuation, step 2
+                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

            if not response2:
-                self.logger.error("Failed to continue new planning session")
+                self.logger.error("Failed to continue complex planning")
                return False

            response2_data = self._parse_planner_response(response2)
            if not response2_data:
                return False

-            # Step 2+ should NOT have previous_plan_context (only step 1 with continuation_id gets context)
-            if "previous_plan_context" in response2_data:
-                self.logger.error("Step 2 should NOT have previous_plan_context")
+            # Step 2 should also trigger deep thinking for complex plans
+            if response2_data.get("status") != "pause_for_deep_thinking":
+                self.logger.error("Expected deep thinking pause for complex plan step 2")
                return False

-            self.logger.info("    ✅ Step 2 correctly has no previous context (as expected)")
+            self.logger.info("    ✅ Complex plan step 2 correctly triggered deep thinking pause")
+
+            # Step 4 of complex plan - should use normal flow (after step 3)
+            self.logger.info("    1.3.3: Step 4 of complex plan (should use normal flow)")
+            response4, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Now moving to tactical planning: Phase 1 execution details with specific timelines and resource allocation for the technical infrastructure track.",
+                    "step_number": 4,
+                    "total_steps": 8,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                },
+            )
+
+            if not response4:
+                self.logger.error("Failed to continue to step 4")
+                return False
+
+            response4_data = self._parse_planner_response(response4)
+            if not response4_data:
+                return False
+
+            # Step 4 should use normal flow (no more deep thinking pauses)
+            if response4_data.get("status") != "pause_for_planner":
+                self.logger.error("Expected normal planning flow for step 4")
+                return False
+
+            if response4_data.get("thinking_required"):
+                self.logger.error("Step 4 should not require special thinking pause")
+                return False
+
+            self.logger.info("    ✅ Complex plan transitions to normal flow after step 3")
            return True

        except Exception as e:
-            self.logger.error(f"Plan continuation test failed: {e}")
+            self.logger.error(f"Complex plan deep thinking test failed: {e}")
            return False

-    def _test_branching_and_revision(self) -> bool:
-        """Test branching and revision capabilities"""
+    def _test_self_contained_completion(self) -> bool:
+        """Test self-contained completion without expert analysis"""
        try:
-            self.logger.info("  1.3: Testing branching and revision capabilities")
+            self.logger.info("  1.4: Testing self-contained completion")

-            # Start a new planning session for testing branching
-            self.logger.info("    1.3.1: Start planning session for branching test")
+            # Simple planning session that should complete without expert analysis
+            self.logger.info("    1.4.1: Simple planning session")
            response1, continuation_id = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Let me plan the deployment strategy for the microservices. I'll consider different deployment options.",
+                    "step": "Planning a simple website redesign with new color scheme and improved navigation.",
                    "step_number": 1,
-                    "total_steps": 4,
+                    "total_steps": 2,
                    "next_step_required": True,
+                    "model": "flash",
                },
            )

            if not response1 or not continuation_id:
-                self.logger.error("Failed to start branching test planning session")
+                self.logger.error("Failed to start simple planning")
                return False

-            # Test branching
-            self.logger.info("    1.3.2: Create a branch from step 1")
+            # Final step - should complete without expert analysis
+            self.logger.info("    1.4.2: Final step - self-contained completion")
            response2, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Branch A: I'll explore Kubernetes deployment with service mesh (Istio) for advanced traffic management and observability.",
+                    "step": "Website redesign plan complete: Phase 1 - Update color palette and typography, Phase 2 - Redesign navigation structure and user flows.",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to complete simple planning")
+                return False
+
+            response2_data = self._parse_planner_response(response2)
+            if not response2_data:
+                return False
+
+            # Validate self-contained completion
+            if response2_data.get("status") != "planner_complete":
+                self.logger.error("Expected self-contained completion status")
+                return False
+
+            # Should NOT call expert analysis
+            if "expert_analysis" in response2_data:
+                self.logger.error("PlannerWorkflow should not call expert analysis")
+                return False
+
+            # Should have planning_complete flag
+            if not response2_data.get("planning_complete"):
+                self.logger.error("Expected planning_complete=true")
+                return False
+
+            # Should have plan_summary
+            if not response2_data.get("plan_summary"):
+                self.logger.error("Expected plan_summary in completion")
+                return False
+
+            # Check completion instructions
+            output = response2_data.get("output", {})
+            if not output.get("instructions"):
+                self.logger.error("Missing output instructions for plan presentation")
+                return False
+
+            self.logger.info("    ✅ Self-contained completion working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Self-contained completion test failed: {e}")
+            return False
+
+    def _test_branching_and_revision(self) -> bool:
+        """Test branching and revision with workflow architecture"""
+        try:
+            self.logger.info("  1.5: Testing branching and revision with workflow")
+
+            # Start planning session for branching test
+            self.logger.info("    1.5.1: Start planning for branching test")
+            response1, continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Planning mobile app development strategy with different technology options to evaluate.",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start branching test")
+                return False
+
+            # Create branch
+            self.logger.info("    1.5.2: Create branch for React Native approach")
+            response2, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Branch A: React Native approach - cross-platform development with shared codebase, faster development cycle, and consistent UI across platforms.",
                    "step_number": 2,
                    "total_steps": 4,
                    "next_step_required": True,
                    "is_branch_point": True,
                    "branch_from_step": 1,
-                    "branch_id": "kubernetes-istio",
+                    "branch_id": "react-native",
                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

@@ -257,34 +512,35 @@ class PlannerValidationTest(ConversationBaseTest):
            if not response2_data:
                return False

-            # Validate branching metadata
+            # Validate branching in workflow
            metadata = response2_data.get("metadata", {})
            if not metadata.get("is_branch_point"):
-                self.logger.error("Branch point not properly recorded in metadata")
+                self.logger.error("Branch point not recorded in workflow")
                return False

-            if metadata.get("branch_id") != "kubernetes-istio":
+            if metadata.get("branch_id") != "react-native":
                self.logger.error("Branch ID not properly recorded")
                return False

-            if "kubernetes-istio" not in metadata.get("branches", []):
-                self.logger.error("Branch not recorded in branches list")
+            if "react-native" not in metadata.get("branches", []):
+                self.logger.error("Branch not added to branches list")
                return False

-            self.logger.info("    ✅ Branching working correctly")
+            self.logger.info("    ✅ Branching working with workflow architecture")

            # Test revision
-            self.logger.info("    1.3.3: Revise step 2")
+            self.logger.info("    1.5.3: Test revision capability")
            response3, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
+                    "step": "Revision of step 2: After consideration, let me revise the React Native approach to include performance optimizations and native module integration for critical features.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
                    "is_step_revision": True,
                    "revises_step_number": 2,
                    "continuation_id": continuation_id,
+                    "model": "flash",
                },
            )

@@ -296,23 +552,87 @@ class PlannerValidationTest(ConversationBaseTest):
            if not response3_data:
                return False

-            # Validate revision metadata
+            # Validate revision in workflow
            metadata = response3_data.get("metadata", {})
            if not metadata.get("is_step_revision"):
-                self.logger.error("Step revision not properly recorded in metadata")
+                self.logger.error("Step revision not recorded in workflow")
                return False

            if metadata.get("revises_step_number") != 2:
                self.logger.error("Revised step number not properly recorded")
                return False

-            self.logger.info("    ✅ Revision working correctly")
+            self.logger.info("    ✅ Revision working with workflow architecture")
            return True

        except Exception as e:
            self.logger.error(f"Branching and revision test failed: {e}")
            return False

+    def _test_workflow_file_context(self) -> bool:
+        """Test workflow file context behavior (should be minimal for planner)"""
+        try:
+            self.logger.info("  1.6: Testing workflow file context behavior")
+
+            # Planner typically doesn't use files, but test the workflow handles this correctly
+            self.logger.info("    1.6.1: Planning step with no files (normal case)")
+            response1, continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Planning data architecture for analytics platform.",
+                    "step_number": 1,
+                    "total_steps": 2,
+                    "next_step_required": True,
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start workflow file context test")
+                return False
+
+            response1_data = self._parse_planner_response(response1)
+            if not response1_data:
+                return False
+
+            # Planner workflow should not have file_context since it doesn't use files
+            if "file_context" in response1_data:
+                self.logger.info("    ℹ️ Workflow file context present but should be minimal for planner")
+
+            # Final step
+            self.logger.info("    1.6.2: Final step (should complete without file embedding)")
+            response2, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Data architecture plan complete with data lakes, processing pipelines, and analytics layers.",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to complete workflow file context test")
+                return False
+
+            response2_data = self._parse_planner_response(response2)
+            if not response2_data:
+                return False
+
+            # Final step should complete self-contained
+            if response2_data.get("status") != "planner_complete":
+                self.logger.error("Expected self-contained completion for planner workflow")
+                return False
+
+            self.logger.info("    ✅ Workflow file context behavior appropriate for planner")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Workflow file context test failed: {e}")
+            return False
+
    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool in-process - override for planner-specific response handling"""
        # Use in-process implementation to maintain conversation memory
@@ -329,7 +649,7 @@ class PlannerValidationTest(ConversationBaseTest):
    def _extract_planner_continuation_id(self, response_text: str) -> Optional[str]:
        """Extract continuation_id from planner response"""
        try:
-            # Parse the response - it's now direct JSON, not wrapped
+            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

@@ -340,7 +660,7 @@ class PlannerValidationTest(ConversationBaseTest):
    def _parse_planner_response(self, response_text: str) -> dict:
        """Parse planner tool JSON response"""
        try:
-            # Parse the response - it's now direct JSON, not wrapped
+            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
@@ -356,7 +676,7 @@ class PlannerValidationTest(ConversationBaseTest):
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
-        """Validate a planning step response structure"""
+        """Validate a planner step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status:
@@ -380,16 +700,11 @@ class PlannerValidationTest(ConversationBaseTest):
                )
                return False

-            # Check that step_content exists
+            # Check step_content exists
            if not response_data.get("step_content"):
                self.logger.error("Missing step_content in response")
                return False

-            # Check metadata exists
-            if "metadata" not in response_data:
-                self.logger.error("Missing metadata in response")
-                return False
-
            # Check next_steps guidance
            if not response_data.get("next_steps"):
                self.logger.error("Missing next_steps guidance in response")
@@ -400,40 +715,3 @@ class PlannerValidationTest(ConversationBaseTest):
        except Exception as e:
            self.logger.error(f"Error validating step response: {e}")
            return False
-
-    def _validate_final_step_response(self, response_data: dict, expected_step: int, expected_total: int) -> bool:
-        """Validate a final planning step response"""
-        try:
-            # Basic step validation
-            if not self._validate_step_response(
-                response_data, expected_step, expected_total, False, "planning_success"
-            ):
-                return False
-
-            # Check planning_complete flag
-            if not response_data.get("planning_complete"):
-                self.logger.error("Expected planning_complete=true for final step")
-                return False
-
-            # Check plan_summary exists
-            if not response_data.get("plan_summary"):
-                self.logger.error("Missing plan_summary in final step")
-                return False
-
-            # Check plan_summary contains expected content
-            plan_summary = response_data.get("plan_summary", "")
-            if "COMPLETE PLAN:" not in plan_summary:
-                self.logger.error("plan_summary doesn't contain 'COMPLETE PLAN:' marker")
-                return False
-
-            # Check next_steps mentions completion
-            next_steps = response_data.get("next_steps", "")
-            if "complete" not in next_steps.lower():
-                self.logger.error("next_steps doesn't indicate planning completion")
-                return False
-
-            return True
-
-        except Exception as e:
-            self.logger.error(f"Error validating final step response: {e}")
-            return False
--- a/simulator_tests/test_planner_validation_old.py
+++ b/simulator_tests/test_planner_validation_old.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python3
+"""
+Planner Tool Validation Test
+
+Tests the planner tool's sequential planning capabilities including:
+- Step-by-step planning with proper JSON responses
+- Continuation logic across planning sessions
+- Branching and revision capabilities
+- Previous plan context loading
+- Plan completion and summary storage
+"""
+
+import json
+from typing import Optional
+
+from .conversation_base_test import ConversationBaseTest
+
+
+class PlannerValidationTest(ConversationBaseTest):
+    """Test planner tool's sequential planning and continuation features"""
+
+    @property
+    def test_name(self) -> str:
+        return "planner_validation"
+
+    @property
+    def test_description(self) -> str:
+        return "Planner tool sequential planning and continuation validation"
+
+    def run_test(self) -> bool:
+        """Test planner tool sequential planning capabilities"""
+        # Set up the test environment
+        self.setUp()
+
+        try:
+            self.logger.info("Test: Planner tool validation")
+
+            # Test 1: Single planning session with multiple steps
+            if not self._test_single_planning_session():
+                return False
+
+            # Test 2: Plan completion and continuation to new planning session
+            if not self._test_plan_continuation():
+                return False
+
+            # Test 3: Branching and revision capabilities
+            if not self._test_branching_and_revision():
+                return False
+
+            self.logger.info("  ✅ All planner validation tests passed")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Planner validation test failed: {e}")
+            return False
+
+    def _test_single_planning_session(self) -> bool:
+        """Test a complete planning session with multiple steps"""
+        try:
+            self.logger.info("  1.1: Testing single planning session")
+
+            # Step 1: Start planning
+            self.logger.info("    1.1.1: Step 1 - Initial planning step")
+            response1, continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "I need to plan a microservices migration for our monolithic e-commerce platform. Let me start by understanding the current architecture and identifying the key business domains.",
+                    "step_number": 1,
+                    "total_steps": 5,
+                    "next_step_required": True,
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to get initial planning response")
+                return False
+
+            # Parse and validate JSON response
+            response1_data = self._parse_planner_response(response1)
+            if not response1_data:
+                return False
+
+            # Validate step 1 response structure
+            if not self._validate_step_response(response1_data, 1, 5, True, "planning_success"):
+                return False
+
+            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
+
+            # Step 2: Continue planning
+            self.logger.info("    1.1.2: Step 2 - Domain identification")
+            response2, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Based on my analysis, I can identify the main business domains: User Management, Product Catalog, Order Processing, Payment, and Inventory. Let me plan how to extract these into separate services.",
+                    "step_number": 2,
+                    "total_steps": 5,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue planning to step 2")
+                return False
+
+            response2_data = self._parse_planner_response(response2)
+            if not self._validate_step_response(response2_data, 2, 5, True, "planning_success"):
+                return False
+
+            self.logger.info("    ✅ Step 2 successful")
+
+            # Step 3: Final step
+            self.logger.info("    1.1.3: Step 3 - Final planning step")
+            response3, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Now I'll create a phased migration strategy: Phase 1 - Extract User Management, Phase 2 - Product Catalog and Inventory, Phase 3 - Order Processing and Payment services. This completes the initial migration plan.",
+                    "step_number": 3,
+                    "total_steps": 3,  # Adjusted total
+                    "next_step_required": False,  # Final step
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to complete planning session")
+                return False
+
+            response3_data = self._parse_planner_response(response3)
+            if not self._validate_final_step_response(response3_data, 3, 3):
+                return False
+
+            self.logger.info("    ✅ Planning session completed successfully")
+
+            # Store continuation_id for next test
+            self.migration_continuation_id = continuation_id
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Single planning session test failed: {e}")
+            return False
+
+    def _test_plan_continuation(self) -> bool:
+        """Test continuing from a previous completed plan"""
+        try:
+            self.logger.info("  1.2: Testing plan continuation with previous context")
+
+            # Start a new planning session using the continuation_id from previous completed plan
+            self.logger.info("    1.2.1: New planning session with previous plan context")
+            response1, new_continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Now that I have the microservices migration plan, let me plan the database strategy. I need to decide how to handle data consistency across the new services.",
+                    "step_number": 1,  # New planning session starts at step 1
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": self.migration_continuation_id,  # Use previous plan's continuation_id
+                },
+            )
+
+            if not response1 or not new_continuation_id:
+                self.logger.error("Failed to start new planning session with context")
+                return False
+
+            response1_data = self._parse_planner_response(response1)
+            if not response1_data:
+                return False
+
+            # Should have previous plan context
+            if "previous_plan_context" not in response1_data:
+                self.logger.error("Expected previous_plan_context in new planning session")
+                return False
+
+            # Check for key terms from the previous plan
+            context = response1_data["previous_plan_context"].lower()
+            if "migration" not in context and "plan" not in context:
+                self.logger.error("Previous plan context doesn't contain expected content")
+                return False
+
+            self.logger.info("    ✅ New planning session loaded previous plan context")
+
+            # Continue the new planning session (step 2+ should NOT load context)
+            self.logger.info("    1.2.2: Continue new planning session (no context loading)")
+            response2, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "I'll implement a database-per-service pattern with eventual consistency using event sourcing for cross-service communication.",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": new_continuation_id,  # Same continuation, step 2
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue new planning session")
+                return False
+
+            response2_data = self._parse_planner_response(response2)
+            if not response2_data:
+                return False
+
+            # Step 2+ should NOT have previous_plan_context (only step 1 with continuation_id gets context)
+            if "previous_plan_context" in response2_data:
+                self.logger.error("Step 2 should NOT have previous_plan_context")
+                return False
+
+            self.logger.info("    ✅ Step 2 correctly has no previous context (as expected)")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Plan continuation test failed: {e}")
+            return False
+
+    def _test_branching_and_revision(self) -> bool:
+        """Test branching and revision capabilities"""
+        try:
+            self.logger.info("  1.3: Testing branching and revision capabilities")
+
+            # Start a new planning session for testing branching
+            self.logger.info("    1.3.1: Start planning session for branching test")
+            response1, continuation_id = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Let me plan the deployment strategy for the microservices. I'll consider different deployment options.",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start branching test planning session")
+                return False
+
+            # Test branching
+            self.logger.info("    1.3.2: Create a branch from step 1")
+            response2, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Branch A: I'll explore Kubernetes deployment with service mesh (Istio) for advanced traffic management and observability.",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "is_branch_point": True,
+                    "branch_from_step": 1,
+                    "branch_id": "kubernetes-istio",
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to create branch")
+                return False
+
+            response2_data = self._parse_planner_response(response2)
+            if not response2_data:
+                return False
+
+            # Validate branching metadata
+            metadata = response2_data.get("metadata", {})
+            if not metadata.get("is_branch_point"):
+                self.logger.error("Branch point not properly recorded in metadata")
+                return False
+
+            if metadata.get("branch_id") != "kubernetes-istio":
+                self.logger.error("Branch ID not properly recorded")
+                return False
+
+            if "kubernetes-istio" not in metadata.get("branches", []):
+                self.logger.error("Branch not recorded in branches list")
+                return False
+
+            self.logger.info("    ✅ Branching working correctly")
+
+            # Test revision
+            self.logger.info("    1.3.3: Revise step 2")
+            response3, _ = self.call_mcp_tool(
+                "planner",
+                {
+                    "step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "is_step_revision": True,
+                    "revises_step_number": 2,
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to create revision")
+                return False
+
+            response3_data = self._parse_planner_response(response3)
+            if not response3_data:
+                return False
+
+            # Validate revision metadata
+            metadata = response3_data.get("metadata", {})
+            if not metadata.get("is_step_revision"):
+                self.logger.error("Step revision not properly recorded in metadata")
+                return False
+
+            if metadata.get("revises_step_number") != 2:
+                self.logger.error("Revised step number not properly recorded")
+                return False
+
+            self.logger.info("    ✅ Revision working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Branching and revision test failed: {e}")
+            return False
+
+    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
+        """Call an MCP tool in-process - override for planner-specific response handling"""
+        # Use in-process implementation to maintain conversation memory
+        response_text, _ = self.call_mcp_tool_direct(tool_name, params)
+
+        if not response_text:
+            return None, None
+
+        # Extract continuation_id from planner response specifically
+        continuation_id = self._extract_planner_continuation_id(response_text)
+
+        return response_text, continuation_id
+
+    def _extract_planner_continuation_id(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from planner response"""
+        try:
+            # Parse the response - it's now direct JSON, not wrapped
+            response_data = json.loads(response_text)
+            return response_data.get("continuation_id")
+
+        except json.JSONDecodeError as e:
+            self.logger.debug(f"Failed to parse response for planner continuation_id: {e}")
+            return None
+
+    def _parse_planner_response(self, response_text: str) -> dict:
+        """Parse planner tool JSON response"""
+        try:
+            # Parse the response - it's now direct JSON, not wrapped
+            return json.loads(response_text)
+
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse planner response as JSON: {e}")
+            self.logger.error(f"Response text: {response_text[:500]}...")
+            return {}
+
+    def _validate_step_response(
+        self,
+        response_data: dict,
+        expected_step: int,
+        expected_total: int,
+        expected_next_required: bool,
+        expected_status: str,
+    ) -> bool:
+        """Validate a planning step response structure"""
+        try:
+            # Check status
+            if response_data.get("status") != expected_status:
+                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
+                return False
+
+            # Check step number
+            if response_data.get("step_number") != expected_step:
+                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
+                return False
+
+            # Check total steps
+            if response_data.get("total_steps") != expected_total:
+                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
+                return False
+
+            # Check next_step_required
+            if response_data.get("next_step_required") != expected_next_required:
+                self.logger.error(
+                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
+                )
+                return False
+
+            # Check that step_content exists
+            if not response_data.get("step_content"):
+                self.logger.error("Missing step_content in response")
+                return False
+
+            # Check metadata exists
+            if "metadata" not in response_data:
+                self.logger.error("Missing metadata in response")
+                return False
+
+            # Check next_steps guidance
+            if not response_data.get("next_steps"):
+                self.logger.error("Missing next_steps guidance in response")
+                return False
+
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error validating step response: {e}")
+            return False
+
+    def _validate_final_step_response(self, response_data: dict, expected_step: int, expected_total: int) -> bool:
+        """Validate a final planning step response"""
+        try:
+            # Basic step validation
+            if not self._validate_step_response(
+                response_data, expected_step, expected_total, False, "planning_success"
+            ):
+                return False
+
+            # Check planning_complete flag
+            if not response_data.get("planning_complete"):
+                self.logger.error("Expected planning_complete=true for final step")
+                return False
+
+            # Check plan_summary exists
+            if not response_data.get("plan_summary"):
+                self.logger.error("Missing plan_summary in final step")
+                return False
+
+            # Check plan_summary contains expected content
+            plan_summary = response_data.get("plan_summary", "")
+            if "COMPLETE PLAN:" not in plan_summary:
+                self.logger.error("plan_summary doesn't contain 'COMPLETE PLAN:' marker")
+                return False
+
+            # Check next_steps mentions completion
+            next_steps = response_data.get("next_steps", "")
+            if "complete" not in next_steps.lower():
+                self.logger.error("next_steps doesn't indicate planning completion")
+                return False
+
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error validating final step response: {e}")
+            return False
--- a/simulator_tests/test_precommitworkflow_validation.py
+++ b/simulator_tests/test_precommitworkflow_validation.py
--- a/simulator_tests/test_refactor_validation.py
+++ b/simulator_tests/test_refactor_validation.py
--- a/simulator_tests/test_testgen_validation.py
+++ b/simulator_tests/test_testgen_validation.py
@@ -2,18 +2,19 @@
 """
 TestGen Tool Validation Test

-Tests the testgen tool by:
- Creating a test code file with a specific function
- Using testgen to generate tests with a specific function name
- Validating that the output contains the expected test function
- Confirming the format matches test generation patterns
+Tests the testgen tool's capabilities using the workflow architecture.
+This validates that the workflow-based implementation guides Claude through
+systematic test generation analysis before creating comprehensive test suites.
 """

-from .base_test import BaseSimulatorTest
+import json
+from typing import Optional
+
+from .conversation_base_test import ConversationBaseTest


-class TestGenValidationTest(BaseSimulatorTest):
-    """Test testgen tool validation with specific function name"""
+class TestGenValidationTest(ConversationBaseTest):
+    """Test testgen tool with workflow architecture"""

    @property
    def test_name(self) -> str:
@@ -21,111 +22,812 @@ class TestGenValidationTest(BaseSimulatorTest):

    @property
    def test_description(self) -> str:
-        return "TestGen tool validation with specific test function"
+        return "TestGen tool validation with step-by-step test planning"

    def run_test(self) -> bool:
-        """Test testgen tool with specific function name validation"""
+        """Test testgen tool capabilities"""
+        # Set up the test environment
+        self.setUp()
+
        try:
            self.logger.info("Test: TestGen tool validation")

-            # Setup test files
-            self.setup_test_files()
+            # Create sample code files to test
+            self._create_test_code_files()

-            # Create a specific code file for test generation
-            test_code_content = '''"""
-Sample authentication module for testing testgen
-"""
-
-class UserAuthenticator:
-    """Handles user authentication logic"""
-
-    def __init__(self):
-        self.failed_attempts = {}
-        self.max_attempts = 3
-
-    def validate_password(self, username, password):
-        """Validate user password with security checks"""
-        if not username or not password:
-            return False
-
-        if username in self.failed_attempts:
-            if self.failed_attempts[username] >= self.max_attempts:
-                return False  # Account locked
-
-        # Simple validation for demo
-        if len(password) < 8:
-            self._record_failed_attempt(username)
-            return False
-
-        if password == "password123":  # Demo valid password
-            self._reset_failed_attempts(username)
-            return True
-
-        self._record_failed_attempt(username)
-        return False
-
-    def _record_failed_attempt(self, username):
-        """Record a failed login attempt"""
-        self.failed_attempts[username] = self.failed_attempts.get(username, 0) + 1
-
-    def _reset_failed_attempts(self, username):
-        """Reset failed attempts after successful login"""
-        if username in self.failed_attempts:
-            del self.failed_attempts[username]
-'''
-
-            # Create the auth code file
-            auth_file = self.create_additional_test_file("user_auth.py", test_code_content)
-
-            # Test testgen tool with specific requirements
-            self.logger.info("  1.1: Generate tests with specific function name")
-            response, continuation_id = self.call_mcp_tool(
-                "testgen",
-                {
-                    "files": [auth_file],
-                    "prompt": "Generate comprehensive tests for the UserAuthenticator.validate_password method. Include tests for edge cases, security scenarios, and account locking. Use the specific test function name 'test_password_validation_edge_cases' for one of the test methods.",
-                    "model": "flash",
-                },
-            )
-
-            if not response:
-                self.logger.error("Failed to get testgen response")
+            # Test 1: Single investigation session with multiple steps
+            if not self._test_single_test_generation_session():
                return False

-            self.logger.info("  1.2: Validate response contains expected test function")
-
-            # Check that the response contains the specific test function name
-            if "test_password_validation_edge_cases" not in response:
-                self.logger.error("Response does not contain the requested test function name")
-                self.logger.debug(f"Response content: {response[:500]}...")
+            # Test 2: Test generation with pattern following
+            if not self._test_generation_with_pattern_following():
                return False

-            # Check for common test patterns
-            test_patterns = [
-                "def test_",  # Test function definition
-                "assert",  # Assertion statements
-                "UserAuthenticator",  # Class being tested
-                "validate_password",  # Method being tested
-            ]
-
-            missing_patterns = []
-            for pattern in test_patterns:
-                if pattern not in response:
-                    missing_patterns.append(pattern)
-
-            if missing_patterns:
-                self.logger.error(f"Response missing expected test patterns: {missing_patterns}")
-                self.logger.debug(f"Response content: {response[:500]}...")
+            # Test 3: Complete test generation with expert analysis
+            if not self._test_complete_generation_with_analysis():
                return False

-            self.logger.info("  ✅ TestGen tool validation successful")
-            self.logger.info("  ✅ Generated tests contain expected function name")
-            self.logger.info("  ✅ Generated tests follow proper test patterns")
+            # Test 4: Certain confidence behavior
+            if not self._test_certain_confidence():
+                return False

+            # Test 5: Context-aware file embedding
+            if not self._test_context_aware_file_embedding():
+                return False
+
+            # Test 6: Multi-step test planning
+            if not self._test_multi_step_test_planning():
+                return False
+
+            self.logger.info("  ✅ All testgen validation tests passed")
            return True

        except Exception as e:
            self.logger.error(f"TestGen validation test failed: {e}")
            return False
-        finally:
-            self.cleanup_test_files()
+
+    def _create_test_code_files(self):
+        """Create sample code files for test generation"""
+        # Create a calculator module with various functions
+        calculator_code = """#!/usr/bin/env python3
+\"\"\"
+Simple calculator module for demonstration
+\"\"\"
+
+def add(a, b):
+    \"\"\"Add two numbers\"\"\"
+    return a + b
+
+def subtract(a, b):
+    \"\"\"Subtract b from a\"\"\"
+    return a - b
+
+def multiply(a, b):
+    \"\"\"Multiply two numbers\"\"\"
+    return a * b
+
+def divide(a, b):
+    \"\"\"Divide a by b\"\"\"
+    if b == 0:
+        raise ValueError("Cannot divide by zero")
+    return a / b
+
+def calculate_percentage(value, percentage):
+    \"\"\"Calculate percentage of a value\"\"\"
+    if percentage < 0:
+        raise ValueError("Percentage cannot be negative")
+    if percentage > 100:
+        raise ValueError("Percentage cannot exceed 100")
+    return (value * percentage) / 100
+
+def power(base, exponent):
+    \"\"\"Calculate base raised to exponent\"\"\"
+    if base == 0 and exponent < 0:
+        raise ValueError("Cannot raise 0 to negative power")
+    return base ** exponent
+"""
+
+        # Create test file
+        self.calculator_file = self.create_additional_test_file("calculator.py", calculator_code)
+        self.logger.info(f"  ✅ Created calculator module: {self.calculator_file}")
+
+        # Create a simple existing test file to use as pattern
+        existing_test = """#!/usr/bin/env python3
+import pytest
+from calculator import add, subtract
+
+class TestCalculatorBasic:
+    \"\"\"Test basic calculator operations\"\"\"
+
+    def test_add_positive_numbers(self):
+        \"\"\"Test adding two positive numbers\"\"\"
+        assert add(2, 3) == 5
+        assert add(10, 20) == 30
+
+    def test_add_negative_numbers(self):
+        \"\"\"Test adding negative numbers\"\"\"
+        assert add(-5, -3) == -8
+        assert add(-10, 5) == -5
+
+    def test_subtract_positive(self):
+        \"\"\"Test subtracting positive numbers\"\"\"
+        assert subtract(10, 3) == 7
+        assert subtract(5, 5) == 0
+"""
+
+        self.existing_test_file = self.create_additional_test_file("test_calculator_basic.py", existing_test)
+        self.logger.info(f"  ✅ Created existing test file: {self.existing_test_file}")
+
+    def _test_single_test_generation_session(self) -> bool:
+        """Test a complete test generation session with multiple steps"""
+        try:
+            self.logger.info("  1.1: Testing single test generation session")
+
+            # Step 1: Start investigation
+            self.logger.info("    1.1.1: Step 1 - Initial test planning")
+            response1, continuation_id = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "I need to generate comprehensive tests for the calculator module. Let me start by analyzing the code structure and understanding the functionality.",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Calculator module contains 6 functions: add, subtract, multiply, divide, calculate_percentage, and power. Each has specific error conditions that need testing.",
+                    "files_checked": [self.calculator_file],
+                    "relevant_files": [self.calculator_file],
+                    "relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to get initial test planning response")
+                return False
+
+            # Parse and validate JSON response
+            response1_data = self._parse_testgen_response(response1)
+            if not response1_data:
+                return False
+
+            # Validate step 1 response structure
+            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_test_analysis"):
+                return False
+
+            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
+
+            # Step 2: Analyze test requirements
+            self.logger.info("    1.1.2: Step 2 - Test requirements analysis")
+            response2, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Now analyzing the test requirements for each function, identifying edge cases and boundary conditions.",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Identified key test scenarios: (1) divide - zero division error, (2) calculate_percentage - negative/over 100 validation, (3) power - zero to negative power error. Need tests for normal cases and edge cases.",
+                    "files_checked": [self.calculator_file],
+                    "relevant_files": [self.calculator_file],
+                    "relevant_context": ["divide", "calculate_percentage", "power"],
+                    "confidence": "medium",
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue test planning to step 2")
+                return False
+
+            response2_data = self._parse_testgen_response(response2)
+            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_test_analysis"):
+                return False
+
+            # Check test generation status tracking
+            test_status = response2_data.get("test_generation_status", {})
+            if test_status.get("test_scenarios_identified", 0) < 3:
+                self.logger.error("Test scenarios not properly tracked")
+                return False
+
+            if test_status.get("analysis_confidence") != "medium":
+                self.logger.error("Confidence level not properly tracked")
+                return False
+
+            self.logger.info("    ✅ Step 2 successful with proper tracking")
+
+            # Store continuation_id for next test
+            self.test_continuation_id = continuation_id
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Single test generation session test failed: {e}")
+            return False
+
+    def _test_generation_with_pattern_following(self) -> bool:
+        """Test test generation following existing patterns"""
+        try:
+            self.logger.info("  1.2: Testing test generation with pattern following")
+
+            # Start a new investigation with existing test patterns
+            self.logger.info("    1.2.1: Start test generation with pattern reference")
+            response1, continuation_id = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Generating tests for remaining calculator functions following existing test patterns",
+                    "step_number": 1,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Found existing test pattern using pytest with class-based organization and descriptive test names",
+                    "files_checked": [self.calculator_file, self.existing_test_file],
+                    "relevant_files": [self.calculator_file, self.existing_test_file],
+                    "relevant_context": ["TestCalculatorBasic", "multiply", "divide", "calculate_percentage", "power"],
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start pattern following test")
+                return False
+
+            # Step 2: Analyze patterns
+            self.logger.info("    1.2.2: Step 2 - Pattern analysis")
+            response2, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Analyzing the existing test patterns to maintain consistency",
+                    "step_number": 2,
+                    "total_steps": 3,
+                    "next_step_required": True,
+                    "findings": "Existing tests use: class-based organization (TestCalculatorBasic), descriptive method names (test_operation_scenario), multiple assertions per test, pytest framework",
+                    "files_checked": [self.existing_test_file],
+                    "relevant_files": [self.calculator_file, self.existing_test_file],
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            self.logger.info("    ✅ Pattern analysis successful")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Pattern following test failed: {e}")
+            return False
+
+    def _test_complete_generation_with_analysis(self) -> bool:
+        """Test complete test generation ending with expert analysis"""
+        try:
+            self.logger.info("  1.3: Testing complete test generation with expert analysis")
+
+            # Use the continuation from first test or start fresh
+            continuation_id = getattr(self, "test_continuation_id", None)
+            if not continuation_id:
+                # Start fresh if no continuation available
+                self.logger.info("    1.3.0: Starting fresh test generation")
+                response0, continuation_id = self.call_mcp_tool(
+                    "testgen",
+                    {
+                        "step": "Analyzing calculator module for comprehensive test generation",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "findings": "Identified 6 functions needing tests with various edge cases",
+                        "files_checked": [self.calculator_file],
+                        "relevant_files": [self.calculator_file],
+                        "relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
+                    },
+                )
+                if not response0 or not continuation_id:
+                    self.logger.error("Failed to start fresh test generation")
+                    return False
+
+            # Final step - trigger expert analysis
+            self.logger.info("    1.3.1: Final step - complete test planning")
+            response_final, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Test planning complete. Identified all test scenarios including edge cases, error conditions, and boundary values for comprehensive coverage.",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step - triggers expert analysis
+                    "findings": "Complete test plan: normal operations, edge cases (zero, negative), error conditions (divide by zero, invalid percentage, zero to negative power), boundary values",
+                    "files_checked": [self.calculator_file],
+                    "relevant_files": [self.calculator_file],
+                    "relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                    "model": "flash",  # Use flash for expert analysis
+                },
+            )
+
+            if not response_final:
+                self.logger.error("Failed to complete test generation")
+                return False
+
+            response_final_data = self._parse_testgen_response(response_final)
+            if not response_final_data:
+                return False
+
+            # Validate final response structure
+            if response_final_data.get("status") != "calling_expert_analysis":
+                self.logger.error(
+                    f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
+                )
+                return False
+
+            if not response_final_data.get("test_generation_complete"):
+                self.logger.error("Expected test_generation_complete=true for final step")
+                return False
+
+            # Check for expert analysis
+            if "expert_analysis" not in response_final_data:
+                self.logger.error("Missing expert_analysis in final response")
+                return False
+
+            expert_analysis = response_final_data.get("expert_analysis", {})
+
+            # Check for expected analysis content
+            analysis_text = json.dumps(expert_analysis).lower()
+
+            # Look for test generation indicators
+            test_indicators = ["test", "edge", "boundary", "error", "coverage", "pytest"]
+            found_indicators = sum(1 for indicator in test_indicators if indicator in analysis_text)
+
+            if found_indicators >= 4:
+                self.logger.info("    ✅ Expert analysis provided comprehensive test suggestions")
+            else:
+                self.logger.warning(
+                    f"    ⚠️ Expert analysis may not have fully addressed test generation (found {found_indicators}/6 indicators)"
+                )
+
+            # Check complete test generation summary
+            if "complete_test_generation" not in response_final_data:
+                self.logger.error("Missing complete_test_generation in final response")
+                return False
+
+            complete_generation = response_final_data["complete_test_generation"]
+            if not complete_generation.get("relevant_context"):
+                self.logger.error("Missing relevant context in complete test generation")
+                return False
+
+            self.logger.info("    ✅ Complete test generation with expert analysis successful")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Complete test generation test failed: {e}")
+            return False
+
+    def _test_certain_confidence(self) -> bool:
+        """Test certain confidence behavior - should skip expert analysis"""
+        try:
+            self.logger.info("  1.4: Testing certain confidence behavior")
+
+            # Test certain confidence - should skip expert analysis
+            self.logger.info("    1.4.1: Certain confidence test generation")
+            response_certain, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "I have fully analyzed the code and identified all test scenarios with 100% certainty. Test plan is complete.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,  # Final step
+                    "findings": "Complete test coverage plan: all functions covered with normal cases, edge cases, and error conditions. Ready for implementation.",
+                    "files_checked": [self.calculator_file],
+                    "relevant_files": [self.calculator_file],
+                    "relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
+                    "confidence": "certain",  # This should skip expert analysis
+                    "model": "flash",
+                },
+            )
+
+            if not response_certain:
+                self.logger.error("Failed to test certain confidence")
+                return False
+
+            response_certain_data = self._parse_testgen_response(response_certain)
+            if not response_certain_data:
+                return False
+
+            # Validate certain confidence response - should skip expert analysis
+            if response_certain_data.get("status") != "test_generation_complete_ready_for_implementation":
+                self.logger.error(
+                    f"Expected status 'test_generation_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
+                )
+                return False
+
+            if not response_certain_data.get("skip_expert_analysis"):
+                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
+                return False
+
+            expert_analysis = response_certain_data.get("expert_analysis", {})
+            if expert_analysis.get("status") != "skipped_due_to_certain_test_confidence":
+                self.logger.error("Expert analysis should be skipped for certain confidence")
+                return False
+
+            self.logger.info("    ✅ Certain confidence behavior working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Certain confidence test failed: {e}")
+            return False
+
+    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
+        """Call an MCP tool in-process - override for testgen-specific response handling"""
+        # Use in-process implementation to maintain conversation memory
+        response_text, _ = self.call_mcp_tool_direct(tool_name, params)
+
+        if not response_text:
+            return None, None
+
+        # Extract continuation_id from testgen response specifically
+        continuation_id = self._extract_testgen_continuation_id(response_text)
+
+        return response_text, continuation_id
+
+    def _extract_testgen_continuation_id(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from testgen response"""
+        try:
+            # Parse the response
+            response_data = json.loads(response_text)
+            return response_data.get("continuation_id")
+
+        except json.JSONDecodeError as e:
+            self.logger.debug(f"Failed to parse response for testgen continuation_id: {e}")
+            return None
+
+    def _parse_testgen_response(self, response_text: str) -> dict:
+        """Parse testgen tool JSON response"""
+        try:
+            # Parse the response - it should be direct JSON
+            return json.loads(response_text)
+
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse testgen response as JSON: {e}")
+            self.logger.error(f"Response text: {response_text[:500]}...")
+            return {}
+
+    def _validate_step_response(
+        self,
+        response_data: dict,
+        expected_step: int,
+        expected_total: int,
+        expected_next_required: bool,
+        expected_status: str,
+    ) -> bool:
+        """Validate a test generation step response structure"""
+        try:
+            # Check status
+            if response_data.get("status") != expected_status:
+                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
+                return False
+
+            # Check step number
+            if response_data.get("step_number") != expected_step:
+                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
+                return False
+
+            # Check total steps
+            if response_data.get("total_steps") != expected_total:
+                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
+                return False
+
+            # Check next_step_required
+            if response_data.get("next_step_required") != expected_next_required:
+                self.logger.error(
+                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
+                )
+                return False
+
+            # Check test_generation_status exists
+            if "test_generation_status" not in response_data:
+                self.logger.error("Missing test_generation_status in response")
+                return False
+
+            # Check next_steps guidance
+            if not response_data.get("next_steps"):
+                self.logger.error("Missing next_steps guidance in response")
+                return False
+
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error validating step response: {e}")
+            return False
+
+    def _test_context_aware_file_embedding(self) -> bool:
+        """Test context-aware file embedding optimization"""
+        try:
+            self.logger.info("  1.5: Testing context-aware file embedding")
+
+            # Create additional test files
+            utils_code = """#!/usr/bin/env python3
+def validate_number(n):
+    \"\"\"Validate if input is a number\"\"\"
+    return isinstance(n, (int, float))
+
+def format_result(result):
+    \"\"\"Format calculation result\"\"\"
+    if isinstance(result, float):
+        return round(result, 2)
+    return result
+"""
+
+            math_helpers_code = """#!/usr/bin/env python3
+import math
+
+def factorial(n):
+    \"\"\"Calculate factorial of n\"\"\"
+    if n < 0:
+        raise ValueError("Factorial not defined for negative numbers")
+    return math.factorial(n)
+
+def is_prime(n):
+    \"\"\"Check if number is prime\"\"\"
+    if n < 2:
+        return False
+    for i in range(2, int(n**0.5) + 1):
+        if n % i == 0:
+            return False
+    return True
+"""
+
+            # Create test files
+            utils_file = self.create_additional_test_file("utils.py", utils_code)
+            math_file = self.create_additional_test_file("math_helpers.py", math_helpers_code)
+
+            # Test 1: New conversation, intermediate step - should only reference files
+            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
+            response1, continuation_id = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Starting test generation for utility modules",
+                    "step_number": 1,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Intermediate step
+                    "findings": "Initial analysis of utility functions",
+                    "files_checked": [utils_file, math_file],
+                    "relevant_files": [utils_file],  # This should be referenced, not embedded
+                    "relevant_context": ["validate_number", "format_result"],
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start context-aware file embedding test")
+                return False
+
+            response1_data = self._parse_testgen_response(response1)
+            if not response1_data:
+                return False
+
+            # Check file context - should be reference_only for intermediate step
+            file_context = response1_data.get("file_context", {})
+            if file_context.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
+                return False
+
+            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")
+
+            # Test 2: Final step - should embed files for expert analysis
+            self.logger.info("    1.5.2: Final step (should embed files)")
+            response2, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Test planning complete - all test scenarios identified",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Complete test plan for all utility functions with edge cases",
+                    "files_checked": [utils_file, math_file],
+                    "relevant_files": [utils_file, math_file],  # Should be fully embedded
+                    "relevant_context": ["validate_number", "format_result", "factorial", "is_prime"],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response2_data = self._parse_testgen_response(response2)
+            if not response2_data:
+                return False
+
+            # Check file context - should be fully_embedded for final step
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "fully_embedded":
+                self.logger.error(
+                    f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
+                )
+                return False
+
+            # Verify expert analysis was called for final step
+            if response2_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Context-aware file embedding test failed: {e}")
+            return False
+
+    def _test_multi_step_test_planning(self) -> bool:
+        """Test multi-step test planning with complex code"""
+        try:
+            self.logger.info("  1.6: Testing multi-step test planning")
+
+            # Create a complex class to test
+            complex_code = """#!/usr/bin/env python3
+import asyncio
+from typing import List, Dict, Optional
+
+class DataProcessor:
+    \"\"\"Complex data processor with async operations\"\"\"
+
+    def __init__(self, batch_size: int = 100):
+        self.batch_size = batch_size
+        self.processed_count = 0
+        self.error_count = 0
+        self.cache: Dict[str, any] = {}
+
+    async def process_batch(self, items: List[dict]) -> List[dict]:
+        \"\"\"Process a batch of items asynchronously\"\"\"
+        if not items:
+            return []
+
+        if len(items) > self.batch_size:
+            raise ValueError(f"Batch size {len(items)} exceeds limit {self.batch_size}")
+
+        results = []
+        for item in items:
+            try:
+                result = await self._process_single_item(item)
+                results.append(result)
+                self.processed_count += 1
+            except Exception as e:
+                self.error_count += 1
+                results.append({"error": str(e), "item": item})
+
+        return results
+
+    async def _process_single_item(self, item: dict) -> dict:
+        \"\"\"Process a single item with caching\"\"\"
+        item_id = item.get('id')
+        if not item_id:
+            raise ValueError("Item must have an ID")
+
+        # Check cache
+        if item_id in self.cache:
+            return self.cache[item_id]
+
+        # Simulate async processing
+        await asyncio.sleep(0.01)
+
+        processed = {
+            'id': item_id,
+            'processed': True,
+            'value': item.get('value', 0) * 2
+        }
+
+        # Cache result
+        self.cache[item_id] = processed
+        return processed
+
+    def get_stats(self) -> Dict[str, int]:
+        \"\"\"Get processing statistics\"\"\"
+        return {
+            'processed': self.processed_count,
+            'errors': self.error_count,
+            'cache_size': len(self.cache),
+            'success_rate': self.processed_count / (self.processed_count + self.error_count) if (self.processed_count + self.error_count) > 0 else 0
+        }
+"""
+
+            # Create test file
+            processor_file = self.create_additional_test_file("data_processor.py", complex_code)
+
+            # Step 1: Start investigation
+            self.logger.info("    1.6.1: Step 1 - Start complex test planning")
+            response1, continuation_id = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Analyzing complex DataProcessor class for comprehensive test generation",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "DataProcessor is an async class with caching, error handling, and statistics. Need async test patterns.",
+                    "files_checked": [processor_file],
+                    "relevant_files": [processor_file],
+                    "relevant_context": ["DataProcessor", "process_batch", "_process_single_item", "get_stats"],
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start multi-step test planning")
+                return False
+
+            response1_data = self._parse_testgen_response(response1)
+
+            # Validate step 1
+            file_context1 = response1_data.get("file_context", {})
+            if file_context1.get("type") != "reference_only":
+                self.logger.error("Step 1 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 1: Started complex test planning")
+
+            # Step 2: Analyze async patterns
+            self.logger.info("    1.6.2: Step 2 - Async pattern analysis")
+            response2, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Analyzing async patterns and edge cases for testing",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Key test areas: async batch processing, cache behavior, error handling, batch size limits, empty items, statistics calculation",
+                    "files_checked": [processor_file],
+                    "relevant_files": [processor_file],
+                    "relevant_context": ["process_batch", "_process_single_item"],
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            self.logger.info("    ✅ Step 2: Async patterns analyzed")
+
+            # Step 3: Edge case identification
+            self.logger.info("    1.6.3: Step 3 - Edge case identification")
+            response3, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Identifying all edge cases and boundary conditions",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Edge cases: empty batch, oversized batch, items without ID, cache hits/misses, concurrent processing, error accumulation",
+                    "files_checked": [processor_file],
+                    "relevant_files": [processor_file],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to continue to step 3")
+                return False
+
+            self.logger.info("    ✅ Step 3: Edge cases identified")
+
+            # Step 4: Final test plan with expert analysis
+            self.logger.info("    1.6.4: Step 4 - Complete test plan")
+            response4, _ = self.call_mcp_tool(
+                "testgen",
+                {
+                    "step": "Test planning complete with comprehensive coverage strategy",
+                    "step_number": 4,
+                    "total_steps": 4,
+                    "next_step_required": False,  # Final step
+                    "continuation_id": continuation_id,
+                    "findings": "Complete async test suite plan: unit tests for each method, integration tests for batch processing, edge case coverage, performance tests",
+                    "files_checked": [processor_file],
+                    "relevant_files": [processor_file],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response4:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response4_data = self._parse_testgen_response(response4)
+
+            # Validate final step
+            if response4_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            file_context4 = response4_data.get("file_context", {})
+            if file_context4.get("type") != "fully_embedded":
+                self.logger.error("Final step should use fully_embedded file context")
+                return False
+
+            self.logger.info("    ✅ Multi-step test planning completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Multi-step test planning test failed: {e}")
+            return False
--- a/simulator_tests/test_thinkdeep_validation.py
+++ b/simulator_tests/test_thinkdeep_validation.py
@@ -0,0 +1,950 @@
+#!/usr/bin/env python3
+"""
+ThinkDeep Tool Validation Test
+
+Tests the thinkdeep tool's capabilities using the new workflow architecture.
+This validates that the workflow-based deep thinking implementation provides
+step-by-step thinking with expert analysis integration.
+"""
+
+import json
+from typing import Optional
+
+from .conversation_base_test import ConversationBaseTest
+
+
+class ThinkDeepWorkflowValidationTest(ConversationBaseTest):
+    """Test thinkdeep tool with new workflow architecture"""
+
+    @property
+    def test_name(self) -> str:
+        return "thinkdeep_validation"
+
+    @property
+    def test_description(self) -> str:
+        return "ThinkDeep workflow tool validation with new workflow architecture"
+
+    def run_test(self) -> bool:
+        """Test thinkdeep tool capabilities"""
+        # Set up the test environment
+        self.setUp()
+
+        try:
+            self.logger.info("Test: ThinkDeepWorkflow tool validation (new architecture)")
+
+            # Create test files for thinking context
+            self._create_thinking_context()
+
+            # Test 1: Single thinking session with multiple steps
+            if not self._test_single_thinking_session():
+                return False
+
+            # Test 2: Thinking with backtracking
+            if not self._test_thinking_with_backtracking():
+                return False
+
+            # Test 3: Complete thinking with expert analysis
+            if not self._test_complete_thinking_with_analysis():
+                return False
+
+            # Test 4: Certain confidence behavior
+            if not self._test_certain_confidence():
+                return False
+
+            # Test 5: Context-aware file embedding
+            if not self._test_context_aware_file_embedding():
+                return False
+
+            # Test 6: Multi-step file context optimization
+            if not self._test_multi_step_file_context():
+                return False
+
+            self.logger.info("  ✅ All thinkdeep validation tests passed")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"ThinkDeep validation test failed: {e}")
+            return False
+
+    def _create_thinking_context(self):
+        """Create test files for deep thinking context"""
+        # Create architecture document
+        architecture_doc = """# Microservices Architecture Design
+
+## Current System
+- Monolithic application with 500k LOC
+- Single PostgreSQL database
+- Peak load: 10k requests/minute
+- Team size: 25 developers
+- Deployment: Manual, 2-week cycles
+
+## Proposed Migration to Microservices
+
+### Benefits
+- Independent deployments
+- Technology diversity
+- Team autonomy
+- Scalability improvements
+
+### Challenges
+- Data consistency
+- Network latency
+- Operational complexity
+- Transaction management
+
+### Key Considerations
+- Service boundaries
+- Data migration strategy
+- Communication patterns
+- Monitoring and observability
+"""
+
+        # Create requirements document
+        requirements_doc = """# Migration Requirements
+
+## Business Goals
+- Reduce deployment cycle from 2 weeks to daily
+- Support 50k requests/minute by Q4
+- Enable A/B testing capabilities
+- Improve system resilience
+
+## Technical Constraints
+- Zero downtime migration
+- Maintain data consistency
+- Budget: $200k for infrastructure
+- Timeline: 6 months
+- Existing team skills: Java, Spring Boot
+
+## Success Metrics
+- Deployment frequency: 10x improvement
+- System availability: 99.9%
+- Response time: <200ms p95
+- Developer productivity: 30% improvement
+"""
+
+        # Create performance analysis
+        performance_analysis = """# Current Performance Analysis
+
+## Database Bottlenecks
+- Connection pool exhaustion during peak hours
+- Complex joins affecting query performance
+- Lock contention on user_sessions table
+- Read replica lag causing data inconsistency
+
+## Application Issues
+- Memory leaks in background processing
+- Thread pool starvation
+- Cache invalidation storms
+- Session clustering problems
+
+## Infrastructure Limits
+- Single server deployment
+- Manual scaling processes
+- Limited monitoring capabilities
+- No circuit breaker patterns
+"""
+
+        # Create test files
+        self.architecture_file = self.create_additional_test_file("architecture_design.md", architecture_doc)
+        self.requirements_file = self.create_additional_test_file("migration_requirements.md", requirements_doc)
+        self.performance_file = self.create_additional_test_file("performance_analysis.md", performance_analysis)
+
+        self.logger.info("  ✅ Created thinking context files:")
+        self.logger.info(f"      - {self.architecture_file}")
+        self.logger.info(f"      - {self.requirements_file}")
+        self.logger.info(f"      - {self.performance_file}")
+
+    def _test_single_thinking_session(self) -> bool:
+        """Test a complete thinking session with multiple steps"""
+        try:
+            self.logger.info("  1.1: Testing single thinking session")
+
+            # Step 1: Start thinking analysis
+            self.logger.info("    1.1.1: Step 1 - Initial thinking analysis")
+            response1, continuation_id = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "I need to think deeply about the microservices migration strategy. Let me analyze the trade-offs, risks, and implementation approach systematically.",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Initial analysis shows significant architectural complexity but potential for major scalability and development velocity improvements. Need to carefully consider migration strategy and service boundaries.",
+                    "files_checked": [self.architecture_file, self.requirements_file],
+                    "relevant_files": [self.architecture_file, self.requirements_file],
+                    "relevant_context": ["microservices_migration", "service_boundaries", "data_consistency"],
+                    "confidence": "low",
+                    "problem_context": "Enterprise application migration from monolith to microservices",
+                    "focus_areas": ["architecture", "scalability", "risk_assessment"],
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to get initial thinking response")
+                return False
+
+            # Parse and validate JSON response
+            response1_data = self._parse_thinkdeep_response(response1)
+            if not response1_data:
+                return False
+
+            # Validate step 1 response structure - expect pause_for_thinkdeep for next_step_required=True
+            if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_thinkdeep"):
+                return False
+
+            self.logger.info(f"    ✅ Step 1 successful, continuation_id: {continuation_id}")
+
+            # Step 2: Deep analysis
+            self.logger.info("    1.1.2: Step 2 - Deep analysis of alternatives")
+            response2, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Analyzing different migration approaches: strangler fig pattern vs big bang vs gradual extraction. Each has different risk profiles and timelines.",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Strangler fig pattern emerges as best approach: lower risk, incremental value delivery, team learning curve management. Key insight: start with read-only services to minimize data consistency issues.",
+                    "files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
+                    "relevant_files": [self.architecture_file, self.performance_file],
+                    "relevant_context": ["strangler_fig_pattern", "service_extraction", "risk_mitigation"],
+                    "issues_found": [
+                        {"severity": "high", "description": "Data consistency challenges during migration"},
+                        {"severity": "medium", "description": "Team skill gap in distributed systems"},
+                    ],
+                    "confidence": "medium",
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue thinking to step 2")
+                return False
+
+            response2_data = self._parse_thinkdeep_response(response2)
+            if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_thinkdeep"):
+                return False
+
+            # Check thinking status tracking
+            thinking_status = response2_data.get("thinking_status", {})
+            if thinking_status.get("files_checked", 0) < 3:
+                self.logger.error("Files checked count not properly tracked")
+                return False
+
+            if thinking_status.get("thinking_confidence") != "medium":
+                self.logger.error("Confidence level not properly tracked")
+                return False
+
+            self.logger.info("    ✅ Step 2 successful with proper tracking")
+
+            # Store continuation_id for next test
+            self.thinking_continuation_id = continuation_id
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Single thinking session test failed: {e}")
+            return False
+
+    def _test_thinking_with_backtracking(self) -> bool:
+        """Test thinking with backtracking to revise analysis"""
+        try:
+            self.logger.info("  1.2: Testing thinking with backtracking")
+
+            # Start a new thinking session for testing backtracking
+            self.logger.info("    1.2.1: Start thinking for backtracking test")
+            response1, continuation_id = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Thinking about optimal database architecture for the new microservices",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Initial thought: each service should have its own database for independence",
+                    "files_checked": [self.architecture_file],
+                    "relevant_files": [self.architecture_file],
+                    "relevant_context": ["database_per_service", "data_independence"],
+                    "confidence": "low",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start backtracking test thinking")
+                return False
+
+            # Step 2: Initial direction
+            self.logger.info("    1.2.2: Step 2 - Initial analysis direction")
+            response2, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Exploring database-per-service pattern implementation",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Database-per-service creates significant complexity for transactions and reporting",
+                    "files_checked": [self.architecture_file, self.performance_file],
+                    "relevant_files": [self.performance_file],
+                    "relevant_context": ["database_per_service", "transaction_management"],
+                    "issues_found": [
+                        {"severity": "high", "description": "Cross-service transactions become complex"},
+                        {"severity": "medium", "description": "Reporting queries span multiple databases"},
+                    ],
+                    "confidence": "low",
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            # Step 3: Backtrack and revise approach
+            self.logger.info("    1.2.3: Step 3 - Backtrack and revise thinking")
+            response3, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Backtracking - maybe shared database with service-specific schemas is better initially. Then gradually extract databases as services mature.",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Hybrid approach: shared database with bounded contexts, then gradual extraction. This reduces initial complexity while preserving migration path to full service independence.",
+                    "files_checked": [self.architecture_file, self.requirements_file],
+                    "relevant_files": [self.architecture_file, self.requirements_file],
+                    "relevant_context": ["shared_database", "bounded_contexts", "gradual_extraction"],
+                    "confidence": "medium",
+                    "backtrack_from_step": 2,  # Backtrack from step 2
+                    "continuation_id": continuation_id,
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to backtrack")
+                return False
+
+            response3_data = self._parse_thinkdeep_response(response3)
+            if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_thinkdeep"):
+                return False
+
+            self.logger.info("    ✅ Backtracking working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Backtracking test failed: {e}")
+            return False
+
+    def _test_complete_thinking_with_analysis(self) -> bool:
+        """Test complete thinking ending with expert analysis"""
+        try:
+            self.logger.info("  1.3: Testing complete thinking with expert analysis")
+
+            # Use the continuation from first test
+            continuation_id = getattr(self, "thinking_continuation_id", None)
+            if not continuation_id:
+                # Start fresh if no continuation available
+                self.logger.info("    1.3.0: Starting fresh thinking session")
+                response0, continuation_id = self.call_mcp_tool(
+                    "thinkdeep",
+                    {
+                        "step": "Thinking about the complete microservices migration strategy",
+                        "step_number": 1,
+                        "total_steps": 2,
+                        "next_step_required": True,
+                        "findings": "Comprehensive analysis of migration approaches and risks",
+                        "files_checked": [self.architecture_file, self.requirements_file],
+                        "relevant_files": [self.architecture_file, self.requirements_file],
+                        "relevant_context": ["migration_strategy", "risk_assessment"],
+                    },
+                )
+                if not response0 or not continuation_id:
+                    self.logger.error("Failed to start fresh thinking session")
+                    return False
+
+            # Final step - trigger expert analysis
+            self.logger.info("    1.3.1: Final step - complete thinking analysis")
+            response_final, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Thinking analysis complete. I've thoroughly considered the migration strategy, risks, and implementation approach.",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step - triggers expert analysis
+                    "findings": "Comprehensive migration strategy: strangler fig pattern with shared database initially, gradual service extraction based on business value and technical feasibility. Key success factors: team training, monitoring infrastructure, and incremental rollout.",
+                    "files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
+                    "relevant_files": [self.architecture_file, self.requirements_file, self.performance_file],
+                    "relevant_context": ["strangler_fig", "migration_strategy", "risk_mitigation", "team_readiness"],
+                    "issues_found": [
+                        {"severity": "medium", "description": "Team needs distributed systems training"},
+                        {"severity": "low", "description": "Monitoring tools need upgrade"},
+                    ],
+                    "confidence": "high",
+                    "continuation_id": continuation_id,
+                    "model": "flash",  # Use flash for expert analysis
+                },
+            )
+
+            if not response_final:
+                self.logger.error("Failed to complete thinking")
+                return False
+
+            response_final_data = self._parse_thinkdeep_response(response_final)
+            if not response_final_data:
+                return False
+
+            # Validate final response structure - accept both expert analysis and special statuses
+            valid_final_statuses = ["calling_expert_analysis", "files_required_to_continue"]
+            if response_final_data.get("status") not in valid_final_statuses:
+                self.logger.error(
+                    f"Expected status in {valid_final_statuses}, got '{response_final_data.get('status')}'"
+                )
+                return False
+
+            if not response_final_data.get("thinking_complete"):
+                self.logger.error("Expected thinking_complete=true for final step")
+                return False
+
+            # Check for expert analysis or special status content
+            if response_final_data.get("status") == "calling_expert_analysis":
+                if "expert_analysis" not in response_final_data:
+                    self.logger.error("Missing expert_analysis in final response")
+                    return False
+                expert_analysis = response_final_data.get("expert_analysis", {})
+            else:
+                # For special statuses like files_required_to_continue, analysis may be in content
+                expert_analysis = response_final_data.get("content", "{}")
+                if isinstance(expert_analysis, str):
+                    try:
+                        expert_analysis = json.loads(expert_analysis)
+                    except (json.JSONDecodeError, TypeError):
+                        expert_analysis = {"analysis": expert_analysis}
+
+            # Check for expected analysis content (checking common patterns)
+            analysis_text = json.dumps(expert_analysis).lower()
+
+            # Look for thinking analysis validation
+            thinking_indicators = ["migration", "strategy", "microservices", "risk", "approach", "implementation"]
+            found_indicators = sum(1 for indicator in thinking_indicators if indicator in analysis_text)
+
+            if found_indicators >= 3:
+                self.logger.info("    ✅ Expert analysis validated the thinking correctly")
+            else:
+                self.logger.warning(
+                    f"    ⚠️ Expert analysis may not have fully validated the thinking (found {found_indicators}/6 indicators)"
+                )
+
+            # Check complete thinking summary
+            if "complete_thinking" not in response_final_data:
+                self.logger.error("Missing complete_thinking in final response")
+                return False
+
+            complete_thinking = response_final_data["complete_thinking"]
+            if not complete_thinking.get("relevant_context"):
+                self.logger.error("Missing relevant context in complete thinking")
+                return False
+
+            if "migration_strategy" not in complete_thinking["relevant_context"]:
+                self.logger.error("Expected context not found in thinking summary")
+                return False
+
+            self.logger.info("    ✅ Complete thinking with expert analysis successful")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Complete thinking test failed: {e}")
+            return False
+
+    def _test_certain_confidence(self) -> bool:
+        """Test certain confidence behavior - should skip expert analysis"""
+        try:
+            self.logger.info("  1.4: Testing certain confidence behavior")
+
+            # Test certain confidence - should skip expert analysis
+            self.logger.info("    1.4.1: Certain confidence thinking")
+            response_certain, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "I have thoroughly analyzed all aspects of the migration strategy with complete certainty.",
+                    "step_number": 1,
+                    "total_steps": 1,
+                    "next_step_required": False,  # Final step
+                    "findings": "Definitive conclusion: strangler fig pattern with phased database extraction is the optimal approach. Risk mitigation through team training and robust monitoring. Timeline: 6 months with monthly service extractions.",
+                    "files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
+                    "relevant_files": [self.architecture_file, self.requirements_file],
+                    "relevant_context": ["migration_complete_strategy", "implementation_plan"],
+                    "confidence": "certain",  # This should skip expert analysis
+                    "model": "flash",
+                },
+            )
+
+            if not response_certain:
+                self.logger.error("Failed to test certain confidence")
+                return False
+
+            response_certain_data = self._parse_thinkdeep_response(response_certain)
+            if not response_certain_data:
+                return False
+
+            # Validate certain confidence response - should skip expert analysis
+            if response_certain_data.get("status") != "deep_thinking_complete_ready_for_implementation":
+                self.logger.error(
+                    f"Expected status 'deep_thinking_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
+                )
+                return False
+
+            if not response_certain_data.get("skip_expert_analysis"):
+                self.logger.error("Expected skip_expert_analysis=true for certain confidence")
+                return False
+
+            expert_analysis = response_certain_data.get("expert_analysis", {})
+            if expert_analysis.get("status") != "skipped_due_to_certain_thinking_confidence":
+                self.logger.error("Expert analysis should be skipped for certain confidence")
+                return False
+
+            self.logger.info("    ✅ Certain confidence behavior working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Certain confidence test failed: {e}")
+            return False
+
+    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
+        """Call an MCP tool in-process - override for thinkdeep-specific response handling"""
+        # Use in-process implementation to maintain conversation memory
+        response_text, _ = self.call_mcp_tool_direct(tool_name, params)
+
+        if not response_text:
+            return None, None
+
+        # Extract continuation_id from thinkdeep response specifically
+        continuation_id = self._extract_thinkdeep_continuation_id(response_text)
+
+        return response_text, continuation_id
+
+    def _extract_thinkdeep_continuation_id(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from thinkdeep response"""
+        try:
+            # Parse the response
+            response_data = json.loads(response_text)
+            return response_data.get("continuation_id")
+
+        except json.JSONDecodeError as e:
+            self.logger.debug(f"Failed to parse response for thinkdeep continuation_id: {e}")
+            return None
+
+    def _parse_thinkdeep_response(self, response_text: str) -> dict:
+        """Parse thinkdeep tool JSON response"""
+        try:
+            # Parse the response - it should be direct JSON
+            return json.loads(response_text)
+
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse thinkdeep response as JSON: {e}")
+            self.logger.error(f"Response text: {response_text[:500]}...")
+            return {}
+
+    def _validate_step_response(
+        self,
+        response_data: dict,
+        expected_step: int,
+        expected_total: int,
+        expected_next_required: bool,
+        expected_status: str,
+    ) -> bool:
+        """Validate a thinkdeep thinking step response structure"""
+        try:
+            # Check status
+            if response_data.get("status") != expected_status:
+                self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
+                return False
+
+            # Check step number
+            if response_data.get("step_number") != expected_step:
+                self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
+                return False
+
+            # Check total steps
+            if response_data.get("total_steps") != expected_total:
+                self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
+                return False
+
+            # Check next_step_required
+            if response_data.get("next_step_required") != expected_next_required:
+                self.logger.error(
+                    f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
+                )
+                return False
+
+            # Check thinking_status exists
+            if "thinking_status" not in response_data:
+                self.logger.error("Missing thinking_status in response")
+                return False
+
+            # Check next_steps guidance
+            if not response_data.get("next_steps"):
+                self.logger.error("Missing next_steps guidance in response")
+                return False
+
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error validating step response: {e}")
+            return False
+
+    def _test_context_aware_file_embedding(self) -> bool:
+        """Test context-aware file embedding optimization"""
+        try:
+            self.logger.info("  1.5: Testing context-aware file embedding")
+
+            # Create additional test files for context testing
+            strategy_doc = """# Implementation Strategy
+
+## Phase 1: Foundation (Month 1-2)
+- Set up monitoring and logging infrastructure
+- Establish CI/CD pipelines for microservices
+- Team training on distributed systems concepts
+
+## Phase 2: Initial Services (Month 3-4)
+- Extract read-only services (user profiles, product catalog)
+- Implement API gateway
+- Set up service discovery
+
+## Phase 3: Core Services (Month 5-6)
+- Extract transaction services
+- Implement saga patterns for distributed transactions
+- Performance optimization and monitoring
+"""
+
+            tech_stack_doc = """# Technology Stack Decisions
+
+## Service Framework
+- Spring Boot 2.7 (team familiarity)
+- Docker containers
+- Kubernetes orchestration
+
+## Communication
+- REST APIs for synchronous communication
+- Apache Kafka for asynchronous messaging
+- gRPC for high-performance internal communication
+
+## Data Layer
+- PostgreSQL (existing expertise)
+- Redis for caching
+- Elasticsearch for search and analytics
+
+## Monitoring
+- Prometheus + Grafana
+- Distributed tracing with Jaeger
+- Centralized logging with ELK stack
+"""
+
+            # Create test files
+            strategy_file = self.create_additional_test_file("implementation_strategy.md", strategy_doc)
+            tech_stack_file = self.create_additional_test_file("tech_stack.md", tech_stack_doc)
+
+            # Test 1: New conversation, intermediate step - should only reference files
+            self.logger.info("    1.5.1: New conversation intermediate step (should reference only)")
+            response1, continuation_id = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Starting deep thinking about implementation timeline and technology choices",
+                    "step_number": 1,
+                    "total_steps": 3,
+                    "next_step_required": True,  # Intermediate step
+                    "findings": "Initial analysis of implementation strategy and technology stack decisions",
+                    "files_checked": [strategy_file, tech_stack_file],
+                    "relevant_files": [strategy_file],  # This should be referenced, not embedded
+                    "relevant_context": ["implementation_timeline", "technology_selection"],
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start context-aware file embedding test")
+                return False
+
+            response1_data = self._parse_thinkdeep_response(response1)
+            if not response1_data:
+                return False
+
+            # Check file context - should be reference_only for intermediate step
+            file_context = response1_data.get("file_context", {})
+            if file_context.get("type") != "reference_only":
+                self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
+                return False
+
+            if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
+                self.logger.error("Expected context optimization message for reference_only")
+                return False
+
+            self.logger.info("    ✅ Intermediate step correctly uses reference_only file context")
+
+            # Test 2: Final step - should embed files for expert analysis
+            self.logger.info("    1.5.2: Final step (should embed files)")
+            response2, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Thinking analysis complete - comprehensive evaluation of implementation approach",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Complete analysis: phased implementation with proven technology stack minimizes risk while maximizing team effectiveness. Timeline is realistic with proper training and infrastructure setup.",
+                    "files_checked": [strategy_file, tech_stack_file],
+                    "relevant_files": [strategy_file, tech_stack_file],  # Should be fully embedded
+                    "relevant_context": ["implementation_plan", "technology_decisions", "risk_management"],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response2_data = self._parse_thinkdeep_response(response2)
+            if not response2_data:
+                return False
+
+            # Check file context - should be fully_embedded for final step
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "fully_embedded":
+                self.logger.error(
+                    f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
+                )
+                return False
+
+            if "Full file content embedded for expert analysis" not in file_context2.get("context_optimization", ""):
+                self.logger.error("Expected expert analysis optimization message for fully_embedded")
+                return False
+
+            self.logger.info("    ✅ Final step correctly uses fully_embedded file context")
+
+            # Verify expert analysis was called for final step
+            if response2_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            if "expert_analysis" not in response2_data:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Context-aware file embedding test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Context-aware file embedding test failed: {e}")
+            return False
+
+    def _test_multi_step_file_context(self) -> bool:
+        """Test multi-step workflow with proper file context transitions"""
+        try:
+            self.logger.info("  1.6: Testing multi-step file context optimization")
+
+            # Create a complex scenario with multiple thinking documents
+            risk_analysis = """# Risk Analysis
+
+## Technical Risks
+- Service mesh complexity
+- Data consistency challenges
+- Performance degradation during migration
+- Operational overhead increase
+
+## Business Risks
+- Extended development timelines
+- Potential system instability
+- Team productivity impact
+- Customer experience disruption
+
+## Mitigation Strategies
+- Gradual rollout with feature flags
+- Comprehensive monitoring and alerting
+- Rollback procedures for each phase
+- Customer communication plan
+"""
+
+            success_metrics = """# Success Metrics and KPIs
+
+## Development Velocity
+- Deployment frequency: Target 10x improvement
+- Lead time for changes: <2 hours
+- Mean time to recovery: <30 minutes
+- Change failure rate: <5%
+
+## System Performance
+- Response time: <200ms p95
+- System availability: 99.9%
+- Throughput: 50k requests/minute
+- Resource utilization: 70% optimal
+
+## Business Impact
+- Developer satisfaction: >8/10
+- Time to market: 50% reduction
+- Operational costs: 20% reduction
+- System reliability: 99.9% uptime
+"""
+
+            # Create test files
+            risk_file = self.create_additional_test_file("risk_analysis.md", risk_analysis)
+            metrics_file = self.create_additional_test_file("success_metrics.md", success_metrics)
+
+            # Step 1: Start thinking analysis (new conversation)
+            self.logger.info("    1.6.1: Step 1 - Start thinking analysis")
+            response1, continuation_id = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Beginning comprehensive analysis of migration risks and success criteria",
+                    "step_number": 1,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "findings": "Initial assessment of risk factors and success metrics for microservices migration",
+                    "files_checked": [risk_file],
+                    "relevant_files": [risk_file],
+                    "relevant_context": ["risk_assessment", "migration_planning"],
+                    "confidence": "low",
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("Failed to start multi-step file context test")
+                return False
+
+            response1_data = self._parse_thinkdeep_response(response1)
+
+            # Validate step 1 - should use reference_only
+            file_context1 = response1_data.get("file_context", {})
+            if file_context1.get("type") != "reference_only":
+                self.logger.error("Step 1 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 1: reference_only file context")
+
+            # Step 2: Expand thinking analysis
+            self.logger.info("    1.6.2: Step 2 - Expand thinking analysis")
+            response2, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Deepening analysis by correlating risks with success metrics",
+                    "step_number": 2,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Key insight: technical risks directly impact business metrics. Need balanced approach prioritizing high-impact, low-risk improvements first.",
+                    "files_checked": [risk_file, metrics_file],
+                    "relevant_files": [risk_file, metrics_file],
+                    "relevant_context": ["risk_metric_correlation", "priority_matrix"],
+                    "confidence": "medium",
+                    "model": "flash",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to continue to step 2")
+                return False
+
+            response2_data = self._parse_thinkdeep_response(response2)
+
+            # Validate step 2 - should still use reference_only
+            file_context2 = response2_data.get("file_context", {})
+            if file_context2.get("type") != "reference_only":
+                self.logger.error("Step 2 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 2: reference_only file context with multiple files")
+
+            # Step 3: Deep analysis
+            self.logger.info("    1.6.3: Step 3 - Deep strategic analysis")
+            response3, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Synthesizing risk mitigation strategies with measurable success criteria",
+                    "step_number": 3,
+                    "total_steps": 4,
+                    "next_step_required": True,
+                    "continuation_id": continuation_id,
+                    "findings": "Strategic framework emerging: phase-gate approach with clear go/no-go criteria at each milestone. Emphasis on early wins to build confidence and momentum.",
+                    "files_checked": [risk_file, metrics_file, self.requirements_file],
+                    "relevant_files": [risk_file, metrics_file, self.requirements_file],
+                    "relevant_context": ["phase_gate_approach", "milestone_criteria", "early_wins"],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response3:
+                self.logger.error("Failed to continue to step 3")
+                return False
+
+            response3_data = self._parse_thinkdeep_response(response3)
+
+            # Validate step 3 - should still use reference_only
+            file_context3 = response3_data.get("file_context", {})
+            if file_context3.get("type") != "reference_only":
+                self.logger.error("Step 3 should use reference_only file context")
+                return False
+
+            self.logger.info("    ✅ Step 3: reference_only file context")
+
+            # Step 4: Final analysis with expert consultation
+            self.logger.info("    1.6.4: Step 4 - Final step with expert analysis")
+            response4, _ = self.call_mcp_tool(
+                "thinkdeep",
+                {
+                    "step": "Thinking analysis complete - comprehensive strategic framework developed",
+                    "step_number": 4,
+                    "total_steps": 4,
+                    "next_step_required": False,  # Final step - should embed files
+                    "continuation_id": continuation_id,
+                    "findings": "Complete strategic framework: risk-balanced migration with measurable success criteria, phase-gate governance, and clear rollback procedures. Framework aligns technical execution with business objectives.",
+                    "files_checked": [risk_file, metrics_file, self.requirements_file, self.architecture_file],
+                    "relevant_files": [risk_file, metrics_file, self.requirements_file, self.architecture_file],
+                    "relevant_context": ["strategic_framework", "governance_model", "success_measurement"],
+                    "confidence": "high",
+                    "model": "flash",
+                },
+            )
+
+            if not response4:
+                self.logger.error("Failed to complete to final step")
+                return False
+
+            response4_data = self._parse_thinkdeep_response(response4)
+
+            # Validate step 4 - should use fully_embedded for expert analysis
+            file_context4 = response4_data.get("file_context", {})
+            if file_context4.get("type") != "fully_embedded":
+                self.logger.error("Step 4 (final) should use fully_embedded file context")
+                return False
+
+            if "expert analysis" not in file_context4.get("context_optimization", "").lower():
+                self.logger.error("Final step should mention expert analysis in context optimization")
+                return False
+
+            # Verify expert analysis was triggered
+            if response4_data.get("status") != "calling_expert_analysis":
+                self.logger.error("Final step should trigger expert analysis")
+                return False
+
+            # Check that expert analysis has file context
+            expert_analysis = response4_data.get("expert_analysis", {})
+            if not expert_analysis:
+                self.logger.error("Expert analysis should be present in final step")
+                return False
+
+            self.logger.info("    ✅ Step 4: fully_embedded file context with expert analysis")
+
+            # Validate the complete workflow progression
+            progression_summary = {
+                "step_1": "reference_only (new conversation, intermediate)",
+                "step_2": "reference_only (continuation, intermediate)",
+                "step_3": "reference_only (continuation, intermediate)",
+                "step_4": "fully_embedded (continuation, final)",
+            }
+
+            self.logger.info("    📋 File context progression:")
+            for step, context_type in progression_summary.items():
+                self.logger.info(f"      {step}: {context_type}")
+
+            self.logger.info("    ✅ Multi-step file context optimization test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Multi-step file context test failed: {e}")
+            return False