Add DocGen tool with comprehensive documentation generation capabilities (#109)

* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools * WIP - Update tests to match new tools * Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97 Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96 Improved retry error logging Cleanup * WIP - chat tool using new architecture and improved code sharing * Removed todo * Removed todo * Cleanup old name * Tweak wordings * Tweak wordings Migrate old tests * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 Fixed test * Improved consensus to use the workflow base class * Improved consensus to use the workflow base class * Allow images * Allow images * Replaced old consensus tool * Cleanup tests * Tests for prompt size * New tool: docgen Tests for prompt size Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107 Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105 * Improved docgen prompt Exclude TestGen from pytest inclusion * Updated errors * Lint * DocGen instructed not to fix bugs, surface them and stick to d * WIP * Stop claude from being lazy and only documenting a small handful * More style rules --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-21 23:21:19 -07:00
parent 0655590a51
commit c960bcb720
58 changed files with 5492 additions and 5558 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -8,6 +8,7 @@ Each test is in its own file for better organization and maintainability.
 from .base_test import BaseSimulatorTest
 from .test_analyze_validation import AnalyzeValidationTest
 from .test_basic_conversation import BasicConversationTest
+from .test_chat_simple_validation import ChatSimpleValidationTest
 from .test_codereview_validation import CodeReviewValidationTest
 from .test_consensus_conversation import TestConsensusConversation
 from .test_consensus_stance import TestConsensusStance
@@ -30,6 +31,7 @@ from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_planner_continuation_history import PlannerContinuationHistoryTest
 from .test_planner_validation import PlannerValidationTest
 from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest
+from .test_prompt_size_limit_bug import PromptSizeLimitBugTest

 # Redis validation test removed - no longer needed for standalone server
 from .test_refactor_validation import RefactorValidationTest
@@ -42,6 +44,7 @@ from .test_xai_models import XAIModelsTest
 # Test registry for dynamic loading
 TEST_REGISTRY = {
    "basic_conversation": BasicConversationTest,
+    "chat_validation": ChatSimpleValidationTest,
    "codereview_validation": CodeReviewValidationTest,
    "content_validation": ContentValidationTest,
    "per_tool_deduplication": PerToolDeduplicationTest,
@@ -71,12 +74,14 @@ TEST_REGISTRY = {
    "consensus_stance": TestConsensusStance,
    "consensus_three_models": TestConsensusThreeModels,
    "analyze_validation": AnalyzeValidationTest,
+    "prompt_size_limit_bug": PromptSizeLimitBugTest,
    # "o3_pro_expensive": O3ProExpensiveTest,  # COMMENTED OUT - too expensive to run by default
 }

 __all__ = [
    "BaseSimulatorTest",
    "BasicConversationTest",
+    "ChatSimpleValidationTest",
    "CodeReviewValidationTest",
    "ContentValidationTest",
    "PerToolDeduplicationTest",
@@ -106,5 +111,6 @@ __all__ = [
    "TestConsensusStance",
    "TestConsensusThreeModels",
    "AnalyzeValidationTest",
+    "PromptSizeLimitBugTest",
    "TEST_REGISTRY",
 ]
--- a/simulator_tests/test_chat_simple_validation.py
+++ b/simulator_tests/test_chat_simple_validation.py
@@ -0,0 +1,509 @@
+#!/usr/bin/env python3
+"""
+Chat Simple Tool Validation Test
+
+Comprehensive test for the new ChatSimple tool implementation that validates:
+- Basic conversation flow without continuation_id (new chats)
+- Continuing existing conversations with continuation_id (continued chats)
+- File handling with conversation context (chats with files)
+- Image handling in conversations (chat with images)
+- Continuing conversations with files from previous turns (continued chats with files previously)
+- Temperature validation for different models
+- Image limit validation per model
+- Conversation context preservation across turns
+"""
+
+
+from .conversation_base_test import ConversationBaseTest
+
+
+class ChatSimpleValidationTest(ConversationBaseTest):
+    """Test ChatSimple tool functionality and validation"""
+
+    @property
+    def test_name(self) -> str:
+        return "_validation"
+
+    @property
+    def test_description(self) -> str:
+        return "Comprehensive validation of ChatSimple tool implementation"
+
+    def run_test(self) -> bool:
+        """Run comprehensive ChatSimple validation tests"""
+        try:
+            # Set up the test environment for in-process testing
+            self.setUp()
+
+            self.logger.info("Test: ChatSimple tool validation")
+
+            # Run all test scenarios
+            if not self.test_new_conversation_no_continuation():
+                return False
+
+            if not self.test_continue_existing_conversation():
+                return False
+
+            if not self.test_file_handling_with_conversation():
+                return False
+
+            if not self.test_temperature_validation_edge_cases():
+                return False
+
+            if not self.test_image_limits_per_model():
+                return False
+
+            if not self.test_conversation_context_preservation():
+                return False
+
+            if not self.test_chat_with_images():
+                return False
+
+            if not self.test_continued_chat_with_previous_files():
+                return False
+
+            self.logger.info("  ✅ All ChatSimple validation tests passed")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"ChatSimple validation test failed: {e}")
+            return False
+
+    def test_new_conversation_no_continuation(self) -> bool:
+        """Test ChatSimple creates new conversation without continuation_id"""
+        try:
+            self.logger.info("  1. Test new conversation without continuation_id")
+
+            # Call chat without continuation_id
+            response, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Hello! Please use low thinking mode. Can you explain what MCP tools are?",
+                    "model": "flash",
+                    "temperature": 0.7,
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response:
+                self.logger.error("    ❌ Failed to get response from chat")
+                return False
+
+            if not continuation_id:
+                self.logger.error("    ❌ No continuation_id returned for new conversation")
+                return False
+
+            # Verify response mentions MCP or tools
+            if "MCP" not in response and "tool" not in response.lower():
+                self.logger.error("    ❌ Response doesn't seem to address the question about MCP tools")
+                return False
+
+            self.logger.info(f"    ✅ New conversation created with continuation_id: {continuation_id}")
+            self.new_continuation_id = continuation_id  # Store for next test
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ New conversation test failed: {e}")
+            return False
+
+    def test_continue_existing_conversation(self) -> bool:
+        """Test ChatSimple continues conversation with valid continuation_id"""
+        try:
+            self.logger.info("  2. Test continuing existing conversation")
+
+            if not hasattr(self, "new_continuation_id"):
+                self.logger.error("    ❌ No continuation_id from previous test")
+                return False
+
+            # Continue the conversation
+            response, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Can you give me a specific example of how an MCP tool might work?",
+                    "continuation_id": self.new_continuation_id,
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response:
+                self.logger.error("    ❌ Failed to continue conversation")
+                return False
+
+            # Continuation ID should be the same
+            if continuation_id != self.new_continuation_id:
+                self.logger.error(f"    ❌ Continuation ID changed: {self.new_continuation_id} -> {continuation_id}")
+                return False
+
+            # Response should be contextual (mentioning previous discussion)
+            if "example" not in response.lower():
+                self.logger.error("    ❌ Response doesn't seem to provide an example as requested")
+                return False
+
+            self.logger.info("    ✅ Successfully continued conversation with same continuation_id")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Continue conversation test failed: {e}")
+            return False
+
+    def test_file_handling_with_conversation(self) -> bool:
+        """Test ChatSimple handles files correctly in conversation context"""
+        try:
+            self.logger.info("  3. Test file handling with conversation")
+
+            # Setup test files
+            self.setup_test_files()
+
+            # Start new conversation with a file
+            response1, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Analyze this Python code and tell me what the Calculator class does",
+                    "files": [self.test_files["python"]],
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("    ❌ Failed to start conversation with file")
+                return False
+
+            # Continue with same file (should be deduplicated)
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. What methods does the Calculator class have?",
+                    "files": [self.test_files["python"]],  # Same file
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response2:
+                self.logger.error("    ❌ Failed to continue with same file")
+                return False
+
+            # Response should mention add and multiply methods
+            if "add" not in response2.lower() or "multiply" not in response2.lower():
+                self.logger.error("    ❌ Response doesn't mention Calculator methods")
+                return False
+
+            self.logger.info("    ✅ File handling with conversation working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ File handling test failed: {e}")
+            return False
+        finally:
+            self.cleanup_test_files()
+
+    def test_temperature_validation_edge_cases(self) -> bool:
+        """Test temperature is corrected for model limits (too high/low)"""
+        try:
+            self.logger.info("  4. Test temperature validation edge cases")
+
+            # Test 1: Temperature exactly at limit (should work)
+            response1, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Hello, this is a test with max temperature",
+                    "model": "flash",
+                    "temperature": 1.0,  # At the limit
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response1:
+                self.logger.error("    ❌ Failed with temperature 1.0")
+                return False
+
+            # Test 2: Temperature at minimum (should work)
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Another test message with min temperature",
+                    "model": "flash",
+                    "temperature": 0.0,  # At minimum
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response2:
+                self.logger.error("    ❌ Failed with temperature 0.0")
+                return False
+
+            # Test 3: Check that invalid temperatures are rejected by validation
+            # This should result in an error response from the tool, not a crash
+            try:
+                response3, _ = self.call_mcp_tool_direct(
+                    "chat",
+                    {
+                        "prompt": "Please use low thinking mode. Test with invalid temperature",
+                        "model": "flash",
+                        "temperature": 1.5,  # Too high - should be validated
+                        "thinking_mode": "low",
+                    },
+                )
+
+                # If we get here, check if it's an error response
+                if response3 and "validation error" in response3.lower():
+                    self.logger.info("    ✅ Invalid temperature properly rejected by validation")
+                else:
+                    self.logger.warning("    ⚠️  High temperature not properly validated")
+            except Exception:
+                # Expected - validation should reject this
+                self.logger.info("    ✅ Invalid temperature properly rejected")
+
+            self.logger.info("    ✅ Temperature validation working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Temperature validation test failed: {e}")
+            return False
+
+    def test_image_limits_per_model(self) -> bool:
+        """Test image validation respects model-specific limits"""
+        try:
+            self.logger.info("  5. Test image limits per model")
+
+            # Create test image data URLs (small base64 images)
+            small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
+
+            # Test 1: Model that doesn't support images
+            response1, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Can you see this image?",
+                    "model": "local-llama",  # Text-only model
+                    "images": [small_image],
+                    "thinking_mode": "low",
+                },
+            )
+
+            # Should get an error about image support
+            if response1 and "does not support image" not in response1:
+                self.logger.warning("    ⚠️  Model without image support didn't reject images properly")
+
+            # Test 2: Too many images for a model
+            many_images = [small_image] * 25  # Most models support max 20
+
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Analyze these images",
+                    "model": "gemini-2.5-flash",  # Supports max 16 images
+                    "images": many_images,
+                    "thinking_mode": "low",
+                },
+            )
+
+            # Should get an error about too many images
+            if response2 and "too many images" not in response2.lower():
+                self.logger.warning("    ⚠️  Model didn't reject excessive image count")
+
+            # Test 3: Valid image count
+            response3, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. This is a test with one image",
+                    "model": "gemini-2.5-flash",
+                    "images": [small_image],
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response3:
+                self.logger.error("    ❌ Failed with valid image count")
+                return False
+
+            self.logger.info("    ✅ Image validation working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Image limits test failed: {e}")
+            return False
+
+    def test_conversation_context_preservation(self) -> bool:
+        """Test ChatSimple preserves context across turns"""
+        try:
+            self.logger.info("  6. Test conversation context preservation")
+
+            # Start conversation with specific context
+            response1, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. My name is TestUser and I'm working on a Python project called TestProject",
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("    ❌ Failed to start conversation")
+                return False
+
+            # Continue and reference previous context
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. What's my name and what project am I working on?",
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response2:
+                self.logger.error("    ❌ Failed to continue conversation")
+                return False
+
+            # Check if context was preserved
+            if "TestUser" not in response2 or "TestProject" not in response2:
+                self.logger.error("    ❌ Context not preserved across conversation turns")
+                self.logger.debug(f"    Response: {response2[:200]}...")
+                return False
+
+            self.logger.info("    ✅ Conversation context preserved correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Context preservation test failed: {e}")
+            return False
+
+    def test_chat_with_images(self) -> bool:
+        """Test ChatSimple handles images correctly in conversation"""
+        try:
+            self.logger.info("  7. Test chat with images")
+
+            # Create test image data URL (small base64 image)
+            small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
+
+            # Start conversation with image
+            response1, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. I'm sharing an image with you. Can you acknowledge that you received it?",
+                    "images": [small_image],
+                    "model": "gemini-2.5-flash",  # Model that supports images
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("    ❌ Failed to start conversation with image")
+                return False
+
+            # Verify response acknowledges the image
+            if "image" not in response1.lower():
+                self.logger.warning("    ⚠️  Response doesn't acknowledge receiving image")
+
+            # Continue conversation referencing the image
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. What did you see in that image I shared earlier?",
+                    "continuation_id": continuation_id,
+                    "model": "gemini-2.5-flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response2:
+                self.logger.error("    ❌ Failed to continue conversation about image")
+                return False
+
+            # Test with multiple images
+            multiple_images = [small_image, small_image]  # Two identical small images
+            response3, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Here are two images for comparison",
+                    "images": multiple_images,
+                    "model": "gemini-2.5-flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response3:
+                self.logger.error("    ❌ Failed with multiple images")
+                return False
+
+            self.logger.info("    ✅ Chat with images working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Chat with images test failed: {e}")
+            return False
+
+    def test_continued_chat_with_previous_files(self) -> bool:
+        """Test continuing conversation where files were shared in previous turns"""
+        try:
+            self.logger.info("  8. Test continued chat with files from previous turns")
+
+            # Setup test files
+            self.setup_test_files()
+
+            # Start conversation with files
+            response1, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Here are some files for you to analyze",
+                    "files": [self.test_files["python"], self.test_files["config"]],
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("    ❌ Failed to start conversation with files")
+                return False
+
+            # Continue conversation without new files (should remember previous files)
+            response2, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. From the files I shared earlier, what types of files were there?",
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response2:
+                self.logger.error("    ❌ Failed to continue conversation")
+                return False
+
+            # Check if response references the files from previous turn
+            if "python" not in response2.lower() and "config" not in response2.lower():
+                self.logger.warning("    ⚠️  Response doesn't reference previous files properly")
+
+            # Continue with a different question about same files (should still remember them)
+            response3, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. Can you tell me what functions were defined in the Python file from our earlier discussion?",
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                    "thinking_mode": "low",
+                },
+            )
+
+            if not response3:
+                self.logger.error("    ❌ Failed to continue conversation about Python file")
+                return False
+
+            # Should reference functions from the Python file (fibonacci, factorial, Calculator, etc.)
+            response_lower = response3.lower()
+            if not ("fibonacci" in response_lower or "factorial" in response_lower or "calculator" in response_lower):
+                self.logger.warning("    ⚠️  Response doesn't reference Python file contents from earlier turn")
+
+            self.logger.info("    ✅ Continued chat with previous files working correctly")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"    ❌ Continued chat with files test failed: {e}")
+            return False
+        finally:
+            self.cleanup_test_files()
--- a/simulator_tests/test_cross_tool_comprehensive.py
+++ b/simulator_tests/test_cross_tool_comprehensive.py
@@ -21,7 +21,12 @@ class CrossToolComprehensiveTest(ConversationBaseTest):

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
        """Call an MCP tool in-process"""
-        response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
+        # Use the new method for workflow tools
+        workflow_tools = ["analyze", "debug", "codereview", "precommit", "refactor", "thinkdeep"]
+        if tool_name in workflow_tools:
+            response_text, continuation_id = super().call_mcp_tool(tool_name, params)
+        else:
+            response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
        return response_text, continuation_id

    @property
@@ -96,8 +101,12 @@ def hash_pwd(pwd):
            # Step 2: Use analyze tool to do deeper analysis (fresh conversation)
            self.logger.info("  Step 2: analyze tool - Deep code analysis (fresh)")
            analyze_params = {
-                "files": [auth_file],
-                "prompt": "Find vulnerabilities",
+                "step": "Starting comprehensive code analysis to find security vulnerabilities in the authentication system",
+                "step_number": 1,
+                "total_steps": 2,
+                "next_step_required": True,
+                "findings": "Initial analysis will focus on security vulnerabilities in authentication code",
+                "relevant_files": [auth_file],
                "thinking_mode": "low",
                "model": "flash",
            }
@@ -133,8 +142,12 @@ def hash_pwd(pwd):
            # Step 4: Use debug tool to identify specific issues
            self.logger.info("  Step 4: debug tool - Identify specific problems")
            debug_params = {
-                "files": [auth_file, config_file_path],
-                "prompt": "Fix auth issues",
+                "step": "Starting debug investigation to identify and fix authentication security issues",
+                "step_number": 1,
+                "total_steps": 2,
+                "next_step_required": True,
+                "findings": "Investigating authentication vulnerabilities found in previous analysis",
+                "relevant_files": [auth_file, config_file_path],
                "thinking_mode": "low",
                "model": "flash",
            }
@@ -153,9 +166,13 @@ def hash_pwd(pwd):
            if continuation_id4:
                self.logger.info("  Step 5: debug continuation - Additional analysis")
                debug_continue_params = {
+                    "step": "Continuing debug investigation to fix password hashing implementation",
+                    "step_number": 2,
+                    "total_steps": 2,
+                    "next_step_required": False,
+                    "findings": "Building on previous analysis to fix weak password hashing",
                    "continuation_id": continuation_id4,
-                    "files": [auth_file, config_file_path],
-                    "prompt": "Fix password hashing",
+                    "relevant_files": [auth_file, config_file_path],
                    "thinking_mode": "low",
                    "model": "flash",
                }
@@ -168,8 +185,12 @@ def hash_pwd(pwd):
            # Step 6: Use codereview for comprehensive review
            self.logger.info("  Step 6: codereview tool - Comprehensive code review")
            codereview_params = {
-                "files": [auth_file, config_file_path],
-                "prompt": "Security review",
+                "step": "Starting comprehensive security code review of authentication system",
+                "step_number": 1,
+                "total_steps": 2,
+                "next_step_required": True,
+                "findings": "Performing thorough security review of authentication code and configuration",
+                "relevant_files": [auth_file, config_file_path],
                "thinking_mode": "low",
                "model": "flash",
            }
@@ -201,9 +222,13 @@ def secure_login(user, pwd):
            improved_file = self.create_additional_test_file("auth_improved.py", improved_code)

            precommit_params = {
+                "step": "Starting pre-commit validation of improved authentication code",
+                "step_number": 1,
+                "total_steps": 2,
+                "next_step_required": True,
+                "findings": "Validating improved authentication implementation before commit",
                "path": self.test_dir,
-                "files": [auth_file, config_file_path, improved_file],
-                "prompt": "Ready to commit",
+                "relevant_files": [auth_file, config_file_path, improved_file],
                "thinking_mode": "low",
                "model": "flash",
            }
--- a/simulator_tests/test_prompt_size_limit_bug.py
+++ b/simulator_tests/test_prompt_size_limit_bug.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+Prompt Size Limit Bug Test
+
+This test reproduces a critical bug where the prompt size limit check
+incorrectly includes conversation history when validating incoming prompts
+from Claude to MCP. The limit should ONLY apply to the actual prompt text
+sent by the user, not the entire conversation context.
+
+Bug Scenario:
+- User starts a conversation with chat tool
+- Continues conversation multiple times (building up history)
+- On subsequent continuation, a short prompt (150 chars) triggers
+  "resend_prompt" error claiming >50k characters
+
+Expected Behavior:
+- Only count the actual prompt parameter for size limit
+- Conversation history should NOT count toward prompt size limit
+- Only the user's actual input should be validated against 50k limit
+"""
+
+from .conversation_base_test import ConversationBaseTest
+
+
+class PromptSizeLimitBugTest(ConversationBaseTest):
+    """Test to reproduce and verify fix for prompt size limit bug"""
+
+    @property
+    def test_name(self) -> str:
+        return "prompt_size_limit_bug"
+
+    @property
+    def test_description(self) -> str:
+        return "Reproduce prompt size limit bug with conversation continuation"
+
+    def run_test(self) -> bool:
+        """Test prompt size limit bug reproduction using in-process calls"""
+        try:
+            self.logger.info("🐛 Test: Prompt size limit bug reproduction (in-process)")
+
+            # Setup test environment
+            self.setUp()
+
+            # Create a test file to provide context
+            test_file_content = """
+# Test SwiftUI-like Framework Implementation
+
+struct ContentView: View {
+    @State private var counter = 0
+
+    var body: some View {
+        VStack {
+            Text("Count: \\(counter)")
+            Button("Increment") {
+                counter += 1
+            }
+        }
+    }
+}
+
+class Renderer {
+    static let shared = Renderer()
+
+    func render(view: View) {
+        // Implementation details for UIKit/AppKit rendering
+    }
+}
+
+protocol View {
+    var body: some View { get }
+}
+"""
+            test_file_path = self.create_additional_test_file("SwiftFramework.swift", test_file_content)
+
+            # Step 1: Start initial conversation
+            self.logger.info("  Step 1: Start conversation with initial context")
+
+            initial_prompt = "I'm building a SwiftUI-like framework. Can you help me design the architecture?"
+
+            response1, continuation_id = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": initial_prompt,
+                    "files": [test_file_path],
+                    "model": "flash",
+                },
+            )
+
+            if not response1 or not continuation_id:
+                self.logger.error("  ❌ Failed to start initial conversation")
+                return False
+
+            self.logger.info(f"  ✅ Initial conversation started: {continuation_id[:8]}...")
+
+            # Step 2: Continue conversation multiple times to build substantial history
+            conversation_prompts = [
+                "That's helpful! Can you elaborate on the View protocol design?",
+                "How should I implement the State property wrapper?",
+                "What's the best approach for the VStack layout implementation?",
+                "Should I use UIKit directly or create an abstraction layer?",
+                "Smart approach! For the rendering layer, would you suggest UIKit/AppKit directly?",
+            ]
+
+            for i, prompt in enumerate(conversation_prompts, 2):
+                self.logger.info(f"  Step {i}: Continue conversation (exchange {i})")
+
+                response, _ = self.call_mcp_tool_direct(
+                    "chat",
+                    {
+                        "prompt": prompt,
+                        "continuation_id": continuation_id,
+                        "model": "flash",
+                    },
+                )
+
+                if not response:
+                    self.logger.error(f"  ❌ Failed at exchange {i}")
+                    return False
+
+                self.logger.info(f"  ✅ Exchange {i} completed")
+
+            # Step 3: Send short prompt that should NOT trigger size limit
+            self.logger.info("  Step 7: Send short prompt (should NOT trigger size limit)")
+
+            # This is a very short prompt - should not trigger the bug after fix
+            short_prompt = "Thanks! This gives me a solid foundation to start prototyping."
+
+            self.logger.info(f"     Short prompt length: {len(short_prompt)} characters")
+
+            response_final, _ = self.call_mcp_tool_direct(
+                "chat",
+                {
+                    "prompt": short_prompt,
+                    "continuation_id": continuation_id,
+                    "model": "flash",
+                },
+            )
+
+            if not response_final:
+                self.logger.error("  ❌ Final short prompt failed")
+                return False
+
+            # Parse the response to check for the bug
+            import json
+
+            try:
+                response_data = json.loads(response_final)
+                status = response_data.get("status", "")
+
+                if status == "resend_prompt":
+                    # This is the bug! Short prompt incorrectly triggering size limit
+                    metadata = response_data.get("metadata", {})
+                    prompt_size = metadata.get("prompt_size", 0)
+
+                    self.logger.error(
+                        f"  🐛 BUG STILL EXISTS: Short prompt ({len(short_prompt)} chars) triggered resend_prompt"
+                    )
+                    self.logger.error(f"     Reported prompt_size: {prompt_size} (should be ~{len(short_prompt)})")
+                    self.logger.error("     This indicates conversation history is still being counted")
+
+                    return False  # Bug still exists
+
+                elif status in ["success", "continuation_available"]:
+                    self.logger.info("  ✅ Short prompt processed correctly - bug appears to be FIXED!")
+                    self.logger.info(f"     Prompt length: {len(short_prompt)} chars, Status: {status}")
+                    return True
+
+                else:
+                    self.logger.warning(f"  ⚠️ Unexpected status: {status}")
+                    # Check if this might be a non-JSON response (successful execution)
+                    if len(response_final) > 0 and not response_final.startswith('{"'):
+                        self.logger.info("  ✅ Non-JSON response suggests successful tool execution")
+                        return True
+                    return False
+
+            except json.JSONDecodeError:
+                # Non-JSON response often means successful tool execution
+                self.logger.info("  ✅ Non-JSON response suggests successful tool execution (bug likely fixed)")
+                self.logger.debug(f"     Response preview: {response_final[:200]}...")
+                return True
+
+        except Exception as e:
+            self.logger.error(f"Prompt size limit bug test failed: {e}")
+            import traceback
+
+            self.logger.debug(f"Full traceback: {traceback.format_exc()}")
+            return False
+
+
+def main():
+    """Run the prompt size limit bug test"""
+    import sys
+
+    verbose = "--verbose" in sys.argv or "-v" in sys.argv
+    test = PromptSizeLimitBugTest(verbose=verbose)
+
+    success = test.run_test()
+    if success:
+        print("Bug reproduction test completed - check logs for details")
+    else:
+        print("Test failed to complete")
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/simulator_tests/test_refactor_validation.py
+++ b/simulator_tests/test_refactor_validation.py
@@ -947,37 +947,37 @@ class DataContainer:
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
-        """Call an MCP tool in-process - override for refactorworkflow-specific response handling"""
+        """Call an MCP tool in-process - override for -specific response handling"""
        # Use in-process implementation to maintain conversation memory
        response_text, _ = self.call_mcp_tool_direct(tool_name, params)

        if not response_text:
            return None, None

-        # Extract continuation_id from refactorworkflow response specifically
-        continuation_id = self._extract_refactorworkflow_continuation_id(response_text)
+        # Extract continuation_id from refactor response specifically
+        continuation_id = self._extract_refactor_continuation_id(response_text)

        return response_text, continuation_id

-    def _extract_refactorworkflow_continuation_id(self, response_text: str) -> Optional[str]:
-        """Extract continuation_id from refactorworkflow response"""
+    def _extract_refactor_continuation_id(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from refactor response"""
        try:
            # Parse the response
            response_data = json.loads(response_text)
            return response_data.get("continuation_id")

        except json.JSONDecodeError as e:
-            self.logger.debug(f"Failed to parse response for refactorworkflow continuation_id: {e}")
+            self.logger.debug(f"Failed to parse response for refactor continuation_id: {e}")
            return None

    def _parse_refactor_response(self, response_text: str) -> dict:
-        """Parse refactorworkflow tool JSON response"""
+        """Parse refactor tool JSON response"""
        try:
            # Parse the response - it should be direct JSON
            return json.loads(response_text)

        except json.JSONDecodeError as e:
-            self.logger.error(f"Failed to parse refactorworkflow response as JSON: {e}")
+            self.logger.error(f"Failed to parse refactor response as JSON: {e}")
            self.logger.error(f"Response text: {response_text[:500]}...")
            return {}

@@ -989,7 +989,7 @@ class DataContainer:
        expected_next_required: bool,
        expected_status: str,
    ) -> bool:
-        """Validate a refactorworkflow investigation step response structure"""
+        """Validate a refactor investigation step response structure"""
        try:
            # Check status
            if response_data.get("status") != expected_status: