Add DocGen tool with comprehensive documentation generation capabilities (#109)
* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools * WIP - Update tests to match new tools * Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97 Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96 Improved retry error logging Cleanup * WIP - chat tool using new architecture and improved code sharing * Removed todo * Removed todo * Cleanup old name * Tweak wordings * Tweak wordings Migrate old tests * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 Fixed test * Improved consensus to use the workflow base class * Improved consensus to use the workflow base class * Allow images * Allow images * Replaced old consensus tool * Cleanup tests * Tests for prompt size * New tool: docgen Tests for prompt size Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107 Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105 * Improved docgen prompt Exclude TestGen from pytest inclusion * Updated errors * Lint * DocGen instructed not to fix bugs, surface them and stick to d * WIP * Stop claude from being lazy and only documenting a small handful * More style rules --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
0655590a51
commit
c960bcb720
@@ -8,6 +8,7 @@ Each test is in its own file for better organization and maintainability.
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .test_analyze_validation import AnalyzeValidationTest
|
||||
from .test_basic_conversation import BasicConversationTest
|
||||
from .test_chat_simple_validation import ChatSimpleValidationTest
|
||||
from .test_codereview_validation import CodeReviewValidationTest
|
||||
from .test_consensus_conversation import TestConsensusConversation
|
||||
from .test_consensus_stance import TestConsensusStance
|
||||
@@ -30,6 +31,7 @@ from .test_per_tool_deduplication import PerToolDeduplicationTest
|
||||
from .test_planner_continuation_history import PlannerContinuationHistoryTest
|
||||
from .test_planner_validation import PlannerValidationTest
|
||||
from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest
|
||||
from .test_prompt_size_limit_bug import PromptSizeLimitBugTest
|
||||
|
||||
# Redis validation test removed - no longer needed for standalone server
|
||||
from .test_refactor_validation import RefactorValidationTest
|
||||
@@ -42,6 +44,7 @@ from .test_xai_models import XAIModelsTest
|
||||
# Test registry for dynamic loading
|
||||
TEST_REGISTRY = {
|
||||
"basic_conversation": BasicConversationTest,
|
||||
"chat_validation": ChatSimpleValidationTest,
|
||||
"codereview_validation": CodeReviewValidationTest,
|
||||
"content_validation": ContentValidationTest,
|
||||
"per_tool_deduplication": PerToolDeduplicationTest,
|
||||
@@ -71,12 +74,14 @@ TEST_REGISTRY = {
|
||||
"consensus_stance": TestConsensusStance,
|
||||
"consensus_three_models": TestConsensusThreeModels,
|
||||
"analyze_validation": AnalyzeValidationTest,
|
||||
"prompt_size_limit_bug": PromptSizeLimitBugTest,
|
||||
# "o3_pro_expensive": O3ProExpensiveTest, # COMMENTED OUT - too expensive to run by default
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"BaseSimulatorTest",
|
||||
"BasicConversationTest",
|
||||
"ChatSimpleValidationTest",
|
||||
"CodeReviewValidationTest",
|
||||
"ContentValidationTest",
|
||||
"PerToolDeduplicationTest",
|
||||
@@ -106,5 +111,6 @@ __all__ = [
|
||||
"TestConsensusStance",
|
||||
"TestConsensusThreeModels",
|
||||
"AnalyzeValidationTest",
|
||||
"PromptSizeLimitBugTest",
|
||||
"TEST_REGISTRY",
|
||||
]
|
||||
|
||||
509
simulator_tests/test_chat_simple_validation.py
Normal file
509
simulator_tests/test_chat_simple_validation.py
Normal file
@@ -0,0 +1,509 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chat Simple Tool Validation Test
|
||||
|
||||
Comprehensive test for the new ChatSimple tool implementation that validates:
|
||||
- Basic conversation flow without continuation_id (new chats)
|
||||
- Continuing existing conversations with continuation_id (continued chats)
|
||||
- File handling with conversation context (chats with files)
|
||||
- Image handling in conversations (chat with images)
|
||||
- Continuing conversations with files from previous turns (continued chats with files previously)
|
||||
- Temperature validation for different models
|
||||
- Image limit validation per model
|
||||
- Conversation context preservation across turns
|
||||
"""
|
||||
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class ChatSimpleValidationTest(ConversationBaseTest):
|
||||
"""Test ChatSimple tool functionality and validation"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Comprehensive validation of ChatSimple tool implementation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Run comprehensive ChatSimple validation tests"""
|
||||
try:
|
||||
# Set up the test environment for in-process testing
|
||||
self.setUp()
|
||||
|
||||
self.logger.info("Test: ChatSimple tool validation")
|
||||
|
||||
# Run all test scenarios
|
||||
if not self.test_new_conversation_no_continuation():
|
||||
return False
|
||||
|
||||
if not self.test_continue_existing_conversation():
|
||||
return False
|
||||
|
||||
if not self.test_file_handling_with_conversation():
|
||||
return False
|
||||
|
||||
if not self.test_temperature_validation_edge_cases():
|
||||
return False
|
||||
|
||||
if not self.test_image_limits_per_model():
|
||||
return False
|
||||
|
||||
if not self.test_conversation_context_preservation():
|
||||
return False
|
||||
|
||||
if not self.test_chat_with_images():
|
||||
return False
|
||||
|
||||
if not self.test_continued_chat_with_previous_files():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All ChatSimple validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"ChatSimple validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_new_conversation_no_continuation(self) -> bool:
|
||||
"""Test ChatSimple creates new conversation without continuation_id"""
|
||||
try:
|
||||
self.logger.info(" 1. Test new conversation without continuation_id")
|
||||
|
||||
# Call chat without continuation_id
|
||||
response, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Hello! Please use low thinking mode. Can you explain what MCP tools are?",
|
||||
"model": "flash",
|
||||
"temperature": 0.7,
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error(" ❌ Failed to get response from chat")
|
||||
return False
|
||||
|
||||
if not continuation_id:
|
||||
self.logger.error(" ❌ No continuation_id returned for new conversation")
|
||||
return False
|
||||
|
||||
# Verify response mentions MCP or tools
|
||||
if "MCP" not in response and "tool" not in response.lower():
|
||||
self.logger.error(" ❌ Response doesn't seem to address the question about MCP tools")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ New conversation created with continuation_id: {continuation_id}")
|
||||
self.new_continuation_id = continuation_id # Store for next test
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ New conversation test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_continue_existing_conversation(self) -> bool:
|
||||
"""Test ChatSimple continues conversation with valid continuation_id"""
|
||||
try:
|
||||
self.logger.info(" 2. Test continuing existing conversation")
|
||||
|
||||
if not hasattr(self, "new_continuation_id"):
|
||||
self.logger.error(" ❌ No continuation_id from previous test")
|
||||
return False
|
||||
|
||||
# Continue the conversation
|
||||
response, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Can you give me a specific example of how an MCP tool might work?",
|
||||
"continuation_id": self.new_continuation_id,
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error(" ❌ Failed to continue conversation")
|
||||
return False
|
||||
|
||||
# Continuation ID should be the same
|
||||
if continuation_id != self.new_continuation_id:
|
||||
self.logger.error(f" ❌ Continuation ID changed: {self.new_continuation_id} -> {continuation_id}")
|
||||
return False
|
||||
|
||||
# Response should be contextual (mentioning previous discussion)
|
||||
if "example" not in response.lower():
|
||||
self.logger.error(" ❌ Response doesn't seem to provide an example as requested")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Successfully continued conversation with same continuation_id")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Continue conversation test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_file_handling_with_conversation(self) -> bool:
|
||||
"""Test ChatSimple handles files correctly in conversation context"""
|
||||
try:
|
||||
self.logger.info(" 3. Test file handling with conversation")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
# Start new conversation with a file
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Analyze this Python code and tell me what the Calculator class does",
|
||||
"files": [self.test_files["python"]],
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error(" ❌ Failed to start conversation with file")
|
||||
return False
|
||||
|
||||
# Continue with same file (should be deduplicated)
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. What methods does the Calculator class have?",
|
||||
"files": [self.test_files["python"]], # Same file
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Failed to continue with same file")
|
||||
return False
|
||||
|
||||
# Response should mention add and multiply methods
|
||||
if "add" not in response2.lower() or "multiply" not in response2.lower():
|
||||
self.logger.error(" ❌ Response doesn't mention Calculator methods")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ File handling with conversation working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ File handling test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
|
||||
def test_temperature_validation_edge_cases(self) -> bool:
|
||||
"""Test temperature is corrected for model limits (too high/low)"""
|
||||
try:
|
||||
self.logger.info(" 4. Test temperature validation edge cases")
|
||||
|
||||
# Test 1: Temperature exactly at limit (should work)
|
||||
response1, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Hello, this is a test with max temperature",
|
||||
"model": "flash",
|
||||
"temperature": 1.0, # At the limit
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1:
|
||||
self.logger.error(" ❌ Failed with temperature 1.0")
|
||||
return False
|
||||
|
||||
# Test 2: Temperature at minimum (should work)
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Another test message with min temperature",
|
||||
"model": "flash",
|
||||
"temperature": 0.0, # At minimum
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Failed with temperature 0.0")
|
||||
return False
|
||||
|
||||
# Test 3: Check that invalid temperatures are rejected by validation
|
||||
# This should result in an error response from the tool, not a crash
|
||||
try:
|
||||
response3, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Test with invalid temperature",
|
||||
"model": "flash",
|
||||
"temperature": 1.5, # Too high - should be validated
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
# If we get here, check if it's an error response
|
||||
if response3 and "validation error" in response3.lower():
|
||||
self.logger.info(" ✅ Invalid temperature properly rejected by validation")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ High temperature not properly validated")
|
||||
except Exception:
|
||||
# Expected - validation should reject this
|
||||
self.logger.info(" ✅ Invalid temperature properly rejected")
|
||||
|
||||
self.logger.info(" ✅ Temperature validation working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Temperature validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_image_limits_per_model(self) -> bool:
|
||||
"""Test image validation respects model-specific limits"""
|
||||
try:
|
||||
self.logger.info(" 5. Test image limits per model")
|
||||
|
||||
# Create test image data URLs (small base64 images)
|
||||
small_image = ""
|
||||
|
||||
# Test 1: Model that doesn't support images
|
||||
response1, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Can you see this image?",
|
||||
"model": "local-llama", # Text-only model
|
||||
"images": [small_image],
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
# Should get an error about image support
|
||||
if response1 and "does not support image" not in response1:
|
||||
self.logger.warning(" ⚠️ Model without image support didn't reject images properly")
|
||||
|
||||
# Test 2: Too many images for a model
|
||||
many_images = [small_image] * 25 # Most models support max 20
|
||||
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Analyze these images",
|
||||
"model": "gemini-2.5-flash", # Supports max 16 images
|
||||
"images": many_images,
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
# Should get an error about too many images
|
||||
if response2 and "too many images" not in response2.lower():
|
||||
self.logger.warning(" ⚠️ Model didn't reject excessive image count")
|
||||
|
||||
# Test 3: Valid image count
|
||||
response3, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. This is a test with one image",
|
||||
"model": "gemini-2.5-flash",
|
||||
"images": [small_image],
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Failed with valid image count")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Image validation working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Image limits test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_conversation_context_preservation(self) -> bool:
|
||||
"""Test ChatSimple preserves context across turns"""
|
||||
try:
|
||||
self.logger.info(" 6. Test conversation context preservation")
|
||||
|
||||
# Start conversation with specific context
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. My name is TestUser and I'm working on a Python project called TestProject",
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error(" ❌ Failed to start conversation")
|
||||
return False
|
||||
|
||||
# Continue and reference previous context
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. What's my name and what project am I working on?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Failed to continue conversation")
|
||||
return False
|
||||
|
||||
# Check if context was preserved
|
||||
if "TestUser" not in response2 or "TestProject" not in response2:
|
||||
self.logger.error(" ❌ Context not preserved across conversation turns")
|
||||
self.logger.debug(f" Response: {response2[:200]}...")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Conversation context preserved correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Context preservation test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_chat_with_images(self) -> bool:
|
||||
"""Test ChatSimple handles images correctly in conversation"""
|
||||
try:
|
||||
self.logger.info(" 7. Test chat with images")
|
||||
|
||||
# Create test image data URL (small base64 image)
|
||||
small_image = ""
|
||||
|
||||
# Start conversation with image
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. I'm sharing an image with you. Can you acknowledge that you received it?",
|
||||
"images": [small_image],
|
||||
"model": "gemini-2.5-flash", # Model that supports images
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error(" ❌ Failed to start conversation with image")
|
||||
return False
|
||||
|
||||
# Verify response acknowledges the image
|
||||
if "image" not in response1.lower():
|
||||
self.logger.warning(" ⚠️ Response doesn't acknowledge receiving image")
|
||||
|
||||
# Continue conversation referencing the image
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. What did you see in that image I shared earlier?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "gemini-2.5-flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Failed to continue conversation about image")
|
||||
return False
|
||||
|
||||
# Test with multiple images
|
||||
multiple_images = [small_image, small_image] # Two identical small images
|
||||
response3, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Here are two images for comparison",
|
||||
"images": multiple_images,
|
||||
"model": "gemini-2.5-flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Failed with multiple images")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Chat with images working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Chat with images test failed: {e}")
|
||||
return False
|
||||
|
||||
def test_continued_chat_with_previous_files(self) -> bool:
|
||||
"""Test continuing conversation where files were shared in previous turns"""
|
||||
try:
|
||||
self.logger.info(" 8. Test continued chat with files from previous turns")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
# Start conversation with files
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Here are some files for you to analyze",
|
||||
"files": [self.test_files["python"], self.test_files["config"]],
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error(" ❌ Failed to start conversation with files")
|
||||
return False
|
||||
|
||||
# Continue conversation without new files (should remember previous files)
|
||||
response2, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. From the files I shared earlier, what types of files were there?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Failed to continue conversation")
|
||||
return False
|
||||
|
||||
# Check if response references the files from previous turn
|
||||
if "python" not in response2.lower() and "config" not in response2.lower():
|
||||
self.logger.warning(" ⚠️ Response doesn't reference previous files properly")
|
||||
|
||||
# Continue with a different question about same files (should still remember them)
|
||||
response3, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Can you tell me what functions were defined in the Python file from our earlier discussion?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
"thinking_mode": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Failed to continue conversation about Python file")
|
||||
return False
|
||||
|
||||
# Should reference functions from the Python file (fibonacci, factorial, Calculator, etc.)
|
||||
response_lower = response3.lower()
|
||||
if not ("fibonacci" in response_lower or "factorial" in response_lower or "calculator" in response_lower):
|
||||
self.logger.warning(" ⚠️ Response doesn't reference Python file contents from earlier turn")
|
||||
|
||||
self.logger.info(" ✅ Continued chat with previous files working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" ❌ Continued chat with files test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
@@ -21,7 +21,12 @@ class CrossToolComprehensiveTest(ConversationBaseTest):
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
|
||||
"""Call an MCP tool in-process"""
|
||||
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
|
||||
# Use the new method for workflow tools
|
||||
workflow_tools = ["analyze", "debug", "codereview", "precommit", "refactor", "thinkdeep"]
|
||||
if tool_name in workflow_tools:
|
||||
response_text, continuation_id = super().call_mcp_tool(tool_name, params)
|
||||
else:
|
||||
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
|
||||
return response_text, continuation_id
|
||||
|
||||
@property
|
||||
@@ -96,8 +101,12 @@ def hash_pwd(pwd):
|
||||
# Step 2: Use analyze tool to do deeper analysis (fresh conversation)
|
||||
self.logger.info(" Step 2: analyze tool - Deep code analysis (fresh)")
|
||||
analyze_params = {
|
||||
"files": [auth_file],
|
||||
"prompt": "Find vulnerabilities",
|
||||
"step": "Starting comprehensive code analysis to find security vulnerabilities in the authentication system",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Initial analysis will focus on security vulnerabilities in authentication code",
|
||||
"relevant_files": [auth_file],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
@@ -133,8 +142,12 @@ def hash_pwd(pwd):
|
||||
# Step 4: Use debug tool to identify specific issues
|
||||
self.logger.info(" Step 4: debug tool - Identify specific problems")
|
||||
debug_params = {
|
||||
"files": [auth_file, config_file_path],
|
||||
"prompt": "Fix auth issues",
|
||||
"step": "Starting debug investigation to identify and fix authentication security issues",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Investigating authentication vulnerabilities found in previous analysis",
|
||||
"relevant_files": [auth_file, config_file_path],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
@@ -153,9 +166,13 @@ def hash_pwd(pwd):
|
||||
if continuation_id4:
|
||||
self.logger.info(" Step 5: debug continuation - Additional analysis")
|
||||
debug_continue_params = {
|
||||
"step": "Continuing debug investigation to fix password hashing implementation",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False,
|
||||
"findings": "Building on previous analysis to fix weak password hashing",
|
||||
"continuation_id": continuation_id4,
|
||||
"files": [auth_file, config_file_path],
|
||||
"prompt": "Fix password hashing",
|
||||
"relevant_files": [auth_file, config_file_path],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
@@ -168,8 +185,12 @@ def hash_pwd(pwd):
|
||||
# Step 6: Use codereview for comprehensive review
|
||||
self.logger.info(" Step 6: codereview tool - Comprehensive code review")
|
||||
codereview_params = {
|
||||
"files": [auth_file, config_file_path],
|
||||
"prompt": "Security review",
|
||||
"step": "Starting comprehensive security code review of authentication system",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Performing thorough security review of authentication code and configuration",
|
||||
"relevant_files": [auth_file, config_file_path],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
@@ -201,9 +222,13 @@ def secure_login(user, pwd):
|
||||
improved_file = self.create_additional_test_file("auth_improved.py", improved_code)
|
||||
|
||||
precommit_params = {
|
||||
"step": "Starting pre-commit validation of improved authentication code",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Validating improved authentication implementation before commit",
|
||||
"path": self.test_dir,
|
||||
"files": [auth_file, config_file_path, improved_file],
|
||||
"prompt": "Ready to commit",
|
||||
"relevant_files": [auth_file, config_file_path, improved_file],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
206
simulator_tests/test_prompt_size_limit_bug.py
Normal file
206
simulator_tests/test_prompt_size_limit_bug.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Prompt Size Limit Bug Test
|
||||
|
||||
This test reproduces a critical bug where the prompt size limit check
|
||||
incorrectly includes conversation history when validating incoming prompts
|
||||
from Claude to MCP. The limit should ONLY apply to the actual prompt text
|
||||
sent by the user, not the entire conversation context.
|
||||
|
||||
Bug Scenario:
|
||||
- User starts a conversation with chat tool
|
||||
- Continues conversation multiple times (building up history)
|
||||
- On subsequent continuation, a short prompt (150 chars) triggers
|
||||
"resend_prompt" error claiming >50k characters
|
||||
|
||||
Expected Behavior:
|
||||
- Only count the actual prompt parameter for size limit
|
||||
- Conversation history should NOT count toward prompt size limit
|
||||
- Only the user's actual input should be validated against 50k limit
|
||||
"""
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class PromptSizeLimitBugTest(ConversationBaseTest):
|
||||
"""Test to reproduce and verify fix for prompt size limit bug"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "prompt_size_limit_bug"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Reproduce prompt size limit bug with conversation continuation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test prompt size limit bug reproduction using in-process calls"""
|
||||
try:
|
||||
self.logger.info("🐛 Test: Prompt size limit bug reproduction (in-process)")
|
||||
|
||||
# Setup test environment
|
||||
self.setUp()
|
||||
|
||||
# Create a test file to provide context
|
||||
test_file_content = """
|
||||
# Test SwiftUI-like Framework Implementation
|
||||
|
||||
struct ContentView: View {
|
||||
@State private var counter = 0
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
Text("Count: \\(counter)")
|
||||
Button("Increment") {
|
||||
counter += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Renderer {
|
||||
static let shared = Renderer()
|
||||
|
||||
func render(view: View) {
|
||||
// Implementation details for UIKit/AppKit rendering
|
||||
}
|
||||
}
|
||||
|
||||
protocol View {
|
||||
var body: some View { get }
|
||||
}
|
||||
"""
|
||||
test_file_path = self.create_additional_test_file("SwiftFramework.swift", test_file_content)
|
||||
|
||||
# Step 1: Start initial conversation
|
||||
self.logger.info(" Step 1: Start conversation with initial context")
|
||||
|
||||
initial_prompt = "I'm building a SwiftUI-like framework. Can you help me design the architecture?"
|
||||
|
||||
response1, continuation_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": initial_prompt,
|
||||
"files": [test_file_path],
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error(" ❌ Failed to start initial conversation")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Initial conversation started: {continuation_id[:8]}...")
|
||||
|
||||
# Step 2: Continue conversation multiple times to build substantial history
|
||||
conversation_prompts = [
|
||||
"That's helpful! Can you elaborate on the View protocol design?",
|
||||
"How should I implement the State property wrapper?",
|
||||
"What's the best approach for the VStack layout implementation?",
|
||||
"Should I use UIKit directly or create an abstraction layer?",
|
||||
"Smart approach! For the rendering layer, would you suggest UIKit/AppKit directly?",
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(conversation_prompts, 2):
|
||||
self.logger.info(f" Step {i}: Continue conversation (exchange {i})")
|
||||
|
||||
response, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": prompt,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error(f" ❌ Failed at exchange {i}")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Exchange {i} completed")
|
||||
|
||||
# Step 3: Send short prompt that should NOT trigger size limit
|
||||
self.logger.info(" Step 7: Send short prompt (should NOT trigger size limit)")
|
||||
|
||||
# This is a very short prompt - should not trigger the bug after fix
|
||||
short_prompt = "Thanks! This gives me a solid foundation to start prototyping."
|
||||
|
||||
self.logger.info(f" Short prompt length: {len(short_prompt)} characters")
|
||||
|
||||
response_final, _ = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": short_prompt,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response_final:
|
||||
self.logger.error(" ❌ Final short prompt failed")
|
||||
return False
|
||||
|
||||
# Parse the response to check for the bug
|
||||
import json
|
||||
|
||||
try:
|
||||
response_data = json.loads(response_final)
|
||||
status = response_data.get("status", "")
|
||||
|
||||
if status == "resend_prompt":
|
||||
# This is the bug! Short prompt incorrectly triggering size limit
|
||||
metadata = response_data.get("metadata", {})
|
||||
prompt_size = metadata.get("prompt_size", 0)
|
||||
|
||||
self.logger.error(
|
||||
f" 🐛 BUG STILL EXISTS: Short prompt ({len(short_prompt)} chars) triggered resend_prompt"
|
||||
)
|
||||
self.logger.error(f" Reported prompt_size: {prompt_size} (should be ~{len(short_prompt)})")
|
||||
self.logger.error(" This indicates conversation history is still being counted")
|
||||
|
||||
return False # Bug still exists
|
||||
|
||||
elif status in ["success", "continuation_available"]:
|
||||
self.logger.info(" ✅ Short prompt processed correctly - bug appears to be FIXED!")
|
||||
self.logger.info(f" Prompt length: {len(short_prompt)} chars, Status: {status}")
|
||||
return True
|
||||
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Unexpected status: {status}")
|
||||
# Check if this might be a non-JSON response (successful execution)
|
||||
if len(response_final) > 0 and not response_final.startswith('{"'):
|
||||
self.logger.info(" ✅ Non-JSON response suggests successful tool execution")
|
||||
return True
|
||||
return False
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Non-JSON response often means successful tool execution
|
||||
self.logger.info(" ✅ Non-JSON response suggests successful tool execution (bug likely fixed)")
|
||||
self.logger.debug(f" Response preview: {response_final[:200]}...")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Prompt size limit bug test failed: {e}")
|
||||
import traceback
|
||||
|
||||
self.logger.debug(f"Full traceback: {traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the prompt size limit bug test"""
|
||||
import sys
|
||||
|
||||
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
||||
test = PromptSizeLimitBugTest(verbose=verbose)
|
||||
|
||||
success = test.run_test()
|
||||
if success:
|
||||
print("Bug reproduction test completed - check logs for details")
|
||||
else:
|
||||
print("Test failed to complete")
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -947,37 +947,37 @@ class DataContainer:
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for refactorworkflow-specific response handling"""
|
||||
"""Call an MCP tool in-process - override for -specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
|
||||
|
||||
if not response_text:
|
||||
return None, None
|
||||
|
||||
# Extract continuation_id from refactorworkflow response specifically
|
||||
continuation_id = self._extract_refactorworkflow_continuation_id(response_text)
|
||||
# Extract continuation_id from refactor response specifically
|
||||
continuation_id = self._extract_refactor_continuation_id(response_text)
|
||||
|
||||
return response_text, continuation_id
|
||||
|
||||
def _extract_refactorworkflow_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from refactorworkflow response"""
|
||||
def _extract_refactor_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from refactor response"""
|
||||
try:
|
||||
# Parse the response
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.debug(f"Failed to parse response for refactorworkflow continuation_id: {e}")
|
||||
self.logger.debug(f"Failed to parse response for refactor continuation_id: {e}")
|
||||
return None
|
||||
|
||||
def _parse_refactor_response(self, response_text: str) -> dict:
|
||||
"""Parse refactorworkflow tool JSON response"""
|
||||
"""Parse refactor tool JSON response"""
|
||||
try:
|
||||
# Parse the response - it should be direct JSON
|
||||
return json.loads(response_text)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse refactorworkflow response as JSON: {e}")
|
||||
self.logger.error(f"Failed to parse refactor response as JSON: {e}")
|
||||
self.logger.error(f"Response text: {response_text[:500]}...")
|
||||
return {}
|
||||
|
||||
@@ -989,7 +989,7 @@ class DataContainer:
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate a refactorworkflow investigation step response structure"""
|
||||
"""Validate a refactor investigation step response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
|
||||
Reference in New Issue
Block a user