* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools * WIP - Update tests to match new tools * Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97 Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96 Improved retry error logging Cleanup * WIP - chat tool using new architecture and improved code sharing * Removed todo * Removed todo * Cleanup old name * Tweak wordings * Tweak wordings Migrate old tests * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 Fixed test * Improved consensus to use the workflow base class * Improved consensus to use the workflow base class * Allow images * Allow images * Replaced old consensus tool * Cleanup tests * Tests for prompt size * New tool: docgen Tests for prompt size Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107 Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105 * Improved docgen prompt Exclude TestGen from pytest inclusion * Updated errors * Lint * DocGen instructed not to fix bugs, surface them and stick to d * WIP * Stop claude from being lazy and only documenting a small handful * More style rules --------- Co-authored-by: Claude <noreply@anthropic.com>
510 lines
20 KiB
Python
510 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chat Simple Tool Validation Test
|
|
|
|
Comprehensive test for the new ChatSimple tool implementation that validates:
|
|
- Basic conversation flow without continuation_id (new chats)
|
|
- Continuing existing conversations with continuation_id (continued chats)
|
|
- File handling with conversation context (chats with files)
|
|
- Image handling in conversations (chat with images)
|
|
- Continuing conversations with files from previous turns (continued chats with files previously)
|
|
- Temperature validation for different models
|
|
- Image limit validation per model
|
|
- Conversation context preservation across turns
|
|
"""
|
|
|
|
|
|
from .conversation_base_test import ConversationBaseTest
|
|
|
|
|
|
class ChatSimpleValidationTest(ConversationBaseTest):
|
|
"""Test ChatSimple tool functionality and validation"""
|
|
|
|
@property
|
|
def test_name(self) -> str:
|
|
return "_validation"
|
|
|
|
@property
|
|
def test_description(self) -> str:
|
|
return "Comprehensive validation of ChatSimple tool implementation"
|
|
|
|
def run_test(self) -> bool:
|
|
"""Run comprehensive ChatSimple validation tests"""
|
|
try:
|
|
# Set up the test environment for in-process testing
|
|
self.setUp()
|
|
|
|
self.logger.info("Test: ChatSimple tool validation")
|
|
|
|
# Run all test scenarios
|
|
if not self.test_new_conversation_no_continuation():
|
|
return False
|
|
|
|
if not self.test_continue_existing_conversation():
|
|
return False
|
|
|
|
if not self.test_file_handling_with_conversation():
|
|
return False
|
|
|
|
if not self.test_temperature_validation_edge_cases():
|
|
return False
|
|
|
|
if not self.test_image_limits_per_model():
|
|
return False
|
|
|
|
if not self.test_conversation_context_preservation():
|
|
return False
|
|
|
|
if not self.test_chat_with_images():
|
|
return False
|
|
|
|
if not self.test_continued_chat_with_previous_files():
|
|
return False
|
|
|
|
self.logger.info(" ✅ All ChatSimple validation tests passed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"ChatSimple validation test failed: {e}")
|
|
return False
|
|
|
|
def test_new_conversation_no_continuation(self) -> bool:
|
|
"""Test ChatSimple creates new conversation without continuation_id"""
|
|
try:
|
|
self.logger.info(" 1. Test new conversation without continuation_id")
|
|
|
|
# Call chat without continuation_id
|
|
response, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Hello! Please use low thinking mode. Can you explain what MCP tools are?",
|
|
"model": "flash",
|
|
"temperature": 0.7,
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
self.logger.error(" ❌ Failed to get response from chat")
|
|
return False
|
|
|
|
if not continuation_id:
|
|
self.logger.error(" ❌ No continuation_id returned for new conversation")
|
|
return False
|
|
|
|
# Verify response mentions MCP or tools
|
|
if "MCP" not in response and "tool" not in response.lower():
|
|
self.logger.error(" ❌ Response doesn't seem to address the question about MCP tools")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ New conversation created with continuation_id: {continuation_id}")
|
|
self.new_continuation_id = continuation_id # Store for next test
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ New conversation test failed: {e}")
|
|
return False
|
|
|
|
def test_continue_existing_conversation(self) -> bool:
|
|
"""Test ChatSimple continues conversation with valid continuation_id"""
|
|
try:
|
|
self.logger.info(" 2. Test continuing existing conversation")
|
|
|
|
if not hasattr(self, "new_continuation_id"):
|
|
self.logger.error(" ❌ No continuation_id from previous test")
|
|
return False
|
|
|
|
# Continue the conversation
|
|
response, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Can you give me a specific example of how an MCP tool might work?",
|
|
"continuation_id": self.new_continuation_id,
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
self.logger.error(" ❌ Failed to continue conversation")
|
|
return False
|
|
|
|
# Continuation ID should be the same
|
|
if continuation_id != self.new_continuation_id:
|
|
self.logger.error(f" ❌ Continuation ID changed: {self.new_continuation_id} -> {continuation_id}")
|
|
return False
|
|
|
|
# Response should be contextual (mentioning previous discussion)
|
|
if "example" not in response.lower():
|
|
self.logger.error(" ❌ Response doesn't seem to provide an example as requested")
|
|
return False
|
|
|
|
self.logger.info(" ✅ Successfully continued conversation with same continuation_id")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Continue conversation test failed: {e}")
|
|
return False
|
|
|
|
def test_file_handling_with_conversation(self) -> bool:
|
|
"""Test ChatSimple handles files correctly in conversation context"""
|
|
try:
|
|
self.logger.info(" 3. Test file handling with conversation")
|
|
|
|
# Setup test files
|
|
self.setup_test_files()
|
|
|
|
# Start new conversation with a file
|
|
response1, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Analyze this Python code and tell me what the Calculator class does",
|
|
"files": [self.test_files["python"]],
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response1 or not continuation_id:
|
|
self.logger.error(" ❌ Failed to start conversation with file")
|
|
return False
|
|
|
|
# Continue with same file (should be deduplicated)
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. What methods does the Calculator class have?",
|
|
"files": [self.test_files["python"]], # Same file
|
|
"continuation_id": continuation_id,
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response2:
|
|
self.logger.error(" ❌ Failed to continue with same file")
|
|
return False
|
|
|
|
# Response should mention add and multiply methods
|
|
if "add" not in response2.lower() or "multiply" not in response2.lower():
|
|
self.logger.error(" ❌ Response doesn't mention Calculator methods")
|
|
return False
|
|
|
|
self.logger.info(" ✅ File handling with conversation working correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ File handling test failed: {e}")
|
|
return False
|
|
finally:
|
|
self.cleanup_test_files()
|
|
|
|
def test_temperature_validation_edge_cases(self) -> bool:
|
|
"""Test temperature is corrected for model limits (too high/low)"""
|
|
try:
|
|
self.logger.info(" 4. Test temperature validation edge cases")
|
|
|
|
# Test 1: Temperature exactly at limit (should work)
|
|
response1, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Hello, this is a test with max temperature",
|
|
"model": "flash",
|
|
"temperature": 1.0, # At the limit
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response1:
|
|
self.logger.error(" ❌ Failed with temperature 1.0")
|
|
return False
|
|
|
|
# Test 2: Temperature at minimum (should work)
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Another test message with min temperature",
|
|
"model": "flash",
|
|
"temperature": 0.0, # At minimum
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response2:
|
|
self.logger.error(" ❌ Failed with temperature 0.0")
|
|
return False
|
|
|
|
# Test 3: Check that invalid temperatures are rejected by validation
|
|
# This should result in an error response from the tool, not a crash
|
|
try:
|
|
response3, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Test with invalid temperature",
|
|
"model": "flash",
|
|
"temperature": 1.5, # Too high - should be validated
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
# If we get here, check if it's an error response
|
|
if response3 and "validation error" in response3.lower():
|
|
self.logger.info(" ✅ Invalid temperature properly rejected by validation")
|
|
else:
|
|
self.logger.warning(" ⚠️ High temperature not properly validated")
|
|
except Exception:
|
|
# Expected - validation should reject this
|
|
self.logger.info(" ✅ Invalid temperature properly rejected")
|
|
|
|
self.logger.info(" ✅ Temperature validation working correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Temperature validation test failed: {e}")
|
|
return False
|
|
|
|
def test_image_limits_per_model(self) -> bool:
|
|
"""Test image validation respects model-specific limits"""
|
|
try:
|
|
self.logger.info(" 5. Test image limits per model")
|
|
|
|
# Create test image data URLs (small base64 images)
|
|
small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
|
|
|
|
# Test 1: Model that doesn't support images
|
|
response1, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Can you see this image?",
|
|
"model": "local-llama", # Text-only model
|
|
"images": [small_image],
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
# Should get an error about image support
|
|
if response1 and "does not support image" not in response1:
|
|
self.logger.warning(" ⚠️ Model without image support didn't reject images properly")
|
|
|
|
# Test 2: Too many images for a model
|
|
many_images = [small_image] * 25 # Most models support max 20
|
|
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Analyze these images",
|
|
"model": "gemini-2.5-flash", # Supports max 16 images
|
|
"images": many_images,
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
# Should get an error about too many images
|
|
if response2 and "too many images" not in response2.lower():
|
|
self.logger.warning(" ⚠️ Model didn't reject excessive image count")
|
|
|
|
# Test 3: Valid image count
|
|
response3, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. This is a test with one image",
|
|
"model": "gemini-2.5-flash",
|
|
"images": [small_image],
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response3:
|
|
self.logger.error(" ❌ Failed with valid image count")
|
|
return False
|
|
|
|
self.logger.info(" ✅ Image validation working correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Image limits test failed: {e}")
|
|
return False
|
|
|
|
def test_conversation_context_preservation(self) -> bool:
|
|
"""Test ChatSimple preserves context across turns"""
|
|
try:
|
|
self.logger.info(" 6. Test conversation context preservation")
|
|
|
|
# Start conversation with specific context
|
|
response1, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. My name is TestUser and I'm working on a Python project called TestProject",
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response1 or not continuation_id:
|
|
self.logger.error(" ❌ Failed to start conversation")
|
|
return False
|
|
|
|
# Continue and reference previous context
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. What's my name and what project am I working on?",
|
|
"continuation_id": continuation_id,
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response2:
|
|
self.logger.error(" ❌ Failed to continue conversation")
|
|
return False
|
|
|
|
# Check if context was preserved
|
|
if "TestUser" not in response2 or "TestProject" not in response2:
|
|
self.logger.error(" ❌ Context not preserved across conversation turns")
|
|
self.logger.debug(f" Response: {response2[:200]}...")
|
|
return False
|
|
|
|
self.logger.info(" ✅ Conversation context preserved correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Context preservation test failed: {e}")
|
|
return False
|
|
|
|
def test_chat_with_images(self) -> bool:
|
|
"""Test ChatSimple handles images correctly in conversation"""
|
|
try:
|
|
self.logger.info(" 7. Test chat with images")
|
|
|
|
# Create test image data URL (small base64 image)
|
|
small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
|
|
|
|
# Start conversation with image
|
|
response1, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. I'm sharing an image with you. Can you acknowledge that you received it?",
|
|
"images": [small_image],
|
|
"model": "gemini-2.5-flash", # Model that supports images
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response1 or not continuation_id:
|
|
self.logger.error(" ❌ Failed to start conversation with image")
|
|
return False
|
|
|
|
# Verify response acknowledges the image
|
|
if "image" not in response1.lower():
|
|
self.logger.warning(" ⚠️ Response doesn't acknowledge receiving image")
|
|
|
|
# Continue conversation referencing the image
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. What did you see in that image I shared earlier?",
|
|
"continuation_id": continuation_id,
|
|
"model": "gemini-2.5-flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response2:
|
|
self.logger.error(" ❌ Failed to continue conversation about image")
|
|
return False
|
|
|
|
# Test with multiple images
|
|
multiple_images = [small_image, small_image] # Two identical small images
|
|
response3, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Here are two images for comparison",
|
|
"images": multiple_images,
|
|
"model": "gemini-2.5-flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response3:
|
|
self.logger.error(" ❌ Failed with multiple images")
|
|
return False
|
|
|
|
self.logger.info(" ✅ Chat with images working correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Chat with images test failed: {e}")
|
|
return False
|
|
|
|
def test_continued_chat_with_previous_files(self) -> bool:
|
|
"""Test continuing conversation where files were shared in previous turns"""
|
|
try:
|
|
self.logger.info(" 8. Test continued chat with files from previous turns")
|
|
|
|
# Setup test files
|
|
self.setup_test_files()
|
|
|
|
# Start conversation with files
|
|
response1, continuation_id = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Here are some files for you to analyze",
|
|
"files": [self.test_files["python"], self.test_files["config"]],
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response1 or not continuation_id:
|
|
self.logger.error(" ❌ Failed to start conversation with files")
|
|
return False
|
|
|
|
# Continue conversation without new files (should remember previous files)
|
|
response2, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. From the files I shared earlier, what types of files were there?",
|
|
"continuation_id": continuation_id,
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response2:
|
|
self.logger.error(" ❌ Failed to continue conversation")
|
|
return False
|
|
|
|
# Check if response references the files from previous turn
|
|
if "python" not in response2.lower() and "config" not in response2.lower():
|
|
self.logger.warning(" ⚠️ Response doesn't reference previous files properly")
|
|
|
|
# Continue with a different question about same files (should still remember them)
|
|
response3, _ = self.call_mcp_tool_direct(
|
|
"chat",
|
|
{
|
|
"prompt": "Please use low thinking mode. Can you tell me what functions were defined in the Python file from our earlier discussion?",
|
|
"continuation_id": continuation_id,
|
|
"model": "flash",
|
|
"thinking_mode": "low",
|
|
},
|
|
)
|
|
|
|
if not response3:
|
|
self.logger.error(" ❌ Failed to continue conversation about Python file")
|
|
return False
|
|
|
|
# Should reference functions from the Python file (fibonacci, factorial, Calculator, etc.)
|
|
response_lower = response3.lower()
|
|
if not ("fibonacci" in response_lower or "factorial" in response_lower or "calculator" in response_lower):
|
|
self.logger.warning(" ⚠️ Response doesn't reference Python file contents from earlier turn")
|
|
|
|
self.logger.info(" ✅ Continued chat with previous files working correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f" ❌ Continued chat with files test failed: {e}")
|
|
return False
|
|
finally:
|
|
self.cleanup_test_files()
|