Add DocGen tool with comprehensive documentation generation capabilities (#109)

* WIP: new workflow architecture

* WIP: further improvements and cleanup

* WIP: cleanup and docks, replace old tool with new

* WIP: cleanup and docks, replace old tool with new

* WIP: new planner implementation using workflow

* WIP: precommit tool working as a workflow instead of a basic tool
Support for passing False to use_assistant_model to skip external models completely and use Claude only

* WIP: precommit workflow version swapped with old

* WIP: codereview

* WIP: replaced codereview

* WIP: replaced codereview

* WIP: replaced refactor

* WIP: workflow for thinkdeep

* WIP: ensure files get embedded correctly

* WIP: thinkdeep replaced with workflow version

* WIP: improved messaging when an external model's response is received

* WIP: analyze tool swapped

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: fixed get_completion_next_steps_message missing param

* Fixed tests
Request for files consistently

* Fixed tests
Request for files consistently

* Fixed tests

* New testgen workflow tool
Updated docs

* Swap testgen workflow

* Fix CI test failures by excluding API-dependent tests

- Update GitHub Actions workflow to exclude simulation tests that require API keys
- Fix collaboration tests to properly mock workflow tool expert analysis calls
- Update test assertions to handle new workflow tool response format
- Ensure unit tests run without external API dependencies in CI

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* WIP - Update tests to match new tools

* WIP - Update tests to match new tools

* WIP - Update tests to match new tools

* Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97
Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96
Improved retry error logging
Cleanup

* WIP - chat tool using new architecture and improved code sharing

* Removed todo

* Removed todo

* Cleanup old name

* Tweak wordings

* Tweak wordings
Migrate old tests

* Support for Flash 2.0 and Flash Lite 2.0

* Support for Flash 2.0 and Flash Lite 2.0

* Support for Flash 2.0 and Flash Lite 2.0
Fixed test

* Improved consensus to use the workflow base class

* Improved consensus to use the workflow base class

* Allow images

* Allow images

* Replaced old consensus tool

* Cleanup tests

* Tests for prompt size

* New tool: docgen
Tests for prompt size
Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107
Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105

* Improved docgen prompt
Exclude TestGen from pytest inclusion

* Updated errors

* Lint

* DocGen instructed not to fix bugs, surface them and stick to d

* WIP

* Stop claude from being lazy and only documenting a small handful

* More style rules

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Beehive Innovations
2025-06-21 23:21:19 -07:00
committed by GitHub
parent 0655590a51
commit c960bcb720
58 changed files with 5492 additions and 5558 deletions

View File

@@ -8,6 +8,7 @@ Each test is in its own file for better organization and maintainability.
from .base_test import BaseSimulatorTest
from .test_analyze_validation import AnalyzeValidationTest
from .test_basic_conversation import BasicConversationTest
from .test_chat_simple_validation import ChatSimpleValidationTest
from .test_codereview_validation import CodeReviewValidationTest
from .test_consensus_conversation import TestConsensusConversation
from .test_consensus_stance import TestConsensusStance
@@ -30,6 +31,7 @@ from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_planner_continuation_history import PlannerContinuationHistoryTest
from .test_planner_validation import PlannerValidationTest
from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest
from .test_prompt_size_limit_bug import PromptSizeLimitBugTest
# Redis validation test removed - no longer needed for standalone server
from .test_refactor_validation import RefactorValidationTest
@@ -42,6 +44,7 @@ from .test_xai_models import XAIModelsTest
# Test registry for dynamic loading
TEST_REGISTRY = {
"basic_conversation": BasicConversationTest,
"chat_validation": ChatSimpleValidationTest,
"codereview_validation": CodeReviewValidationTest,
"content_validation": ContentValidationTest,
"per_tool_deduplication": PerToolDeduplicationTest,
@@ -71,12 +74,14 @@ TEST_REGISTRY = {
"consensus_stance": TestConsensusStance,
"consensus_three_models": TestConsensusThreeModels,
"analyze_validation": AnalyzeValidationTest,
"prompt_size_limit_bug": PromptSizeLimitBugTest,
# "o3_pro_expensive": O3ProExpensiveTest, # COMMENTED OUT - too expensive to run by default
}
__all__ = [
"BaseSimulatorTest",
"BasicConversationTest",
"ChatSimpleValidationTest",
"CodeReviewValidationTest",
"ContentValidationTest",
"PerToolDeduplicationTest",
@@ -106,5 +111,6 @@ __all__ = [
"TestConsensusStance",
"TestConsensusThreeModels",
"AnalyzeValidationTest",
"PromptSizeLimitBugTest",
"TEST_REGISTRY",
]

View File

@@ -0,0 +1,509 @@
#!/usr/bin/env python3
"""
Chat Simple Tool Validation Test
Comprehensive test for the new ChatSimple tool implementation that validates:
- Basic conversation flow without continuation_id (new chats)
- Continuing existing conversations with continuation_id (continued chats)
- File handling with conversation context (chats with files)
- Image handling in conversations (chat with images)
- Continuing conversations with files from previous turns (continued chats with files previously)
- Temperature validation for different models
- Image limit validation per model
- Conversation context preservation across turns
"""
from .conversation_base_test import ConversationBaseTest
class ChatSimpleValidationTest(ConversationBaseTest):
"""Test ChatSimple tool functionality and validation"""
@property
def test_name(self) -> str:
return "_validation"
@property
def test_description(self) -> str:
return "Comprehensive validation of ChatSimple tool implementation"
def run_test(self) -> bool:
"""Run comprehensive ChatSimple validation tests"""
try:
# Set up the test environment for in-process testing
self.setUp()
self.logger.info("Test: ChatSimple tool validation")
# Run all test scenarios
if not self.test_new_conversation_no_continuation():
return False
if not self.test_continue_existing_conversation():
return False
if not self.test_file_handling_with_conversation():
return False
if not self.test_temperature_validation_edge_cases():
return False
if not self.test_image_limits_per_model():
return False
if not self.test_conversation_context_preservation():
return False
if not self.test_chat_with_images():
return False
if not self.test_continued_chat_with_previous_files():
return False
self.logger.info(" ✅ All ChatSimple validation tests passed")
return True
except Exception as e:
self.logger.error(f"ChatSimple validation test failed: {e}")
return False
def test_new_conversation_no_continuation(self) -> bool:
"""Test ChatSimple creates new conversation without continuation_id"""
try:
self.logger.info(" 1. Test new conversation without continuation_id")
# Call chat without continuation_id
response, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Hello! Please use low thinking mode. Can you explain what MCP tools are?",
"model": "flash",
"temperature": 0.7,
"thinking_mode": "low",
},
)
if not response:
self.logger.error(" ❌ Failed to get response from chat")
return False
if not continuation_id:
self.logger.error(" ❌ No continuation_id returned for new conversation")
return False
# Verify response mentions MCP or tools
if "MCP" not in response and "tool" not in response.lower():
self.logger.error(" ❌ Response doesn't seem to address the question about MCP tools")
return False
self.logger.info(f" ✅ New conversation created with continuation_id: {continuation_id}")
self.new_continuation_id = continuation_id # Store for next test
return True
except Exception as e:
self.logger.error(f" ❌ New conversation test failed: {e}")
return False
def test_continue_existing_conversation(self) -> bool:
"""Test ChatSimple continues conversation with valid continuation_id"""
try:
self.logger.info(" 2. Test continuing existing conversation")
if not hasattr(self, "new_continuation_id"):
self.logger.error(" ❌ No continuation_id from previous test")
return False
# Continue the conversation
response, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Can you give me a specific example of how an MCP tool might work?",
"continuation_id": self.new_continuation_id,
"model": "flash",
"thinking_mode": "low",
},
)
if not response:
self.logger.error(" ❌ Failed to continue conversation")
return False
# Continuation ID should be the same
if continuation_id != self.new_continuation_id:
self.logger.error(f" ❌ Continuation ID changed: {self.new_continuation_id} -> {continuation_id}")
return False
# Response should be contextual (mentioning previous discussion)
if "example" not in response.lower():
self.logger.error(" ❌ Response doesn't seem to provide an example as requested")
return False
self.logger.info(" ✅ Successfully continued conversation with same continuation_id")
return True
except Exception as e:
self.logger.error(f" ❌ Continue conversation test failed: {e}")
return False
def test_file_handling_with_conversation(self) -> bool:
"""Test ChatSimple handles files correctly in conversation context"""
try:
self.logger.info(" 3. Test file handling with conversation")
# Setup test files
self.setup_test_files()
# Start new conversation with a file
response1, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Analyze this Python code and tell me what the Calculator class does",
"files": [self.test_files["python"]],
"model": "flash",
"thinking_mode": "low",
},
)
if not response1 or not continuation_id:
self.logger.error(" ❌ Failed to start conversation with file")
return False
# Continue with same file (should be deduplicated)
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. What methods does the Calculator class have?",
"files": [self.test_files["python"]], # Same file
"continuation_id": continuation_id,
"model": "flash",
"thinking_mode": "low",
},
)
if not response2:
self.logger.error(" ❌ Failed to continue with same file")
return False
# Response should mention add and multiply methods
if "add" not in response2.lower() or "multiply" not in response2.lower():
self.logger.error(" ❌ Response doesn't mention Calculator methods")
return False
self.logger.info(" ✅ File handling with conversation working correctly")
return True
except Exception as e:
self.logger.error(f" ❌ File handling test failed: {e}")
return False
finally:
self.cleanup_test_files()
def test_temperature_validation_edge_cases(self) -> bool:
"""Test temperature is corrected for model limits (too high/low)"""
try:
self.logger.info(" 4. Test temperature validation edge cases")
# Test 1: Temperature exactly at limit (should work)
response1, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Hello, this is a test with max temperature",
"model": "flash",
"temperature": 1.0, # At the limit
"thinking_mode": "low",
},
)
if not response1:
self.logger.error(" ❌ Failed with temperature 1.0")
return False
# Test 2: Temperature at minimum (should work)
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Another test message with min temperature",
"model": "flash",
"temperature": 0.0, # At minimum
"thinking_mode": "low",
},
)
if not response2:
self.logger.error(" ❌ Failed with temperature 0.0")
return False
# Test 3: Check that invalid temperatures are rejected by validation
# This should result in an error response from the tool, not a crash
try:
response3, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Test with invalid temperature",
"model": "flash",
"temperature": 1.5, # Too high - should be validated
"thinking_mode": "low",
},
)
# If we get here, check if it's an error response
if response3 and "validation error" in response3.lower():
self.logger.info(" ✅ Invalid temperature properly rejected by validation")
else:
self.logger.warning(" ⚠️ High temperature not properly validated")
except Exception:
# Expected - validation should reject this
self.logger.info(" ✅ Invalid temperature properly rejected")
self.logger.info(" ✅ Temperature validation working correctly")
return True
except Exception as e:
self.logger.error(f" ❌ Temperature validation test failed: {e}")
return False
def test_image_limits_per_model(self) -> bool:
"""Test image validation respects model-specific limits"""
try:
self.logger.info(" 5. Test image limits per model")
# Create test image data URLs (small base64 images)
small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
# Test 1: Model that doesn't support images
response1, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Can you see this image?",
"model": "local-llama", # Text-only model
"images": [small_image],
"thinking_mode": "low",
},
)
# Should get an error about image support
if response1 and "does not support image" not in response1:
self.logger.warning(" ⚠️ Model without image support didn't reject images properly")
# Test 2: Too many images for a model
many_images = [small_image] * 25 # Most models support max 20
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Analyze these images",
"model": "gemini-2.5-flash", # Supports max 16 images
"images": many_images,
"thinking_mode": "low",
},
)
# Should get an error about too many images
if response2 and "too many images" not in response2.lower():
self.logger.warning(" ⚠️ Model didn't reject excessive image count")
# Test 3: Valid image count
response3, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. This is a test with one image",
"model": "gemini-2.5-flash",
"images": [small_image],
"thinking_mode": "low",
},
)
if not response3:
self.logger.error(" ❌ Failed with valid image count")
return False
self.logger.info(" ✅ Image validation working correctly")
return True
except Exception as e:
self.logger.error(f" ❌ Image limits test failed: {e}")
return False
def test_conversation_context_preservation(self) -> bool:
"""Test ChatSimple preserves context across turns"""
try:
self.logger.info(" 6. Test conversation context preservation")
# Start conversation with specific context
response1, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. My name is TestUser and I'm working on a Python project called TestProject",
"model": "flash",
"thinking_mode": "low",
},
)
if not response1 or not continuation_id:
self.logger.error(" ❌ Failed to start conversation")
return False
# Continue and reference previous context
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. What's my name and what project am I working on?",
"continuation_id": continuation_id,
"model": "flash",
"thinking_mode": "low",
},
)
if not response2:
self.logger.error(" ❌ Failed to continue conversation")
return False
# Check if context was preserved
if "TestUser" not in response2 or "TestProject" not in response2:
self.logger.error(" ❌ Context not preserved across conversation turns")
self.logger.debug(f" Response: {response2[:200]}...")
return False
self.logger.info(" ✅ Conversation context preserved correctly")
return True
except Exception as e:
self.logger.error(f" ❌ Context preservation test failed: {e}")
return False
def test_chat_with_images(self) -> bool:
"""Test ChatSimple handles images correctly in conversation"""
try:
self.logger.info(" 7. Test chat with images")
# Create test image data URL (small base64 image)
small_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
# Start conversation with image
response1, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. I'm sharing an image with you. Can you acknowledge that you received it?",
"images": [small_image],
"model": "gemini-2.5-flash", # Model that supports images
"thinking_mode": "low",
},
)
if not response1 or not continuation_id:
self.logger.error(" ❌ Failed to start conversation with image")
return False
# Verify response acknowledges the image
if "image" not in response1.lower():
self.logger.warning(" ⚠️ Response doesn't acknowledge receiving image")
# Continue conversation referencing the image
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. What did you see in that image I shared earlier?",
"continuation_id": continuation_id,
"model": "gemini-2.5-flash",
"thinking_mode": "low",
},
)
if not response2:
self.logger.error(" ❌ Failed to continue conversation about image")
return False
# Test with multiple images
multiple_images = [small_image, small_image] # Two identical small images
response3, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Here are two images for comparison",
"images": multiple_images,
"model": "gemini-2.5-flash",
"thinking_mode": "low",
},
)
if not response3:
self.logger.error(" ❌ Failed with multiple images")
return False
self.logger.info(" ✅ Chat with images working correctly")
return True
except Exception as e:
self.logger.error(f" ❌ Chat with images test failed: {e}")
return False
def test_continued_chat_with_previous_files(self) -> bool:
"""Test continuing conversation where files were shared in previous turns"""
try:
self.logger.info(" 8. Test continued chat with files from previous turns")
# Setup test files
self.setup_test_files()
# Start conversation with files
response1, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Here are some files for you to analyze",
"files": [self.test_files["python"], self.test_files["config"]],
"model": "flash",
"thinking_mode": "low",
},
)
if not response1 or not continuation_id:
self.logger.error(" ❌ Failed to start conversation with files")
return False
# Continue conversation without new files (should remember previous files)
response2, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. From the files I shared earlier, what types of files were there?",
"continuation_id": continuation_id,
"model": "flash",
"thinking_mode": "low",
},
)
if not response2:
self.logger.error(" ❌ Failed to continue conversation")
return False
# Check if response references the files from previous turn
if "python" not in response2.lower() and "config" not in response2.lower():
self.logger.warning(" ⚠️ Response doesn't reference previous files properly")
# Continue with a different question about same files (should still remember them)
response3, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Can you tell me what functions were defined in the Python file from our earlier discussion?",
"continuation_id": continuation_id,
"model": "flash",
"thinking_mode": "low",
},
)
if not response3:
self.logger.error(" ❌ Failed to continue conversation about Python file")
return False
# Should reference functions from the Python file (fibonacci, factorial, Calculator, etc.)
response_lower = response3.lower()
if not ("fibonacci" in response_lower or "factorial" in response_lower or "calculator" in response_lower):
self.logger.warning(" ⚠️ Response doesn't reference Python file contents from earlier turn")
self.logger.info(" ✅ Continued chat with previous files working correctly")
return True
except Exception as e:
self.logger.error(f" ❌ Continued chat with files test failed: {e}")
return False
finally:
self.cleanup_test_files()

View File

@@ -21,7 +21,12 @@ class CrossToolComprehensiveTest(ConversationBaseTest):
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
"""Call an MCP tool in-process"""
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
# Use the new method for workflow tools
workflow_tools = ["analyze", "debug", "codereview", "precommit", "refactor", "thinkdeep"]
if tool_name in workflow_tools:
response_text, continuation_id = super().call_mcp_tool(tool_name, params)
else:
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
return response_text, continuation_id
@property
@@ -96,8 +101,12 @@ def hash_pwd(pwd):
# Step 2: Use analyze tool to do deeper analysis (fresh conversation)
self.logger.info(" Step 2: analyze tool - Deep code analysis (fresh)")
analyze_params = {
"files": [auth_file],
"prompt": "Find vulnerabilities",
"step": "Starting comprehensive code analysis to find security vulnerabilities in the authentication system",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Initial analysis will focus on security vulnerabilities in authentication code",
"relevant_files": [auth_file],
"thinking_mode": "low",
"model": "flash",
}
@@ -133,8 +142,12 @@ def hash_pwd(pwd):
# Step 4: Use debug tool to identify specific issues
self.logger.info(" Step 4: debug tool - Identify specific problems")
debug_params = {
"files": [auth_file, config_file_path],
"prompt": "Fix auth issues",
"step": "Starting debug investigation to identify and fix authentication security issues",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Investigating authentication vulnerabilities found in previous analysis",
"relevant_files": [auth_file, config_file_path],
"thinking_mode": "low",
"model": "flash",
}
@@ -153,9 +166,13 @@ def hash_pwd(pwd):
if continuation_id4:
self.logger.info(" Step 5: debug continuation - Additional analysis")
debug_continue_params = {
"step": "Continuing debug investigation to fix password hashing implementation",
"step_number": 2,
"total_steps": 2,
"next_step_required": False,
"findings": "Building on previous analysis to fix weak password hashing",
"continuation_id": continuation_id4,
"files": [auth_file, config_file_path],
"prompt": "Fix password hashing",
"relevant_files": [auth_file, config_file_path],
"thinking_mode": "low",
"model": "flash",
}
@@ -168,8 +185,12 @@ def hash_pwd(pwd):
# Step 6: Use codereview for comprehensive review
self.logger.info(" Step 6: codereview tool - Comprehensive code review")
codereview_params = {
"files": [auth_file, config_file_path],
"prompt": "Security review",
"step": "Starting comprehensive security code review of authentication system",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Performing thorough security review of authentication code and configuration",
"relevant_files": [auth_file, config_file_path],
"thinking_mode": "low",
"model": "flash",
}
@@ -201,9 +222,13 @@ def secure_login(user, pwd):
improved_file = self.create_additional_test_file("auth_improved.py", improved_code)
precommit_params = {
"step": "Starting pre-commit validation of improved authentication code",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"findings": "Validating improved authentication implementation before commit",
"path": self.test_dir,
"files": [auth_file, config_file_path, improved_file],
"prompt": "Ready to commit",
"relevant_files": [auth_file, config_file_path, improved_file],
"thinking_mode": "low",
"model": "flash",
}

View File

@@ -0,0 +1,206 @@
#!/usr/bin/env python3
"""
Prompt Size Limit Bug Test
This test reproduces a critical bug where the prompt size limit check
incorrectly includes conversation history when validating incoming prompts
from Claude to MCP. The limit should ONLY apply to the actual prompt text
sent by the user, not the entire conversation context.
Bug Scenario:
- User starts a conversation with chat tool
- Continues conversation multiple times (building up history)
- On subsequent continuation, a short prompt (150 chars) triggers
"resend_prompt" error claiming >50k characters
Expected Behavior:
- Only count the actual prompt parameter for size limit
- Conversation history should NOT count toward prompt size limit
- Only the user's actual input should be validated against 50k limit
"""
from .conversation_base_test import ConversationBaseTest
class PromptSizeLimitBugTest(ConversationBaseTest):
"""Test to reproduce and verify fix for prompt size limit bug"""
@property
def test_name(self) -> str:
return "prompt_size_limit_bug"
@property
def test_description(self) -> str:
return "Reproduce prompt size limit bug with conversation continuation"
def run_test(self) -> bool:
"""Test prompt size limit bug reproduction using in-process calls"""
try:
self.logger.info("🐛 Test: Prompt size limit bug reproduction (in-process)")
# Setup test environment
self.setUp()
# Create a test file to provide context
test_file_content = """
# Test SwiftUI-like Framework Implementation
struct ContentView: View {
@State private var counter = 0
var body: some View {
VStack {
Text("Count: \\(counter)")
Button("Increment") {
counter += 1
}
}
}
}
class Renderer {
static let shared = Renderer()
func render(view: View) {
// Implementation details for UIKit/AppKit rendering
}
}
protocol View {
var body: some View { get }
}
"""
test_file_path = self.create_additional_test_file("SwiftFramework.swift", test_file_content)
# Step 1: Start initial conversation
self.logger.info(" Step 1: Start conversation with initial context")
initial_prompt = "I'm building a SwiftUI-like framework. Can you help me design the architecture?"
response1, continuation_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": initial_prompt,
"files": [test_file_path],
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error(" ❌ Failed to start initial conversation")
return False
self.logger.info(f" ✅ Initial conversation started: {continuation_id[:8]}...")
# Step 2: Continue conversation multiple times to build substantial history
conversation_prompts = [
"That's helpful! Can you elaborate on the View protocol design?",
"How should I implement the State property wrapper?",
"What's the best approach for the VStack layout implementation?",
"Should I use UIKit directly or create an abstraction layer?",
"Smart approach! For the rendering layer, would you suggest UIKit/AppKit directly?",
]
for i, prompt in enumerate(conversation_prompts, 2):
self.logger.info(f" Step {i}: Continue conversation (exchange {i})")
response, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": prompt,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response:
self.logger.error(f" ❌ Failed at exchange {i}")
return False
self.logger.info(f" ✅ Exchange {i} completed")
# Step 3: Send short prompt that should NOT trigger size limit
self.logger.info(" Step 7: Send short prompt (should NOT trigger size limit)")
# This is a very short prompt - should not trigger the bug after fix
short_prompt = "Thanks! This gives me a solid foundation to start prototyping."
self.logger.info(f" Short prompt length: {len(short_prompt)} characters")
response_final, _ = self.call_mcp_tool_direct(
"chat",
{
"prompt": short_prompt,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response_final:
self.logger.error(" ❌ Final short prompt failed")
return False
# Parse the response to check for the bug
import json
try:
response_data = json.loads(response_final)
status = response_data.get("status", "")
if status == "resend_prompt":
# This is the bug! Short prompt incorrectly triggering size limit
metadata = response_data.get("metadata", {})
prompt_size = metadata.get("prompt_size", 0)
self.logger.error(
f" 🐛 BUG STILL EXISTS: Short prompt ({len(short_prompt)} chars) triggered resend_prompt"
)
self.logger.error(f" Reported prompt_size: {prompt_size} (should be ~{len(short_prompt)})")
self.logger.error(" This indicates conversation history is still being counted")
return False # Bug still exists
elif status in ["success", "continuation_available"]:
self.logger.info(" ✅ Short prompt processed correctly - bug appears to be FIXED!")
self.logger.info(f" Prompt length: {len(short_prompt)} chars, Status: {status}")
return True
else:
self.logger.warning(f" ⚠️ Unexpected status: {status}")
# Check if this might be a non-JSON response (successful execution)
if len(response_final) > 0 and not response_final.startswith('{"'):
self.logger.info(" ✅ Non-JSON response suggests successful tool execution")
return True
return False
except json.JSONDecodeError:
# Non-JSON response often means successful tool execution
self.logger.info(" ✅ Non-JSON response suggests successful tool execution (bug likely fixed)")
self.logger.debug(f" Response preview: {response_final[:200]}...")
return True
except Exception as e:
self.logger.error(f"Prompt size limit bug test failed: {e}")
import traceback
self.logger.debug(f"Full traceback: {traceback.format_exc()}")
return False
def main():
"""Run the prompt size limit bug test"""
import sys
verbose = "--verbose" in sys.argv or "-v" in sys.argv
test = PromptSizeLimitBugTest(verbose=verbose)
success = test.run_test()
if success:
print("Bug reproduction test completed - check logs for details")
else:
print("Test failed to complete")
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -947,37 +947,37 @@ class DataContainer:
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool in-process - override for refactorworkflow-specific response handling"""
"""Call an MCP tool in-process - override for -specific response handling"""
# Use in-process implementation to maintain conversation memory
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
if not response_text:
return None, None
# Extract continuation_id from refactorworkflow response specifically
continuation_id = self._extract_refactorworkflow_continuation_id(response_text)
# Extract continuation_id from refactor response specifically
continuation_id = self._extract_refactor_continuation_id(response_text)
return response_text, continuation_id
def _extract_refactorworkflow_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from refactorworkflow response"""
def _extract_refactor_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from refactor response"""
try:
# Parse the response
response_data = json.loads(response_text)
return response_data.get("continuation_id")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for refactorworkflow continuation_id: {e}")
self.logger.debug(f"Failed to parse response for refactor continuation_id: {e}")
return None
def _parse_refactor_response(self, response_text: str) -> dict:
"""Parse refactorworkflow tool JSON response"""
"""Parse refactor tool JSON response"""
try:
# Parse the response - it should be direct JSON
return json.loads(response_text)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse refactorworkflow response as JSON: {e}")
self.logger.error(f"Failed to parse refactor response as JSON: {e}")
self.logger.error(f"Response text: {response_text[:500]}...")
return {}
@@ -989,7 +989,7 @@ class DataContainer:
expected_next_required: bool,
expected_status: str,
) -> bool:
"""Validate a refactorworkflow investigation step response structure"""
"""Validate a refactor investigation step response structure"""
try:
# Check status
if response_data.get("status") != expected_status: