Lots of tests with live simulation to validate conversation continuation / preservation work across requests
This commit is contained in:
1626
communication_simulator_test.py
Executable file → Normal file
1626
communication_simulator_test.py
Executable file → Normal file
File diff suppressed because it is too large
Load Diff
1994
communication_simulator_test_old.py
Executable file
1994
communication_simulator_test_old.py
Executable file
File diff suppressed because it is too large
Load Diff
35
simulator_tests/__init__.py
Normal file
35
simulator_tests/__init__.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Communication Simulator Tests Package
|
||||||
|
|
||||||
|
This package contains individual test modules for the Gemini MCP Communication Simulator.
|
||||||
|
Each test is in its own file for better organization and maintainability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
from .test_basic_conversation import BasicConversationTest
|
||||||
|
from .test_content_validation import ContentValidationTest
|
||||||
|
from .test_per_tool_deduplication import PerToolDeduplicationTest
|
||||||
|
from .test_cross_tool_continuation import CrossToolContinuationTest
|
||||||
|
from .test_logs_validation import LogsValidationTest
|
||||||
|
from .test_redis_validation import RedisValidationTest
|
||||||
|
|
||||||
|
# Test registry for dynamic loading
|
||||||
|
TEST_REGISTRY = {
|
||||||
|
"basic_conversation": BasicConversationTest,
|
||||||
|
"content_validation": ContentValidationTest,
|
||||||
|
"per_tool_deduplication": PerToolDeduplicationTest,
|
||||||
|
"cross_tool_continuation": CrossToolContinuationTest,
|
||||||
|
"logs_validation": LogsValidationTest,
|
||||||
|
"redis_validation": RedisValidationTest,
|
||||||
|
}
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'BaseSimulatorTest',
|
||||||
|
'BasicConversationTest',
|
||||||
|
'ContentValidationTest',
|
||||||
|
'PerToolDeduplicationTest',
|
||||||
|
'CrossToolContinuationTest',
|
||||||
|
'LogsValidationTest',
|
||||||
|
'RedisValidationTest',
|
||||||
|
'TEST_REGISTRY'
|
||||||
|
]
|
||||||
255
simulator_tests/base_test.py
Normal file
255
simulator_tests/base_test.py
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Base Test Class for Communication Simulator Tests
|
||||||
|
|
||||||
|
Provides common functionality and utilities for all simulator tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSimulatorTest:
|
||||||
|
"""Base class for all communication simulator tests"""
|
||||||
|
|
||||||
|
def __init__(self, verbose: bool = False):
|
||||||
|
self.verbose = verbose
|
||||||
|
self.test_files = {}
|
||||||
|
self.test_dir = None
|
||||||
|
self.container_name = "gemini-mcp-server"
|
||||||
|
self.redis_container = "gemini-mcp-redis"
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
log_level = logging.DEBUG if verbose else logging.INFO
|
||||||
|
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||||
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
def setup_test_files(self):
|
||||||
|
"""Create test files for the simulation"""
|
||||||
|
# Test Python file
|
||||||
|
python_content = '''"""
|
||||||
|
Sample Python module for testing MCP conversation continuity
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fibonacci(n):
|
||||||
|
"""Calculate fibonacci number recursively"""
|
||||||
|
if n <= 1:
|
||||||
|
return n
|
||||||
|
return fibonacci(n-1) + fibonacci(n-2)
|
||||||
|
|
||||||
|
def factorial(n):
|
||||||
|
"""Calculate factorial iteratively"""
|
||||||
|
result = 1
|
||||||
|
for i in range(1, n + 1):
|
||||||
|
result *= i
|
||||||
|
return result
|
||||||
|
|
||||||
|
class Calculator:
|
||||||
|
"""Simple calculator class"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.history = []
|
||||||
|
|
||||||
|
def add(self, a, b):
|
||||||
|
result = a + b
|
||||||
|
self.history.append(f"{a} + {b} = {result}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def multiply(self, a, b):
|
||||||
|
result = a * b
|
||||||
|
self.history.append(f"{a} * {b} = {result}")
|
||||||
|
return result
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Test configuration file
|
||||||
|
config_content = """{
|
||||||
|
"database": {
|
||||||
|
"host": "localhost",
|
||||||
|
"port": 5432,
|
||||||
|
"name": "testdb",
|
||||||
|
"ssl": true
|
||||||
|
},
|
||||||
|
"cache": {
|
||||||
|
"redis_url": "redis://localhost:6379",
|
||||||
|
"ttl": 3600
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"level": "INFO",
|
||||||
|
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
|
||||||
|
# Create files in the current project directory
|
||||||
|
current_dir = os.getcwd()
|
||||||
|
self.test_dir = os.path.join(current_dir, "test_simulation_files")
|
||||||
|
os.makedirs(self.test_dir, exist_ok=True)
|
||||||
|
|
||||||
|
test_py = os.path.join(self.test_dir, "test_module.py")
|
||||||
|
test_config = os.path.join(self.test_dir, "config.json")
|
||||||
|
|
||||||
|
with open(test_py, "w") as f:
|
||||||
|
f.write(python_content)
|
||||||
|
with open(test_config, "w") as f:
|
||||||
|
f.write(config_content)
|
||||||
|
|
||||||
|
self.test_files = {"python": test_py, "config": test_config}
|
||||||
|
self.logger.debug(f"Created test files: {list(self.test_files.values())}")
|
||||||
|
|
||||||
|
def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""Call an MCP tool via Claude CLI (docker exec)"""
|
||||||
|
try:
|
||||||
|
# Prepare the MCP initialization and tool call sequence
|
||||||
|
init_request = {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": 1,
|
||||||
|
"method": "initialize",
|
||||||
|
"params": {
|
||||||
|
"protocolVersion": "2024-11-05",
|
||||||
|
"capabilities": {"tools": {}},
|
||||||
|
"clientInfo": {"name": "communication-simulator", "version": "1.0.0"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send initialized notification
|
||||||
|
initialized_notification = {"jsonrpc": "2.0", "method": "notifications/initialized"}
|
||||||
|
|
||||||
|
# Prepare the tool call request
|
||||||
|
tool_request = {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": 2,
|
||||||
|
"method": "tools/call",
|
||||||
|
"params": {"name": tool_name, "arguments": params},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Combine all messages
|
||||||
|
messages = [json.dumps(init_request), json.dumps(initialized_notification), json.dumps(tool_request)]
|
||||||
|
|
||||||
|
# Join with newlines as MCP expects
|
||||||
|
input_data = "\n".join(messages) + "\n"
|
||||||
|
|
||||||
|
# Simulate Claude CLI calling the MCP server via docker exec
|
||||||
|
docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"]
|
||||||
|
|
||||||
|
self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")
|
||||||
|
|
||||||
|
# Execute the command
|
||||||
|
result = subprocess.run(
|
||||||
|
docker_cmd, input=input_data, text=True, capture_output=True, timeout=300 # 5 minute timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
self.logger.error(f"Docker exec failed: {result.stderr}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Parse the response - look for the tool call response
|
||||||
|
response_data = self._parse_mcp_response(result.stdout, expected_id=2)
|
||||||
|
if not response_data:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Extract continuation_id if present
|
||||||
|
continuation_id = self._extract_continuation_id(response_data)
|
||||||
|
|
||||||
|
return response_data, continuation_id
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
self.logger.error(f"MCP tool call timed out: {tool_name}")
|
||||||
|
return None, None
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"MCP tool call failed: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def _parse_mcp_response(self, stdout: str, expected_id: int = 2) -> Optional[str]:
|
||||||
|
"""Parse MCP JSON-RPC response from stdout"""
|
||||||
|
try:
|
||||||
|
lines = stdout.strip().split("\n")
|
||||||
|
for line in lines:
|
||||||
|
if line.strip() and line.startswith("{"):
|
||||||
|
response = json.loads(line)
|
||||||
|
# Look for the tool call response with the expected ID
|
||||||
|
if response.get("id") == expected_id and "result" in response:
|
||||||
|
# Extract the actual content from the response
|
||||||
|
result = response["result"]
|
||||||
|
# Handle new response format with 'content' array
|
||||||
|
if isinstance(result, dict) and "content" in result:
|
||||||
|
content_array = result["content"]
|
||||||
|
if isinstance(content_array, list) and len(content_array) > 0:
|
||||||
|
return content_array[0].get("text", "")
|
||||||
|
# Handle legacy format
|
||||||
|
elif isinstance(result, list) and len(result) > 0:
|
||||||
|
return result[0].get("text", "")
|
||||||
|
elif response.get("id") == expected_id and "error" in response:
|
||||||
|
self.logger.error(f"MCP error: {response['error']}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# If we get here, log all responses for debugging
|
||||||
|
self.logger.warning(f"No valid tool call response found for ID {expected_id}")
|
||||||
|
self.logger.debug(f"Full stdout: {stdout}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.error(f"Failed to parse MCP response: {e}")
|
||||||
|
self.logger.debug(f"Stdout that failed to parse: {stdout}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_continuation_id(self, response_text: str) -> Optional[str]:
|
||||||
|
"""Extract continuation_id from response metadata"""
|
||||||
|
try:
|
||||||
|
# Parse the response text as JSON to look for continuation metadata
|
||||||
|
response_data = json.loads(response_text)
|
||||||
|
|
||||||
|
# Look for continuation_id in various places
|
||||||
|
if isinstance(response_data, dict):
|
||||||
|
# Check metadata
|
||||||
|
metadata = response_data.get("metadata", {})
|
||||||
|
if "thread_id" in metadata:
|
||||||
|
return metadata["thread_id"]
|
||||||
|
|
||||||
|
# Check follow_up_request
|
||||||
|
follow_up = response_data.get("follow_up_request", {})
|
||||||
|
if follow_up and "continuation_id" in follow_up:
|
||||||
|
return follow_up["continuation_id"]
|
||||||
|
|
||||||
|
# Check continuation_offer
|
||||||
|
continuation_offer = response_data.get("continuation_offer", {})
|
||||||
|
if continuation_offer and "continuation_id" in continuation_offer:
|
||||||
|
return continuation_offer["continuation_id"]
|
||||||
|
|
||||||
|
self.logger.debug(f"No continuation_id found in response: {response_data}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.debug(f"Failed to parse response for continuation_id: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def run_command(self, cmd: list[str], check: bool = True, capture_output: bool = False, **kwargs):
|
||||||
|
"""Run a shell command with logging"""
|
||||||
|
if self.verbose:
|
||||||
|
self.logger.debug(f"Running: {' '.join(cmd)}")
|
||||||
|
|
||||||
|
return subprocess.run(cmd, check=check, capture_output=capture_output, **kwargs)
|
||||||
|
|
||||||
|
def cleanup_test_files(self):
|
||||||
|
"""Clean up test files"""
|
||||||
|
if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir):
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(self.test_dir)
|
||||||
|
self.logger.debug(f"Removed test files directory: {self.test_dir}")
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Run the test - to be implemented by subclasses"""
|
||||||
|
raise NotImplementedError("Subclasses must implement run_test()")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
"""Get the test name - to be implemented by subclasses"""
|
||||||
|
raise NotImplementedError("Subclasses must implement test_name property")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
"""Get the test description - to be implemented by subclasses"""
|
||||||
|
raise NotImplementedError("Subclasses must implement test_description property")
|
||||||
83
simulator_tests/test_basic_conversation.py
Normal file
83
simulator_tests/test_basic_conversation.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Basic Conversation Flow Test
|
||||||
|
|
||||||
|
Tests basic conversation continuity with the chat tool, including:
|
||||||
|
- Initial chat with file analysis
|
||||||
|
- Continuing conversation with same file (deduplication)
|
||||||
|
- Adding additional files to ongoing conversation
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class BasicConversationTest(BaseSimulatorTest):
|
||||||
|
"""Test basic conversation flow with chat tool"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "basic_conversation"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Basic conversation flow with chat tool"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Test basic conversation flow with chat tool"""
|
||||||
|
try:
|
||||||
|
self.logger.info("📝 Test: Basic conversation flow")
|
||||||
|
|
||||||
|
# Setup test files
|
||||||
|
self.setup_test_files()
|
||||||
|
|
||||||
|
# Initial chat tool call with file
|
||||||
|
self.logger.info(" 1.1: Initial chat with file analysis")
|
||||||
|
response1, continuation_id = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response1 or not continuation_id:
|
||||||
|
self.logger.error("Failed to get initial response with continuation_id")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(f" ✅ Got continuation_id: {continuation_id}")
|
||||||
|
|
||||||
|
# Continue conversation with same file (should be deduplicated)
|
||||||
|
self.logger.info(" 1.2: Continue conversation with same file")
|
||||||
|
response2, _ = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Now focus on the Calculator class specifically. Are there any improvements you'd suggest?",
|
||||||
|
"files": [self.test_files["python"]], # Same file - should be deduplicated
|
||||||
|
"continuation_id": continuation_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response2:
|
||||||
|
self.logger.error("Failed to continue conversation")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Continue with additional file
|
||||||
|
self.logger.info(" 1.3: Continue conversation with additional file")
|
||||||
|
response3, _ = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Now also analyze this configuration file and see how it might relate to the Python code",
|
||||||
|
"files": [self.test_files["python"], self.test_files["config"]],
|
||||||
|
"continuation_id": continuation_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response3:
|
||||||
|
self.logger.error("Failed to continue with additional file")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(" ✅ Basic conversation flow working")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Basic conversation flow test failed: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
self.cleanup_test_files()
|
||||||
177
simulator_tests/test_content_validation.py
Normal file
177
simulator_tests/test_content_validation.py
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Content Validation Test
|
||||||
|
|
||||||
|
Tests that tools don't duplicate file content in their responses.
|
||||||
|
This test is specifically designed to catch content duplication bugs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class ContentValidationTest(BaseSimulatorTest):
|
||||||
|
"""Test that tools don't duplicate file content in their responses"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "content_validation"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Content validation and duplicate detection"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Test that tools don't duplicate file content in their responses"""
|
||||||
|
try:
|
||||||
|
self.logger.info("📄 Test: Content validation and duplicate detection")
|
||||||
|
|
||||||
|
# Setup test files first
|
||||||
|
self.setup_test_files()
|
||||||
|
|
||||||
|
# Create a test file with distinctive content for validation
|
||||||
|
validation_content = '''"""
|
||||||
|
Configuration file for content validation testing
|
||||||
|
This content should appear only ONCE in any tool response
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Configuration constants
|
||||||
|
MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
|
||||||
|
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
|
||||||
|
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
|
||||||
|
|
||||||
|
# Database settings
|
||||||
|
DATABASE_CONFIG = {
|
||||||
|
"host": "localhost",
|
||||||
|
"port": 5432,
|
||||||
|
"name": "validation_test_db"
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
|
validation_file = os.path.join(self.test_dir, "validation_config.py")
|
||||||
|
with open(validation_file, "w") as f:
|
||||||
|
f.write(validation_content)
|
||||||
|
|
||||||
|
# Test 1: Precommit tool with files parameter (where the bug occurred)
|
||||||
|
self.logger.info(" 1: Testing precommit tool content duplication")
|
||||||
|
|
||||||
|
# Call precommit tool with the validation file
|
||||||
|
response1, thread_id = self.call_mcp_tool(
|
||||||
|
"precommit",
|
||||||
|
{
|
||||||
|
"path": os.getcwd(),
|
||||||
|
"files": [validation_file],
|
||||||
|
"original_request": "Test for content duplication in precommit tool"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response1:
|
||||||
|
# Parse response and check for content duplication
|
||||||
|
try:
|
||||||
|
response_data = json.loads(response1)
|
||||||
|
content = response_data.get("content", "")
|
||||||
|
|
||||||
|
# Count occurrences of distinctive markers
|
||||||
|
max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000")
|
||||||
|
temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2")
|
||||||
|
unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER")
|
||||||
|
|
||||||
|
# Validate no duplication
|
||||||
|
duplication_detected = False
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
if max_content_count > 1:
|
||||||
|
issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times")
|
||||||
|
duplication_detected = True
|
||||||
|
|
||||||
|
if temp_analytical_count > 1:
|
||||||
|
issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times")
|
||||||
|
duplication_detected = True
|
||||||
|
|
||||||
|
if unique_marker_count > 1:
|
||||||
|
issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times")
|
||||||
|
duplication_detected = True
|
||||||
|
|
||||||
|
if duplication_detected:
|
||||||
|
self.logger.error(f" ❌ Content duplication detected in precommit tool: {'; '.join(issues)}")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
self.logger.info(" ✅ No content duplication in precommit tool")
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning(" ⚠️ Could not parse precommit response as JSON")
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.logger.warning(" ⚠️ Precommit tool failed to respond")
|
||||||
|
|
||||||
|
# Test 2: Other tools that use files parameter
|
||||||
|
tools_to_test = [
|
||||||
|
("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}),
|
||||||
|
("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}),
|
||||||
|
("analyze", {"files": [validation_file], "analysis_type": "code_quality"})
|
||||||
|
]
|
||||||
|
|
||||||
|
for tool_name, params in tools_to_test:
|
||||||
|
self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication")
|
||||||
|
|
||||||
|
response, _ = self.call_mcp_tool(tool_name, params)
|
||||||
|
if response:
|
||||||
|
try:
|
||||||
|
response_data = json.loads(response)
|
||||||
|
content = response_data.get("content", "")
|
||||||
|
|
||||||
|
# Check for duplication
|
||||||
|
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
|
||||||
|
if marker_count > 1:
|
||||||
|
self.logger.error(f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
self.logger.info(f" ✅ No content duplication in {tool_name}")
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning(f" ⚠️ Could not parse {tool_name} response")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f" ⚠️ {tool_name} tool failed to respond")
|
||||||
|
|
||||||
|
# Test 3: Cross-tool content validation with file deduplication
|
||||||
|
self.logger.info(" 3: Testing cross-tool content consistency")
|
||||||
|
|
||||||
|
if thread_id:
|
||||||
|
# Continue conversation with same file - content should be deduplicated in conversation history
|
||||||
|
response2, _ = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Continue analyzing this configuration file",
|
||||||
|
"files": [validation_file], # Same file should be deduplicated
|
||||||
|
"continuation_id": thread_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if response2:
|
||||||
|
try:
|
||||||
|
response_data = json.loads(response2)
|
||||||
|
content = response_data.get("content", "")
|
||||||
|
|
||||||
|
# In continuation, the file content shouldn't be duplicated either
|
||||||
|
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
|
||||||
|
if marker_count > 1:
|
||||||
|
self.logger.error(f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
self.logger.info(" ✅ No content duplication in cross-tool continuation")
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning(" ⚠️ Could not parse continuation response")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
os.remove(validation_file)
|
||||||
|
|
||||||
|
self.logger.info(" ✅ All content validation tests passed")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Content validation test failed: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
self.cleanup_test_files()
|
||||||
196
simulator_tests/test_cross_tool_continuation.py
Normal file
196
simulator_tests/test_cross_tool_continuation.py
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Cross-Tool Continuation Test
|
||||||
|
|
||||||
|
Tests comprehensive cross-tool continuation scenarios to ensure
|
||||||
|
conversation context is maintained when switching between different tools.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class CrossToolContinuationTest(BaseSimulatorTest):
|
||||||
|
"""Test comprehensive cross-tool continuation scenarios"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "cross_tool_continuation"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Cross-tool conversation continuation scenarios"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Test comprehensive cross-tool continuation scenarios"""
|
||||||
|
try:
|
||||||
|
self.logger.info("🔧 Test: Cross-tool continuation scenarios")
|
||||||
|
|
||||||
|
# Setup test files
|
||||||
|
self.setup_test_files()
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
total_scenarios = 3
|
||||||
|
|
||||||
|
# Scenario 1: chat -> thinkdeep -> codereview
|
||||||
|
if self._test_chat_thinkdeep_codereview():
|
||||||
|
success_count += 1
|
||||||
|
|
||||||
|
# Scenario 2: analyze -> debug -> thinkdeep
|
||||||
|
if self._test_analyze_debug_thinkdeep():
|
||||||
|
success_count += 1
|
||||||
|
|
||||||
|
# Scenario 3: Multi-file cross-tool continuation
|
||||||
|
if self._test_multi_file_continuation():
|
||||||
|
success_count += 1
|
||||||
|
|
||||||
|
self.logger.info(f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed")
|
||||||
|
|
||||||
|
# Consider successful if at least one scenario worked
|
||||||
|
return success_count > 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Cross-tool continuation test failed: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
self.cleanup_test_files()
|
||||||
|
|
||||||
|
def _test_chat_thinkdeep_codereview(self) -> bool:
|
||||||
|
"""Test chat -> thinkdeep -> codereview scenario"""
|
||||||
|
try:
|
||||||
|
self.logger.info(" 1: Testing chat -> thinkdeep -> codereview")
|
||||||
|
|
||||||
|
# Start with chat
|
||||||
|
chat_response, chat_id = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
|
||||||
|
"files": [self.test_files["python"]],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chat_response or not chat_id:
|
||||||
|
self.logger.error("Failed to start chat conversation")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Continue with thinkdeep
|
||||||
|
thinkdeep_response, _ = self.call_mcp_tool(
|
||||||
|
"thinkdeep",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
|
||||||
|
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||||
|
"continuation_id": chat_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not thinkdeep_response:
|
||||||
|
self.logger.error("Failed chat -> thinkdeep continuation")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Continue with codereview
|
||||||
|
codereview_response, _ = self.call_mcp_tool(
|
||||||
|
"codereview",
|
||||||
|
{
|
||||||
|
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||||
|
"context": "Building on our previous analysis, provide a comprehensive code review",
|
||||||
|
"continuation_id": chat_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not codereview_response:
|
||||||
|
self.logger.error("Failed thinkdeep -> codereview continuation")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(" ✅ chat -> thinkdeep -> codereview working")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Chat -> thinkdeep -> codereview scenario failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _test_analyze_debug_thinkdeep(self) -> bool:
|
||||||
|
"""Test analyze -> debug -> thinkdeep scenario"""
|
||||||
|
try:
|
||||||
|
self.logger.info(" 2: Testing analyze -> debug -> thinkdeep")
|
||||||
|
|
||||||
|
# Start with analyze
|
||||||
|
analyze_response, analyze_id = self.call_mcp_tool(
|
||||||
|
"analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not analyze_response or not analyze_id:
|
||||||
|
self.logger.warning("Failed to start analyze conversation, skipping scenario 2")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Continue with debug
|
||||||
|
debug_response, _ = self.call_mcp_tool(
|
||||||
|
"debug",
|
||||||
|
{
|
||||||
|
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||||
|
"issue_description": "Based on our analysis, help debug the performance issue in fibonacci",
|
||||||
|
"continuation_id": analyze_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not debug_response:
|
||||||
|
self.logger.warning(" ⚠️ analyze -> debug continuation failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Continue with thinkdeep
|
||||||
|
final_response, _ = self.call_mcp_tool(
|
||||||
|
"thinkdeep",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
|
||||||
|
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||||
|
"continuation_id": analyze_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not final_response:
|
||||||
|
self.logger.warning(" ⚠️ debug -> thinkdeep continuation failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(" ✅ analyze -> debug -> thinkdeep working")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Analyze -> debug -> thinkdeep scenario failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _test_multi_file_continuation(self) -> bool:
|
||||||
|
"""Test multi-file cross-tool continuation"""
|
||||||
|
try:
|
||||||
|
self.logger.info(" 3: Testing multi-file cross-tool continuation")
|
||||||
|
|
||||||
|
# Start with both files
|
||||||
|
multi_response, multi_id = self.call_mcp_tool(
|
||||||
|
"chat",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
|
||||||
|
"files": [self.test_files["python"], self.test_files["config"]],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not multi_response or not multi_id:
|
||||||
|
self.logger.warning("Failed to start multi-file conversation, skipping scenario 3")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Switch to codereview with same files (should use conversation history)
|
||||||
|
multi_review, _ = self.call_mcp_tool(
|
||||||
|
"codereview",
|
||||||
|
{
|
||||||
|
"files": [self.test_files["python"], self.test_files["config"]], # Same files
|
||||||
|
"context": "Review both files in the context of our previous discussion",
|
||||||
|
"continuation_id": multi_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not multi_review:
|
||||||
|
self.logger.warning(" ⚠️ Multi-file cross-tool continuation failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(" ✅ Multi-file cross-tool continuation working")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Multi-file continuation scenario failed: {e}")
|
||||||
|
return False
|
||||||
99
simulator_tests/test_logs_validation.py
Normal file
99
simulator_tests/test_logs_validation.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Docker Logs Validation Test
|
||||||
|
|
||||||
|
Validates Docker logs to confirm file deduplication behavior and
|
||||||
|
conversation threading is working properly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class LogsValidationTest(BaseSimulatorTest):
|
||||||
|
"""Validate Docker logs to confirm file deduplication behavior"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "logs_validation"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Docker logs validation"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Validate Docker logs to confirm file deduplication behavior"""
|
||||||
|
try:
|
||||||
|
self.logger.info("📋 Test: Validating Docker logs for file deduplication...")
|
||||||
|
|
||||||
|
# Get server logs from both main container and activity logs
|
||||||
|
result = self.run_command(["docker", "logs", self.container_name], capture_output=True)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to get Docker logs: {result.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
main_logs = result.stdout.decode() + result.stderr.decode()
|
||||||
|
|
||||||
|
# Also get activity logs for more detailed conversation tracking
|
||||||
|
activity_result = self.run_command(
|
||||||
|
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
activity_logs = ""
|
||||||
|
if activity_result.returncode == 0:
|
||||||
|
activity_logs = activity_result.stdout.decode()
|
||||||
|
|
||||||
|
logs = main_logs + "\n" + activity_logs
|
||||||
|
|
||||||
|
# Look for conversation threading patterns that indicate the system is working
|
||||||
|
conversation_patterns = [
|
||||||
|
"CONVERSATION_RESUME",
|
||||||
|
"CONVERSATION_CONTEXT",
|
||||||
|
"previous turns loaded",
|
||||||
|
"tool embedding",
|
||||||
|
"files included",
|
||||||
|
"files truncated",
|
||||||
|
"already in conversation history",
|
||||||
|
]
|
||||||
|
|
||||||
|
conversation_lines = []
|
||||||
|
for line in logs.split("\n"):
|
||||||
|
for pattern in conversation_patterns:
|
||||||
|
if pattern.lower() in line.lower():
|
||||||
|
conversation_lines.append(line.strip())
|
||||||
|
break
|
||||||
|
|
||||||
|
# Look for evidence of conversation threading and file handling
|
||||||
|
conversation_threading_found = False
|
||||||
|
multi_turn_conversations = False
|
||||||
|
|
||||||
|
for line in conversation_lines:
|
||||||
|
lower_line = line.lower()
|
||||||
|
if "conversation_resume" in lower_line:
|
||||||
|
conversation_threading_found = True
|
||||||
|
self.logger.debug(f"📄 Conversation threading: {line}")
|
||||||
|
elif "previous turns loaded" in lower_line:
|
||||||
|
multi_turn_conversations = True
|
||||||
|
self.logger.debug(f"📄 Multi-turn conversation: {line}")
|
||||||
|
elif "already in conversation" in lower_line:
|
||||||
|
self.logger.info(f"✅ Found explicit deduplication: {line}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Conversation threading with multiple turns is evidence of file deduplication working
|
||||||
|
if conversation_threading_found and multi_turn_conversations:
|
||||||
|
self.logger.info("✅ Conversation threading with multi-turn context working")
|
||||||
|
self.logger.info(
|
||||||
|
"✅ File deduplication working implicitly (files embedded once in conversation history)"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
elif conversation_threading_found:
|
||||||
|
self.logger.info("✅ Conversation threading detected")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.warning("⚠️ No clear evidence of conversation threading in logs")
|
||||||
|
self.logger.debug(f"Found {len(conversation_lines)} conversation-related log lines")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Log validation failed: {e}")
|
||||||
|
return False
|
||||||
101
simulator_tests/test_per_tool_deduplication.py
Normal file
101
simulator_tests/test_per_tool_deduplication.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Per-Tool File Deduplication Test
|
||||||
|
|
||||||
|
Tests file deduplication for each individual MCP tool to ensure
|
||||||
|
that files are properly deduplicated within single-tool conversations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class PerToolDeduplicationTest(BaseSimulatorTest):
|
||||||
|
"""Test file deduplication for each individual tool"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "per_tool_deduplication"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "File deduplication for individual tools"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Test file deduplication for each individual tool"""
|
||||||
|
try:
|
||||||
|
self.logger.info("📄 Test: Per-tool file deduplication")
|
||||||
|
|
||||||
|
# Setup test files
|
||||||
|
self.setup_test_files()
|
||||||
|
|
||||||
|
tools_to_test = [
|
||||||
|
(
|
||||||
|
"thinkdeep",
|
||||||
|
{
|
||||||
|
"prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements",
|
||||||
|
"files": [self.test_files["python"]],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}),
|
||||||
|
(
|
||||||
|
"debug",
|
||||||
|
{
|
||||||
|
"files": [self.test_files["python"]],
|
||||||
|
"issue_description": "The fibonacci function seems slow for large numbers",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"codereview",
|
||||||
|
{
|
||||||
|
"files": [self.test_files["python"]],
|
||||||
|
"context": "General code review for quality and best practices",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
successful_tests = 0
|
||||||
|
total_tests = len(tools_to_test)
|
||||||
|
|
||||||
|
for tool_name, initial_params in tools_to_test:
|
||||||
|
self.logger.info(f" {tool_name}: Testing {tool_name} tool file deduplication")
|
||||||
|
|
||||||
|
# Initial call
|
||||||
|
response1, continuation_id = self.call_mcp_tool(tool_name, initial_params)
|
||||||
|
if not response1:
|
||||||
|
self.logger.warning(f" ⚠️ {tool_name} tool initial call failed, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not continuation_id:
|
||||||
|
self.logger.warning(f" ⚠️ {tool_name} tool didn't provide continuation_id, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Continue with same file - should be deduplicated
|
||||||
|
continue_params = initial_params.copy()
|
||||||
|
continue_params["continuation_id"] = continuation_id
|
||||||
|
|
||||||
|
if tool_name == "thinkdeep":
|
||||||
|
continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
|
||||||
|
elif tool_name == "analyze":
|
||||||
|
continue_params["analysis_type"] = "performance"
|
||||||
|
elif tool_name == "debug":
|
||||||
|
continue_params["issue_description"] = "How can we optimize the fibonacci function?"
|
||||||
|
elif tool_name == "codereview":
|
||||||
|
continue_params["context"] = "Focus on the Calculator class implementation"
|
||||||
|
|
||||||
|
response2, _ = self.call_mcp_tool(tool_name, continue_params)
|
||||||
|
if response2:
|
||||||
|
self.logger.info(f" ✅ {tool_name} tool file deduplication working")
|
||||||
|
successful_tests += 1
|
||||||
|
else:
|
||||||
|
self.logger.warning(f" ⚠️ {tool_name} tool continuation failed")
|
||||||
|
|
||||||
|
self.logger.info(f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed")
|
||||||
|
|
||||||
|
# Consider test successful if at least one tool worked
|
||||||
|
return successful_tests > 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Per-tool file deduplication test failed: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
self.cleanup_test_files()
|
||||||
134
simulator_tests/test_redis_validation.py
Normal file
134
simulator_tests/test_redis_validation.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Redis Conversation Memory Validation Test
|
||||||
|
|
||||||
|
Validates that conversation memory is working via Redis by checking
|
||||||
|
for stored conversation threads and their content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from .base_test import BaseSimulatorTest
|
||||||
|
|
||||||
|
|
||||||
|
class RedisValidationTest(BaseSimulatorTest):
|
||||||
|
"""Validate that conversation memory is working via Redis"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "redis_validation"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Redis conversation memory validation"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Validate that conversation memory is working via Redis"""
|
||||||
|
try:
|
||||||
|
self.logger.info("💾 Test: Validating conversation memory via Redis...")
|
||||||
|
|
||||||
|
# First, test Redis connectivity
|
||||||
|
ping_result = self.run_command(
|
||||||
|
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if ping_result.returncode != 0:
|
||||||
|
self.logger.error("Failed to connect to Redis")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "PONG" not in ping_result.stdout.decode():
|
||||||
|
self.logger.error("Redis ping failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info("✅ Redis connectivity confirmed")
|
||||||
|
|
||||||
|
# Check Redis for stored conversations
|
||||||
|
result = self.run_command(
|
||||||
|
["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
self.logger.error("Failed to query Redis")
|
||||||
|
return False
|
||||||
|
|
||||||
|
keys = result.stdout.decode().strip().split("\n")
|
||||||
|
thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"]
|
||||||
|
|
||||||
|
if thread_keys:
|
||||||
|
self.logger.info(f"✅ Found {len(thread_keys)} conversation threads in Redis")
|
||||||
|
|
||||||
|
# Get details of first thread
|
||||||
|
thread_key = thread_keys[0]
|
||||||
|
result = self.run_command(
|
||||||
|
["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
thread_data = result.stdout.decode()
|
||||||
|
try:
|
||||||
|
parsed = json.loads(thread_data)
|
||||||
|
turns = parsed.get("turns", [])
|
||||||
|
self.logger.info(f"✅ Thread has {len(turns)} turns")
|
||||||
|
return True
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.warning("Could not parse thread data")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
# If no existing threads, create a test thread to validate Redis functionality
|
||||||
|
self.logger.info("📝 No existing threads found, creating test thread to validate Redis...")
|
||||||
|
|
||||||
|
test_thread_id = "test_thread_validation"
|
||||||
|
test_data = {
|
||||||
|
"thread_id": test_thread_id,
|
||||||
|
"turns": [
|
||||||
|
{
|
||||||
|
"tool": "chat",
|
||||||
|
"timestamp": "2025-06-11T16:30:00Z",
|
||||||
|
"prompt": "Test validation prompt"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Store test data
|
||||||
|
store_result = self.run_command([
|
||||||
|
"docker", "exec", self.redis_container, "redis-cli",
|
||||||
|
"SET", f"thread:{test_thread_id}", json.dumps(test_data)
|
||||||
|
], capture_output=True)
|
||||||
|
|
||||||
|
if store_result.returncode != 0:
|
||||||
|
self.logger.error("Failed to store test data in Redis")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Retrieve test data
|
||||||
|
retrieve_result = self.run_command([
|
||||||
|
"docker", "exec", self.redis_container, "redis-cli",
|
||||||
|
"GET", f"thread:{test_thread_id}"
|
||||||
|
], capture_output=True)
|
||||||
|
|
||||||
|
if retrieve_result.returncode != 0:
|
||||||
|
self.logger.error("Failed to retrieve test data from Redis")
|
||||||
|
return False
|
||||||
|
|
||||||
|
retrieved_data = retrieve_result.stdout.decode()
|
||||||
|
try:
|
||||||
|
parsed = json.loads(retrieved_data)
|
||||||
|
if parsed.get("thread_id") == test_thread_id:
|
||||||
|
self.logger.info("✅ Redis read/write validation successful")
|
||||||
|
|
||||||
|
# Clean up test data
|
||||||
|
self.run_command([
|
||||||
|
"docker", "exec", self.redis_container, "redis-cli",
|
||||||
|
"DEL", f"thread:{test_thread_id}"
|
||||||
|
], capture_output=True)
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.error("Retrieved data doesn't match stored data")
|
||||||
|
return False
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.logger.error("Could not parse retrieved test data")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Conversation memory validation failed: {e}")
|
||||||
|
return False
|
||||||
@@ -2,14 +2,12 @@
|
|||||||
Sample Python module for testing MCP conversation continuity
|
Sample Python module for testing MCP conversation continuity
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def fibonacci(n):
|
def fibonacci(n):
|
||||||
"""Calculate fibonacci number recursively"""
|
"""Calculate fibonacci number recursively"""
|
||||||
if n <= 1:
|
if n <= 1:
|
||||||
return n
|
return n
|
||||||
return fibonacci(n-1) + fibonacci(n-2)
|
return fibonacci(n-1) + fibonacci(n-2)
|
||||||
|
|
||||||
|
|
||||||
def factorial(n):
|
def factorial(n):
|
||||||
"""Calculate factorial iteratively"""
|
"""Calculate factorial iteratively"""
|
||||||
result = 1
|
result = 1
|
||||||
@@ -17,7 +15,6 @@ def factorial(n):
|
|||||||
result *= i
|
result *= i
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class Calculator:
|
class Calculator:
|
||||||
"""Simple calculator class"""
|
"""Simple calculator class"""
|
||||||
|
|
||||||
|
|||||||
16
test_simulation_files/validation_config.py
Normal file
16
test_simulation_files/validation_config.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
Configuration file for content validation testing
|
||||||
|
This content should appear only ONCE in any tool response
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Configuration constants
|
||||||
|
MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
|
||||||
|
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
|
||||||
|
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
|
||||||
|
|
||||||
|
# Database settings
|
||||||
|
DATABASE_CONFIG = {
|
||||||
|
"host": "localhost",
|
||||||
|
"port": 5432,
|
||||||
|
"name": "validation_test_db"
|
||||||
|
}
|
||||||
261
tests/test_precommit_with_mock_store.py
Normal file
261
tests/test_precommit_with_mock_store.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
"""
|
||||||
|
Enhanced tests for precommit tool using mock storage to test real logic
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from unittest.mock import Mock, patch, MagicMock
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tools.precommit import Precommit, PrecommitRequest
|
||||||
|
|
||||||
|
|
||||||
|
class MockRedisClient:
|
||||||
|
"""Mock Redis client that uses in-memory dictionary storage"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.data: Dict[str, str] = {}
|
||||||
|
self.ttl_data: Dict[str, int] = {}
|
||||||
|
|
||||||
|
def get(self, key: str) -> Optional[str]:
|
||||||
|
return self.data.get(key)
|
||||||
|
|
||||||
|
def set(self, key: str, value: str, ex: Optional[int] = None) -> bool:
|
||||||
|
self.data[key] = value
|
||||||
|
if ex:
|
||||||
|
self.ttl_data[key] = ex
|
||||||
|
return True
|
||||||
|
|
||||||
|
def delete(self, key: str) -> int:
|
||||||
|
if key in self.data:
|
||||||
|
del self.data[key]
|
||||||
|
self.ttl_data.pop(key, None)
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def exists(self, key: str) -> int:
|
||||||
|
return 1 if key in self.data else 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestPrecommitToolWithMockStore:
|
||||||
|
"""Test precommit tool with mock storage to validate actual logic"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_redis(self):
|
||||||
|
"""Create mock Redis client"""
|
||||||
|
return MockRedisClient()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tool(self, mock_redis):
|
||||||
|
"""Create tool instance with mocked Redis"""
|
||||||
|
tool = Precommit()
|
||||||
|
|
||||||
|
# Mock the Redis client getter to return our mock
|
||||||
|
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis):
|
||||||
|
yield tool
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_repo(self):
|
||||||
|
"""Create a temporary git repository with test files"""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
# Initialize git repo
|
||||||
|
subprocess.run(['git', 'init'], cwd=temp_dir, capture_output=True)
|
||||||
|
subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=temp_dir, capture_output=True)
|
||||||
|
subprocess.run(['git', 'config', 'user.email', 'test@example.com'], cwd=temp_dir, capture_output=True)
|
||||||
|
|
||||||
|
# Create test config file
|
||||||
|
config_content = '''"""Test configuration file"""
|
||||||
|
|
||||||
|
# Version and metadata
|
||||||
|
__version__ = "1.0.0"
|
||||||
|
__author__ = "Test"
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content
|
||||||
|
TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging
|
||||||
|
'''
|
||||||
|
|
||||||
|
config_path = os.path.join(temp_dir, 'config.py')
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
f.write(config_content)
|
||||||
|
|
||||||
|
# Add and commit initial version
|
||||||
|
subprocess.run(['git', 'add', '.'], cwd=temp_dir, capture_output=True)
|
||||||
|
subprocess.run(['git', 'commit', '-m', 'Initial commit'], cwd=temp_dir, capture_output=True)
|
||||||
|
|
||||||
|
# Modify config to create a diff
|
||||||
|
modified_content = config_content + '\nNEW_SETTING = "test" # Added setting\n'
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
f.write(modified_content)
|
||||||
|
|
||||||
|
yield temp_dir, config_path
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_no_duplicate_file_content_in_prompt(self, tool, temp_repo, mock_redis):
|
||||||
|
"""Test that file content doesn't appear twice in the generated prompt"""
|
||||||
|
temp_dir, config_path = temp_repo
|
||||||
|
|
||||||
|
# Create request with files parameter
|
||||||
|
request = PrecommitRequest(
|
||||||
|
path=temp_dir,
|
||||||
|
files=[config_path],
|
||||||
|
original_request="Test configuration changes"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate the prompt
|
||||||
|
prompt = await tool.prepare_prompt(request)
|
||||||
|
|
||||||
|
# Test that MAX_CONTENT_TOKENS only appears once in the entire prompt
|
||||||
|
max_content_count = prompt.count('MAX_CONTENT_TOKENS = 800_000')
|
||||||
|
assert max_content_count == 1, f"MAX_CONTENT_TOKENS appears {max_content_count} times (should be 1)"
|
||||||
|
|
||||||
|
# Test that the config file content only appears once
|
||||||
|
config_content_count = prompt.count('# Configuration')
|
||||||
|
assert config_content_count == 1, f"Config file content appears {config_content_count} times (should be 1)"
|
||||||
|
|
||||||
|
# Verify expected sections are present
|
||||||
|
assert "## Original Request" in prompt
|
||||||
|
assert "Test configuration changes" in prompt
|
||||||
|
assert "## Additional Context Files" in prompt
|
||||||
|
assert "## Git Diffs" in prompt
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_conversation_memory_integration(self, tool, temp_repo, mock_redis):
|
||||||
|
"""Test that conversation memory works with mock storage"""
|
||||||
|
temp_dir, config_path = temp_repo
|
||||||
|
|
||||||
|
# Mock conversation memory functions to use our mock redis
|
||||||
|
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis):
|
||||||
|
# First request - should embed file content
|
||||||
|
request1 = PrecommitRequest(
|
||||||
|
path=temp_dir,
|
||||||
|
files=[config_path],
|
||||||
|
original_request="First review"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate conversation thread creation
|
||||||
|
from utils.conversation_memory import create_thread, add_turn
|
||||||
|
thread_id = create_thread("precommit", {"files": [config_path]})
|
||||||
|
|
||||||
|
# Test that file embedding works
|
||||||
|
files_to_embed = tool.filter_new_files([config_path], None)
|
||||||
|
assert config_path in files_to_embed, "New conversation should embed all files"
|
||||||
|
|
||||||
|
# Add a turn to the conversation
|
||||||
|
add_turn(thread_id, "assistant", "First response", files=[config_path], tool_name="precommit")
|
||||||
|
|
||||||
|
# Second request with continuation - should skip already embedded files
|
||||||
|
request2 = PrecommitRequest(
|
||||||
|
path=temp_dir,
|
||||||
|
files=[config_path],
|
||||||
|
continuation_id=thread_id,
|
||||||
|
original_request="Follow-up review"
|
||||||
|
)
|
||||||
|
|
||||||
|
files_to_embed_2 = tool.filter_new_files([config_path], thread_id)
|
||||||
|
assert len(files_to_embed_2) == 0, "Continuation should skip already embedded files"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_prompt_structure_integrity(self, tool, temp_repo, mock_redis):
|
||||||
|
"""Test that the prompt structure is well-formed and doesn't have content duplication"""
|
||||||
|
temp_dir, config_path = temp_repo
|
||||||
|
|
||||||
|
request = PrecommitRequest(
|
||||||
|
path=temp_dir,
|
||||||
|
files=[config_path],
|
||||||
|
original_request="Validate prompt structure",
|
||||||
|
review_type="full",
|
||||||
|
severity_filter="high"
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = await tool.prepare_prompt(request)
|
||||||
|
|
||||||
|
# Split prompt into sections
|
||||||
|
sections = {
|
||||||
|
"original_request": "## Original Request",
|
||||||
|
"review_parameters": "## Review Parameters",
|
||||||
|
"repo_summary": "## Repository Changes Summary",
|
||||||
|
"context_files_summary": "## Context Files Summary",
|
||||||
|
"git_diffs": "## Git Diffs",
|
||||||
|
"additional_context": "## Additional Context Files",
|
||||||
|
"review_instructions": "## Review Instructions"
|
||||||
|
}
|
||||||
|
|
||||||
|
section_indices = {}
|
||||||
|
for name, header in sections.items():
|
||||||
|
index = prompt.find(header)
|
||||||
|
if index != -1:
|
||||||
|
section_indices[name] = index
|
||||||
|
|
||||||
|
# Verify sections appear in logical order
|
||||||
|
assert section_indices["original_request"] < section_indices["review_parameters"]
|
||||||
|
assert section_indices["review_parameters"] < section_indices["repo_summary"]
|
||||||
|
assert section_indices["git_diffs"] < section_indices["additional_context"]
|
||||||
|
assert section_indices["additional_context"] < section_indices["review_instructions"]
|
||||||
|
|
||||||
|
# Test that file content only appears in Additional Context section
|
||||||
|
file_content_start = section_indices["additional_context"]
|
||||||
|
file_content_end = section_indices["review_instructions"]
|
||||||
|
|
||||||
|
file_section = prompt[file_content_start:file_content_end]
|
||||||
|
before_file_section = prompt[:file_content_start]
|
||||||
|
after_file_section = prompt[file_content_end:]
|
||||||
|
|
||||||
|
# MAX_CONTENT_TOKENS should only appear in the file section
|
||||||
|
assert 'MAX_CONTENT_TOKENS' in file_section
|
||||||
|
assert 'MAX_CONTENT_TOKENS' not in before_file_section
|
||||||
|
assert 'MAX_CONTENT_TOKENS' not in after_file_section
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_file_content_formatting(self, tool, temp_repo, mock_redis):
|
||||||
|
"""Test that file content is properly formatted without duplication"""
|
||||||
|
temp_dir, config_path = temp_repo
|
||||||
|
|
||||||
|
# Test the centralized file preparation method directly
|
||||||
|
file_content = tool._prepare_file_content_for_prompt(
|
||||||
|
[config_path],
|
||||||
|
None, # No continuation
|
||||||
|
"Test files",
|
||||||
|
max_tokens=100000,
|
||||||
|
reserve_tokens=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should contain file markers
|
||||||
|
assert "--- BEGIN FILE:" in file_content
|
||||||
|
assert "--- END FILE:" in file_content
|
||||||
|
assert "config.py" in file_content
|
||||||
|
|
||||||
|
# Should contain actual file content
|
||||||
|
assert "MAX_CONTENT_TOKENS = 800_000" in file_content
|
||||||
|
assert "__version__ = \"1.0.0\"" in file_content
|
||||||
|
|
||||||
|
# Content should appear only once
|
||||||
|
assert file_content.count("MAX_CONTENT_TOKENS = 800_000") == 1
|
||||||
|
assert file_content.count("__version__ = \"1.0.0\"") == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_mock_redis_basic_operations():
|
||||||
|
"""Test that our mock Redis implementation works correctly"""
|
||||||
|
mock_redis = MockRedisClient()
|
||||||
|
|
||||||
|
# Test basic operations
|
||||||
|
assert mock_redis.get("nonexistent") is None
|
||||||
|
assert mock_redis.exists("nonexistent") == 0
|
||||||
|
|
||||||
|
mock_redis.set("test_key", "test_value")
|
||||||
|
assert mock_redis.get("test_key") == "test_value"
|
||||||
|
assert mock_redis.exists("test_key") == 1
|
||||||
|
|
||||||
|
assert mock_redis.delete("test_key") == 1
|
||||||
|
assert mock_redis.get("test_key") is None
|
||||||
|
assert mock_redis.delete("test_key") == 0 # Already deleted
|
||||||
@@ -10,7 +10,7 @@ from pydantic import Field
|
|||||||
|
|
||||||
from config import MAX_CONTEXT_TOKENS
|
from config import MAX_CONTEXT_TOKENS
|
||||||
from prompts.tool_prompts import PRECOMMIT_PROMPT
|
from prompts.tool_prompts import PRECOMMIT_PROMPT
|
||||||
from utils.file_utils import translate_file_paths, translate_path_for_environment
|
from utils.file_utils import read_files, translate_file_paths, translate_path_for_environment
|
||||||
from utils.git_utils import find_git_repositories, get_git_status, run_git_command
|
from utils.git_utils import find_git_repositories, get_git_status, run_git_command
|
||||||
from utils.token_utils import estimate_tokens
|
from utils.token_utils import estimate_tokens
|
||||||
|
|
||||||
@@ -304,7 +304,7 @@ class Precommit(BaseTool):
|
|||||||
request.continuation_id,
|
request.continuation_id,
|
||||||
"Context files",
|
"Context files",
|
||||||
max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated
|
max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated
|
||||||
reserve_tokens=1000, # Small reserve for formatting
|
reserve_tokens=1000 # Small reserve for formatting
|
||||||
)
|
)
|
||||||
|
|
||||||
if file_content:
|
if file_content:
|
||||||
|
|||||||
Reference in New Issue
Block a user