Lots of tests with live simulation to validate conversation continuation / preservation work across requests

This commit is contained in:
Fahad
2025-06-11 17:03:09 +04:00
parent ac763e0213
commit c90ac7561e
14 changed files with 3612 additions and 1420 deletions

1668
communication_simulator_test.py Executable file → Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,35 @@
"""
Communication Simulator Tests Package
This package contains individual test modules for the Gemini MCP Communication Simulator.
Each test is in its own file for better organization and maintainability.
"""
from .base_test import BaseSimulatorTest
from .test_basic_conversation import BasicConversationTest
from .test_content_validation import ContentValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_cross_tool_continuation import CrossToolContinuationTest
from .test_logs_validation import LogsValidationTest
from .test_redis_validation import RedisValidationTest
# Test registry for dynamic loading
TEST_REGISTRY = {
"basic_conversation": BasicConversationTest,
"content_validation": ContentValidationTest,
"per_tool_deduplication": PerToolDeduplicationTest,
"cross_tool_continuation": CrossToolContinuationTest,
"logs_validation": LogsValidationTest,
"redis_validation": RedisValidationTest,
}
__all__ = [
'BaseSimulatorTest',
'BasicConversationTest',
'ContentValidationTest',
'PerToolDeduplicationTest',
'CrossToolContinuationTest',
'LogsValidationTest',
'RedisValidationTest',
'TEST_REGISTRY'
]

View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
Base Test Class for Communication Simulator Tests
Provides common functionality and utilities for all simulator tests.
"""
import json
import logging
import os
import subprocess
import tempfile
import time
from typing import Optional, Tuple
class BaseSimulatorTest:
"""Base class for all communication simulator tests"""
def __init__(self, verbose: bool = False):
self.verbose = verbose
self.test_files = {}
self.test_dir = None
self.container_name = "gemini-mcp-server"
self.redis_container = "gemini-mcp-redis"
# Configure logging
log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
self.logger = logging.getLogger(self.__class__.__name__)
def setup_test_files(self):
"""Create test files for the simulation"""
# Test Python file
python_content = '''"""
Sample Python module for testing MCP conversation continuity
"""
def fibonacci(n):
"""Calculate fibonacci number recursively"""
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
def factorial(n):
"""Calculate factorial iteratively"""
result = 1
for i in range(1, n + 1):
result *= i
return result
class Calculator:
"""Simple calculator class"""
def __init__(self):
self.history = []
def add(self, a, b):
result = a + b
self.history.append(f"{a} + {b} = {result}")
return result
def multiply(self, a, b):
result = a * b
self.history.append(f"{a} * {b} = {result}")
return result
'''
# Test configuration file
config_content = """{
"database": {
"host": "localhost",
"port": 5432,
"name": "testdb",
"ssl": true
},
"cache": {
"redis_url": "redis://localhost:6379",
"ttl": 3600
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
}
}"""
# Create files in the current project directory
current_dir = os.getcwd()
self.test_dir = os.path.join(current_dir, "test_simulation_files")
os.makedirs(self.test_dir, exist_ok=True)
test_py = os.path.join(self.test_dir, "test_module.py")
test_config = os.path.join(self.test_dir, "config.json")
with open(test_py, "w") as f:
f.write(python_content)
with open(test_config, "w") as f:
f.write(config_content)
self.test_files = {"python": test_py, "config": test_config}
self.logger.debug(f"Created test files: {list(self.test_files.values())}")
def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec)"""
try:
# Prepare the MCP initialization and tool call sequence
init_request = {
"jsonrpc": "2.0",
"id": 1,
"method": "initialize",
"params": {
"protocolVersion": "2024-11-05",
"capabilities": {"tools": {}},
"clientInfo": {"name": "communication-simulator", "version": "1.0.0"},
},
}
# Send initialized notification
initialized_notification = {"jsonrpc": "2.0", "method": "notifications/initialized"}
# Prepare the tool call request
tool_request = {
"jsonrpc": "2.0",
"id": 2,
"method": "tools/call",
"params": {"name": tool_name, "arguments": params},
}
# Combine all messages
messages = [json.dumps(init_request), json.dumps(initialized_notification), json.dumps(tool_request)]
# Join with newlines as MCP expects
input_data = "\n".join(messages) + "\n"
# Simulate Claude CLI calling the MCP server via docker exec
docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"]
self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")
# Execute the command
result = subprocess.run(
docker_cmd, input=input_data, text=True, capture_output=True, timeout=300 # 5 minute timeout
)
if result.returncode != 0:
self.logger.error(f"Docker exec failed: {result.stderr}")
return None, None
# Parse the response - look for the tool call response
response_data = self._parse_mcp_response(result.stdout, expected_id=2)
if not response_data:
return None, None
# Extract continuation_id if present
continuation_id = self._extract_continuation_id(response_data)
return response_data, continuation_id
except subprocess.TimeoutExpired:
self.logger.error(f"MCP tool call timed out: {tool_name}")
return None, None
except Exception as e:
self.logger.error(f"MCP tool call failed: {e}")
return None, None
def _parse_mcp_response(self, stdout: str, expected_id: int = 2) -> Optional[str]:
"""Parse MCP JSON-RPC response from stdout"""
try:
lines = stdout.strip().split("\n")
for line in lines:
if line.strip() and line.startswith("{"):
response = json.loads(line)
# Look for the tool call response with the expected ID
if response.get("id") == expected_id and "result" in response:
# Extract the actual content from the response
result = response["result"]
# Handle new response format with 'content' array
if isinstance(result, dict) and "content" in result:
content_array = result["content"]
if isinstance(content_array, list) and len(content_array) > 0:
return content_array[0].get("text", "")
# Handle legacy format
elif isinstance(result, list) and len(result) > 0:
return result[0].get("text", "")
elif response.get("id") == expected_id and "error" in response:
self.logger.error(f"MCP error: {response['error']}")
return None
# If we get here, log all responses for debugging
self.logger.warning(f"No valid tool call response found for ID {expected_id}")
self.logger.debug(f"Full stdout: {stdout}")
return None
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse MCP response: {e}")
self.logger.debug(f"Stdout that failed to parse: {stdout}")
return None
def _extract_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from response metadata"""
try:
# Parse the response text as JSON to look for continuation metadata
response_data = json.loads(response_text)
# Look for continuation_id in various places
if isinstance(response_data, dict):
# Check metadata
metadata = response_data.get("metadata", {})
if "thread_id" in metadata:
return metadata["thread_id"]
# Check follow_up_request
follow_up = response_data.get("follow_up_request", {})
if follow_up and "continuation_id" in follow_up:
return follow_up["continuation_id"]
# Check continuation_offer
continuation_offer = response_data.get("continuation_offer", {})
if continuation_offer and "continuation_id" in continuation_offer:
return continuation_offer["continuation_id"]
self.logger.debug(f"No continuation_id found in response: {response_data}")
return None
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for continuation_id: {e}")
return None
def run_command(self, cmd: list[str], check: bool = True, capture_output: bool = False, **kwargs):
"""Run a shell command with logging"""
if self.verbose:
self.logger.debug(f"Running: {' '.join(cmd)}")
return subprocess.run(cmd, check=check, capture_output=capture_output, **kwargs)
def cleanup_test_files(self):
"""Clean up test files"""
if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir):
import shutil
shutil.rmtree(self.test_dir)
self.logger.debug(f"Removed test files directory: {self.test_dir}")
def run_test(self) -> bool:
"""Run the test - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement run_test()")
@property
def test_name(self) -> str:
"""Get the test name - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement test_name property")
@property
def test_description(self) -> str:
"""Get the test description - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement test_description property")

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Basic Conversation Flow Test
Tests basic conversation continuity with the chat tool, including:
- Initial chat with file analysis
- Continuing conversation with same file (deduplication)
- Adding additional files to ongoing conversation
"""
from .base_test import BaseSimulatorTest
class BasicConversationTest(BaseSimulatorTest):
"""Test basic conversation flow with chat tool"""
@property
def test_name(self) -> str:
return "basic_conversation"
@property
def test_description(self) -> str:
return "Basic conversation flow with chat tool"
def run_test(self) -> bool:
"""Test basic conversation flow with chat tool"""
try:
self.logger.info("📝 Test: Basic conversation flow")
# Setup test files
self.setup_test_files()
# Initial chat tool call with file
self.logger.info(" 1.1: Initial chat with file analysis")
response1, continuation_id = self.call_mcp_tool(
"chat",
{"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]},
)
if not response1 or not continuation_id:
self.logger.error("Failed to get initial response with continuation_id")
return False
self.logger.info(f" ✅ Got continuation_id: {continuation_id}")
# Continue conversation with same file (should be deduplicated)
self.logger.info(" 1.2: Continue conversation with same file")
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. Now focus on the Calculator class specifically. Are there any improvements you'd suggest?",
"files": [self.test_files["python"]], # Same file - should be deduplicated
"continuation_id": continuation_id,
},
)
if not response2:
self.logger.error("Failed to continue conversation")
return False
# Continue with additional file
self.logger.info(" 1.3: Continue conversation with additional file")
response3, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. Now also analyze this configuration file and see how it might relate to the Python code",
"files": [self.test_files["python"], self.test_files["config"]],
"continuation_id": continuation_id,
},
)
if not response3:
self.logger.error("Failed to continue with additional file")
return False
self.logger.info(" ✅ Basic conversation flow working")
return True
except Exception as e:
self.logger.error(f"Basic conversation flow test failed: {e}")
return False
finally:
self.cleanup_test_files()

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Content Validation Test
Tests that tools don't duplicate file content in their responses.
This test is specifically designed to catch content duplication bugs.
"""
import json
import os
from .base_test import BaseSimulatorTest
class ContentValidationTest(BaseSimulatorTest):
"""Test that tools don't duplicate file content in their responses"""
@property
def test_name(self) -> str:
return "content_validation"
@property
def test_description(self) -> str:
return "Content validation and duplicate detection"
def run_test(self) -> bool:
"""Test that tools don't duplicate file content in their responses"""
try:
self.logger.info("📄 Test: Content validation and duplicate detection")
# Setup test files first
self.setup_test_files()
# Create a test file with distinctive content for validation
validation_content = '''"""
Configuration file for content validation testing
This content should appear only ONCE in any tool response
"""
# Configuration constants
MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings
DATABASE_CONFIG = {
"host": "localhost",
"port": 5432,
"name": "validation_test_db"
}
'''
validation_file = os.path.join(self.test_dir, "validation_config.py")
with open(validation_file, "w") as f:
f.write(validation_content)
# Test 1: Precommit tool with files parameter (where the bug occurred)
self.logger.info(" 1: Testing precommit tool content duplication")
# Call precommit tool with the validation file
response1, thread_id = self.call_mcp_tool(
"precommit",
{
"path": os.getcwd(),
"files": [validation_file],
"original_request": "Test for content duplication in precommit tool"
}
)
if response1:
# Parse response and check for content duplication
try:
response_data = json.loads(response1)
content = response_data.get("content", "")
# Count occurrences of distinctive markers
max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000")
temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2")
unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER")
# Validate no duplication
duplication_detected = False
issues = []
if max_content_count > 1:
issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times")
duplication_detected = True
if temp_analytical_count > 1:
issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times")
duplication_detected = True
if unique_marker_count > 1:
issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times")
duplication_detected = True
if duplication_detected:
self.logger.error(f" ❌ Content duplication detected in precommit tool: {'; '.join(issues)}")
return False
else:
self.logger.info(" ✅ No content duplication in precommit tool")
except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse precommit response as JSON")
else:
self.logger.warning(" ⚠️ Precommit tool failed to respond")
# Test 2: Other tools that use files parameter
tools_to_test = [
("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}),
("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}),
("analyze", {"files": [validation_file], "analysis_type": "code_quality"})
]
for tool_name, params in tools_to_test:
self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication")
response, _ = self.call_mcp_tool(tool_name, params)
if response:
try:
response_data = json.loads(response)
content = response_data.get("content", "")
# Check for duplication
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1:
self.logger.error(f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times")
return False
else:
self.logger.info(f" ✅ No content duplication in {tool_name}")
except json.JSONDecodeError:
self.logger.warning(f" ⚠️ Could not parse {tool_name} response")
else:
self.logger.warning(f" ⚠️ {tool_name} tool failed to respond")
# Test 3: Cross-tool content validation with file deduplication
self.logger.info(" 3: Testing cross-tool content consistency")
if thread_id:
# Continue conversation with same file - content should be deduplicated in conversation history
response2, _ = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. Continue analyzing this configuration file",
"files": [validation_file], # Same file should be deduplicated
"continuation_id": thread_id,
},
)
if response2:
try:
response_data = json.loads(response2)
content = response_data.get("content", "")
# In continuation, the file content shouldn't be duplicated either
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1:
self.logger.error(f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times")
return False
else:
self.logger.info(" ✅ No content duplication in cross-tool continuation")
except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse continuation response")
# Cleanup
os.remove(validation_file)
self.logger.info(" ✅ All content validation tests passed")
return True
except Exception as e:
self.logger.error(f"Content validation test failed: {e}")
return False
finally:
self.cleanup_test_files()

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python3
"""
Cross-Tool Continuation Test
Tests comprehensive cross-tool continuation scenarios to ensure
conversation context is maintained when switching between different tools.
"""
from .base_test import BaseSimulatorTest
class CrossToolContinuationTest(BaseSimulatorTest):
"""Test comprehensive cross-tool continuation scenarios"""
@property
def test_name(self) -> str:
return "cross_tool_continuation"
@property
def test_description(self) -> str:
return "Cross-tool conversation continuation scenarios"
def run_test(self) -> bool:
"""Test comprehensive cross-tool continuation scenarios"""
try:
self.logger.info("🔧 Test: Cross-tool continuation scenarios")
# Setup test files
self.setup_test_files()
success_count = 0
total_scenarios = 3
# Scenario 1: chat -> thinkdeep -> codereview
if self._test_chat_thinkdeep_codereview():
success_count += 1
# Scenario 2: analyze -> debug -> thinkdeep
if self._test_analyze_debug_thinkdeep():
success_count += 1
# Scenario 3: Multi-file cross-tool continuation
if self._test_multi_file_continuation():
success_count += 1
self.logger.info(f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed")
# Consider successful if at least one scenario worked
return success_count > 0
except Exception as e:
self.logger.error(f"Cross-tool continuation test failed: {e}")
return False
finally:
self.cleanup_test_files()
def _test_chat_thinkdeep_codereview(self) -> bool:
"""Test chat -> thinkdeep -> codereview scenario"""
try:
self.logger.info(" 1: Testing chat -> thinkdeep -> codereview")
# Start with chat
chat_response, chat_id = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
"files": [self.test_files["python"]],
},
)
if not chat_response or not chat_id:
self.logger.error("Failed to start chat conversation")
return False
# Continue with thinkdeep
thinkdeep_response, _ = self.call_mcp_tool(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
"files": [self.test_files["python"]], # Same file should be deduplicated
"continuation_id": chat_id,
},
)
if not thinkdeep_response:
self.logger.error("Failed chat -> thinkdeep continuation")
return False
# Continue with codereview
codereview_response, _ = self.call_mcp_tool(
"codereview",
{
"files": [self.test_files["python"]], # Same file should be deduplicated
"context": "Building on our previous analysis, provide a comprehensive code review",
"continuation_id": chat_id,
},
)
if not codereview_response:
self.logger.error("Failed thinkdeep -> codereview continuation")
return False
self.logger.info(" ✅ chat -> thinkdeep -> codereview working")
return True
except Exception as e:
self.logger.error(f"Chat -> thinkdeep -> codereview scenario failed: {e}")
return False
def _test_analyze_debug_thinkdeep(self) -> bool:
"""Test analyze -> debug -> thinkdeep scenario"""
try:
self.logger.info(" 2: Testing analyze -> debug -> thinkdeep")
# Start with analyze
analyze_response, analyze_id = self.call_mcp_tool(
"analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality"}
)
if not analyze_response or not analyze_id:
self.logger.warning("Failed to start analyze conversation, skipping scenario 2")
return False
# Continue with debug
debug_response, _ = self.call_mcp_tool(
"debug",
{
"files": [self.test_files["python"]], # Same file should be deduplicated
"issue_description": "Based on our analysis, help debug the performance issue in fibonacci",
"continuation_id": analyze_id,
},
)
if not debug_response:
self.logger.warning(" ⚠️ analyze -> debug continuation failed")
return False
# Continue with thinkdeep
final_response, _ = self.call_mcp_tool(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
"files": [self.test_files["python"]], # Same file should be deduplicated
"continuation_id": analyze_id,
},
)
if not final_response:
self.logger.warning(" ⚠️ debug -> thinkdeep continuation failed")
return False
self.logger.info(" ✅ analyze -> debug -> thinkdeep working")
return True
except Exception as e:
self.logger.error(f"Analyze -> debug -> thinkdeep scenario failed: {e}")
return False
def _test_multi_file_continuation(self) -> bool:
"""Test multi-file cross-tool continuation"""
try:
self.logger.info(" 3: Testing multi-file cross-tool continuation")
# Start with both files
multi_response, multi_id = self.call_mcp_tool(
"chat",
{
"prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
"files": [self.test_files["python"], self.test_files["config"]],
},
)
if not multi_response or not multi_id:
self.logger.warning("Failed to start multi-file conversation, skipping scenario 3")
return False
# Switch to codereview with same files (should use conversation history)
multi_review, _ = self.call_mcp_tool(
"codereview",
{
"files": [self.test_files["python"], self.test_files["config"]], # Same files
"context": "Review both files in the context of our previous discussion",
"continuation_id": multi_id,
},
)
if not multi_review:
self.logger.warning(" ⚠️ Multi-file cross-tool continuation failed")
return False
self.logger.info(" ✅ Multi-file cross-tool continuation working")
return True
except Exception as e:
self.logger.error(f"Multi-file continuation scenario failed: {e}")
return False

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""
Docker Logs Validation Test
Validates Docker logs to confirm file deduplication behavior and
conversation threading is working properly.
"""
from .base_test import BaseSimulatorTest
class LogsValidationTest(BaseSimulatorTest):
"""Validate Docker logs to confirm file deduplication behavior"""
@property
def test_name(self) -> str:
return "logs_validation"
@property
def test_description(self) -> str:
return "Docker logs validation"
def run_test(self) -> bool:
"""Validate Docker logs to confirm file deduplication behavior"""
try:
self.logger.info("📋 Test: Validating Docker logs for file deduplication...")
# Get server logs from both main container and activity logs
result = self.run_command(["docker", "logs", self.container_name], capture_output=True)
if result.returncode != 0:
self.logger.error(f"Failed to get Docker logs: {result.stderr}")
return False
main_logs = result.stdout.decode() + result.stderr.decode()
# Also get activity logs for more detailed conversation tracking
activity_result = self.run_command(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True
)
activity_logs = ""
if activity_result.returncode == 0:
activity_logs = activity_result.stdout.decode()
logs = main_logs + "\n" + activity_logs
# Look for conversation threading patterns that indicate the system is working
conversation_patterns = [
"CONVERSATION_RESUME",
"CONVERSATION_CONTEXT",
"previous turns loaded",
"tool embedding",
"files included",
"files truncated",
"already in conversation history",
]
conversation_lines = []
for line in logs.split("\n"):
for pattern in conversation_patterns:
if pattern.lower() in line.lower():
conversation_lines.append(line.strip())
break
# Look for evidence of conversation threading and file handling
conversation_threading_found = False
multi_turn_conversations = False
for line in conversation_lines:
lower_line = line.lower()
if "conversation_resume" in lower_line:
conversation_threading_found = True
self.logger.debug(f"📄 Conversation threading: {line}")
elif "previous turns loaded" in lower_line:
multi_turn_conversations = True
self.logger.debug(f"📄 Multi-turn conversation: {line}")
elif "already in conversation" in lower_line:
self.logger.info(f"✅ Found explicit deduplication: {line}")
return True
# Conversation threading with multiple turns is evidence of file deduplication working
if conversation_threading_found and multi_turn_conversations:
self.logger.info("✅ Conversation threading with multi-turn context working")
self.logger.info(
"✅ File deduplication working implicitly (files embedded once in conversation history)"
)
return True
elif conversation_threading_found:
self.logger.info("✅ Conversation threading detected")
return True
else:
self.logger.warning("⚠️ No clear evidence of conversation threading in logs")
self.logger.debug(f"Found {len(conversation_lines)} conversation-related log lines")
return False
except Exception as e:
self.logger.error(f"Log validation failed: {e}")
return False

View File

@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""
Per-Tool File Deduplication Test
Tests file deduplication for each individual MCP tool to ensure
that files are properly deduplicated within single-tool conversations.
"""
from .base_test import BaseSimulatorTest
class PerToolDeduplicationTest(BaseSimulatorTest):
"""Test file deduplication for each individual tool"""
@property
def test_name(self) -> str:
return "per_tool_deduplication"
@property
def test_description(self) -> str:
return "File deduplication for individual tools"
def run_test(self) -> bool:
"""Test file deduplication for each individual tool"""
try:
self.logger.info("📄 Test: Per-tool file deduplication")
# Setup test files
self.setup_test_files()
tools_to_test = [
(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements",
"files": [self.test_files["python"]],
},
),
("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}),
(
"debug",
{
"files": [self.test_files["python"]],
"issue_description": "The fibonacci function seems slow for large numbers",
},
),
(
"codereview",
{
"files": [self.test_files["python"]],
"context": "General code review for quality and best practices",
},
),
]
successful_tests = 0
total_tests = len(tools_to_test)
for tool_name, initial_params in tools_to_test:
self.logger.info(f" {tool_name}: Testing {tool_name} tool file deduplication")
# Initial call
response1, continuation_id = self.call_mcp_tool(tool_name, initial_params)
if not response1:
self.logger.warning(f" ⚠️ {tool_name} tool initial call failed, skipping")
continue
if not continuation_id:
self.logger.warning(f" ⚠️ {tool_name} tool didn't provide continuation_id, skipping")
continue
# Continue with same file - should be deduplicated
continue_params = initial_params.copy()
continue_params["continuation_id"] = continuation_id
if tool_name == "thinkdeep":
continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
elif tool_name == "analyze":
continue_params["analysis_type"] = "performance"
elif tool_name == "debug":
continue_params["issue_description"] = "How can we optimize the fibonacci function?"
elif tool_name == "codereview":
continue_params["context"] = "Focus on the Calculator class implementation"
response2, _ = self.call_mcp_tool(tool_name, continue_params)
if response2:
self.logger.info(f"{tool_name} tool file deduplication working")
successful_tests += 1
else:
self.logger.warning(f" ⚠️ {tool_name} tool continuation failed")
self.logger.info(f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed")
# Consider test successful if at least one tool worked
return successful_tests > 0
except Exception as e:
self.logger.error(f"Per-tool file deduplication test failed: {e}")
return False
finally:
self.cleanup_test_files()

View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
Redis Conversation Memory Validation Test
Validates that conversation memory is working via Redis by checking
for stored conversation threads and their content.
"""
import json
from .base_test import BaseSimulatorTest
class RedisValidationTest(BaseSimulatorTest):
"""Validate that conversation memory is working via Redis"""
@property
def test_name(self) -> str:
return "redis_validation"
@property
def test_description(self) -> str:
return "Redis conversation memory validation"
def run_test(self) -> bool:
"""Validate that conversation memory is working via Redis"""
try:
self.logger.info("💾 Test: Validating conversation memory via Redis...")
# First, test Redis connectivity
ping_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
)
if ping_result.returncode != 0:
self.logger.error("Failed to connect to Redis")
return False
if "PONG" not in ping_result.stdout.decode():
self.logger.error("Redis ping failed")
return False
self.logger.info("✅ Redis connectivity confirmed")
# Check Redis for stored conversations
result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True
)
if result.returncode != 0:
self.logger.error("Failed to query Redis")
return False
keys = result.stdout.decode().strip().split("\n")
thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"]
if thread_keys:
self.logger.info(f"✅ Found {len(thread_keys)} conversation threads in Redis")
# Get details of first thread
thread_key = thread_keys[0]
result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True
)
if result.returncode == 0:
thread_data = result.stdout.decode()
try:
parsed = json.loads(thread_data)
turns = parsed.get("turns", [])
self.logger.info(f"✅ Thread has {len(turns)} turns")
return True
except json.JSONDecodeError:
self.logger.warning("Could not parse thread data")
return True
else:
# If no existing threads, create a test thread to validate Redis functionality
self.logger.info("📝 No existing threads found, creating test thread to validate Redis...")
test_thread_id = "test_thread_validation"
test_data = {
"thread_id": test_thread_id,
"turns": [
{
"tool": "chat",
"timestamp": "2025-06-11T16:30:00Z",
"prompt": "Test validation prompt"
}
]
}
# Store test data
store_result = self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"SET", f"thread:{test_thread_id}", json.dumps(test_data)
], capture_output=True)
if store_result.returncode != 0:
self.logger.error("Failed to store test data in Redis")
return False
# Retrieve test data
retrieve_result = self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"GET", f"thread:{test_thread_id}"
], capture_output=True)
if retrieve_result.returncode != 0:
self.logger.error("Failed to retrieve test data from Redis")
return False
retrieved_data = retrieve_result.stdout.decode()
try:
parsed = json.loads(retrieved_data)
if parsed.get("thread_id") == test_thread_id:
self.logger.info("✅ Redis read/write validation successful")
# Clean up test data
self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"DEL", f"thread:{test_thread_id}"
], capture_output=True)
return True
else:
self.logger.error("Retrieved data doesn't match stored data")
return False
except json.JSONDecodeError:
self.logger.error("Could not parse retrieved test data")
return False
except Exception as e:
self.logger.error(f"Conversation memory validation failed: {e}")
return False

View File

@@ -2,13 +2,11 @@
Sample Python module for testing MCP conversation continuity
"""
def fibonacci(n):
"""Calculate fibonacci number recursively"""
if n <= 1:
return n
return fibonacci(n - 1) + fibonacci(n - 2)
return fibonacci(n-1) + fibonacci(n-2)
def factorial(n):
"""Calculate factorial iteratively"""
@@ -17,7 +15,6 @@ def factorial(n):
result *= i
return result
class Calculator:
"""Simple calculator class"""

View File

@@ -0,0 +1,16 @@
"""
Configuration file for content validation testing
This content should appear only ONCE in any tool response
"""
# Configuration constants
MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings
DATABASE_CONFIG = {
"host": "localhost",
"port": 5432,
"name": "validation_test_db"
}

View File

@@ -0,0 +1,261 @@
"""
Enhanced tests for precommit tool using mock storage to test real logic
"""
import json
import tempfile
import os
from unittest.mock import Mock, patch, MagicMock
from typing import Dict, Any, Optional
import pytest
from tools.precommit import Precommit, PrecommitRequest
class MockRedisClient:
"""Mock Redis client that uses in-memory dictionary storage"""
def __init__(self):
self.data: Dict[str, str] = {}
self.ttl_data: Dict[str, int] = {}
def get(self, key: str) -> Optional[str]:
return self.data.get(key)
def set(self, key: str, value: str, ex: Optional[int] = None) -> bool:
self.data[key] = value
if ex:
self.ttl_data[key] = ex
return True
def delete(self, key: str) -> int:
if key in self.data:
del self.data[key]
self.ttl_data.pop(key, None)
return 1
return 0
def exists(self, key: str) -> int:
return 1 if key in self.data else 0
class TestPrecommitToolWithMockStore:
"""Test precommit tool with mock storage to validate actual logic"""
@pytest.fixture
def mock_redis(self):
"""Create mock Redis client"""
return MockRedisClient()
@pytest.fixture
def tool(self, mock_redis):
"""Create tool instance with mocked Redis"""
tool = Precommit()
# Mock the Redis client getter to return our mock
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis):
yield tool
@pytest.fixture
def temp_repo(self):
"""Create a temporary git repository with test files"""
import subprocess
temp_dir = tempfile.mkdtemp()
# Initialize git repo
subprocess.run(['git', 'init'], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'config', 'user.email', 'test@example.com'], cwd=temp_dir, capture_output=True)
# Create test config file
config_content = '''"""Test configuration file"""
# Version and metadata
__version__ = "1.0.0"
__author__ = "Test"
# Configuration
MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content
TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging
'''
config_path = os.path.join(temp_dir, 'config.py')
with open(config_path, 'w') as f:
f.write(config_content)
# Add and commit initial version
subprocess.run(['git', 'add', '.'], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'commit', '-m', 'Initial commit'], cwd=temp_dir, capture_output=True)
# Modify config to create a diff
modified_content = config_content + '\nNEW_SETTING = "test" # Added setting\n'
with open(config_path, 'w') as f:
f.write(modified_content)
yield temp_dir, config_path
# Cleanup
import shutil
shutil.rmtree(temp_dir)
@pytest.mark.asyncio
async def test_no_duplicate_file_content_in_prompt(self, tool, temp_repo, mock_redis):
"""Test that file content doesn't appear twice in the generated prompt"""
temp_dir, config_path = temp_repo
# Create request with files parameter
request = PrecommitRequest(
path=temp_dir,
files=[config_path],
original_request="Test configuration changes"
)
# Generate the prompt
prompt = await tool.prepare_prompt(request)
# Test that MAX_CONTENT_TOKENS only appears once in the entire prompt
max_content_count = prompt.count('MAX_CONTENT_TOKENS = 800_000')
assert max_content_count == 1, f"MAX_CONTENT_TOKENS appears {max_content_count} times (should be 1)"
# Test that the config file content only appears once
config_content_count = prompt.count('# Configuration')
assert config_content_count == 1, f"Config file content appears {config_content_count} times (should be 1)"
# Verify expected sections are present
assert "## Original Request" in prompt
assert "Test configuration changes" in prompt
assert "## Additional Context Files" in prompt
assert "## Git Diffs" in prompt
@pytest.mark.asyncio
async def test_conversation_memory_integration(self, tool, temp_repo, mock_redis):
"""Test that conversation memory works with mock storage"""
temp_dir, config_path = temp_repo
# Mock conversation memory functions to use our mock redis
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis):
# First request - should embed file content
request1 = PrecommitRequest(
path=temp_dir,
files=[config_path],
original_request="First review"
)
# Simulate conversation thread creation
from utils.conversation_memory import create_thread, add_turn
thread_id = create_thread("precommit", {"files": [config_path]})
# Test that file embedding works
files_to_embed = tool.filter_new_files([config_path], None)
assert config_path in files_to_embed, "New conversation should embed all files"
# Add a turn to the conversation
add_turn(thread_id, "assistant", "First response", files=[config_path], tool_name="precommit")
# Second request with continuation - should skip already embedded files
request2 = PrecommitRequest(
path=temp_dir,
files=[config_path],
continuation_id=thread_id,
original_request="Follow-up review"
)
files_to_embed_2 = tool.filter_new_files([config_path], thread_id)
assert len(files_to_embed_2) == 0, "Continuation should skip already embedded files"
@pytest.mark.asyncio
async def test_prompt_structure_integrity(self, tool, temp_repo, mock_redis):
"""Test that the prompt structure is well-formed and doesn't have content duplication"""
temp_dir, config_path = temp_repo
request = PrecommitRequest(
path=temp_dir,
files=[config_path],
original_request="Validate prompt structure",
review_type="full",
severity_filter="high"
)
prompt = await tool.prepare_prompt(request)
# Split prompt into sections
sections = {
"original_request": "## Original Request",
"review_parameters": "## Review Parameters",
"repo_summary": "## Repository Changes Summary",
"context_files_summary": "## Context Files Summary",
"git_diffs": "## Git Diffs",
"additional_context": "## Additional Context Files",
"review_instructions": "## Review Instructions"
}
section_indices = {}
for name, header in sections.items():
index = prompt.find(header)
if index != -1:
section_indices[name] = index
# Verify sections appear in logical order
assert section_indices["original_request"] < section_indices["review_parameters"]
assert section_indices["review_parameters"] < section_indices["repo_summary"]
assert section_indices["git_diffs"] < section_indices["additional_context"]
assert section_indices["additional_context"] < section_indices["review_instructions"]
# Test that file content only appears in Additional Context section
file_content_start = section_indices["additional_context"]
file_content_end = section_indices["review_instructions"]
file_section = prompt[file_content_start:file_content_end]
before_file_section = prompt[:file_content_start]
after_file_section = prompt[file_content_end:]
# MAX_CONTENT_TOKENS should only appear in the file section
assert 'MAX_CONTENT_TOKENS' in file_section
assert 'MAX_CONTENT_TOKENS' not in before_file_section
assert 'MAX_CONTENT_TOKENS' not in after_file_section
@pytest.mark.asyncio
async def test_file_content_formatting(self, tool, temp_repo, mock_redis):
"""Test that file content is properly formatted without duplication"""
temp_dir, config_path = temp_repo
# Test the centralized file preparation method directly
file_content = tool._prepare_file_content_for_prompt(
[config_path],
None, # No continuation
"Test files",
max_tokens=100000,
reserve_tokens=1000
)
# Should contain file markers
assert "--- BEGIN FILE:" in file_content
assert "--- END FILE:" in file_content
assert "config.py" in file_content
# Should contain actual file content
assert "MAX_CONTENT_TOKENS = 800_000" in file_content
assert "__version__ = \"1.0.0\"" in file_content
# Content should appear only once
assert file_content.count("MAX_CONTENT_TOKENS = 800_000") == 1
assert file_content.count("__version__ = \"1.0.0\"") == 1
def test_mock_redis_basic_operations():
"""Test that our mock Redis implementation works correctly"""
mock_redis = MockRedisClient()
# Test basic operations
assert mock_redis.get("nonexistent") is None
assert mock_redis.exists("nonexistent") == 0
mock_redis.set("test_key", "test_value")
assert mock_redis.get("test_key") == "test_value"
assert mock_redis.exists("test_key") == 1
assert mock_redis.delete("test_key") == 1
assert mock_redis.get("test_key") is None
assert mock_redis.delete("test_key") == 0 # Already deleted

View File

@@ -10,7 +10,7 @@ from pydantic import Field
from config import MAX_CONTEXT_TOKENS
from prompts.tool_prompts import PRECOMMIT_PROMPT
from utils.file_utils import translate_file_paths, translate_path_for_environment
from utils.file_utils import read_files, translate_file_paths, translate_path_for_environment
from utils.git_utils import find_git_repositories, get_git_status, run_git_command
from utils.token_utils import estimate_tokens
@@ -300,11 +300,11 @@ class Precommit(BaseTool):
# Use centralized file handling with filtering for duplicate prevention
file_content = self._prepare_file_content_for_prompt(
translated_files,
request.continuation_id,
translated_files,
request.continuation_id,
"Context files",
max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated
reserve_tokens=1000, # Small reserve for formatting
reserve_tokens=1000 # Small reserve for formatting
)
if file_content: