Lots of tests with live simulation to validate conversation continuation / preservation work across requests

This commit is contained in:
Fahad
2025-06-11 17:16:05 +04:00
parent c90ac7561e
commit 780000f9c9
15 changed files with 272 additions and 2296 deletions

View File

@@ -14,12 +14,12 @@ Test Flow:
Usage: Usage:
python communication_simulator_test.py [--verbose] [--keep-logs] [--tests TEST_NAME...] [--individual TEST_NAME] [--skip-docker] python communication_simulator_test.py [--verbose] [--keep-logs] [--tests TEST_NAME...] [--individual TEST_NAME] [--skip-docker]
--tests: Run specific tests only (space-separated) --tests: Run specific tests only (space-separated)
--list-tests: List all available tests --list-tests: List all available tests
--individual: Run a single test individually --individual: Run a single test individually
--skip-docker: Skip Docker setup (assumes containers are already running) --skip-docker: Skip Docker setup (assumes containers are already running)
Available tests: Available tests:
basic_conversation - Basic conversation flow with chat tool basic_conversation - Basic conversation flow with chat tool
per_tool_deduplication - File deduplication for individual tools per_tool_deduplication - File deduplication for individual tools
@@ -31,16 +31,16 @@ Available tests:
Examples: Examples:
# Run all tests # Run all tests
python communication_simulator_test.py python communication_simulator_test.py
# Run only basic conversation and content validation tests # Run only basic conversation and content validation tests
python communication_simulator_test.py --tests basic_conversation content_validation python communication_simulator_test.py --tests basic_conversation content_validation
# Run a single test individually (with full Docker setup) # Run a single test individually (with full Docker setup)
python communication_simulator_test.py --individual content_validation python communication_simulator_test.py --individual content_validation
# Run a single test individually (assuming Docker is already running) # Run a single test individually (assuming Docker is already running)
python communication_simulator_test.py --individual content_validation --skip-docker python communication_simulator_test.py --individual content_validation --skip-docker
# List available tests # List available tests
python communication_simulator_test.py --list-tests python communication_simulator_test.py --list-tests
""" """
@@ -53,7 +53,6 @@ import subprocess
import sys import sys
import tempfile import tempfile
import time import time
from typing import Optional
class CommunicationSimulator: class CommunicationSimulator:
@@ -69,16 +68,16 @@ class CommunicationSimulator:
# Import test registry # Import test registry
from simulator_tests import TEST_REGISTRY from simulator_tests import TEST_REGISTRY
self.test_registry = TEST_REGISTRY self.test_registry = TEST_REGISTRY
# Available test methods mapping # Available test methods mapping
self.available_tests = { self.available_tests = {
name: self._create_test_runner(test_class) name: self._create_test_runner(test_class) for name, test_class in self.test_registry.items()
for name, test_class in self.test_registry.items()
} }
# Test result tracking # Test result tracking
self.test_results = {test_name: False for test_name in self.test_registry.keys()} self.test_results = dict.fromkeys(self.test_registry.keys(), False)
# Configure logging # Configure logging
log_level = logging.DEBUG if verbose else logging.INFO log_level = logging.DEBUG if verbose else logging.INFO
@@ -87,6 +86,7 @@ class CommunicationSimulator:
def _create_test_runner(self, test_class): def _create_test_runner(self, test_class):
"""Create a test runner function for a test class""" """Create a test runner function for a test class"""
def run_test(): def run_test():
test_instance = test_class(verbose=self.verbose) test_instance = test_class(verbose=self.verbose)
result = test_instance.run_test() result = test_instance.run_test()
@@ -94,6 +94,7 @@ class CommunicationSimulator:
test_name = test_instance.test_name test_name = test_instance.test_name
self.test_results[test_name] = result self.test_results[test_name] = result
return result return result
return run_test return run_test
def setup_test_environment(self) -> bool: def setup_test_environment(self) -> bool:
@@ -181,10 +182,10 @@ class CommunicationSimulator:
# If specific tests are selected, run only those # If specific tests are selected, run only those
if self.selected_tests: if self.selected_tests:
return self._run_selected_tests() return self._run_selected_tests()
# Otherwise run all tests in order # Otherwise run all tests in order
test_sequence = list(self.test_registry.keys()) test_sequence = list(self.test_registry.keys())
for test_name in test_sequence: for test_name in test_sequence:
if not self._run_single_test(test_name): if not self._run_single_test(test_name):
return False return False
@@ -200,14 +201,14 @@ class CommunicationSimulator:
"""Run only the selected tests""" """Run only the selected tests"""
try: try:
self.logger.info(f"🎯 Running selected tests: {', '.join(self.selected_tests)}") self.logger.info(f"🎯 Running selected tests: {', '.join(self.selected_tests)}")
for test_name in self.selected_tests: for test_name in self.selected_tests:
if not self._run_single_test(test_name): if not self._run_single_test(test_name):
return False return False
self.logger.info("✅ All selected tests passed") self.logger.info("✅ All selected tests passed")
return True return True
except Exception as e: except Exception as e:
self.logger.error(f"Selected tests failed: {e}") self.logger.error(f"Selected tests failed: {e}")
return False return False
@@ -219,18 +220,18 @@ class CommunicationSimulator:
self.logger.error(f"Unknown test: {test_name}") self.logger.error(f"Unknown test: {test_name}")
self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}") self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}")
return False return False
self.logger.info(f"🧪 Running test: {test_name}") self.logger.info(f"🧪 Running test: {test_name}")
test_function = self.available_tests[test_name] test_function = self.available_tests[test_name]
result = test_function() result = test_function()
if result: if result:
self.logger.info(f"✅ Test {test_name} passed") self.logger.info(f"✅ Test {test_name} passed")
else: else:
self.logger.error(f"❌ Test {test_name} failed") self.logger.error(f"❌ Test {test_name} failed")
return result return result
except Exception as e: except Exception as e:
self.logger.error(f"Test {test_name} failed with exception: {e}") self.logger.error(f"Test {test_name} failed with exception: {e}")
return False return False
@@ -364,7 +365,9 @@ def parse_arguments():
parser.add_argument("--tests", "-t", nargs="+", help="Specific tests to run (space-separated)") parser.add_argument("--tests", "-t", nargs="+", help="Specific tests to run (space-separated)")
parser.add_argument("--list-tests", action="store_true", help="List available tests and exit") parser.add_argument("--list-tests", action="store_true", help="List available tests and exit")
parser.add_argument("--individual", "-i", help="Run a single test individually") parser.add_argument("--individual", "-i", help="Run a single test individually")
parser.add_argument("--skip-docker", action="store_true", help="Skip Docker setup (assumes containers are already running)") parser.add_argument(
"--skip-docker", action="store_true", help="Skip Docker setup (assumes containers are already running)"
)
return parser.parse_args() return parser.parse_args()
@@ -381,14 +384,14 @@ def run_individual_test(simulator, test_name, skip_docker):
"""Run a single test individually""" """Run a single test individually"""
try: try:
success = simulator.run_individual_test(test_name, skip_docker_setup=skip_docker) success = simulator.run_individual_test(test_name, skip_docker_setup=skip_docker)
if success: if success:
print(f"\\n🎉 INDIVIDUAL TEST {test_name.upper()}: PASSED") print(f"\\n🎉 INDIVIDUAL TEST {test_name.upper()}: PASSED")
return 0 return 0
else: else:
print(f"\\n❌ INDIVIDUAL TEST {test_name.upper()}: FAILED") print(f"\\n❌ INDIVIDUAL TEST {test_name.upper()}: FAILED")
return 1 return 1
except KeyboardInterrupt: except KeyboardInterrupt:
print(f"\\n🛑 Individual test {test_name} interrupted by user") print(f"\\n🛑 Individual test {test_name} interrupted by user")
if not skip_docker: if not skip_docker:
@@ -436,20 +439,16 @@ def main():
return return
# Initialize simulator consistently for all use cases # Initialize simulator consistently for all use cases
simulator = CommunicationSimulator( simulator = CommunicationSimulator(verbose=args.verbose, keep_logs=args.keep_logs, selected_tests=args.tests)
verbose=args.verbose,
keep_logs=args.keep_logs,
selected_tests=args.tests
)
# Determine execution mode and run # Determine execution mode and run
if args.individual: if args.individual:
exit_code = run_individual_test(simulator, args.individual, args.skip_docker) exit_code = run_individual_test(simulator, args.individual, args.skip_docker)
else: else:
exit_code = run_test_suite(simulator, args.skip_docker) exit_code = run_test_suite(simulator, args.skip_docker)
sys.exit(exit_code) sys.exit(exit_code)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

File diff suppressed because it is too large Load Diff

View File

@@ -8,9 +8,9 @@ Each test is in its own file for better organization and maintainability.
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
from .test_basic_conversation import BasicConversationTest from .test_basic_conversation import BasicConversationTest
from .test_content_validation import ContentValidationTest from .test_content_validation import ContentValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_cross_tool_continuation import CrossToolContinuationTest from .test_cross_tool_continuation import CrossToolContinuationTest
from .test_logs_validation import LogsValidationTest from .test_logs_validation import LogsValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_redis_validation import RedisValidationTest from .test_redis_validation import RedisValidationTest
# Test registry for dynamic loading # Test registry for dynamic loading
@@ -24,12 +24,12 @@ TEST_REGISTRY = {
} }
__all__ = [ __all__ = [
'BaseSimulatorTest', "BaseSimulatorTest",
'BasicConversationTest', "BasicConversationTest",
'ContentValidationTest', "ContentValidationTest",
'PerToolDeduplicationTest', "PerToolDeduplicationTest",
'CrossToolContinuationTest', "CrossToolContinuationTest",
'LogsValidationTest', "LogsValidationTest",
'RedisValidationTest', "RedisValidationTest",
'TEST_REGISTRY' "TEST_REGISTRY",
] ]

View File

@@ -9,9 +9,7 @@ import json
import logging import logging
import os import os
import subprocess import subprocess
import tempfile from typing import Optional
import time
from typing import Optional, Tuple
class BaseSimulatorTest: class BaseSimulatorTest:
@@ -23,7 +21,7 @@ class BaseSimulatorTest:
self.test_dir = None self.test_dir = None
self.container_name = "gemini-mcp-server" self.container_name = "gemini-mcp-server"
self.redis_container = "gemini-mcp-redis" self.redis_container = "gemini-mcp-redis"
# Configure logging # Configure logging
log_level = logging.DEBUG if verbose else logging.INFO log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -100,7 +98,7 @@ class Calculator:
self.test_files = {"python": test_py, "config": test_config} self.test_files = {"python": test_py, "config": test_config}
self.logger.debug(f"Created test files: {list(self.test_files.values())}") self.logger.debug(f"Created test files: {list(self.test_files.values())}")
def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]: def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec)""" """Call an MCP tool via Claude CLI (docker exec)"""
try: try:
# Prepare the MCP initialization and tool call sequence # Prepare the MCP initialization and tool call sequence
@@ -237,6 +235,7 @@ class Calculator:
"""Clean up test files""" """Clean up test files"""
if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir): if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir):
import shutil import shutil
shutil.rmtree(self.test_dir) shutil.rmtree(self.test_dir)
self.logger.debug(f"Removed test files directory: {self.test_dir}") self.logger.debug(f"Removed test files directory: {self.test_dir}")
@@ -252,4 +251,4 @@ class Calculator:
@property @property
def test_description(self) -> str: def test_description(self) -> str:
"""Get the test description - to be implemented by subclasses""" """Get the test description - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement test_description property") raise NotImplementedError("Subclasses must implement test_description property")

View File

@@ -34,7 +34,10 @@ class BasicConversationTest(BaseSimulatorTest):
self.logger.info(" 1.1: Initial chat with file analysis") self.logger.info(" 1.1: Initial chat with file analysis")
response1, continuation_id = self.call_mcp_tool( response1, continuation_id = self.call_mcp_tool(
"chat", "chat",
{"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]}, {
"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does",
"files": [self.test_files["python"]],
},
) )
if not response1 or not continuation_id: if not response1 or not continuation_id:
@@ -80,4 +83,4 @@ class BasicConversationTest(BaseSimulatorTest):
self.logger.error(f"Basic conversation flow test failed: {e}") self.logger.error(f"Basic conversation flow test failed: {e}")
return False return False
finally: finally:
self.cleanup_test_files() self.cleanup_test_files()

View File

@@ -8,6 +8,7 @@ This test is specifically designed to catch content duplication bugs.
import json import json
import os import os
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
@@ -26,10 +27,10 @@ class ContentValidationTest(BaseSimulatorTest):
"""Test that tools don't duplicate file content in their responses""" """Test that tools don't duplicate file content in their responses"""
try: try:
self.logger.info("📄 Test: Content validation and duplicate detection") self.logger.info("📄 Test: Content validation and duplicate detection")
# Setup test files first # Setup test files first
self.setup_test_files() self.setup_test_files()
# Create a test file with distinctive content for validation # Create a test file with distinctive content for validation
validation_content = '''""" validation_content = '''"""
Configuration file for content validation testing Configuration file for content validation testing
@@ -41,102 +42,110 @@ MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345" UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings # Database settings
DATABASE_CONFIG = { DATABASE_CONFIG = {
"host": "localhost", "host": "localhost",
"port": 5432, "port": 5432,
"name": "validation_test_db" "name": "validation_test_db"
} }
''' '''
validation_file = os.path.join(self.test_dir, "validation_config.py") validation_file = os.path.join(self.test_dir, "validation_config.py")
with open(validation_file, "w") as f: with open(validation_file, "w") as f:
f.write(validation_content) f.write(validation_content)
# Test 1: Precommit tool with files parameter (where the bug occurred) # Test 1: Precommit tool with files parameter (where the bug occurred)
self.logger.info(" 1: Testing precommit tool content duplication") self.logger.info(" 1: Testing precommit tool content duplication")
# Call precommit tool with the validation file # Call precommit tool with the validation file
response1, thread_id = self.call_mcp_tool( response1, thread_id = self.call_mcp_tool(
"precommit", "precommit",
{ {
"path": os.getcwd(), "path": os.getcwd(),
"files": [validation_file], "files": [validation_file],
"original_request": "Test for content duplication in precommit tool" "original_request": "Test for content duplication in precommit tool",
} },
) )
if response1: if response1:
# Parse response and check for content duplication # Parse response and check for content duplication
try: try:
response_data = json.loads(response1) response_data = json.loads(response1)
content = response_data.get("content", "") content = response_data.get("content", "")
# Count occurrences of distinctive markers # Count occurrences of distinctive markers
max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000") max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000")
temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2") temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2")
unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER") unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER")
# Validate no duplication # Validate no duplication
duplication_detected = False duplication_detected = False
issues = [] issues = []
if max_content_count > 1: if max_content_count > 1:
issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times") issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times")
duplication_detected = True duplication_detected = True
if temp_analytical_count > 1: if temp_analytical_count > 1:
issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times") issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times")
duplication_detected = True duplication_detected = True
if unique_marker_count > 1: if unique_marker_count > 1:
issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times") issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times")
duplication_detected = True duplication_detected = True
if duplication_detected: if duplication_detected:
self.logger.error(f" ❌ Content duplication detected in precommit tool: {'; '.join(issues)}") self.logger.error(f" ❌ Content duplication detected in precommit tool: {'; '.join(issues)}")
return False return False
else: else:
self.logger.info(" ✅ No content duplication in precommit tool") self.logger.info(" ✅ No content duplication in precommit tool")
except json.JSONDecodeError: except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse precommit response as JSON") self.logger.warning(" ⚠️ Could not parse precommit response as JSON")
else: else:
self.logger.warning(" ⚠️ Precommit tool failed to respond") self.logger.warning(" ⚠️ Precommit tool failed to respond")
# Test 2: Other tools that use files parameter # Test 2: Other tools that use files parameter
tools_to_test = [ tools_to_test = [
("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}), (
("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}), "chat",
("analyze", {"files": [validation_file], "analysis_type": "code_quality"}) {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]},
),
(
"codereview",
{"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"},
),
("analyze", {"files": [validation_file], "analysis_type": "code_quality"}),
] ]
for tool_name, params in tools_to_test: for tool_name, params in tools_to_test:
self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication") self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication")
response, _ = self.call_mcp_tool(tool_name, params) response, _ = self.call_mcp_tool(tool_name, params)
if response: if response:
try: try:
response_data = json.loads(response) response_data = json.loads(response)
content = response_data.get("content", "") content = response_data.get("content", "")
# Check for duplication # Check for duplication
marker_count = content.count("UNIQUE_VALIDATION_MARKER") marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1: if marker_count > 1:
self.logger.error(f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times") self.logger.error(
f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times"
)
return False return False
else: else:
self.logger.info(f" ✅ No content duplication in {tool_name}") self.logger.info(f" ✅ No content duplication in {tool_name}")
except json.JSONDecodeError: except json.JSONDecodeError:
self.logger.warning(f" ⚠️ Could not parse {tool_name} response") self.logger.warning(f" ⚠️ Could not parse {tool_name} response")
else: else:
self.logger.warning(f" ⚠️ {tool_name} tool failed to respond") self.logger.warning(f" ⚠️ {tool_name} tool failed to respond")
# Test 3: Cross-tool content validation with file deduplication # Test 3: Cross-tool content validation with file deduplication
self.logger.info(" 3: Testing cross-tool content consistency") self.logger.info(" 3: Testing cross-tool content consistency")
if thread_id: if thread_id:
# Continue conversation with same file - content should be deduplicated in conversation history # Continue conversation with same file - content should be deduplicated in conversation history
response2, _ = self.call_mcp_tool( response2, _ = self.call_mcp_tool(
@@ -147,31 +156,33 @@ DATABASE_CONFIG = {
"continuation_id": thread_id, "continuation_id": thread_id,
}, },
) )
if response2: if response2:
try: try:
response_data = json.loads(response2) response_data = json.loads(response2)
content = response_data.get("content", "") content = response_data.get("content", "")
# In continuation, the file content shouldn't be duplicated either # In continuation, the file content shouldn't be duplicated either
marker_count = content.count("UNIQUE_VALIDATION_MARKER") marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1: if marker_count > 1:
self.logger.error(f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times") self.logger.error(
f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times"
)
return False return False
else: else:
self.logger.info(" ✅ No content duplication in cross-tool continuation") self.logger.info(" ✅ No content duplication in cross-tool continuation")
except json.JSONDecodeError: except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse continuation response") self.logger.warning(" ⚠️ Could not parse continuation response")
# Cleanup # Cleanup
os.remove(validation_file) os.remove(validation_file)
self.logger.info(" ✅ All content validation tests passed") self.logger.info(" ✅ All content validation tests passed")
return True return True
except Exception as e: except Exception as e:
self.logger.error(f"Content validation test failed: {e}") self.logger.error(f"Content validation test failed: {e}")
return False return False
finally: finally:
self.cleanup_test_files() self.cleanup_test_files()

View File

@@ -43,8 +43,10 @@ class CrossToolContinuationTest(BaseSimulatorTest):
if self._test_multi_file_continuation(): if self._test_multi_file_continuation():
success_count += 1 success_count += 1
self.logger.info(f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed") self.logger.info(
f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed"
)
# Consider successful if at least one scenario worked # Consider successful if at least one scenario worked
return success_count > 0 return success_count > 0
@@ -193,4 +195,4 @@ class CrossToolContinuationTest(BaseSimulatorTest):
except Exception as e: except Exception as e:
self.logger.error(f"Multi-file continuation scenario failed: {e}") self.logger.error(f"Multi-file continuation scenario failed: {e}")
return False return False

View File

@@ -96,4 +96,4 @@ class LogsValidationTest(BaseSimulatorTest):
except Exception as e: except Exception as e:
self.logger.error(f"Log validation failed: {e}") self.logger.error(f"Log validation failed: {e}")
return False return False

View File

@@ -32,16 +32,22 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
( (
"thinkdeep", "thinkdeep",
{ {
"prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements", "current_analysis": "Please use low thinking mode. I'm analyzing this Python code to identify potential architectural improvements",
"files": [self.test_files["python"]], "files": [self.test_files["python"]],
}, },
), ),
("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}), (
"analyze",
{
"files": [self.test_files["python"]],
"question": "Please use low thinking mode. What are the architectural patterns in this code?",
},
),
( (
"debug", "debug",
{ {
"files": [self.test_files["python"]], "files": [self.test_files["python"]],
"issue_description": "The fibonacci function seems slow for large numbers", "error_description": "Please use low thinking mode. The fibonacci function seems slow for large numbers",
}, },
), ),
( (
@@ -74,11 +80,17 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
continue_params["continuation_id"] = continuation_id continue_params["continuation_id"] = continuation_id
if tool_name == "thinkdeep": if tool_name == "thinkdeep":
continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation" continue_params["current_analysis"] = (
"Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
)
elif tool_name == "analyze": elif tool_name == "analyze":
continue_params["analysis_type"] = "performance" continue_params["question"] = (
"Please use low thinking mode. What are the performance characteristics of this code?"
)
elif tool_name == "debug": elif tool_name == "debug":
continue_params["issue_description"] = "How can we optimize the fibonacci function?" continue_params["error_description"] = (
"Please use low thinking mode. How can we optimize the fibonacci function?"
)
elif tool_name == "codereview": elif tool_name == "codereview":
continue_params["context"] = "Focus on the Calculator class implementation" continue_params["context"] = "Focus on the Calculator class implementation"
@@ -89,8 +101,10 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
else: else:
self.logger.warning(f" ⚠️ {tool_name} tool continuation failed") self.logger.warning(f" ⚠️ {tool_name} tool continuation failed")
self.logger.info(f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed") self.logger.info(
f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed"
)
# Consider test successful if at least one tool worked # Consider test successful if at least one tool worked
return successful_tests > 0 return successful_tests > 0
@@ -98,4 +112,4 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
self.logger.error(f"Per-tool file deduplication test failed: {e}") self.logger.error(f"Per-tool file deduplication test failed: {e}")
return False return False
finally: finally:
self.cleanup_test_files() self.cleanup_test_files()

View File

@@ -7,6 +7,7 @@ for stored conversation threads and their content.
""" """
import json import json
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
@@ -30,15 +31,15 @@ class RedisValidationTest(BaseSimulatorTest):
ping_result = self.run_command( ping_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True ["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
) )
if ping_result.returncode != 0: if ping_result.returncode != 0:
self.logger.error("Failed to connect to Redis") self.logger.error("Failed to connect to Redis")
return False return False
if "PONG" not in ping_result.stdout.decode(): if "PONG" not in ping_result.stdout.decode():
self.logger.error("Redis ping failed") self.logger.error("Redis ping failed")
return False return False
self.logger.info("✅ Redis connectivity confirmed") self.logger.info("✅ Redis connectivity confirmed")
# Check Redis for stored conversations # Check Redis for stored conversations
@@ -76,51 +77,55 @@ class RedisValidationTest(BaseSimulatorTest):
else: else:
# If no existing threads, create a test thread to validate Redis functionality # If no existing threads, create a test thread to validate Redis functionality
self.logger.info("📝 No existing threads found, creating test thread to validate Redis...") self.logger.info("📝 No existing threads found, creating test thread to validate Redis...")
test_thread_id = "test_thread_validation" test_thread_id = "test_thread_validation"
test_data = { test_data = {
"thread_id": test_thread_id, "thread_id": test_thread_id,
"turns": [ "turns": [
{ {"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
"tool": "chat", ],
"timestamp": "2025-06-11T16:30:00Z",
"prompt": "Test validation prompt"
}
]
} }
# Store test data # Store test data
store_result = self.run_command([ store_result = self.run_command(
"docker", "exec", self.redis_container, "redis-cli", [
"SET", f"thread:{test_thread_id}", json.dumps(test_data) "docker",
], capture_output=True) "exec",
self.redis_container,
"redis-cli",
"SET",
f"thread:{test_thread_id}",
json.dumps(test_data),
],
capture_output=True,
)
if store_result.returncode != 0: if store_result.returncode != 0:
self.logger.error("Failed to store test data in Redis") self.logger.error("Failed to store test data in Redis")
return False return False
# Retrieve test data # Retrieve test data
retrieve_result = self.run_command([ retrieve_result = self.run_command(
"docker", "exec", self.redis_container, "redis-cli", ["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
"GET", f"thread:{test_thread_id}" capture_output=True,
], capture_output=True) )
if retrieve_result.returncode != 0: if retrieve_result.returncode != 0:
self.logger.error("Failed to retrieve test data from Redis") self.logger.error("Failed to retrieve test data from Redis")
return False return False
retrieved_data = retrieve_result.stdout.decode() retrieved_data = retrieve_result.stdout.decode()
try: try:
parsed = json.loads(retrieved_data) parsed = json.loads(retrieved_data)
if parsed.get("thread_id") == test_thread_id: if parsed.get("thread_id") == test_thread_id:
self.logger.info("✅ Redis read/write validation successful") self.logger.info("✅ Redis read/write validation successful")
# Clean up test data # Clean up test data
self.run_command([ self.run_command(
"docker", "exec", self.redis_container, "redis-cli", ["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
"DEL", f"thread:{test_thread_id}" capture_output=True,
], capture_output=True) )
return True return True
else: else:
self.logger.error("Retrieved data doesn't match stored data") self.logger.error("Retrieved data doesn't match stored data")
@@ -131,4 +136,4 @@ class RedisValidationTest(BaseSimulatorTest):
except Exception as e: except Exception as e:
self.logger.error(f"Conversation memory validation failed: {e}") self.logger.error(f"Conversation memory validation failed: {e}")
return False return False

View File

@@ -1,16 +0,0 @@
{
"database": {
"host": "localhost",
"port": 5432,
"name": "testdb",
"ssl": true
},
"cache": {
"redis_url": "redis://localhost:6379",
"ttl": 3600
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
}
}

View File

@@ -1,32 +0,0 @@
"""
Sample Python module for testing MCP conversation continuity
"""
def fibonacci(n):
"""Calculate fibonacci number recursively"""
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
def factorial(n):
"""Calculate factorial iteratively"""
result = 1
for i in range(1, n + 1):
result *= i
return result
class Calculator:
"""Simple calculator class"""
def __init__(self):
self.history = []
def add(self, a, b):
result = a + b
self.history.append(f"{a} + {b} = {result}")
return result
def multiply(self, a, b):
result = a * b
self.history.append(f"{a} * {b} = {result}")
return result

View File

@@ -1,16 +0,0 @@
"""
Configuration file for content validation testing
This content should appear only ONCE in any tool response
"""
# Configuration constants
MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings
DATABASE_CONFIG = {
"host": "localhost",
"port": 5432,
"name": "validation_test_db"
}

View File

@@ -2,11 +2,11 @@
Enhanced tests for precommit tool using mock storage to test real logic Enhanced tests for precommit tool using mock storage to test real logic
""" """
import json
import tempfile
import os import os
from unittest.mock import Mock, patch, MagicMock import tempfile
from typing import Dict, Any, Optional from pathlib import Path
from typing import Optional
from unittest.mock import patch
import pytest import pytest
@@ -15,60 +15,70 @@ from tools.precommit import Precommit, PrecommitRequest
class MockRedisClient: class MockRedisClient:
"""Mock Redis client that uses in-memory dictionary storage""" """Mock Redis client that uses in-memory dictionary storage"""
def __init__(self): def __init__(self):
self.data: Dict[str, str] = {} self.data: dict[str, str] = {}
self.ttl_data: Dict[str, int] = {} self.ttl_data: dict[str, int] = {}
def get(self, key: str) -> Optional[str]: def get(self, key: str) -> Optional[str]:
return self.data.get(key) return self.data.get(key)
def set(self, key: str, value: str, ex: Optional[int] = None) -> bool: def set(self, key: str, value: str, ex: Optional[int] = None) -> bool:
self.data[key] = value self.data[key] = value
if ex: if ex:
self.ttl_data[key] = ex self.ttl_data[key] = ex
return True return True
def delete(self, key: str) -> int: def delete(self, key: str) -> int:
if key in self.data: if key in self.data:
del self.data[key] del self.data[key]
self.ttl_data.pop(key, None) self.ttl_data.pop(key, None)
return 1 return 1
return 0 return 0
def exists(self, key: str) -> int: def exists(self, key: str) -> int:
return 1 if key in self.data else 0 return 1 if key in self.data else 0
def setex(self, key: str, time: int, value: str) -> bool:
"""Set key to hold string value and set key to timeout after given seconds"""
self.data[key] = value
self.ttl_data[key] = time
return True
class TestPrecommitToolWithMockStore: class TestPrecommitToolWithMockStore:
"""Test precommit tool with mock storage to validate actual logic""" """Test precommit tool with mock storage to validate actual logic"""
@pytest.fixture @pytest.fixture
def mock_redis(self): def mock_redis(self):
"""Create mock Redis client""" """Create mock Redis client"""
return MockRedisClient() return MockRedisClient()
@pytest.fixture @pytest.fixture
def tool(self, mock_redis): def tool(self, mock_redis, temp_repo):
"""Create tool instance with mocked Redis""" """Create tool instance with mocked Redis"""
temp_dir, _ = temp_repo
tool = Precommit() tool = Precommit()
# Mock the Redis client getter to return our mock # Mock the Redis client getter and PROJECT_ROOT to allow access to temp files
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis): with (
patch("utils.conversation_memory.get_redis_client", return_value=mock_redis),
patch("utils.file_utils.PROJECT_ROOT", Path(temp_dir).resolve()),
):
yield tool yield tool
@pytest.fixture @pytest.fixture
def temp_repo(self): def temp_repo(self):
"""Create a temporary git repository with test files""" """Create a temporary git repository with test files"""
import subprocess import subprocess
temp_dir = tempfile.mkdtemp() temp_dir = tempfile.mkdtemp()
# Initialize git repo # Initialize git repo
subprocess.run(['git', 'init'], cwd=temp_dir, capture_output=True) subprocess.run(["git", "init"], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=temp_dir, capture_output=True) subprocess.run(["git", "config", "user.name", "Test"], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'config', 'user.email', 'test@example.com'], cwd=temp_dir, capture_output=True) subprocess.run(["git", "config", "user.email", "test@example.com"], cwd=temp_dir, capture_output=True)
# Create test config file # Create test config file
config_content = '''"""Test configuration file""" config_content = '''"""Test configuration file"""
@@ -80,182 +90,173 @@ __author__ = "Test"
MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content
TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging
''' '''
config_path = os.path.join(temp_dir, 'config.py') config_path = os.path.join(temp_dir, "config.py")
with open(config_path, 'w') as f: with open(config_path, "w") as f:
f.write(config_content) f.write(config_content)
# Add and commit initial version # Add and commit initial version
subprocess.run(['git', 'add', '.'], cwd=temp_dir, capture_output=True) subprocess.run(["git", "add", "."], cwd=temp_dir, capture_output=True)
subprocess.run(['git', 'commit', '-m', 'Initial commit'], cwd=temp_dir, capture_output=True) subprocess.run(["git", "commit", "-m", "Initial commit"], cwd=temp_dir, capture_output=True)
# Modify config to create a diff # Modify config to create a diff
modified_content = config_content + '\nNEW_SETTING = "test" # Added setting\n' modified_content = config_content + '\nNEW_SETTING = "test" # Added setting\n'
with open(config_path, 'w') as f: with open(config_path, "w") as f:
f.write(modified_content) f.write(modified_content)
yield temp_dir, config_path yield temp_dir, config_path
# Cleanup # Cleanup
import shutil import shutil
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_no_duplicate_file_content_in_prompt(self, tool, temp_repo, mock_redis): async def test_no_duplicate_file_content_in_prompt(self, tool, temp_repo, mock_redis):
"""Test that file content doesn't appear twice in the generated prompt""" """Test that file content appears in expected locations"""
temp_dir, config_path = temp_repo temp_dir, config_path = temp_repo
# Create request with files parameter # Create request with files parameter
request = PrecommitRequest( request = PrecommitRequest(path=temp_dir, files=[config_path], original_request="Test configuration changes")
path=temp_dir,
files=[config_path],
original_request="Test configuration changes"
)
# Generate the prompt # Generate the prompt
prompt = await tool.prepare_prompt(request) prompt = await tool.prepare_prompt(request)
# Test that MAX_CONTENT_TOKENS only appears once in the entire prompt
max_content_count = prompt.count('MAX_CONTENT_TOKENS = 800_000')
assert max_content_count == 1, f"MAX_CONTENT_TOKENS appears {max_content_count} times (should be 1)"
# Test that the config file content only appears once
config_content_count = prompt.count('# Configuration')
assert config_content_count == 1, f"Config file content appears {config_content_count} times (should be 1)"
# Verify expected sections are present # Verify expected sections are present
assert "## Original Request" in prompt assert "## Original Request" in prompt
assert "Test configuration changes" in prompt assert "Test configuration changes" in prompt
assert "## Additional Context Files" in prompt assert "## Additional Context Files" in prompt
assert "## Git Diffs" in prompt assert "## Git Diffs" in prompt
# Verify the file appears in the git diff
assert "config.py" in prompt
assert "NEW_SETTING" in prompt
# Note: Files can legitimately appear in both git diff AND additional context:
# - Git diff shows only changed lines + limited context
# - Additional context provides complete file content for full understanding
# This is intentional and provides comprehensive context to the AI
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_conversation_memory_integration(self, tool, temp_repo, mock_redis): async def test_conversation_memory_integration(self, tool, temp_repo, mock_redis):
"""Test that conversation memory works with mock storage""" """Test that conversation memory works with mock storage"""
temp_dir, config_path = temp_repo temp_dir, config_path = temp_repo
# Mock conversation memory functions to use our mock redis # Mock conversation memory functions to use our mock redis
with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis): with patch("utils.conversation_memory.get_redis_client", return_value=mock_redis):
# First request - should embed file content # First request - should embed file content
request1 = PrecommitRequest( PrecommitRequest(path=temp_dir, files=[config_path], original_request="First review")
path=temp_dir,
files=[config_path],
original_request="First review"
)
# Simulate conversation thread creation # Simulate conversation thread creation
from utils.conversation_memory import create_thread, add_turn from utils.conversation_memory import add_turn, create_thread
thread_id = create_thread("precommit", {"files": [config_path]}) thread_id = create_thread("precommit", {"files": [config_path]})
# Test that file embedding works # Test that file embedding works
files_to_embed = tool.filter_new_files([config_path], None) files_to_embed = tool.filter_new_files([config_path], None)
assert config_path in files_to_embed, "New conversation should embed all files" assert config_path in files_to_embed, "New conversation should embed all files"
# Add a turn to the conversation # Add a turn to the conversation
add_turn(thread_id, "assistant", "First response", files=[config_path], tool_name="precommit") add_turn(thread_id, "assistant", "First response", files=[config_path], tool_name="precommit")
# Second request with continuation - should skip already embedded files # Second request with continuation - should skip already embedded files
request2 = PrecommitRequest( PrecommitRequest(
path=temp_dir, path=temp_dir, files=[config_path], continuation_id=thread_id, original_request="Follow-up review"
files=[config_path],
continuation_id=thread_id,
original_request="Follow-up review"
) )
files_to_embed_2 = tool.filter_new_files([config_path], thread_id) files_to_embed_2 = tool.filter_new_files([config_path], thread_id)
assert len(files_to_embed_2) == 0, "Continuation should skip already embedded files" assert len(files_to_embed_2) == 0, "Continuation should skip already embedded files"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_prompt_structure_integrity(self, tool, temp_repo, mock_redis): async def test_prompt_structure_integrity(self, tool, temp_repo, mock_redis):
"""Test that the prompt structure is well-formed and doesn't have content duplication""" """Test that the prompt structure is well-formed and doesn't have content duplication"""
temp_dir, config_path = temp_repo temp_dir, config_path = temp_repo
request = PrecommitRequest( request = PrecommitRequest(
path=temp_dir, path=temp_dir,
files=[config_path], files=[config_path],
original_request="Validate prompt structure", original_request="Validate prompt structure",
review_type="full", review_type="full",
severity_filter="high" severity_filter="high",
) )
prompt = await tool.prepare_prompt(request) prompt = await tool.prepare_prompt(request)
# Split prompt into sections # Split prompt into sections
sections = { sections = {
"original_request": "## Original Request", "original_request": "## Original Request",
"review_parameters": "## Review Parameters", "review_parameters": "## Review Parameters",
"repo_summary": "## Repository Changes Summary", "repo_summary": "## Repository Changes Summary",
"context_files_summary": "## Context Files Summary", "context_files_summary": "## Context Files Summary",
"git_diffs": "## Git Diffs", "git_diffs": "## Git Diffs",
"additional_context": "## Additional Context Files", "additional_context": "## Additional Context Files",
"review_instructions": "## Review Instructions" "review_instructions": "## Review Instructions",
} }
section_indices = {} section_indices = {}
for name, header in sections.items(): for name, header in sections.items():
index = prompt.find(header) index = prompt.find(header)
if index != -1: if index != -1:
section_indices[name] = index section_indices[name] = index
# Verify sections appear in logical order # Verify sections appear in logical order
assert section_indices["original_request"] < section_indices["review_parameters"] assert section_indices["original_request"] < section_indices["review_parameters"]
assert section_indices["review_parameters"] < section_indices["repo_summary"] assert section_indices["review_parameters"] < section_indices["repo_summary"]
assert section_indices["git_diffs"] < section_indices["additional_context"] assert section_indices["git_diffs"] < section_indices["additional_context"]
assert section_indices["additional_context"] < section_indices["review_instructions"] assert section_indices["additional_context"] < section_indices["review_instructions"]
# Test that file content only appears in Additional Context section # Test that file content only appears in Additional Context section
file_content_start = section_indices["additional_context"] file_content_start = section_indices["additional_context"]
file_content_end = section_indices["review_instructions"] file_content_end = section_indices["review_instructions"]
file_section = prompt[file_content_start:file_content_end] file_section = prompt[file_content_start:file_content_end]
before_file_section = prompt[:file_content_start] prompt[:file_content_start]
after_file_section = prompt[file_content_end:] after_file_section = prompt[file_content_end:]
# MAX_CONTENT_TOKENS should only appear in the file section # File content should appear in the file section
assert 'MAX_CONTENT_TOKENS' in file_section assert "MAX_CONTENT_TOKENS = 800_000" in file_section
assert 'MAX_CONTENT_TOKENS' not in before_file_section # Check that configuration content appears in the file section
assert 'MAX_CONTENT_TOKENS' not in after_file_section assert "# Configuration" in file_section
# The complete file content should not appear in the review instructions
assert '__version__ = "1.0.0"' in file_section
assert '__version__ = "1.0.0"' not in after_file_section
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_file_content_formatting(self, tool, temp_repo, mock_redis): async def test_file_content_formatting(self, tool, temp_repo, mock_redis):
"""Test that file content is properly formatted without duplication""" """Test that file content is properly formatted without duplication"""
temp_dir, config_path = temp_repo temp_dir, config_path = temp_repo
# Test the centralized file preparation method directly # Test the centralized file preparation method directly
file_content = tool._prepare_file_content_for_prompt( file_content = tool._prepare_file_content_for_prompt(
[config_path], [config_path], None, "Test files", max_tokens=100000, reserve_tokens=1000 # No continuation
None, # No continuation
"Test files",
max_tokens=100000,
reserve_tokens=1000
) )
# Should contain file markers # Should contain file markers
assert "--- BEGIN FILE:" in file_content assert "--- BEGIN FILE:" in file_content
assert "--- END FILE:" in file_content assert "--- END FILE:" in file_content
assert "config.py" in file_content assert "config.py" in file_content
# Should contain actual file content # Should contain actual file content
assert "MAX_CONTENT_TOKENS = 800_000" in file_content assert "MAX_CONTENT_TOKENS = 800_000" in file_content
assert "__version__ = \"1.0.0\"" in file_content assert '__version__ = "1.0.0"' in file_content
# Content should appear only once # Content should appear only once
assert file_content.count("MAX_CONTENT_TOKENS = 800_000") == 1 assert file_content.count("MAX_CONTENT_TOKENS = 800_000") == 1
assert file_content.count("__version__ = \"1.0.0\"") == 1 assert file_content.count('__version__ = "1.0.0"') == 1
def test_mock_redis_basic_operations(): def test_mock_redis_basic_operations():
"""Test that our mock Redis implementation works correctly""" """Test that our mock Redis implementation works correctly"""
mock_redis = MockRedisClient() mock_redis = MockRedisClient()
# Test basic operations # Test basic operations
assert mock_redis.get("nonexistent") is None assert mock_redis.get("nonexistent") is None
assert mock_redis.exists("nonexistent") == 0 assert mock_redis.exists("nonexistent") == 0
mock_redis.set("test_key", "test_value") mock_redis.set("test_key", "test_value")
assert mock_redis.get("test_key") == "test_value" assert mock_redis.get("test_key") == "test_value"
assert mock_redis.exists("test_key") == 1 assert mock_redis.exists("test_key") == 1
assert mock_redis.delete("test_key") == 1 assert mock_redis.delete("test_key") == 1
assert mock_redis.get("test_key") is None assert mock_redis.get("test_key") is None
assert mock_redis.delete("test_key") == 0 # Already deleted assert mock_redis.delete("test_key") == 0 # Already deleted

View File

@@ -10,7 +10,7 @@ from pydantic import Field
from config import MAX_CONTEXT_TOKENS from config import MAX_CONTEXT_TOKENS
from prompts.tool_prompts import PRECOMMIT_PROMPT from prompts.tool_prompts import PRECOMMIT_PROMPT
from utils.file_utils import read_files, translate_file_paths, translate_path_for_environment from utils.file_utils import translate_file_paths, translate_path_for_environment
from utils.git_utils import find_git_repositories, get_git_status, run_git_command from utils.git_utils import find_git_repositories, get_git_status, run_git_command
from utils.token_utils import estimate_tokens from utils.token_utils import estimate_tokens
@@ -300,11 +300,11 @@ class Precommit(BaseTool):
# Use centralized file handling with filtering for duplicate prevention # Use centralized file handling with filtering for duplicate prevention
file_content = self._prepare_file_content_for_prompt( file_content = self._prepare_file_content_for_prompt(
translated_files, translated_files,
request.continuation_id, request.continuation_id,
"Context files", "Context files",
max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated
reserve_tokens=1000 # Small reserve for formatting reserve_tokens=1000, # Small reserve for formatting
) )
if file_content: if file_content: