diff --git a/communication_simulator_test.py b/communication_simulator_test.py old mode 100755 new mode 100644 index bb5d133..5c9fd36 --- a/communication_simulator_test.py +++ b/communication_simulator_test.py @@ -8,28 +8,44 @@ continuity, file handling, deduplication features, and clarification scenarios. Test Flow: 1. Setup fresh Docker environment with clean containers -2. Simulate Claude CLI tool calls via docker exec -3. Test conversation threading with file handling -4. Validate file deduplication in conversation history -5. Test requires_clarification scenarios and continuation flows -6. Validate edge cases like partial file provision and clarification loops -7. Check Docker logs for proper behavior -8. Cleanup and report results - -New Clarification Testing Features: -- Debug tool clarification scenarios -- Analyze tool clarification flows -- Clarification with file deduplication across turns -- Multiple round clarification loops -- Partial file provision edge cases -- Real clarification flows with ambiguous prompts +2. Load and run individual test modules +3. Validate system behavior through logs and Redis +4. Cleanup and report results Usage: - python communication_simulator_test.py [--verbose] [--keep-logs] + python communication_simulator_test.py [--verbose] [--keep-logs] [--tests TEST_NAME...] [--individual TEST_NAME] [--skip-docker] + + --tests: Run specific tests only (space-separated) + --list-tests: List all available tests + --individual: Run a single test individually + --skip-docker: Skip Docker setup (assumes containers are already running) + +Available tests: + basic_conversation - Basic conversation flow with chat tool + per_tool_deduplication - File deduplication for individual tools + cross_tool_continuation - Cross-tool conversation continuation scenarios + content_validation - Content validation and duplicate detection + logs_validation - Docker logs validation + redis_validation - Redis conversation memory validation + +Examples: + # Run all tests + python communication_simulator_test.py + + # Run only basic conversation and content validation tests + python communication_simulator_test.py --tests basic_conversation content_validation + + # Run a single test individually (with full Docker setup) + python communication_simulator_test.py --individual content_validation + + # Run a single test individually (assuming Docker is already running) + python communication_simulator_test.py --individual content_validation --skip-docker + + # List available tests + python communication_simulator_test.py --list-tests """ import argparse -import json import logging import os import shutil @@ -43,31 +59,45 @@ from typing import Optional class CommunicationSimulator: """Simulates real-world Claude CLI communication with MCP Gemini server""" - def __init__(self, verbose: bool = False, keep_logs: bool = False): + def __init__(self, verbose: bool = False, keep_logs: bool = False, selected_tests: list[str] = None): self.verbose = verbose self.keep_logs = keep_logs + self.selected_tests = selected_tests or [] self.temp_dir = None - self.test_files = {} self.container_name = "gemini-mcp-server" self.redis_container = "gemini-mcp-redis" - # Test result tracking - self.test_results = { - "basic_conversation": False, - "per_tool_tests": {}, - "cross_tool_scenarios": {}, - "clarification_scenarios": {}, - "logs_validation": False, - "redis_validation": False, + # Import test registry + from simulator_tests import TEST_REGISTRY + self.test_registry = TEST_REGISTRY + + # Available test methods mapping + self.available_tests = { + name: self._create_test_runner(test_class) + for name, test_class in self.test_registry.items() } + # Test result tracking + self.test_results = {test_name: False for test_name in self.test_registry.keys()} + # Configure logging log_level = logging.DEBUG if verbose else logging.INFO logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") self.logger = logging.getLogger(__name__) + def _create_test_runner(self, test_class): + """Create a test runner function for a test class""" + def run_test(): + test_instance = test_class(verbose=self.verbose) + result = test_instance.run_test() + # Update results + test_name = test_instance.test_name + self.test_results[test_name] = result + return result + return run_test + def setup_test_environment(self) -> bool: - """Setup fresh Docker environment and test files""" + """Setup fresh Docker environment""" try: self.logger.info("๐Ÿš€ Setting up test environment...") @@ -75,9 +105,6 @@ class CommunicationSimulator: self.temp_dir = tempfile.mkdtemp(prefix="mcp_test_") self.logger.debug(f"Created temp directory: {self.temp_dir}") - # Create test files - self._create_test_files() - # Setup Docker environment return self._setup_docker() @@ -85,82 +112,6 @@ class CommunicationSimulator: self.logger.error(f"Failed to setup test environment: {e}") return False - def _create_test_files(self): - """Create test files for the simulation in a location accessible by Docker""" - # Test Python file - python_content = '''""" -Sample Python module for testing MCP conversation continuity -""" - -def fibonacci(n): - """Calculate fibonacci number recursively""" - if n <= 1: - return n - return fibonacci(n-1) + fibonacci(n-2) - -def factorial(n): - """Calculate factorial iteratively""" - result = 1 - for i in range(1, n + 1): - result *= i - return result - -class Calculator: - """Simple calculator class""" - - def __init__(self): - self.history = [] - - def add(self, a, b): - result = a + b - self.history.append(f"{a} + {b} = {result}") - return result - - def multiply(self, a, b): - result = a * b - self.history.append(f"{a} * {b} = {result}") - return result -''' - - # Test configuration file - config_content = """{ - "database": { - "host": "localhost", - "port": 5432, - "name": "testdb", - "ssl": true - }, - "cache": { - "redis_url": "redis://localhost:6379", - "ttl": 3600 - }, - "logging": { - "level": "INFO", - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - } -}""" - - # Create files in the current project directory so they're accessible to MCP tools - # MCP tools can access files with absolute paths within the project - current_dir = os.getcwd() - test_dir = os.path.join(current_dir, "test_simulation_files") - os.makedirs(test_dir, exist_ok=True) - - test_py = os.path.join(test_dir, "test_module.py") - test_config = os.path.join(test_dir, "config.json") - - with open(test_py, "w") as f: - f.write(python_content) - with open(test_config, "w") as f: - f.write(config_content) - - self.test_files = {"python": test_py, "config": test_config} - - # Store test directory for cleanup - self.test_dir = test_dir - - self.logger.debug(f"Created test files: {list(self.test_files.values())}") - def _setup_docker(self) -> bool: """Setup fresh Docker environment""" try: @@ -207,7 +158,7 @@ class Calculator: """Verify that required containers are running""" try: result = self._run_command(["docker", "ps", "--format", "{{.Names}}"], capture_output=True) - running_containers = result.stdout.decode().strip().split("\n") + running_containers = result.stdout.decode().strip().split("\\n") required = [self.container_name, self.redis_container] for container in required: @@ -227,1174 +178,156 @@ class Calculator: try: self.logger.info("๐Ÿค– Starting Claude CLI simulation...") - # Test basic conversation continuity - if not self._test_basic_conversation_flow(): - return False + # If specific tests are selected, run only those + if self.selected_tests: + return self._run_selected_tests() + + # Otherwise run all tests in order + test_sequence = list(self.test_registry.keys()) + + for test_name in test_sequence: + if not self._run_single_test(test_name): + return False - # Test per-tool file deduplication - if not self._test_per_tool_file_deduplication(): - return False - - # Test comprehensive cross-tool continuation - if not self._test_cross_tool_continuation(): - return False - - # Test state isolation and contamination detection - if not self._test_state_isolation(): - return False - - # Test conversation boundaries and reset behavior - if not self._test_conversation_boundaries(): - return False - - # Test requires_clarification scenarios - if not self._test_clarification_scenarios(): - return False - - self.logger.info("โœ… All conversation continuity and clarification tests passed") + self.logger.info("โœ… All tests passed") return True except Exception as e: self.logger.error(f"Claude CLI simulation failed: {e}") return False - def _test_basic_conversation_flow(self) -> bool: - """Test basic conversation flow with chat tool""" + def _run_selected_tests(self) -> bool: + """Run only the selected tests""" try: - self.logger.info("๐Ÿ“ Test 1: Basic conversation flow") - - # Initial chat tool call with file - self.logger.info(" 1.1: Initial chat with file analysis") - response1, continuation_id = self._call_mcp_tool( - "chat", - {"prompt": "Analyze this Python code and explain what it does", "files": [self.test_files["python"]]}, - ) - - if not response1 or not continuation_id: - self.logger.error("Failed to get initial response with continuation_id") - return False - - self.logger.info(f" โœ… Got continuation_id: {continuation_id}") - - # Continue conversation with same file (should be deduplicated) - self.logger.info(" 1.2: Continue conversation with same file") - response2, _ = self._call_mcp_tool( - "chat", - { - "prompt": "Now focus on the Calculator class specifically. Are there any improvements you'd suggest?", - "files": [self.test_files["python"]], # Same file - should be deduplicated - "continuation_id": continuation_id, - }, - ) - - if not response2: - self.logger.error("Failed to continue conversation") - return False - - # Continue with additional file - self.logger.info(" 1.3: Continue conversation with additional file") - response3, _ = self._call_mcp_tool( - "chat", - { - "prompt": "Now also analyze this configuration file and see how it might relate to the Python code", - "files": [self.test_files["python"], self.test_files["config"]], - "continuation_id": continuation_id, - }, - ) - - if not response3: - self.logger.error("Failed to continue with additional file") - return False - - self.logger.info(" โœ… Basic conversation flow working") - self.test_results["basic_conversation"] = True - return True - - except Exception as e: - self.logger.error(f"Basic conversation flow test failed: {e}") - return False - - def _test_per_tool_file_deduplication(self) -> bool: - """Test file deduplication for each individual tool""" - try: - self.logger.info("๐Ÿ“„ Test 2: Per-tool file deduplication") - - tools_to_test = [ - ( - "thinkdeep", - { - "prompt": "Think deeply about this Python code and identify potential architectural improvements", - "files": [self.test_files["python"]], - }, - ), - ("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}), - ( - "debug", - { - "files": [self.test_files["python"]], - "issue_description": "The fibonacci function seems slow for large numbers", - }, - ), - ( - "codereview", - { - "files": [self.test_files["python"]], - "context": "General code review for quality and best practices", - }, - ), - ] - - for tool_name, initial_params in tools_to_test: - self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool file deduplication") - - # Initial call - response1, continuation_id = self._call_mcp_tool(tool_name, initial_params) - if not response1: - self.logger.warning(f" โš ๏ธ {tool_name} tool initial call failed, skipping") - continue - - if not continuation_id: - self.logger.warning(f" โš ๏ธ {tool_name} tool didn't provide continuation_id, skipping") - continue - - # Continue with same file - should be deduplicated - continue_params = initial_params.copy() - continue_params["continuation_id"] = continuation_id - - if tool_name == "thinkdeep": - continue_params["prompt"] = "Now focus specifically on the recursive fibonacci implementation" - elif tool_name == "analyze": - continue_params["analysis_type"] = "performance" - elif tool_name == "debug": - continue_params["issue_description"] = "How can we optimize the fibonacci function?" - elif tool_name == "codereview": - continue_params["context"] = "Focus on the Calculator class implementation" - - response2, _ = self._call_mcp_tool(tool_name, continue_params) - if response2: - self.logger.info(f" โœ… {tool_name} tool file deduplication working") - self.test_results["per_tool_tests"][tool_name] = True - else: - self.logger.warning(f" โš ๏ธ {tool_name} tool continuation failed") - self.test_results["per_tool_tests"][tool_name] = False - - self.logger.info(" โœ… Per-tool file deduplication tests completed") - return True - - except Exception as e: - self.logger.error(f"Per-tool file deduplication test failed: {e}") - return False - - def _test_cross_tool_continuation(self) -> bool: - """Test comprehensive cross-tool continuation scenarios""" - try: - self.logger.info("๐Ÿ”ง Test 3: Cross-tool continuation scenarios") - - # Scenario 1: chat -> thinkdeep -> codereview - self.logger.info(" 3.1: Testing chat -> thinkdeep -> codereview") - - # Start with chat - chat_response, chat_id = self._call_mcp_tool( - "chat", - { - "prompt": "Look at this Python code and tell me what you think about it", - "files": [self.test_files["python"]], - }, - ) - - if not chat_response or not chat_id: - self.logger.error("Failed to start chat conversation") - return False - - # Continue with thinkdeep - thinkdeep_response, _ = self._call_mcp_tool( - "thinkdeep", - { - "prompt": "Think deeply about potential performance issues in this code", - "files": [self.test_files["python"]], # Same file should be deduplicated - "continuation_id": chat_id, - }, - ) - - if not thinkdeep_response: - self.logger.error("Failed chat -> thinkdeep continuation") - return False - - # Continue with codereview - codereview_response, _ = self._call_mcp_tool( - "codereview", - { - "files": [self.test_files["python"]], # Same file should be deduplicated - "context": "Building on our previous analysis, provide a comprehensive code review", - "continuation_id": chat_id, - }, - ) - - if not codereview_response: - self.logger.error("Failed thinkdeep -> codereview continuation") - return False - - self.logger.info(" โœ… chat -> thinkdeep -> codereview working") - self.test_results["cross_tool_scenarios"]["chat_thinkdeep_codereview"] = True - - # Scenario 2: analyze -> debug -> thinkdeep - self.logger.info(" 3.2: Testing analyze -> debug -> thinkdeep") - - # Start with analyze - analyze_response, analyze_id = self._call_mcp_tool( - "analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality"} - ) - - if not analyze_response or not analyze_id: - self.logger.warning("Failed to start analyze conversation, skipping scenario 2") - else: - # Continue with debug - debug_response, _ = self._call_mcp_tool( - "debug", - { - "files": [self.test_files["python"]], # Same file should be deduplicated - "issue_description": "Based on our analysis, help debug the performance issue in fibonacci", - "continuation_id": analyze_id, - }, - ) - - if debug_response: - # Continue with thinkdeep - final_response, _ = self._call_mcp_tool( - "thinkdeep", - { - "prompt": "Think deeply about the architectural implications of the issues we've found", - "files": [self.test_files["python"]], # Same file should be deduplicated - "continuation_id": analyze_id, - }, - ) - - if final_response: - self.logger.info(" โœ… analyze -> debug -> thinkdeep working") - self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = True - else: - self.logger.warning(" โš ๏ธ debug -> thinkdeep continuation failed") - self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = False - else: - self.logger.warning(" โš ๏ธ analyze -> debug continuation failed") - self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = False - - # Scenario 3: Multi-file cross-tool continuation - self.logger.info(" 3.3: Testing multi-file cross-tool continuation") - - # Start with both files - multi_response, multi_id = self._call_mcp_tool( - "chat", - { - "prompt": "Analyze both the Python code and configuration file", - "files": [self.test_files["python"], self.test_files["config"]], - }, - ) - - if not multi_response or not multi_id: - self.logger.warning("Failed to start multi-file conversation, skipping scenario 3") - else: - # Switch to codereview with same files (should use conversation history) - multi_review, _ = self._call_mcp_tool( - "codereview", - { - "files": [self.test_files["python"], self.test_files["config"]], # Same files - "context": "Review both files in the context of our previous discussion", - "continuation_id": multi_id, - }, - ) - - if multi_review: - self.logger.info(" โœ… Multi-file cross-tool continuation working") - self.test_results["cross_tool_scenarios"]["multi_file_continuation"] = True - else: - self.logger.warning(" โš ๏ธ Multi-file cross-tool continuation failed") - self.test_results["cross_tool_scenarios"]["multi_file_continuation"] = False - - self.logger.info(" โœ… Cross-tool continuation scenarios completed") - return True - - except Exception as e: - self.logger.error(f"Cross-tool continuation test failed: {e}") - return False - - def _test_state_isolation(self) -> bool: - """Test that different conversation threads don't contaminate each other""" - try: - self.logger.info("๐Ÿ”’ Test 4: State isolation and contamination detection") - - # Create a test file specifically for this test - isolation_content = '''""" -Test file for state isolation testing -""" - -def isolated_function(): - """This function should only appear in isolation tests""" - return "ISOLATION_TEST_MARKER" - -class IsolationTestClass: - """Class that should not leak between conversations""" - def __init__(self): - self.marker = "ISOLATION_BOUNDARY" -''' - - isolation_file = os.path.join(self.test_dir, "isolation_test.py") - with open(isolation_file, "w") as f: - f.write(isolation_content) - - # Test 1: Start two separate conversation threads - self.logger.info(" 4.1: Creating separate conversation threads") - - # Thread A: Chat about original Python file - response_a1, thread_a = self._call_mcp_tool( - "chat", {"prompt": "Analyze this Python module", "files": [self.test_files["python"]]} - ) - - if not response_a1 or not thread_a: - self.logger.error("Failed to create thread A") - return False - - # Thread B: Chat about isolation test file - response_b1, thread_b = self._call_mcp_tool( - "chat", {"prompt": "Analyze this isolation test file", "files": [isolation_file]} - ) - - if not response_b1 or not thread_b: - self.logger.error("Failed to create thread B") - return False - - # Verify threads are different - if thread_a == thread_b: - self.logger.error("Threads are not isolated - same continuation_id returned") - return False - - self.logger.info(f" โœ… Created isolated threads: {thread_a[:8]}... and {thread_b[:8]}...") - - # Test 2: Continue both threads and check for contamination - self.logger.info(" 4.2: Testing cross-thread contamination") - - # Continue thread A - should only know about original Python file - response_a2, _ = self._call_mcp_tool( - "chat", {"prompt": "What functions did we discuss in the previous file?", "continuation_id": thread_a} - ) - - # Continue thread B - should only know about isolation file - response_b2, _ = self._call_mcp_tool( - "chat", {"prompt": "What functions did we discuss in the previous file?", "continuation_id": thread_b} - ) - - if not response_a2 or not response_b2: - self.logger.error("Failed to continue isolated threads") - return False - - # Parse responses to check for contamination - response_a2_data = json.loads(response_a2) - response_b2_data = json.loads(response_b2) - - content_a = response_a2_data.get("content", "") - content_b = response_b2_data.get("content", "") - - # Thread A should mention fibonacci/factorial, not isolation functions - # Thread B should mention isolation functions, not fibonacci/factorial - contamination_detected = False - - if "isolated_function" in content_a or "IsolationTestClass" in content_a: - self.logger.error("Thread A contaminated with Thread B content") - contamination_detected = True - - if "fibonacci" in content_b or "factorial" in content_b or "Calculator" in content_b: - self.logger.error("Thread B contaminated with Thread A content") - contamination_detected = True - - if contamination_detected: - self.test_results["cross_tool_scenarios"]["state_isolation"] = False - return False - - self.logger.info(" โœ… No cross-thread contamination detected") - - # Test 3: Cross-tool switching with isolation - self.logger.info(" 4.3: Testing cross-tool state isolation") - - # Switch thread A to codereview - response_a3, _ = self._call_mcp_tool( - "codereview", - { - "files": [self.test_files["python"]], - "context": "Review the code we discussed", - "continuation_id": thread_a, - }, - ) - - # Switch thread B to codereview - response_b3, _ = self._call_mcp_tool( - "codereview", - {"files": [isolation_file], "context": "Review the isolation test code", "continuation_id": thread_b}, - ) - - if response_a3 and response_b3: - self.logger.info(" โœ… Cross-tool isolation maintained") - self.test_results["cross_tool_scenarios"]["state_isolation"] = True - else: - self.logger.warning(" โš ๏ธ Cross-tool isolation test incomplete") - self.test_results["cross_tool_scenarios"]["state_isolation"] = False - - # Cleanup isolation test file - os.remove(isolation_file) - - self.logger.info(" โœ… State isolation tests completed") - return True - - except Exception as e: - self.logger.error(f"State isolation test failed: {e}") - return False - - def _test_conversation_boundaries(self) -> bool: - """Test conversation boundaries and proper reset behavior""" - try: - self.logger.info("๐Ÿšง Test 5: Conversation boundaries and reset behavior") - - # Test 1: Tool-to-tool-to-tool with fresh start - self.logger.info(" 5.1: Testing A->B->A pattern with fresh conversations") - - # Start with chat - response1, thread1 = self._call_mcp_tool( - "chat", {"prompt": "Analyze the fibonacci function in this code", "files": [self.test_files["python"]]} - ) - - if not response1 or not thread1: - self.logger.warning("Failed to start boundary test, skipping") - return True - - # Switch to codereview (continue conversation) - response2, _ = self._call_mcp_tool( - "codereview", - { - "files": [self.test_files["python"]], - "context": "Building on our fibonacci discussion", - "continuation_id": thread1, - }, - ) - - if not response2: - self.logger.warning("Failed codereview continuation") - return True - - # Switch back to chat but start FRESH conversation (no continuation_id) - self.logger.info(" 5.2: Testing fresh conversation after previous context") - response3, thread3 = self._call_mcp_tool( - "chat", - { - "prompt": "Tell me about the Calculator class in this file", # Different focus - "files": [self.test_files["python"]], # Same file but fresh context - }, - ) - - if not response3 or not thread3: - self.logger.warning("Failed fresh conversation test") - return True - - # Verify it's a truly fresh conversation - if thread1 == thread3: - self.logger.error("Fresh conversation got same thread ID - boundary violation!") - self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False - return False - - self.logger.info(f" โœ… Fresh conversation created: {thread3[:8]}... (vs {thread1[:8]}...)") - - # Test 2: Verify fresh conversation doesn't have stale context - self.logger.info(" 5.3: Testing stale context isolation") - - # Continue the fresh conversation - should not reference fibonacci discussion - response4, _ = self._call_mcp_tool( - "chat", {"prompt": "What did we just discuss about this code?", "continuation_id": thread3} - ) - - if response4: - response4_data = json.loads(response4) - content4 = response4_data.get("content", "") - - # Should reference Calculator class, not fibonacci from previous thread - if "fibonacci" in content4.lower() and "calculator" not in content4.lower(): - self.logger.error("Fresh conversation contaminated with stale context!") - self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + self.logger.info(f"๐ŸŽฏ Running selected tests: {', '.join(self.selected_tests)}") + + for test_name in self.selected_tests: + if not self._run_single_test(test_name): return False - else: - self.logger.info(" โœ… Fresh conversation properly isolated from previous context") + + self.logger.info("โœ… All selected tests passed") + return True + + except Exception as e: + self.logger.error(f"Selected tests failed: {e}") + return False - # Test 3: File access without continuation should work - self.logger.info(" 5.4: Testing file access in fresh conversations") - - # New conversation with same files - should read files fresh - response5, thread5 = self._call_mcp_tool( - "chat", - {"prompt": "What's the purpose of this configuration file?", "files": [self.test_files["config"]]}, - ) - - if response5 and thread5: - # Verify it can access the file content - response5_data = json.loads(response5) - content5 = response5_data.get("content", "") - - if "database" in content5.lower() or "redis" in content5.lower(): - self.logger.info(" โœ… Fresh conversation can access files correctly") - self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = True - else: - self.logger.warning(" โš ๏ธ Fresh conversation may not be reading files properly") - self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + def _run_single_test(self, test_name: str) -> bool: + """Run a single test by name""" + try: + if test_name not in self.available_tests: + self.logger.error(f"Unknown test: {test_name}") + self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}") + return False + + self.logger.info(f"๐Ÿงช Running test: {test_name}") + test_function = self.available_tests[test_name] + result = test_function() + + if result: + self.logger.info(f"โœ… Test {test_name} passed") else: - self.logger.warning(" โš ๏ธ Fresh conversation with config file failed") - self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False - - self.logger.info(" โœ… Conversation boundary tests completed") - return True - + self.logger.error(f"โŒ Test {test_name} failed") + + return result + except Exception as e: - self.logger.error(f"Conversation boundary test failed: {e}") + self.logger.error(f"Test {test_name} failed with exception: {e}") return False - def _test_clarification_scenarios(self) -> bool: - """Test requires_clarification scenarios and continuation with additional files""" + def run_individual_test(self, test_name: str, skip_docker_setup: bool = False) -> bool: + """Run a single test individually with optional Docker setup skip""" try: - self.logger.info("๐Ÿ” Test 6: Requires clarification scenarios") - - # Test 1: Debug tool asking for missing files - if not self._test_debug_clarification(): + if test_name not in self.available_tests: + self.logger.error(f"Unknown test: {test_name}") + self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}") return False - # Test 2: Analyze tool asking for related files - if not self._test_analyze_clarification(): - return False + self.logger.info(f"๐Ÿงช Running individual test: {test_name}") - # Test 3: Clarification with file deduplication - if not self._test_clarification_with_deduplication(): - return False + # Setup environment unless skipped + if not skip_docker_setup: + if not self.setup_test_environment(): + self.logger.error("โŒ Environment setup failed") + return False - # Test 4: Multiple round clarification (clarification loop) - if not self._test_clarification_loop(): - return False + # Run the single test + test_function = self.available_tests[test_name] + result = test_function() - # Test 5: Partial file provision edge case - if not self._test_partial_file_provision(): - return False - - # Test 6: Real clarification flow (might actually trigger requires_clarification) - if not self._test_real_clarification_flow(): - return False - - self.logger.info(" โœ… Clarification scenario tests completed") - return True - - except Exception as e: - self.logger.error(f"Clarification scenario test failed: {e}") - return False - - def _test_debug_clarification(self) -> bool: - """Test debug tool requesting clarification for missing files""" - try: - self.logger.info(" 6.1: Testing debug tool clarification flow") - - # Create a problematic file that imports from utils.py - problematic_content = '''""" -Main module with a bug that requires utils.py to debug -""" - -import utils - -def main(): - result = utils.calculate_something("hello") - print(f"Result: {result}") - -if __name__ == "__main__": - main() -''' - - # Create the problematic file - problem_file = os.path.join(self.test_dir, "bug_main.py") - with open(problem_file, "w") as f: - f.write(problematic_content) - - # Step 1: Call debug tool with only the main file (should trigger clarification) - # We'll simulate clarification by creating a mock response - response1 = self._simulate_clarification_request( - "debug", - { - "files": [problem_file], - "error_description": "The application crashes with TypeError when running main()", - }, - ) - - if not response1: - self.logger.warning(" โš ๏ธ Debug clarification simulation failed") - return True # Don't fail entire test suite for simulation issues - - # For real testing, we would need the server to actually return requires_clarification - # This is a proof of concept showing how to structure the test - self.test_results["clarification_scenarios"]["debug_clarification"] = True - self.logger.info(" โœ… Debug clarification flow structure verified") - - # Cleanup - os.remove(problem_file) - return True - - except Exception as e: - self.logger.error(f"Debug clarification test failed: {e}") - return False - - def _test_analyze_clarification(self) -> bool: - """Test analyze tool requesting clarification for architecture analysis""" - try: - self.logger.info(" 6.2: Testing analyze tool clarification flow") - - # Create an incomplete file structure that would need more context - partial_model = '''""" -Partial model file that references other components -""" - -from .base import BaseModel -from .validators import validate_user_data - -class User(BaseModel): - def __init__(self, username: str, email: str): - self.username = username - self.email = validate_user_data(email) - super().__init__() -''' - - partial_file = os.path.join(self.test_dir, "partial_model.py") - with open(partial_file, "w") as f: - f.write(partial_model) - - # Simulate analyze tool clarification - response1 = self._simulate_clarification_request( - "analyze", - { - "files": [partial_file], - "question": "Analyze the architecture and dependencies of this model", - "analysis_type": "architecture", - }, - ) - - if response1: - self.test_results["clarification_scenarios"]["analyze_clarification"] = True - self.logger.info(" โœ… Analyze clarification flow structure verified") - - # Cleanup - os.remove(partial_file) - return True - - except Exception as e: - self.logger.error(f"Analyze clarification test failed: {e}") - return False - - def _test_clarification_with_deduplication(self) -> bool: - """Test that clarification preserves file deduplication across turns""" - try: - self.logger.info(" 6.3: Testing clarification with file deduplication") - - # Start conversation with file A - response1, thread_id = self._call_mcp_tool( - "chat", {"prompt": "Analyze this Python code", "files": [self.test_files["python"]]} - ) - - if not response1 or not thread_id: - self.logger.warning(" โš ๏ธ Initial conversation failed") - return True - - # Continue conversation asking for additional analysis with same file + new file - # This should deduplicate the original file - response2, _ = self._call_mcp_tool( - "chat", - { - "prompt": "Now also analyze this config file in relation to the Python code", - "files": [ - self.test_files["python"], - self.test_files["config"], - ], # python file should be deduplicated - "continuation_id": thread_id, - }, - ) - - if response2: - self.test_results["clarification_scenarios"]["clarification_deduplication"] = True - self.logger.info(" โœ… Clarification with file deduplication working") - - return True - - except Exception as e: - self.logger.error(f"Clarification deduplication test failed: {e}") - return False - - def _test_clarification_loop(self) -> bool: - """Test multiple rounds of clarification in a single conversation""" - try: - self.logger.info(" 6.4: Testing clarification loop scenarios") - - # Create a complex file that would need multiple clarifications - complex_content = '''""" -Complex module with multiple dependencies and configurations -""" - -import config -import database -import cache -from external_api import APIClient - -def process_data(data): - # Complex processing that would need clarification on each component - conn = database.get_connection(config.DB_CONFIG) - cached_result = cache.get(data.id) - api_result = APIClient().fetch_additional_data(data.external_id) - - return combine_results(cached_result, api_result) -''' - - complex_file = os.path.join(self.test_dir, "complex_module.py") - with open(complex_file, "w") as f: - f.write(complex_content) - - # Simulate multiple clarification rounds - # This is a structure test - in real implementation, each round would provide more files - responses = [] - - # Round 1: Initial request - response1 = self._simulate_clarification_request( - "debug", {"files": [complex_file], "error_description": "Complex error in data processing pipeline"} - ) - responses.append(response1) - - # Round 2: Provide config.py but still need database.py - if response1: - response2 = self._simulate_clarification_request( - "debug", - { - "files": [complex_file, self.test_files["config"]], - "error_description": "Still need database configuration", - "continuation_id": "mock_thread_id", - }, - ) - responses.append(response2) - - if all(responses): - self.test_results["clarification_scenarios"]["clarification_loop"] = True - self.logger.info(" โœ… Clarification loop structure verified") - - # Cleanup - os.remove(complex_file) - return True - - except Exception as e: - self.logger.error(f"Clarification loop test failed: {e}") - return False - - def _test_partial_file_provision(self) -> bool: - """Test edge case where user provides only some of requested files""" - try: - self.logger.info(" 6.5: Testing partial file provision edge case") - - # This test would verify that when a tool asks for multiple files - # but user only provides some, the conversation can continue gracefully - - # Create multiple related files - file1_content = '''"""File 1 - main module""" -def main_function(): - return "main" -''' - - file2_content = '''"""File 2 - utility module""" -def utility_function(): - return "utility" -''' - - file1_path = os.path.join(self.test_dir, "file1.py") - file2_path = os.path.join(self.test_dir, "file2.py") - - with open(file1_path, "w") as f: - f.write(file1_content) - with open(file2_path, "w") as f: - f.write(file2_content) - - # Simulate tool asking for both files - - # Simulate user providing only file1.py (partial provision) - # In real implementation, this should trigger another clarification for file2.py - partial_response = self._simulate_partial_file_response([file1_path]) - - if partial_response: - self.test_results["clarification_scenarios"]["partial_file_provision"] = True - self.logger.info(" โœ… Partial file provision edge case structure verified") - - # Cleanup - os.remove(file1_path) - os.remove(file2_path) - return True - - except Exception as e: - self.logger.error(f"Partial file provision test failed: {e}") - return False - - def _simulate_clarification_request(self, tool_name: str, params: dict) -> Optional[str]: - """ - Simulate a tool call that would trigger requires_clarification. - In real implementation, this would intercept the actual Gemini response. - """ - try: - # This is a mock implementation showing the structure - # In a real test, we would: - # 1. Mock the Gemini API response to return requires_clarification - # 2. Call the actual MCP tool - # 3. Verify the response format and conversation ID preservation - - mock_response = { - "status": "requires_clarification", - "question": f"Mock clarification from {tool_name} tool", - "files_needed": ["additional_file.py"], - "conversation_id": f"mock_thread_{tool_name}", - } - - self.logger.debug(f" ๐Ÿ“ Simulated {tool_name} clarification: {mock_response}") - return json.dumps(mock_response) - - except Exception as e: - self.logger.error(f"Clarification simulation failed: {e}") - return None - - def _simulate_partial_file_response(self, provided_files: list[str]) -> Optional[str]: - """Simulate user providing only some of the requested files""" - try: - # This would test the server's handling of incomplete file provision - mock_response = { - "status": "partial_provision", - "provided_files": provided_files, - "still_needed": ["missing_file.py"], - } - - self.logger.debug(f" ๐Ÿ“ Simulated partial file provision: {mock_response}") - return json.dumps(mock_response) - - except Exception as e: - self.logger.error(f"Partial file response simulation failed: {e}") - return None - - def _test_real_clarification_flow(self) -> bool: - """Test a real clarification flow that might trigger requires_clarification from Gemini""" - try: - self.logger.info(" 6.6: Testing real clarification flow with ambiguous prompts") - - # Create an intentionally ambiguous debugging scenario - ambiguous_content = '''""" -Ambiguous code that would be hard to debug without context -""" - -def mysterious_function(data): - result = process_data(data) # Where is process_data defined? - return result.transform() # What is the structure of result? - -class DataProcessor: - def __init__(self): - self.config = load_config() # Where is load_config from? - - def run(self): - return mysterious_function(self.get_data()) # Where is get_data? -''' - - ambiguous_file = os.path.join(self.test_dir, "ambiguous.py") - with open(ambiguous_file, "w") as f: - f.write(ambiguous_content) - - # Try debug tool with minimal context - this might trigger clarification - response1, thread_id = self._call_mcp_tool( - "debug", {"files": [ambiguous_file], "error_description": "Code crashes with AttributeError"} - ) - - if response1: - try: - response_data = json.loads(response1) - if response_data.get("status") == "requires_clarification": - self.logger.info(" ๐ŸŽฏ Real clarification response received!") - self.test_results["clarification_scenarios"]["real_clarification_flow"] = True - - # Test continuation with additional context - if thread_id: - # Provide additional files - continuation_response, _ = self._call_mcp_tool( - "debug", - { - "files": [ambiguous_file, self.test_files["python"]], - "error_description": "Additional context provided", - "continuation_id": thread_id, - }, - ) - - if continuation_response: - self.logger.info(" โœ… Clarification continuation working") - - else: - self.logger.info(" โ„น๏ธ No clarification triggered (Gemini provided analysis directly)") - self.test_results["clarification_scenarios"]["real_clarification_flow"] = True - - except json.JSONDecodeError: - self.logger.warning(" โš ๏ธ Could not parse response as JSON") - - # Cleanup - os.remove(ambiguous_file) - return True - - except Exception as e: - self.logger.error(f"Real clarification flow test failed: {e}") - return False - - def _call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]: - """Simulate calling an MCP tool via Claude CLI (docker exec)""" - try: - # Prepare the MCP initialization and tool call sequence - init_request = { - "jsonrpc": "2.0", - "id": 1, - "method": "initialize", - "params": { - "protocolVersion": "2024-11-05", - "capabilities": {"tools": {}}, - "clientInfo": {"name": "communication-simulator", "version": "1.0.0"}, - }, - } - - # Send initialized notification - initialized_notification = {"jsonrpc": "2.0", "method": "notifications/initialized"} - - # Prepare the tool call request - tool_request = { - "jsonrpc": "2.0", - "id": 2, - "method": "tools/call", - "params": {"name": tool_name, "arguments": params}, - } - - # Combine all messages - messages = [json.dumps(init_request), json.dumps(initialized_notification), json.dumps(tool_request)] - - # Join with newlines as MCP expects - input_data = "\n".join(messages) + "\n" - - # Simulate Claude CLI calling the MCP server via docker exec - docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"] - - self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization") - - # Execute the command - result = subprocess.run( - docker_cmd, input=input_data, text=True, capture_output=True, timeout=120 # 2 minute timeout - ) - - if result.returncode != 0: - self.logger.error(f"Docker exec failed: {result.stderr}") - return None, None - - # Parse the response - look for the tool call response - response_data = self._parse_mcp_response(result.stdout, expected_id=2) - if not response_data: - return None, None - - # Extract continuation_id if present - continuation_id = self._extract_continuation_id(response_data) - - return response_data, continuation_id - - except subprocess.TimeoutExpired: - self.logger.error(f"MCP tool call timed out: {tool_name}") - return None, None - except Exception as e: - self.logger.error(f"MCP tool call failed: {e}") - return None, None - - def _parse_mcp_response(self, stdout: str, expected_id: int = 2) -> Optional[str]: - """Parse MCP JSON-RPC response from stdout""" - try: - lines = stdout.strip().split("\n") - for line in lines: - if line.strip() and line.startswith("{"): - response = json.loads(line) - # Look for the tool call response with the expected ID - if response.get("id") == expected_id and "result" in response: - # Extract the actual content from the response - result = response["result"] - # Handle new response format with 'content' array - if isinstance(result, dict) and "content" in result: - content_array = result["content"] - if isinstance(content_array, list) and len(content_array) > 0: - return content_array[0].get("text", "") - # Handle legacy format - elif isinstance(result, list) and len(result) > 0: - return result[0].get("text", "") - elif response.get("id") == expected_id and "error" in response: - self.logger.error(f"MCP error: {response['error']}") - return None - - # If we get here, log all responses for debugging - self.logger.warning(f"No valid tool call response found for ID {expected_id}") - self.logger.debug(f"Full stdout: {stdout}") - return None - - except json.JSONDecodeError as e: - self.logger.error(f"Failed to parse MCP response: {e}") - self.logger.debug(f"Stdout that failed to parse: {stdout}") - return None - - def _extract_continuation_id(self, response_text: str) -> Optional[str]: - """Extract continuation_id from response metadata""" - try: - # Parse the response text as JSON to look for continuation metadata - response_data = json.loads(response_text) - - # Look for continuation_id in various places - if isinstance(response_data, dict): - # Check metadata - metadata = response_data.get("metadata", {}) - if "thread_id" in metadata: - return metadata["thread_id"] - - # Check follow_up_request - follow_up = response_data.get("follow_up_request", {}) - if follow_up and "continuation_id" in follow_up: - return follow_up["continuation_id"] - - # Check continuation_offer - continuation_offer = response_data.get("continuation_offer", {}) - if continuation_offer and "continuation_id" in continuation_offer: - return continuation_offer["continuation_id"] - - self.logger.debug(f"No continuation_id found in response: {response_data}") - return None - - except json.JSONDecodeError as e: - self.logger.debug(f"Failed to parse response for continuation_id: {e}") - return None - - def validate_docker_logs(self) -> bool: - """Validate Docker logs to confirm file deduplication behavior""" - try: - self.logger.info("๐Ÿ“‹ Validating Docker logs for file deduplication...") - - # Get server logs from both main container and activity logs - result = self._run_command(["docker", "logs", self.container_name], capture_output=True) - - if result.returncode != 0: - self.logger.error(f"Failed to get Docker logs: {result.stderr}") - return False - - main_logs = result.stdout.decode() + result.stderr.decode() - - # Also get activity logs for more detailed conversation tracking - activity_result = self._run_command( - ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True - ) - - activity_logs = "" - if activity_result.returncode == 0: - activity_logs = activity_result.stdout.decode() - - logs = main_logs + "\n" + activity_logs - - # Look for conversation threading patterns that indicate the system is working - conversation_patterns = [ - "CONVERSATION_RESUME", - "CONVERSATION_CONTEXT", - "previous turns loaded", - "tool embedding", - "files included", - "files truncated", - "already in conversation history", - ] - - conversation_lines = [] - for line in logs.split("\n"): - for pattern in conversation_patterns: - if pattern.lower() in line.lower(): - conversation_lines.append(line.strip()) - break - - # Look for evidence of conversation threading and file handling - conversation_threading_found = False - multi_turn_conversations = False - - for line in conversation_lines: - lower_line = line.lower() - if "conversation_resume" in lower_line: - conversation_threading_found = True - self.logger.debug(f"๐Ÿ“„ Conversation threading: {line}") - elif "previous turns loaded" in lower_line: - multi_turn_conversations = True - self.logger.debug(f"๐Ÿ“„ Multi-turn conversation: {line}") - elif "already in conversation" in lower_line: - self.logger.info(f"โœ… Found explicit deduplication: {line}") - return True - - # Conversation threading with multiple turns is evidence of file deduplication working - if conversation_threading_found and multi_turn_conversations: - self.logger.info("โœ… Conversation threading with multi-turn context working") - self.logger.info( - "โœ… File deduplication working implicitly (files embedded once in conversation history)" - ) - self.test_results["logs_validation"] = True - return True - elif conversation_threading_found: - self.logger.info("โœ… Conversation threading detected") - return True + if result: + self.logger.info(f"โœ… Individual test {test_name} passed") else: - self.logger.warning("โš ๏ธ No clear evidence of conversation threading in logs") - self.logger.debug(f"Found {len(conversation_lines)} conversation-related log lines") - return False + self.logger.error(f"โŒ Individual test {test_name} failed") + + return result except Exception as e: - self.logger.error(f"Log validation failed: {e}") + self.logger.error(f"Individual test {test_name} failed with exception: {e}") return False + finally: + if not skip_docker_setup and not self.keep_logs: + self.cleanup() - def validate_conversation_memory(self) -> bool: - """Validate that conversation memory is working via Redis""" + def get_available_tests(self) -> dict[str, str]: + """Get available tests with descriptions""" + descriptions = {} + for name, test_class in self.test_registry.items(): + # Create temporary instance to get description + temp_instance = test_class(verbose=False) + descriptions[name] = temp_instance.test_description + return descriptions + + def print_test_summary(self): + """Print comprehensive test results summary""" + print("\\n" + "=" * 70) + print("๐Ÿงช GEMINI MCP COMMUNICATION SIMULATOR - TEST RESULTS SUMMARY") + print("=" * 70) + + passed_count = sum(1 for result in self.test_results.values() if result) + total_count = len(self.test_results) + + for test_name, result in self.test_results.items(): + status = "โœ… PASS" if result else "โŒ FAIL" + # Get test description + temp_instance = self.test_registry[test_name](verbose=False) + description = temp_instance.test_description + print(f"๐Ÿ“ {description}: {status}") + + print(f"\\n๐ŸŽฏ OVERALL RESULT: {'๐ŸŽ‰ SUCCESS' if passed_count == total_count else 'โŒ FAILURE'}") + print(f"โœ… {passed_count}/{total_count} tests passed") + print("=" * 70) + return passed_count == total_count + + def run_full_test_suite(self, skip_docker_setup: bool = False) -> bool: + """Run the complete test suite""" try: - self.logger.info("๐Ÿ’พ Validating conversation memory via Redis...") + self.logger.info("๐Ÿš€ Starting Gemini MCP Communication Simulator Test Suite") - # Check Redis for stored conversations - result = self._run_command( - ["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True - ) - - if result.returncode != 0: - self.logger.error("Failed to query Redis") - return False - - keys = result.stdout.decode().strip().split("\n") - thread_keys = [k for k in keys if k.startswith("thread:")] - - if thread_keys: - self.logger.info(f"โœ… Found {len(thread_keys)} conversation threads in Redis") - - # Get details of first thread - if thread_keys: - thread_key = thread_keys[0] - result = self._run_command( - ["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True - ) - - if result.returncode == 0: - thread_data = result.stdout.decode() - try: - parsed = json.loads(thread_data) - turns = parsed.get("turns", []) - self.logger.info(f"โœ… Thread has {len(turns)} turns") - self.test_results["redis_validation"] = True - return True - except json.JSONDecodeError: - self.logger.warning("Could not parse thread data") - - self.test_results["redis_validation"] = True - return True + # Setup + if not skip_docker_setup: + if not self.setup_test_environment(): + self.logger.error("โŒ Environment setup failed") + return False else: - self.logger.warning("โš ๏ธ No conversation threads found in Redis") + self.logger.info("โฉ Skipping Docker setup (containers assumed running)") + + # Main simulation + if not self.simulate_claude_cli_session(): + self.logger.error("โŒ Claude CLI simulation failed") return False + # Print comprehensive summary + overall_success = self.print_test_summary() + + return overall_success + except Exception as e: - self.logger.error(f"Conversation memory validation failed: {e}") + self.logger.error(f"Test suite failed: {e}") return False + finally: + if not self.keep_logs and not skip_docker_setup: + self.cleanup() def cleanup(self): """Cleanup test environment""" @@ -1412,11 +345,6 @@ class DataProcessor: shutil.rmtree(self.temp_dir) self.logger.debug(f"Removed temp directory: {self.temp_dir}") - # Remove test files directory - if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir): - shutil.rmtree(self.test_dir) - self.logger.debug(f"Removed test files directory: {self.test_dir}") - except Exception as e: self.logger.error(f"Cleanup failed: {e}") @@ -1427,185 +355,101 @@ class DataProcessor: return subprocess.run(cmd, check=check, capture_output=capture_output, **kwargs) - def print_test_summary(self): - """Print comprehensive test results summary""" - print("\n" + "=" * 70) - print("๐Ÿงช GEMINI MCP COMMUNICATION SIMULATOR - TEST RESULTS SUMMARY") - print("=" * 70) - # Basic conversation flow - status = "โœ… PASS" if self.test_results["basic_conversation"] else "โŒ FAIL" - print(f"๐Ÿ“ Basic Conversation Flow: {status}") +def parse_arguments(): + """Parse and validate command line arguments""" + parser = argparse.ArgumentParser(description="Gemini MCP Communication Simulator Test") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") + parser.add_argument("--keep-logs", action="store_true", help="Keep Docker services running for log inspection") + parser.add_argument("--tests", "-t", nargs="+", help="Specific tests to run (space-separated)") + parser.add_argument("--list-tests", action="store_true", help="List available tests and exit") + parser.add_argument("--individual", "-i", help="Run a single test individually") + parser.add_argument("--skip-docker", action="store_true", help="Skip Docker setup (assumes containers are already running)") - # Per-tool tests - print("\n๐Ÿ“„ Per-Tool File Deduplication Tests:") - tools_tested = len(self.test_results["per_tool_tests"]) - tools_passed = sum(1 for passed in self.test_results["per_tool_tests"].values() if passed) + return parser.parse_args() - if tools_tested > 0: - for tool, passed in self.test_results["per_tool_tests"].items(): - status = "โœ… PASS" if passed else "โŒ FAIL" - print(f" โ€ข {tool}: {status}") - print(f" โ†’ Summary: {tools_passed}/{tools_tested} tools passed") + +def list_available_tests(): + """List all available tests and exit""" + simulator = CommunicationSimulator() + print("Available tests:") + for test_name, description in simulator.get_available_tests().items(): + print(f" {test_name:<25} - {description}") + + +def run_individual_test(simulator, test_name, skip_docker): + """Run a single test individually""" + try: + success = simulator.run_individual_test(test_name, skip_docker_setup=skip_docker) + + if success: + print(f"\\n๐ŸŽ‰ INDIVIDUAL TEST {test_name.upper()}: PASSED") + return 0 else: - print(" โ†’ No tools tested") + print(f"\\nโŒ INDIVIDUAL TEST {test_name.upper()}: FAILED") + return 1 + + except KeyboardInterrupt: + print(f"\\n๐Ÿ›‘ Individual test {test_name} interrupted by user") + if not skip_docker: + simulator.cleanup() + return 130 + except Exception as e: + print(f"\\n๐Ÿ’ฅ Individual test {test_name} failed with error: {e}") + if not skip_docker: + simulator.cleanup() + return 1 - # Cross-tool scenarios - print("\n๐Ÿ”ง Cross-Tool Continuation Scenarios:") - scenarios_tested = len(self.test_results["cross_tool_scenarios"]) - scenarios_passed = sum(1 for passed in self.test_results["cross_tool_scenarios"].values() if passed is True) - if scenarios_tested > 0: - scenario_names = { - "chat_thinkdeep_codereview": "chat โ†’ thinkdeep โ†’ codereview", - "analyze_debug_thinkdeep": "analyze โ†’ debug โ†’ thinkdeep", - "multi_file_continuation": "Multi-file continuation", - "state_isolation": "State isolation (contamination detection)", - "conversation_boundaries": "Conversation boundaries & reset behavior", - } +def run_test_suite(simulator, skip_docker=False): + """Run the full test suite or selected tests""" + try: + success = simulator.run_full_test_suite(skip_docker_setup=skip_docker) - for scenario, passed in self.test_results["cross_tool_scenarios"].items(): - name = scenario_names.get(scenario, scenario) - if passed is True: - status = "โœ… PASS" - elif passed is False: - status = "โŒ FAIL" - else: - status = "โธ๏ธ SKIP" - print(f" โ€ข {name}: {status}") - print(f" โ†’ Summary: {scenarios_passed}/{scenarios_tested} scenarios passed") + if success: + print("\\n๐ŸŽ‰ COMPREHENSIVE MCP COMMUNICATION TEST: PASSED") + return 0 else: - print(" โ†’ No scenarios tested") + print("\\nโŒ COMPREHENSIVE MCP COMMUNICATION TEST: FAILED") + print("โš ๏ธ Check detailed results above") + return 1 - # Clarification scenarios - print("\n๐Ÿ” Requires Clarification Scenarios:") - clarification_tested = len(self.test_results["clarification_scenarios"]) - clarification_passed = sum( - 1 for passed in self.test_results["clarification_scenarios"].values() if passed is True - ) - - if clarification_tested > 0: - clarification_names = { - "debug_clarification": "Debug tool clarification flow", - "analyze_clarification": "Analyze tool clarification flow", - "clarification_deduplication": "Clarification with file deduplication", - "clarification_loop": "Multiple round clarification (loop)", - "partial_file_provision": "Partial file provision edge case", - "real_clarification_flow": "Real clarification flow with ambiguous prompts", - } - - for scenario, passed in self.test_results["clarification_scenarios"].items(): - name = clarification_names.get(scenario, scenario) - if passed is True: - status = "โœ… PASS" - elif passed is False: - status = "โŒ FAIL" - else: - status = "โธ๏ธ SKIP" - print(f" โ€ข {name}: {status}") - print(f" โ†’ Summary: {clarification_passed}/{clarification_tested} clarification scenarios passed") - else: - print(" โ†’ No clarification scenarios tested") - - # System validation - print("\n๐Ÿ’พ System Validation:") - logs_status = "โœ… PASS" if self.test_results["logs_validation"] else "โŒ FAIL" - redis_status = "โœ… PASS" if self.test_results["redis_validation"] else "โŒ FAIL" - print(f" โ€ข Docker logs (conversation threading): {logs_status}") - print(f" โ€ข Redis memory (conversation persistence): {redis_status}") - - # Overall result - all_core_tests = [ - self.test_results["basic_conversation"], - self.test_results["logs_validation"], - self.test_results["redis_validation"], - ] - - tool_tests_ok = tools_tested == 0 or tools_passed > 0 - scenario_tests_ok = scenarios_tested == 0 or scenarios_passed > 0 - clarification_tests_ok = clarification_tested == 0 or clarification_passed > 0 - - overall_success = all(all_core_tests) and tool_tests_ok and scenario_tests_ok and clarification_tests_ok - - print(f"\n๐ŸŽฏ OVERALL RESULT: {'๐ŸŽ‰ SUCCESS' if overall_success else 'โŒ FAILURE'}") - - if overall_success: - print("โœ… MCP server conversation continuity and file deduplication working correctly!") - print("โœ… All core systems validated") - if tools_passed > 0: - print(f"โœ… {tools_passed} tools working with file deduplication") - if scenarios_passed > 0: - print(f"โœ… {scenarios_passed} cross-tool scenarios working") - if clarification_passed > 0: - print(f"โœ… {clarification_passed} clarification scenarios verified") - else: - print("โš ๏ธ Some tests failed - check individual results above") - - print("=" * 70) - return overall_success - - def run_full_test_suite(self) -> bool: - """Run the complete test suite""" - try: - self.logger.info("๐Ÿš€ Starting Gemini MCP Communication Simulator Test Suite") - - # Setup - if not self.setup_test_environment(): - self.logger.error("โŒ Environment setup failed") - return False - - # Main simulation - if not self.simulate_claude_cli_session(): - self.logger.error("โŒ Claude CLI simulation failed") - return False - - # Validation - self.validate_docker_logs() - self.validate_conversation_memory() - - # Print comprehensive summary - overall_success = self.print_test_summary() - - return overall_success - - except Exception as e: - self.logger.error(f"Test suite failed: {e}") - return False - finally: - if not self.keep_logs: - self.cleanup() + except KeyboardInterrupt: + print("\\n๐Ÿ›‘ Test interrupted by user") + if not skip_docker: + simulator.cleanup() + return 130 + except Exception as e: + print(f"\\n๐Ÿ’ฅ Unexpected error: {e}") + if not skip_docker: + simulator.cleanup() + return 1 def main(): """Main entry point""" - parser = argparse.ArgumentParser(description="Gemini MCP Communication Simulator Test") - parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") - parser.add_argument("--keep-logs", action="store_true", help="Keep Docker services running for log inspection") + args = parse_arguments() - args = parser.parse_args() + # Handle list tests request + if args.list_tests: + list_available_tests() + return - simulator = CommunicationSimulator(verbose=args.verbose, keep_logs=args.keep_logs) + # Initialize simulator consistently for all use cases + simulator = CommunicationSimulator( + verbose=args.verbose, + keep_logs=args.keep_logs, + selected_tests=args.tests + ) - try: - success = simulator.run_full_test_suite() - - if success: - print("\n๐ŸŽ‰ COMPREHENSIVE MCP COMMUNICATION TEST: PASSED") - sys.exit(0) - else: - print("\nโŒ COMPREHENSIVE MCP COMMUNICATION TEST: FAILED") - print("โš ๏ธ Check detailed results above") - sys.exit(1) - - except KeyboardInterrupt: - print("\n๐Ÿ›‘ Test interrupted by user") - simulator.cleanup() - sys.exit(130) - except Exception as e: - print(f"\n๐Ÿ’ฅ Unexpected error: {e}") - simulator.cleanup() - sys.exit(1) + # Determine execution mode and run + if args.individual: + exit_code = run_individual_test(simulator, args.individual, args.skip_docker) + else: + exit_code = run_test_suite(simulator, args.skip_docker) + + sys.exit(exit_code) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/communication_simulator_test_old.py b/communication_simulator_test_old.py new file mode 100755 index 0000000..055e254 --- /dev/null +++ b/communication_simulator_test_old.py @@ -0,0 +1,1994 @@ +#!/usr/bin/env python3 +""" +Communication Simulator Test for Gemini MCP Server + +This script provides comprehensive end-to-end testing of the Gemini MCP server +by simulating real Claude CLI communications and validating conversation +continuity, file handling, deduplication features, and clarification scenarios. + +Test Flow: +1. Setup fresh Docker environment with clean containers +2. Simulate Claude CLI tool calls via docker exec +3. Test conversation threading with file handling +4. Validate file deduplication in conversation history +5. Test requires_clarification scenarios and continuation flows +6. Validate edge cases like partial file provision and clarification loops +7. Check Docker logs for proper behavior +8. Cleanup and report results + +New Clarification Testing Features: +- Debug tool clarification scenarios +- Analyze tool clarification flows +- Clarification with file deduplication across turns +- Multiple round clarification loops +- Partial file provision edge cases +- Real clarification flows with ambiguous prompts + +Usage: + python communication_simulator_test.py [--verbose] [--keep-logs] [--tests TEST_NAME...] [--individual TEST_NAME] [--skip-docker] + + --tests: Run specific tests only (space-separated) + --list-tests: List all available tests + --individual: Run a single test individually + --skip-docker: Skip Docker setup (assumes containers are already running) + +Available tests: + basic_conversation - Basic conversation flow with chat tool + per_tool_deduplication - File deduplication for individual tools + cross_tool_continuation - Cross-tool conversation continuation scenarios + state_isolation - State isolation and contamination detection + conversation_boundaries - Conversation boundaries and reset behavior + clarification_scenarios - Requires clarification scenarios + content_validation - Content validation and duplicate detection + logs_validation - Docker logs validation + redis_validation - Redis conversation memory validation + +Examples: + # Run all tests + python communication_simulator_test.py + + # Run only basic conversation and content validation tests + python communication_simulator_test.py --tests basic_conversation content_validation + + # Run a single test individually (with full Docker setup) + python communication_simulator_test.py --individual content_validation + + # Run a single test individually (assuming Docker is already running) + python communication_simulator_test.py --individual content_validation --skip-docker + + # List available tests + python communication_simulator_test.py --list-tests +""" + +import argparse +import json +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import time +from typing import Optional + + +class CommunicationSimulator: + """Simulates real-world Claude CLI communication with MCP Gemini server""" + + def __init__(self, verbose: bool = False, keep_logs: bool = False, selected_tests: list[str] = None): + self.verbose = verbose + self.keep_logs = keep_logs + self.selected_tests = selected_tests or [] + self.temp_dir = None + self.container_name = "gemini-mcp-server" + self.redis_container = "gemini-mcp-redis" + + # Import test registry + from simulator_tests import TEST_REGISTRY + self.test_registry = TEST_REGISTRY + + # Available test methods mapping + self.available_tests = { + name: self._create_test_runner(test_class) + for name, test_class in self.test_registry.items() + } + + # Test result tracking + self.test_results = { + "basic_conversation": False, + "per_tool_tests": {}, + "cross_tool_scenarios": {}, + "clarification_scenarios": {}, + "content_validation": {}, + "logs_validation": False, + "redis_validation": False, + } + + # Configure logging + log_level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") + self.logger = logging.getLogger(__name__) + + def _create_test_runner(self, test_class): + """Create a test runner function for a test class""" + def run_test(): + test_instance = test_class(verbose=self.verbose) + return test_instance.run_test() + return run_test + + def setup_test_environment(self) -> bool: + """Setup fresh Docker environment and test files""" + try: + self.logger.info("๐Ÿš€ Setting up test environment...") + + # Create temporary directory for test files + self.temp_dir = tempfile.mkdtemp(prefix="mcp_test_") + self.logger.debug(f"Created temp directory: {self.temp_dir}") + + # Create test files + self._create_test_files() + + # Setup Docker environment + return self._setup_docker() + + except Exception as e: + self.logger.error(f"Failed to setup test environment: {e}") + return False + + def _create_test_files(self): + """Create test files for the simulation in a location accessible by Docker""" + # Test Python file + python_content = '''""" +Sample Python module for testing MCP conversation continuity +""" + +def fibonacci(n): + """Calculate fibonacci number recursively""" + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + +def factorial(n): + """Calculate factorial iteratively""" + result = 1 + for i in range(1, n + 1): + result *= i + return result + +class Calculator: + """Simple calculator class""" + + def __init__(self): + self.history = [] + + def add(self, a, b): + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result + + def multiply(self, a, b): + result = a * b + self.history.append(f"{a} * {b} = {result}") + return result +''' + + # Test configuration file + config_content = """{ + "database": { + "host": "localhost", + "port": 5432, + "name": "testdb", + "ssl": true + }, + "cache": { + "redis_url": "redis://localhost:6379", + "ttl": 3600 + }, + "logging": { + "level": "INFO", + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } +}""" + + # Create files in the current project directory so they're accessible to MCP tools + # MCP tools can access files with absolute paths within the project + current_dir = os.getcwd() + test_dir = os.path.join(current_dir, "test_simulation_files") + os.makedirs(test_dir, exist_ok=True) + + test_py = os.path.join(test_dir, "test_module.py") + test_config = os.path.join(test_dir, "config.json") + + with open(test_py, "w") as f: + f.write(python_content) + with open(test_config, "w") as f: + f.write(config_content) + + self.test_files = {"python": test_py, "config": test_config} + + # Store test directory for cleanup + self.test_dir = test_dir + + self.logger.debug(f"Created test files: {list(self.test_files.values())}") + + def _setup_docker(self) -> bool: + """Setup fresh Docker environment""" + try: + self.logger.info("๐Ÿณ Setting up Docker environment...") + + # Stop and remove existing containers + self._run_command(["docker", "compose", "down", "--remove-orphans"], check=False, capture_output=True) + + # Clean up any old containers/images + old_containers = [self.container_name, self.redis_container] + for container in old_containers: + self._run_command(["docker", "stop", container], check=False, capture_output=True) + self._run_command(["docker", "rm", container], check=False, capture_output=True) + + # Build and start services + self.logger.info("๐Ÿ“ฆ Building Docker images...") + result = self._run_command(["docker", "compose", "build", "--no-cache"], capture_output=True) + if result.returncode != 0: + self.logger.error(f"Docker build failed: {result.stderr}") + return False + + self.logger.info("๐Ÿš€ Starting Docker services...") + result = self._run_command(["docker", "compose", "up", "-d"], capture_output=True) + if result.returncode != 0: + self.logger.error(f"Docker startup failed: {result.stderr}") + return False + + # Wait for services to be ready + self.logger.info("โณ Waiting for services to be ready...") + time.sleep(10) # Give services time to initialize + + # Verify containers are running + if not self._verify_containers(): + return False + + self.logger.info("โœ… Docker environment ready") + return True + + except Exception as e: + self.logger.error(f"Docker setup failed: {e}") + return False + + def _verify_containers(self) -> bool: + """Verify that required containers are running""" + try: + result = self._run_command(["docker", "ps", "--format", "{{.Names}}"], capture_output=True) + running_containers = result.stdout.decode().strip().split("\n") + + required = [self.container_name, self.redis_container] + for container in required: + if container not in running_containers: + self.logger.error(f"Container not running: {container}") + return False + + self.logger.debug(f"Verified containers running: {required}") + return True + + except Exception as e: + self.logger.error(f"Container verification failed: {e}") + return False + + def simulate_claude_cli_session(self) -> bool: + """Simulate a complete Claude CLI session with conversation continuity""" + try: + self.logger.info("๐Ÿค– Starting Claude CLI simulation...") + + # If specific tests are selected, run only those + if self.selected_tests: + return self._run_selected_tests() + + # Otherwise run all tests in order + test_sequence = [ + "basic_conversation", + "per_tool_deduplication", + "cross_tool_continuation", + "state_isolation", + "conversation_boundaries", + "clarification_scenarios", + "content_validation" + ] + + for test_name in test_sequence: + if not self._run_single_test(test_name): + return False + + self.logger.info("โœ… All conversation continuity, clarification, and content validation tests passed") + return True + + except Exception as e: + self.logger.error(f"Claude CLI simulation failed: {e}") + return False + + def _run_selected_tests(self) -> bool: + """Run only the selected tests""" + try: + self.logger.info(f"๐ŸŽฏ Running selected tests: {', '.join(self.selected_tests)}") + + for test_name in self.selected_tests: + if not self._run_single_test(test_name): + return False + + self.logger.info("โœ… All selected tests passed") + return True + + except Exception as e: + self.logger.error(f"Selected tests failed: {e}") + return False + + def _run_single_test(self, test_name: str) -> bool: + """Run a single test by name""" + try: + if test_name not in self.available_tests: + self.logger.error(f"Unknown test: {test_name}") + self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}") + return False + + self.logger.info(f"๐Ÿงช Running test: {test_name}") + test_function = self.available_tests[test_name] + result = test_function() + + if result: + self.logger.info(f"โœ… Test {test_name} passed") + else: + self.logger.error(f"โŒ Test {test_name} failed") + + return result + + except Exception as e: + self.logger.error(f"Test {test_name} failed with exception: {e}") + return False + + def get_available_tests(self) -> dict[str, str]: + """Get available tests with descriptions""" + descriptions = {} + for name, test_class in self.test_registry.items(): + # Create temporary instance to get description + temp_instance = test_class(verbose=False) + descriptions[name] = temp_instance.test_description + return descriptions + + def _test_basic_conversation_flow(self) -> bool: + """Test basic conversation flow with chat tool""" + try: + self.logger.info("๐Ÿ“ Test 1: Basic conversation flow") + + # Initial chat tool call with file + self.logger.info(" 1.1: Initial chat with file analysis") + response1, continuation_id = self._call_mcp_tool( + "chat", + {"prompt": "Analyze this Python code and explain what it does", "files": [self.test_files["python"]]}, + ) + + if not response1 or not continuation_id: + self.logger.error("Failed to get initial response with continuation_id") + return False + + self.logger.info(f" โœ… Got continuation_id: {continuation_id}") + + # Continue conversation with same file (should be deduplicated) + self.logger.info(" 1.2: Continue conversation with same file") + response2, _ = self._call_mcp_tool( + "chat", + { + "prompt": "Now focus on the Calculator class specifically. Are there any improvements you'd suggest?", + "files": [self.test_files["python"]], # Same file - should be deduplicated + "continuation_id": continuation_id, + }, + ) + + if not response2: + self.logger.error("Failed to continue conversation") + return False + + # Continue with additional file + self.logger.info(" 1.3: Continue conversation with additional file") + response3, _ = self._call_mcp_tool( + "chat", + { + "prompt": "Now also analyze this configuration file and see how it might relate to the Python code", + "files": [self.test_files["python"], self.test_files["config"]], + "continuation_id": continuation_id, + }, + ) + + if not response3: + self.logger.error("Failed to continue with additional file") + return False + + self.logger.info(" โœ… Basic conversation flow working") + self.test_results["basic_conversation"] = True + return True + + except Exception as e: + self.logger.error(f"Basic conversation flow test failed: {e}") + return False + + def _test_per_tool_file_deduplication(self) -> bool: + """Test file deduplication for each individual tool""" + try: + self.logger.info("๐Ÿ“„ Test 2: Per-tool file deduplication") + + tools_to_test = [ + ( + "thinkdeep", + { + "prompt": "Think deeply about this Python code and identify potential architectural improvements", + "files": [self.test_files["python"]], + }, + ), + ("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}), + ( + "debug", + { + "files": [self.test_files["python"]], + "issue_description": "The fibonacci function seems slow for large numbers", + }, + ), + ( + "codereview", + { + "files": [self.test_files["python"]], + "context": "General code review for quality and best practices", + }, + ), + ] + + for tool_name, initial_params in tools_to_test: + self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool file deduplication") + + # Initial call + response1, continuation_id = self._call_mcp_tool(tool_name, initial_params) + if not response1: + self.logger.warning(f" โš ๏ธ {tool_name} tool initial call failed, skipping") + continue + + if not continuation_id: + self.logger.warning(f" โš ๏ธ {tool_name} tool didn't provide continuation_id, skipping") + continue + + # Continue with same file - should be deduplicated + continue_params = initial_params.copy() + continue_params["continuation_id"] = continuation_id + + if tool_name == "thinkdeep": + continue_params["prompt"] = "Now focus specifically on the recursive fibonacci implementation" + elif tool_name == "analyze": + continue_params["analysis_type"] = "performance" + elif tool_name == "debug": + continue_params["issue_description"] = "How can we optimize the fibonacci function?" + elif tool_name == "codereview": + continue_params["context"] = "Focus on the Calculator class implementation" + + response2, _ = self._call_mcp_tool(tool_name, continue_params) + if response2: + self.logger.info(f" โœ… {tool_name} tool file deduplication working") + self.test_results["per_tool_tests"][tool_name] = True + else: + self.logger.warning(f" โš ๏ธ {tool_name} tool continuation failed") + self.test_results["per_tool_tests"][tool_name] = False + + self.logger.info(" โœ… Per-tool file deduplication tests completed") + return True + + except Exception as e: + self.logger.error(f"Per-tool file deduplication test failed: {e}") + return False + + def _test_cross_tool_continuation(self) -> bool: + """Test comprehensive cross-tool continuation scenarios""" + try: + self.logger.info("๐Ÿ”ง Test 3: Cross-tool continuation scenarios") + + # Scenario 1: chat -> thinkdeep -> codereview + self.logger.info(" 3.1: Testing chat -> thinkdeep -> codereview") + + # Start with chat + chat_response, chat_id = self._call_mcp_tool( + "chat", + { + "prompt": "Look at this Python code and tell me what you think about it", + "files": [self.test_files["python"]], + }, + ) + + if not chat_response or not chat_id: + self.logger.error("Failed to start chat conversation") + return False + + # Continue with thinkdeep + thinkdeep_response, _ = self._call_mcp_tool( + "thinkdeep", + { + "prompt": "Think deeply about potential performance issues in this code", + "files": [self.test_files["python"]], # Same file should be deduplicated + "continuation_id": chat_id, + }, + ) + + if not thinkdeep_response: + self.logger.error("Failed chat -> thinkdeep continuation") + return False + + # Continue with codereview + codereview_response, _ = self._call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"]], # Same file should be deduplicated + "context": "Building on our previous analysis, provide a comprehensive code review", + "continuation_id": chat_id, + }, + ) + + if not codereview_response: + self.logger.error("Failed thinkdeep -> codereview continuation") + return False + + self.logger.info(" โœ… chat -> thinkdeep -> codereview working") + self.test_results["cross_tool_scenarios"]["chat_thinkdeep_codereview"] = True + + # Scenario 2: analyze -> debug -> thinkdeep + self.logger.info(" 3.2: Testing analyze -> debug -> thinkdeep") + + # Start with analyze + analyze_response, analyze_id = self._call_mcp_tool( + "analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality"} + ) + + if not analyze_response or not analyze_id: + self.logger.warning("Failed to start analyze conversation, skipping scenario 2") + else: + # Continue with debug + debug_response, _ = self._call_mcp_tool( + "debug", + { + "files": [self.test_files["python"]], # Same file should be deduplicated + "issue_description": "Based on our analysis, help debug the performance issue in fibonacci", + "continuation_id": analyze_id, + }, + ) + + if debug_response: + # Continue with thinkdeep + final_response, _ = self._call_mcp_tool( + "thinkdeep", + { + "prompt": "Think deeply about the architectural implications of the issues we've found", + "files": [self.test_files["python"]], # Same file should be deduplicated + "continuation_id": analyze_id, + }, + ) + + if final_response: + self.logger.info(" โœ… analyze -> debug -> thinkdeep working") + self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = True + else: + self.logger.warning(" โš ๏ธ debug -> thinkdeep continuation failed") + self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = False + else: + self.logger.warning(" โš ๏ธ analyze -> debug continuation failed") + self.test_results["cross_tool_scenarios"]["analyze_debug_thinkdeep"] = False + + # Scenario 3: Multi-file cross-tool continuation + self.logger.info(" 3.3: Testing multi-file cross-tool continuation") + + # Start with both files + multi_response, multi_id = self._call_mcp_tool( + "chat", + { + "prompt": "Analyze both the Python code and configuration file", + "files": [self.test_files["python"], self.test_files["config"]], + }, + ) + + if not multi_response or not multi_id: + self.logger.warning("Failed to start multi-file conversation, skipping scenario 3") + else: + # Switch to codereview with same files (should use conversation history) + multi_review, _ = self._call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"], self.test_files["config"]], # Same files + "context": "Review both files in the context of our previous discussion", + "continuation_id": multi_id, + }, + ) + + if multi_review: + self.logger.info(" โœ… Multi-file cross-tool continuation working") + self.test_results["cross_tool_scenarios"]["multi_file_continuation"] = True + else: + self.logger.warning(" โš ๏ธ Multi-file cross-tool continuation failed") + self.test_results["cross_tool_scenarios"]["multi_file_continuation"] = False + + self.logger.info(" โœ… Cross-tool continuation scenarios completed") + return True + + except Exception as e: + self.logger.error(f"Cross-tool continuation test failed: {e}") + return False + + def _test_state_isolation(self) -> bool: + """Test that different conversation threads don't contaminate each other""" + try: + self.logger.info("๐Ÿ”’ Test 4: State isolation and contamination detection") + + # Create a test file specifically for this test + isolation_content = '''""" +Test file for state isolation testing +""" + +def isolated_function(): + """This function should only appear in isolation tests""" + return "ISOLATION_TEST_MARKER" + +class IsolationTestClass: + """Class that should not leak between conversations""" + def __init__(self): + self.marker = "ISOLATION_BOUNDARY" +''' + + isolation_file = os.path.join(self.test_dir, "isolation_test.py") + with open(isolation_file, "w") as f: + f.write(isolation_content) + + # Test 1: Start two separate conversation threads + self.logger.info(" 4.1: Creating separate conversation threads") + + # Thread A: Chat about original Python file + response_a1, thread_a = self._call_mcp_tool( + "chat", {"prompt": "Analyze this Python module", "files": [self.test_files["python"]]} + ) + + if not response_a1 or not thread_a: + self.logger.error("Failed to create thread A") + return False + + # Thread B: Chat about isolation test file + response_b1, thread_b = self._call_mcp_tool( + "chat", {"prompt": "Analyze this isolation test file", "files": [isolation_file]} + ) + + if not response_b1 or not thread_b: + self.logger.error("Failed to create thread B") + return False + + # Verify threads are different + if thread_a == thread_b: + self.logger.error("Threads are not isolated - same continuation_id returned") + return False + + self.logger.info(f" โœ… Created isolated threads: {thread_a[:8]}... and {thread_b[:8]}...") + + # Test 2: Continue both threads and check for contamination + self.logger.info(" 4.2: Testing cross-thread contamination") + + # Continue thread A - should only know about original Python file + response_a2, _ = self._call_mcp_tool( + "chat", {"prompt": "What functions did we discuss in the previous file?", "continuation_id": thread_a} + ) + + # Continue thread B - should only know about isolation file + response_b2, _ = self._call_mcp_tool( + "chat", {"prompt": "What functions did we discuss in the previous file?", "continuation_id": thread_b} + ) + + if not response_a2 or not response_b2: + self.logger.error("Failed to continue isolated threads") + return False + + # Parse responses to check for contamination + response_a2_data = json.loads(response_a2) + response_b2_data = json.loads(response_b2) + + content_a = response_a2_data.get("content", "") + content_b = response_b2_data.get("content", "") + + # Thread A should mention fibonacci/factorial, not isolation functions + # Thread B should mention isolation functions, not fibonacci/factorial + contamination_detected = False + + if "isolated_function" in content_a or "IsolationTestClass" in content_a: + self.logger.error("Thread A contaminated with Thread B content") + contamination_detected = True + + if "fibonacci" in content_b or "factorial" in content_b or "Calculator" in content_b: + self.logger.error("Thread B contaminated with Thread A content") + contamination_detected = True + + if contamination_detected: + self.test_results["cross_tool_scenarios"]["state_isolation"] = False + return False + + self.logger.info(" โœ… No cross-thread contamination detected") + + # Test 3: Cross-tool switching with isolation + self.logger.info(" 4.3: Testing cross-tool state isolation") + + # Switch thread A to codereview + response_a3, _ = self._call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"]], + "context": "Review the code we discussed", + "continuation_id": thread_a, + }, + ) + + # Switch thread B to codereview + response_b3, _ = self._call_mcp_tool( + "codereview", + {"files": [isolation_file], "context": "Review the isolation test code", "continuation_id": thread_b}, + ) + + if response_a3 and response_b3: + self.logger.info(" โœ… Cross-tool isolation maintained") + self.test_results["cross_tool_scenarios"]["state_isolation"] = True + else: + self.logger.warning(" โš ๏ธ Cross-tool isolation test incomplete") + self.test_results["cross_tool_scenarios"]["state_isolation"] = False + + # Cleanup isolation test file + os.remove(isolation_file) + + self.logger.info(" โœ… State isolation tests completed") + return True + + except Exception as e: + self.logger.error(f"State isolation test failed: {e}") + return False + + def _test_conversation_boundaries(self) -> bool: + """Test conversation boundaries and proper reset behavior""" + try: + self.logger.info("๐Ÿšง Test 5: Conversation boundaries and reset behavior") + + # Test 1: Tool-to-tool-to-tool with fresh start + self.logger.info(" 5.1: Testing A->B->A pattern with fresh conversations") + + # Start with chat + response1, thread1 = self._call_mcp_tool( + "chat", {"prompt": "Analyze the fibonacci function in this code", "files": [self.test_files["python"]]} + ) + + if not response1 or not thread1: + self.logger.warning("Failed to start boundary test, skipping") + return True + + # Switch to codereview (continue conversation) + response2, _ = self._call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"]], + "context": "Building on our fibonacci discussion", + "continuation_id": thread1, + }, + ) + + if not response2: + self.logger.warning("Failed codereview continuation") + return True + + # Switch back to chat but start FRESH conversation (no continuation_id) + self.logger.info(" 5.2: Testing fresh conversation after previous context") + response3, thread3 = self._call_mcp_tool( + "chat", + { + "prompt": "Tell me about the Calculator class in this file", # Different focus + "files": [self.test_files["python"]], # Same file but fresh context + }, + ) + + if not response3 or not thread3: + self.logger.warning("Failed fresh conversation test") + return True + + # Verify it's a truly fresh conversation + if thread1 == thread3: + self.logger.error("Fresh conversation got same thread ID - boundary violation!") + self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + return False + + self.logger.info(f" โœ… Fresh conversation created: {thread3[:8]}... (vs {thread1[:8]}...)") + + # Test 2: Verify fresh conversation doesn't have stale context + self.logger.info(" 5.3: Testing stale context isolation") + + # Continue the fresh conversation - should not reference fibonacci discussion + response4, _ = self._call_mcp_tool( + "chat", {"prompt": "What did we just discuss about this code?", "continuation_id": thread3} + ) + + if response4: + response4_data = json.loads(response4) + content4 = response4_data.get("content", "") + + # Should reference Calculator class, not fibonacci from previous thread + if "fibonacci" in content4.lower() and "calculator" not in content4.lower(): + self.logger.error("Fresh conversation contaminated with stale context!") + self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + return False + else: + self.logger.info(" โœ… Fresh conversation properly isolated from previous context") + + # Test 3: File access without continuation should work + self.logger.info(" 5.4: Testing file access in fresh conversations") + + # New conversation with same files - should read files fresh + response5, thread5 = self._call_mcp_tool( + "chat", + {"prompt": "What's the purpose of this configuration file?", "files": [self.test_files["config"]]}, + ) + + if response5 and thread5: + # Verify it can access the file content + response5_data = json.loads(response5) + content5 = response5_data.get("content", "") + + if "database" in content5.lower() or "redis" in content5.lower(): + self.logger.info(" โœ… Fresh conversation can access files correctly") + self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = True + else: + self.logger.warning(" โš ๏ธ Fresh conversation may not be reading files properly") + self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + else: + self.logger.warning(" โš ๏ธ Fresh conversation with config file failed") + self.test_results["cross_tool_scenarios"]["conversation_boundaries"] = False + + self.logger.info(" โœ… Conversation boundary tests completed") + return True + + except Exception as e: + self.logger.error(f"Conversation boundary test failed: {e}") + return False + + def _test_content_validation(self) -> bool: + """Test that tools don't duplicate file content in their responses""" + try: + self.logger.info("๐Ÿ“„ Test 7: Content validation and duplicate detection") + + # Create a test file with distinctive content for validation + validation_content = '''""" +Configuration file for content validation testing +This content should appear only ONCE in any tool response +""" + +# Configuration constants +MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once +TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once +UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345" + +# Database settings +DATABASE_CONFIG = { + "host": "localhost", + "port": 5432, + "name": "validation_test_db" +} +''' + + validation_file = os.path.join(self.test_dir, "validation_config.py") + with open(validation_file, "w") as f: + f.write(validation_content) + + # Test 1: Precommit tool with files parameter (where the bug occurred) + self.logger.info(" 7.1: Testing precommit tool content duplication") + + # Call precommit tool with the validation file + response1, thread_id = self._call_mcp_tool( + "precommit", + { + "path": os.getcwd(), + "files": [validation_file], + "original_request": "Test for content duplication in precommit tool" + } + ) + + if response1: + # Parse response and check for content duplication + try: + response_data = json.loads(response1) + content = response_data.get("content", "") + + # Count occurrences of distinctive markers + max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000") + temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2") + unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER") + + # Validate no duplication + duplication_detected = False + issues = [] + + if max_content_count > 1: + issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times") + duplication_detected = True + + if temp_analytical_count > 1: + issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times") + duplication_detected = True + + if unique_marker_count > 1: + issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times") + duplication_detected = True + + if duplication_detected: + self.logger.error(f" โŒ Content duplication detected in precommit tool: {'; '.join(issues)}") + self.test_results["content_validation"]["precommit_duplication"] = False + return False + else: + self.logger.info(" โœ… No content duplication in precommit tool") + self.test_results["content_validation"]["precommit_duplication"] = True + + except json.JSONDecodeError: + self.logger.warning(" โš ๏ธ Could not parse precommit response as JSON") + + else: + self.logger.warning(" โš ๏ธ Precommit tool failed to respond") + + # Test 2: Other tools that use files parameter + tools_to_test = [ + ("chat", {"prompt": "Analyze this config file", "files": [validation_file]}), + ("codereview", {"files": [validation_file], "context": "Review this configuration"}), + ("analyze", {"files": [validation_file], "analysis_type": "code_quality"}) + ] + + for tool_name, params in tools_to_test: + self.logger.info(f" 7.{tool_name}: Testing {tool_name} tool content duplication") + + response, _ = self._call_mcp_tool(tool_name, params) + if response: + try: + response_data = json.loads(response) + content = response_data.get("content", "") + + # Check for duplication + marker_count = content.count("UNIQUE_VALIDATION_MARKER") + if marker_count > 1: + self.logger.error(f" โŒ Content duplication in {tool_name}: marker appears {marker_count} times") + self.test_results["content_validation"][f"{tool_name}_duplication"] = False + else: + self.logger.info(f" โœ… No content duplication in {tool_name}") + self.test_results["content_validation"][f"{tool_name}_duplication"] = True + + except json.JSONDecodeError: + self.logger.warning(f" โš ๏ธ Could not parse {tool_name} response") + else: + self.logger.warning(f" โš ๏ธ {tool_name} tool failed to respond") + + # Test 3: Cross-tool content validation with file deduplication + self.logger.info(" 7.cross: Testing cross-tool content consistency") + + if thread_id: + # Continue conversation with same file - content should be deduplicated in conversation history + response2, _ = self._call_mcp_tool( + "chat", + { + "prompt": "Continue analyzing this configuration file", + "files": [validation_file], # Same file should be deduplicated + "continuation_id": thread_id, + }, + ) + + if response2: + try: + response_data = json.loads(response2) + content = response_data.get("content", "") + + # In continuation, the file content shouldn't be duplicated either + marker_count = content.count("UNIQUE_VALIDATION_MARKER") + if marker_count > 1: + self.logger.error(f" โŒ Content duplication in cross-tool continuation: marker appears {marker_count} times") + self.test_results["content_validation"]["cross_tool_duplication"] = False + else: + self.logger.info(" โœ… No content duplication in cross-tool continuation") + self.test_results["content_validation"]["cross_tool_duplication"] = True + + except json.JSONDecodeError: + self.logger.warning(" โš ๏ธ Could not parse continuation response") + + # Cleanup + os.remove(validation_file) + + # Check if all content validation tests passed + validation_results = self.test_results["content_validation"] + all_passed = all(result for result in validation_results.values() if isinstance(result, bool)) + + if all_passed: + self.logger.info(" โœ… All content validation tests passed") + else: + self.logger.error(" โŒ Some content validation tests failed") + return False + + return True + + except Exception as e: + self.logger.error(f"Content validation test failed: {e}") + return False + + def _test_clarification_scenarios(self) -> bool: + """Test requires_clarification scenarios and continuation with additional files""" + try: + self.logger.info("๐Ÿ” Test 6: Requires clarification scenarios") + + # Test 1: Debug tool asking for missing files + if not self._test_debug_clarification(): + return False + + # Test 2: Analyze tool asking for related files + if not self._test_analyze_clarification(): + return False + + # Test 3: Clarification with file deduplication + if not self._test_clarification_with_deduplication(): + return False + + # Test 4: Multiple round clarification (clarification loop) + if not self._test_clarification_loop(): + return False + + # Test 5: Partial file provision edge case + if not self._test_partial_file_provision(): + return False + + # Test 6: Real clarification flow (might actually trigger requires_clarification) + if not self._test_real_clarification_flow(): + return False + + self.logger.info(" โœ… Clarification scenario tests completed") + return True + + except Exception as e: + self.logger.error(f"Clarification scenario test failed: {e}") + return False + + def _test_debug_clarification(self) -> bool: + """Test debug tool requesting clarification for missing files""" + try: + self.logger.info(" 6.1: Testing debug tool clarification flow") + + # Create a problematic file that imports from utils.py + problematic_content = '''""" +Main module with a bug that requires utils.py to debug +""" + +import utils + +def main(): + result = utils.calculate_something("hello") + print(f"Result: {result}") + +if __name__ == "__main__": + main() +''' + + # Create the problematic file + problem_file = os.path.join(self.test_dir, "bug_main.py") + with open(problem_file, "w") as f: + f.write(problematic_content) + + # Step 1: Call debug tool with only the main file (should trigger clarification) + # We'll simulate clarification by creating a mock response + response1 = self._simulate_clarification_request( + "debug", + { + "files": [problem_file], + "error_description": "The application crashes with TypeError when running main()", + }, + ) + + if not response1: + self.logger.warning(" โš ๏ธ Debug clarification simulation failed") + return True # Don't fail entire test suite for simulation issues + + # For real testing, we would need the server to actually return requires_clarification + # This is a proof of concept showing how to structure the test + self.test_results["clarification_scenarios"]["debug_clarification"] = True + self.logger.info(" โœ… Debug clarification flow structure verified") + + # Cleanup + os.remove(problem_file) + return True + + except Exception as e: + self.logger.error(f"Debug clarification test failed: {e}") + return False + + def _test_analyze_clarification(self) -> bool: + """Test analyze tool requesting clarification for architecture analysis""" + try: + self.logger.info(" 6.2: Testing analyze tool clarification flow") + + # Create an incomplete file structure that would need more context + partial_model = '''""" +Partial model file that references other components +""" + +from .base import BaseModel +from .validators import validate_user_data + +class User(BaseModel): + def __init__(self, username: str, email: str): + self.username = username + self.email = validate_user_data(email) + super().__init__() +''' + + partial_file = os.path.join(self.test_dir, "partial_model.py") + with open(partial_file, "w") as f: + f.write(partial_model) + + # Simulate analyze tool clarification + response1 = self._simulate_clarification_request( + "analyze", + { + "files": [partial_file], + "question": "Analyze the architecture and dependencies of this model", + "analysis_type": "architecture", + }, + ) + + if response1: + self.test_results["clarification_scenarios"]["analyze_clarification"] = True + self.logger.info(" โœ… Analyze clarification flow structure verified") + + # Cleanup + os.remove(partial_file) + return True + + except Exception as e: + self.logger.error(f"Analyze clarification test failed: {e}") + return False + + def _test_clarification_with_deduplication(self) -> bool: + """Test that clarification preserves file deduplication across turns""" + try: + self.logger.info(" 6.3: Testing clarification with file deduplication") + + # Start conversation with file A + response1, thread_id = self._call_mcp_tool( + "chat", {"prompt": "Analyze this Python code", "files": [self.test_files["python"]]} + ) + + if not response1 or not thread_id: + self.logger.warning(" โš ๏ธ Initial conversation failed") + return True + + # Continue conversation asking for additional analysis with same file + new file + # This should deduplicate the original file + response2, _ = self._call_mcp_tool( + "chat", + { + "prompt": "Now also analyze this config file in relation to the Python code", + "files": [ + self.test_files["python"], + self.test_files["config"], + ], # python file should be deduplicated + "continuation_id": thread_id, + }, + ) + + if response2: + self.test_results["clarification_scenarios"]["clarification_deduplication"] = True + self.logger.info(" โœ… Clarification with file deduplication working") + + return True + + except Exception as e: + self.logger.error(f"Clarification deduplication test failed: {e}") + return False + + def _test_clarification_loop(self) -> bool: + """Test multiple rounds of clarification in a single conversation""" + try: + self.logger.info(" 6.4: Testing clarification loop scenarios") + + # Create a complex file that would need multiple clarifications + complex_content = '''""" +Complex module with multiple dependencies and configurations +""" + +import config +import database +import cache +from external_api import APIClient + +def process_data(data): + # Complex processing that would need clarification on each component + conn = database.get_connection(config.DB_CONFIG) + cached_result = cache.get(data.id) + api_result = APIClient().fetch_additional_data(data.external_id) + + return combine_results(cached_result, api_result) +''' + + complex_file = os.path.join(self.test_dir, "complex_module.py") + with open(complex_file, "w") as f: + f.write(complex_content) + + # Simulate multiple clarification rounds + # This is a structure test - in real implementation, each round would provide more files + responses = [] + + # Round 1: Initial request + response1 = self._simulate_clarification_request( + "debug", {"files": [complex_file], "error_description": "Complex error in data processing pipeline"} + ) + responses.append(response1) + + # Round 2: Provide config.py but still need database.py + if response1: + response2 = self._simulate_clarification_request( + "debug", + { + "files": [complex_file, self.test_files["config"]], + "error_description": "Still need database configuration", + "continuation_id": "mock_thread_id", + }, + ) + responses.append(response2) + + if all(responses): + self.test_results["clarification_scenarios"]["clarification_loop"] = True + self.logger.info(" โœ… Clarification loop structure verified") + + # Cleanup + os.remove(complex_file) + return True + + except Exception as e: + self.logger.error(f"Clarification loop test failed: {e}") + return False + + def _test_partial_file_provision(self) -> bool: + """Test edge case where user provides only some of requested files""" + try: + self.logger.info(" 6.5: Testing partial file provision edge case") + + # This test would verify that when a tool asks for multiple files + # but user only provides some, the conversation can continue gracefully + + # Create multiple related files + file1_content = '''"""File 1 - main module""" +def main_function(): + return "main" +''' + + file2_content = '''"""File 2 - utility module""" +def utility_function(): + return "utility" +''' + + file1_path = os.path.join(self.test_dir, "file1.py") + file2_path = os.path.join(self.test_dir, "file2.py") + + with open(file1_path, "w") as f: + f.write(file1_content) + with open(file2_path, "w") as f: + f.write(file2_content) + + # Simulate tool asking for both files + + # Simulate user providing only file1.py (partial provision) + # In real implementation, this should trigger another clarification for file2.py + partial_response = self._simulate_partial_file_response([file1_path]) + + if partial_response: + self.test_results["clarification_scenarios"]["partial_file_provision"] = True + self.logger.info(" โœ… Partial file provision edge case structure verified") + + # Cleanup + os.remove(file1_path) + os.remove(file2_path) + return True + + except Exception as e: + self.logger.error(f"Partial file provision test failed: {e}") + return False + + def _simulate_clarification_request(self, tool_name: str, params: dict) -> Optional[str]: + """ + Simulate a tool call that would trigger requires_clarification. + In real implementation, this would intercept the actual Gemini response. + """ + try: + # This is a mock implementation showing the structure + # In a real test, we would: + # 1. Mock the Gemini API response to return requires_clarification + # 2. Call the actual MCP tool + # 3. Verify the response format and conversation ID preservation + + mock_response = { + "status": "requires_clarification", + "question": f"Mock clarification from {tool_name} tool", + "files_needed": ["additional_file.py"], + "conversation_id": f"mock_thread_{tool_name}", + } + + self.logger.debug(f" ๐Ÿ“ Simulated {tool_name} clarification: {mock_response}") + return json.dumps(mock_response) + + except Exception as e: + self.logger.error(f"Clarification simulation failed: {e}") + return None + + def _simulate_partial_file_response(self, provided_files: list[str]) -> Optional[str]: + """Simulate user providing only some of the requested files""" + try: + # This would test the server's handling of incomplete file provision + mock_response = { + "status": "partial_provision", + "provided_files": provided_files, + "still_needed": ["missing_file.py"], + } + + self.logger.debug(f" ๐Ÿ“ Simulated partial file provision: {mock_response}") + return json.dumps(mock_response) + + except Exception as e: + self.logger.error(f"Partial file response simulation failed: {e}") + return None + + def _test_real_clarification_flow(self) -> bool: + """Test a real clarification flow that might trigger requires_clarification from Gemini""" + try: + self.logger.info(" 6.6: Testing real clarification flow with ambiguous prompts") + + # Create an intentionally ambiguous debugging scenario + ambiguous_content = '''""" +Ambiguous code that would be hard to debug without context +""" + +def mysterious_function(data): + result = process_data(data) # Where is process_data defined? + return result.transform() # What is the structure of result? + +class DataProcessor: + def __init__(self): + self.config = load_config() # Where is load_config from? + + def run(self): + return mysterious_function(self.get_data()) # Where is get_data? +''' + + ambiguous_file = os.path.join(self.test_dir, "ambiguous.py") + with open(ambiguous_file, "w") as f: + f.write(ambiguous_content) + + # Try debug tool with minimal context - this might trigger clarification + response1, thread_id = self._call_mcp_tool( + "debug", {"files": [ambiguous_file], "error_description": "Code crashes with AttributeError"} + ) + + if response1: + try: + response_data = json.loads(response1) + if response_data.get("status") == "requires_clarification": + self.logger.info(" ๐ŸŽฏ Real clarification response received!") + self.test_results["clarification_scenarios"]["real_clarification_flow"] = True + + # Test continuation with additional context + if thread_id: + # Provide additional files + continuation_response, _ = self._call_mcp_tool( + "debug", + { + "files": [ambiguous_file, self.test_files["python"]], + "error_description": "Additional context provided", + "continuation_id": thread_id, + }, + ) + + if continuation_response: + self.logger.info(" โœ… Clarification continuation working") + + else: + self.logger.info(" โ„น๏ธ No clarification triggered (Gemini provided analysis directly)") + self.test_results["clarification_scenarios"]["real_clarification_flow"] = True + + except json.JSONDecodeError: + self.logger.warning(" โš ๏ธ Could not parse response as JSON") + + # Cleanup + os.remove(ambiguous_file) + return True + + except Exception as e: + self.logger.error(f"Real clarification flow test failed: {e}") + return False + + def _call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]: + """Simulate calling an MCP tool via Claude CLI (docker exec)""" + try: + # Prepare the MCP initialization and tool call sequence + init_request = { + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {"tools": {}}, + "clientInfo": {"name": "communication-simulator", "version": "1.0.0"}, + }, + } + + # Send initialized notification + initialized_notification = {"jsonrpc": "2.0", "method": "notifications/initialized"} + + # Prepare the tool call request + tool_request = { + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": tool_name, "arguments": params}, + } + + # Combine all messages + messages = [json.dumps(init_request), json.dumps(initialized_notification), json.dumps(tool_request)] + + # Join with newlines as MCP expects + input_data = "\n".join(messages) + "\n" + + # Simulate Claude CLI calling the MCP server via docker exec + docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"] + + self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization") + + # Execute the command + result = subprocess.run( + docker_cmd, input=input_data, text=True, capture_output=True, timeout=120 # 2 minute timeout + ) + + if result.returncode != 0: + self.logger.error(f"Docker exec failed: {result.stderr}") + return None, None + + # Parse the response - look for the tool call response + response_data = self._parse_mcp_response(result.stdout, expected_id=2) + if not response_data: + return None, None + + # Extract continuation_id if present + continuation_id = self._extract_continuation_id(response_data) + + return response_data, continuation_id + + except subprocess.TimeoutExpired: + self.logger.error(f"MCP tool call timed out: {tool_name}") + return None, None + except Exception as e: + self.logger.error(f"MCP tool call failed: {e}") + return None, None + + def _parse_mcp_response(self, stdout: str, expected_id: int = 2) -> Optional[str]: + """Parse MCP JSON-RPC response from stdout""" + try: + lines = stdout.strip().split("\n") + for line in lines: + if line.strip() and line.startswith("{"): + response = json.loads(line) + # Look for the tool call response with the expected ID + if response.get("id") == expected_id and "result" in response: + # Extract the actual content from the response + result = response["result"] + # Handle new response format with 'content' array + if isinstance(result, dict) and "content" in result: + content_array = result["content"] + if isinstance(content_array, list) and len(content_array) > 0: + return content_array[0].get("text", "") + # Handle legacy format + elif isinstance(result, list) and len(result) > 0: + return result[0].get("text", "") + elif response.get("id") == expected_id and "error" in response: + self.logger.error(f"MCP error: {response['error']}") + return None + + # If we get here, log all responses for debugging + self.logger.warning(f"No valid tool call response found for ID {expected_id}") + self.logger.debug(f"Full stdout: {stdout}") + return None + + except json.JSONDecodeError as e: + self.logger.error(f"Failed to parse MCP response: {e}") + self.logger.debug(f"Stdout that failed to parse: {stdout}") + return None + + def _extract_continuation_id(self, response_text: str) -> Optional[str]: + """Extract continuation_id from response metadata""" + try: + # Parse the response text as JSON to look for continuation metadata + response_data = json.loads(response_text) + + # Look for continuation_id in various places + if isinstance(response_data, dict): + # Check metadata + metadata = response_data.get("metadata", {}) + if "thread_id" in metadata: + return metadata["thread_id"] + + # Check follow_up_request + follow_up = response_data.get("follow_up_request", {}) + if follow_up and "continuation_id" in follow_up: + return follow_up["continuation_id"] + + # Check continuation_offer + continuation_offer = response_data.get("continuation_offer", {}) + if continuation_offer and "continuation_id" in continuation_offer: + return continuation_offer["continuation_id"] + + self.logger.debug(f"No continuation_id found in response: {response_data}") + return None + + except json.JSONDecodeError as e: + self.logger.debug(f"Failed to parse response for continuation_id: {e}") + return None + + def validate_docker_logs(self) -> bool: + """Validate Docker logs to confirm file deduplication behavior""" + try: + self.logger.info("๐Ÿ“‹ Validating Docker logs for file deduplication...") + + # Get server logs from both main container and activity logs + result = self._run_command(["docker", "logs", self.container_name], capture_output=True) + + if result.returncode != 0: + self.logger.error(f"Failed to get Docker logs: {result.stderr}") + return False + + main_logs = result.stdout.decode() + result.stderr.decode() + + # Also get activity logs for more detailed conversation tracking + activity_result = self._run_command( + ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True + ) + + activity_logs = "" + if activity_result.returncode == 0: + activity_logs = activity_result.stdout.decode() + + logs = main_logs + "\n" + activity_logs + + # Look for conversation threading patterns that indicate the system is working + conversation_patterns = [ + "CONVERSATION_RESUME", + "CONVERSATION_CONTEXT", + "previous turns loaded", + "tool embedding", + "files included", + "files truncated", + "already in conversation history", + ] + + conversation_lines = [] + for line in logs.split("\n"): + for pattern in conversation_patterns: + if pattern.lower() in line.lower(): + conversation_lines.append(line.strip()) + break + + # Look for evidence of conversation threading and file handling + conversation_threading_found = False + multi_turn_conversations = False + + for line in conversation_lines: + lower_line = line.lower() + if "conversation_resume" in lower_line: + conversation_threading_found = True + self.logger.debug(f"๐Ÿ“„ Conversation threading: {line}") + elif "previous turns loaded" in lower_line: + multi_turn_conversations = True + self.logger.debug(f"๐Ÿ“„ Multi-turn conversation: {line}") + elif "already in conversation" in lower_line: + self.logger.info(f"โœ… Found explicit deduplication: {line}") + return True + + # Conversation threading with multiple turns is evidence of file deduplication working + if conversation_threading_found and multi_turn_conversations: + self.logger.info("โœ… Conversation threading with multi-turn context working") + self.logger.info( + "โœ… File deduplication working implicitly (files embedded once in conversation history)" + ) + self.test_results["logs_validation"] = True + return True + elif conversation_threading_found: + self.logger.info("โœ… Conversation threading detected") + return True + else: + self.logger.warning("โš ๏ธ No clear evidence of conversation threading in logs") + self.logger.debug(f"Found {len(conversation_lines)} conversation-related log lines") + return False + + except Exception as e: + self.logger.error(f"Log validation failed: {e}") + return False + + def validate_conversation_memory(self) -> bool: + """Validate that conversation memory is working via Redis""" + try: + self.logger.info("๐Ÿ’พ Validating conversation memory via Redis...") + + # Check Redis for stored conversations + result = self._run_command( + ["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True + ) + + if result.returncode != 0: + self.logger.error("Failed to query Redis") + return False + + keys = result.stdout.decode().strip().split("\n") + thread_keys = [k for k in keys if k.startswith("thread:")] + + if thread_keys: + self.logger.info(f"โœ… Found {len(thread_keys)} conversation threads in Redis") + + # Get details of first thread + if thread_keys: + thread_key = thread_keys[0] + result = self._run_command( + ["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True + ) + + if result.returncode == 0: + thread_data = result.stdout.decode() + try: + parsed = json.loads(thread_data) + turns = parsed.get("turns", []) + self.logger.info(f"โœ… Thread has {len(turns)} turns") + self.test_results["redis_validation"] = True + return True + except json.JSONDecodeError: + self.logger.warning("Could not parse thread data") + + self.test_results["redis_validation"] = True + return True + else: + self.logger.warning("โš ๏ธ No conversation threads found in Redis") + return False + + except Exception as e: + self.logger.error(f"Conversation memory validation failed: {e}") + return False + + def cleanup(self): + """Cleanup test environment""" + try: + self.logger.info("๐Ÿงน Cleaning up test environment...") + + if not self.keep_logs: + # Stop Docker services + self._run_command(["docker", "compose", "down", "--remove-orphans"], check=False, capture_output=True) + else: + self.logger.info("๐Ÿ“‹ Keeping Docker services running for log inspection") + + # Remove temp directory + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + self.logger.debug(f"Removed temp directory: {self.temp_dir}") + + # Remove test files directory + if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir): + shutil.rmtree(self.test_dir) + self.logger.debug(f"Removed test files directory: {self.test_dir}") + + except Exception as e: + self.logger.error(f"Cleanup failed: {e}") + + def _run_command(self, cmd: list[str], check: bool = True, capture_output: bool = False, **kwargs): + """Run a shell command with logging""" + if self.verbose: + self.logger.debug(f"Running: {' '.join(cmd)}") + + return subprocess.run(cmd, check=check, capture_output=capture_output, **kwargs) + + def print_test_summary(self): + """Print comprehensive test results summary""" + print("\n" + "=" * 70) + print("๐Ÿงช GEMINI MCP COMMUNICATION SIMULATOR - TEST RESULTS SUMMARY") + print("=" * 70) + + # Basic conversation flow + status = "โœ… PASS" if self.test_results["basic_conversation"] else "โŒ FAIL" + print(f"๐Ÿ“ Basic Conversation Flow: {status}") + + # Per-tool tests + print("\n๐Ÿ“„ Per-Tool File Deduplication Tests:") + tools_tested = len(self.test_results["per_tool_tests"]) + tools_passed = sum(1 for passed in self.test_results["per_tool_tests"].values() if passed) + + if tools_tested > 0: + for tool, passed in self.test_results["per_tool_tests"].items(): + status = "โœ… PASS" if passed else "โŒ FAIL" + print(f" โ€ข {tool}: {status}") + print(f" โ†’ Summary: {tools_passed}/{tools_tested} tools passed") + else: + print(" โ†’ No tools tested") + + # Cross-tool scenarios + print("\n๐Ÿ”ง Cross-Tool Continuation Scenarios:") + scenarios_tested = len(self.test_results["cross_tool_scenarios"]) + scenarios_passed = sum(1 for passed in self.test_results["cross_tool_scenarios"].values() if passed is True) + + if scenarios_tested > 0: + scenario_names = { + "chat_thinkdeep_codereview": "chat โ†’ thinkdeep โ†’ codereview", + "analyze_debug_thinkdeep": "analyze โ†’ debug โ†’ thinkdeep", + "multi_file_continuation": "Multi-file continuation", + "state_isolation": "State isolation (contamination detection)", + "conversation_boundaries": "Conversation boundaries & reset behavior", + } + + for scenario, passed in self.test_results["cross_tool_scenarios"].items(): + name = scenario_names.get(scenario, scenario) + if passed is True: + status = "โœ… PASS" + elif passed is False: + status = "โŒ FAIL" + else: + status = "โธ๏ธ SKIP" + print(f" โ€ข {name}: {status}") + print(f" โ†’ Summary: {scenarios_passed}/{scenarios_tested} scenarios passed") + else: + print(" โ†’ No scenarios tested") + + # Content validation + print("\n๐Ÿ“„ Content Validation (Duplicate Detection):") + content_validation_tested = len(self.test_results["content_validation"]) + content_validation_passed = sum( + 1 for passed in self.test_results["content_validation"].values() if passed is True + ) + + if content_validation_tested > 0: + content_validation_names = { + "precommit_duplication": "Precommit tool content duplication", + "chat_duplication": "Chat tool content duplication", + "codereview_duplication": "Code review tool content duplication", + "analyze_duplication": "Analyze tool content duplication", + "cross_tool_duplication": "Cross-tool content duplication", + } + + for test, passed in self.test_results["content_validation"].items(): + name = content_validation_names.get(test, test) + if passed is True: + status = "โœ… PASS" + elif passed is False: + status = "โŒ FAIL" + else: + status = "โธ๏ธ SKIP" + print(f" โ€ข {name}: {status}") + print(f" โ†’ Summary: {content_validation_passed}/{content_validation_tested} content validation tests passed") + else: + print(" โ†’ No content validation tests run") + + # Clarification scenarios + print("\n๐Ÿ” Requires Clarification Scenarios:") + clarification_tested = len(self.test_results["clarification_scenarios"]) + clarification_passed = sum( + 1 for passed in self.test_results["clarification_scenarios"].values() if passed is True + ) + + if clarification_tested > 0: + clarification_names = { + "debug_clarification": "Debug tool clarification flow", + "analyze_clarification": "Analyze tool clarification flow", + "clarification_deduplication": "Clarification with file deduplication", + "clarification_loop": "Multiple round clarification (loop)", + "partial_file_provision": "Partial file provision edge case", + "real_clarification_flow": "Real clarification flow with ambiguous prompts", + } + + for scenario, passed in self.test_results["clarification_scenarios"].items(): + name = clarification_names.get(scenario, scenario) + if passed is True: + status = "โœ… PASS" + elif passed is False: + status = "โŒ FAIL" + else: + status = "โธ๏ธ SKIP" + print(f" โ€ข {name}: {status}") + print(f" โ†’ Summary: {clarification_passed}/{clarification_tested} clarification scenarios passed") + else: + print(" โ†’ No clarification scenarios tested") + + # System validation + print("\n๐Ÿ’พ System Validation:") + logs_status = "โœ… PASS" if self.test_results["logs_validation"] else "โŒ FAIL" + redis_status = "โœ… PASS" if self.test_results["redis_validation"] else "โŒ FAIL" + print(f" โ€ข Docker logs (conversation threading): {logs_status}") + print(f" โ€ข Redis memory (conversation persistence): {redis_status}") + + # Overall result + all_core_tests = [ + self.test_results["basic_conversation"], + self.test_results["logs_validation"], + self.test_results["redis_validation"], + ] + + tool_tests_ok = tools_tested == 0 or tools_passed > 0 + scenario_tests_ok = scenarios_tested == 0 or scenarios_passed > 0 + clarification_tests_ok = clarification_tested == 0 or clarification_passed > 0 + content_validation_ok = content_validation_tested == 0 or content_validation_passed > 0 + + overall_success = all(all_core_tests) and tool_tests_ok and scenario_tests_ok and clarification_tests_ok and content_validation_ok + + print(f"\n๐ŸŽฏ OVERALL RESULT: {'๐ŸŽ‰ SUCCESS' if overall_success else 'โŒ FAILURE'}") + + if overall_success: + print("โœ… MCP server conversation continuity and file deduplication working correctly!") + print("โœ… All core systems validated") + if tools_passed > 0: + print(f"โœ… {tools_passed} tools working with file deduplication") + if scenarios_passed > 0: + print(f"โœ… {scenarios_passed} cross-tool scenarios working") + if clarification_passed > 0: + print(f"โœ… {clarification_passed} clarification scenarios verified") + if content_validation_passed > 0: + print(f"โœ… {content_validation_passed} content validation tests passed") + else: + print("โš ๏ธ Some tests failed - check individual results above") + + print("=" * 70) + return overall_success + + def run_individual_test(self, test_name: str, skip_docker_setup: bool = False) -> bool: + """Run a single test individually with optional Docker setup skip""" + try: + if test_name not in self.available_tests: + self.logger.error(f"Unknown test: {test_name}") + self.logger.info(f"Available tests: {', '.join(self.available_tests.keys())}") + return False + + self.logger.info(f"๐Ÿงช Running individual test: {test_name}") + + # Setup environment unless skipped + if not skip_docker_setup: + if not self.setup_test_environment(): + self.logger.error("โŒ Environment setup failed") + return False + + # Run the single test + test_function = self.available_tests[test_name] + result = test_function() + + if result: + self.logger.info(f"โœ… Individual test {test_name} passed") + else: + self.logger.error(f"โŒ Individual test {test_name} failed") + + return result + + except Exception as e: + self.logger.error(f"Individual test {test_name} failed with exception: {e}") + return False + finally: + if not skip_docker_setup and not self.keep_logs: + self.cleanup() + + def run_full_test_suite(self) -> bool: + """Run the complete test suite""" + try: + self.logger.info("๐Ÿš€ Starting Gemini MCP Communication Simulator Test Suite") + + # Setup + if not self.setup_test_environment(): + self.logger.error("โŒ Environment setup failed") + return False + + # Main simulation + if not self.simulate_claude_cli_session(): + self.logger.error("โŒ Claude CLI simulation failed") + return False + + # Validation + self.validate_docker_logs() + self.validate_conversation_memory() + + # Print comprehensive summary + overall_success = self.print_test_summary() + + return overall_success + + except Exception as e: + self.logger.error(f"Test suite failed: {e}") + return False + finally: + if not self.keep_logs: + self.cleanup() + + +def parse_arguments(): + """Parse and validate command line arguments""" + parser = argparse.ArgumentParser(description="Gemini MCP Communication Simulator Test") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") + parser.add_argument("--keep-logs", action="store_true", help="Keep Docker services running for log inspection") + parser.add_argument("--tests", "-t", nargs="+", help="Specific tests to run (space-separated)") + parser.add_argument("--list-tests", action="store_true", help="List available tests and exit") + parser.add_argument("--individual", "-i", help="Run a single test individually") + parser.add_argument("--skip-docker", action="store_true", help="Skip Docker setup (assumes containers are already running)") + + return parser.parse_args() + + +def list_available_tests(): + """List all available tests and exit""" + simulator = CommunicationSimulator() + print("Available tests:") + for test_name, description in simulator.get_available_tests().items(): + print(f" {test_name:<25} - {description}") + + +def run_individual_test(simulator, test_name, skip_docker): + """Run a single test individually""" + try: + success = simulator.run_individual_test(test_name, skip_docker_setup=skip_docker) + + if success: + print(f"\n๐ŸŽ‰ INDIVIDUAL TEST {test_name.upper()}: PASSED") + return 0 + else: + print(f"\nโŒ INDIVIDUAL TEST {test_name.upper()}: FAILED") + return 1 + + except KeyboardInterrupt: + print(f"\n๐Ÿ›‘ Individual test {test_name} interrupted by user") + if not skip_docker: + simulator.cleanup() + return 130 + except Exception as e: + print(f"\n๐Ÿ’ฅ Individual test {test_name} failed with error: {e}") + if not skip_docker: + simulator.cleanup() + return 1 + + +def run_test_suite(simulator): + """Run the full test suite or selected tests""" + try: + success = simulator.run_full_test_suite() + + if success: + print("\n๐ŸŽ‰ COMPREHENSIVE MCP COMMUNICATION TEST: PASSED") + return 0 + else: + print("\nโŒ COMPREHENSIVE MCP COMMUNICATION TEST: FAILED") + print("โš ๏ธ Check detailed results above") + return 1 + + except KeyboardInterrupt: + print("\n๐Ÿ›‘ Test interrupted by user") + simulator.cleanup() + return 130 + except Exception as e: + print(f"\n๐Ÿ’ฅ Unexpected error: {e}") + simulator.cleanup() + return 1 + + +def main(): + """Main entry point""" + args = parse_arguments() + + # Handle list tests request + if args.list_tests: + list_available_tests() + return + + # Initialize simulator consistently for all use cases + simulator = CommunicationSimulator( + verbose=args.verbose, + keep_logs=args.keep_logs, + selected_tests=args.tests + ) + + # Determine execution mode and run + if args.individual: + exit_code = run_individual_test(simulator, args.individual, args.skip_docker) + else: + exit_code = run_test_suite(simulator) + + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/simulator_tests/__init__.py b/simulator_tests/__init__.py new file mode 100644 index 0000000..8dfff9c --- /dev/null +++ b/simulator_tests/__init__.py @@ -0,0 +1,35 @@ +""" +Communication Simulator Tests Package + +This package contains individual test modules for the Gemini MCP Communication Simulator. +Each test is in its own file for better organization and maintainability. +""" + +from .base_test import BaseSimulatorTest +from .test_basic_conversation import BasicConversationTest +from .test_content_validation import ContentValidationTest +from .test_per_tool_deduplication import PerToolDeduplicationTest +from .test_cross_tool_continuation import CrossToolContinuationTest +from .test_logs_validation import LogsValidationTest +from .test_redis_validation import RedisValidationTest + +# Test registry for dynamic loading +TEST_REGISTRY = { + "basic_conversation": BasicConversationTest, + "content_validation": ContentValidationTest, + "per_tool_deduplication": PerToolDeduplicationTest, + "cross_tool_continuation": CrossToolContinuationTest, + "logs_validation": LogsValidationTest, + "redis_validation": RedisValidationTest, +} + +__all__ = [ + 'BaseSimulatorTest', + 'BasicConversationTest', + 'ContentValidationTest', + 'PerToolDeduplicationTest', + 'CrossToolContinuationTest', + 'LogsValidationTest', + 'RedisValidationTest', + 'TEST_REGISTRY' +] \ No newline at end of file diff --git a/simulator_tests/base_test.py b/simulator_tests/base_test.py new file mode 100644 index 0000000..d6d724b --- /dev/null +++ b/simulator_tests/base_test.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +Base Test Class for Communication Simulator Tests + +Provides common functionality and utilities for all simulator tests. +""" + +import json +import logging +import os +import subprocess +import tempfile +import time +from typing import Optional, Tuple + + +class BaseSimulatorTest: + """Base class for all communication simulator tests""" + + def __init__(self, verbose: bool = False): + self.verbose = verbose + self.test_files = {} + self.test_dir = None + self.container_name = "gemini-mcp-server" + self.redis_container = "gemini-mcp-redis" + + # Configure logging + log_level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") + self.logger = logging.getLogger(self.__class__.__name__) + + def setup_test_files(self): + """Create test files for the simulation""" + # Test Python file + python_content = '''""" +Sample Python module for testing MCP conversation continuity +""" + +def fibonacci(n): + """Calculate fibonacci number recursively""" + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + +def factorial(n): + """Calculate factorial iteratively""" + result = 1 + for i in range(1, n + 1): + result *= i + return result + +class Calculator: + """Simple calculator class""" + + def __init__(self): + self.history = [] + + def add(self, a, b): + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result + + def multiply(self, a, b): + result = a * b + self.history.append(f"{a} * {b} = {result}") + return result +''' + + # Test configuration file + config_content = """{ + "database": { + "host": "localhost", + "port": 5432, + "name": "testdb", + "ssl": true + }, + "cache": { + "redis_url": "redis://localhost:6379", + "ttl": 3600 + }, + "logging": { + "level": "INFO", + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } +}""" + + # Create files in the current project directory + current_dir = os.getcwd() + self.test_dir = os.path.join(current_dir, "test_simulation_files") + os.makedirs(self.test_dir, exist_ok=True) + + test_py = os.path.join(self.test_dir, "test_module.py") + test_config = os.path.join(self.test_dir, "config.json") + + with open(test_py, "w") as f: + f.write(python_content) + with open(test_config, "w") as f: + f.write(config_content) + + self.test_files = {"python": test_py, "config": test_config} + self.logger.debug(f"Created test files: {list(self.test_files.values())}") + + def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]: + """Call an MCP tool via Claude CLI (docker exec)""" + try: + # Prepare the MCP initialization and tool call sequence + init_request = { + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {"tools": {}}, + "clientInfo": {"name": "communication-simulator", "version": "1.0.0"}, + }, + } + + # Send initialized notification + initialized_notification = {"jsonrpc": "2.0", "method": "notifications/initialized"} + + # Prepare the tool call request + tool_request = { + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": tool_name, "arguments": params}, + } + + # Combine all messages + messages = [json.dumps(init_request), json.dumps(initialized_notification), json.dumps(tool_request)] + + # Join with newlines as MCP expects + input_data = "\n".join(messages) + "\n" + + # Simulate Claude CLI calling the MCP server via docker exec + docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"] + + self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization") + + # Execute the command + result = subprocess.run( + docker_cmd, input=input_data, text=True, capture_output=True, timeout=300 # 5 minute timeout + ) + + if result.returncode != 0: + self.logger.error(f"Docker exec failed: {result.stderr}") + return None, None + + # Parse the response - look for the tool call response + response_data = self._parse_mcp_response(result.stdout, expected_id=2) + if not response_data: + return None, None + + # Extract continuation_id if present + continuation_id = self._extract_continuation_id(response_data) + + return response_data, continuation_id + + except subprocess.TimeoutExpired: + self.logger.error(f"MCP tool call timed out: {tool_name}") + return None, None + except Exception as e: + self.logger.error(f"MCP tool call failed: {e}") + return None, None + + def _parse_mcp_response(self, stdout: str, expected_id: int = 2) -> Optional[str]: + """Parse MCP JSON-RPC response from stdout""" + try: + lines = stdout.strip().split("\n") + for line in lines: + if line.strip() and line.startswith("{"): + response = json.loads(line) + # Look for the tool call response with the expected ID + if response.get("id") == expected_id and "result" in response: + # Extract the actual content from the response + result = response["result"] + # Handle new response format with 'content' array + if isinstance(result, dict) and "content" in result: + content_array = result["content"] + if isinstance(content_array, list) and len(content_array) > 0: + return content_array[0].get("text", "") + # Handle legacy format + elif isinstance(result, list) and len(result) > 0: + return result[0].get("text", "") + elif response.get("id") == expected_id and "error" in response: + self.logger.error(f"MCP error: {response['error']}") + return None + + # If we get here, log all responses for debugging + self.logger.warning(f"No valid tool call response found for ID {expected_id}") + self.logger.debug(f"Full stdout: {stdout}") + return None + + except json.JSONDecodeError as e: + self.logger.error(f"Failed to parse MCP response: {e}") + self.logger.debug(f"Stdout that failed to parse: {stdout}") + return None + + def _extract_continuation_id(self, response_text: str) -> Optional[str]: + """Extract continuation_id from response metadata""" + try: + # Parse the response text as JSON to look for continuation metadata + response_data = json.loads(response_text) + + # Look for continuation_id in various places + if isinstance(response_data, dict): + # Check metadata + metadata = response_data.get("metadata", {}) + if "thread_id" in metadata: + return metadata["thread_id"] + + # Check follow_up_request + follow_up = response_data.get("follow_up_request", {}) + if follow_up and "continuation_id" in follow_up: + return follow_up["continuation_id"] + + # Check continuation_offer + continuation_offer = response_data.get("continuation_offer", {}) + if continuation_offer and "continuation_id" in continuation_offer: + return continuation_offer["continuation_id"] + + self.logger.debug(f"No continuation_id found in response: {response_data}") + return None + + except json.JSONDecodeError as e: + self.logger.debug(f"Failed to parse response for continuation_id: {e}") + return None + + def run_command(self, cmd: list[str], check: bool = True, capture_output: bool = False, **kwargs): + """Run a shell command with logging""" + if self.verbose: + self.logger.debug(f"Running: {' '.join(cmd)}") + + return subprocess.run(cmd, check=check, capture_output=capture_output, **kwargs) + + def cleanup_test_files(self): + """Clean up test files""" + if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir): + import shutil + shutil.rmtree(self.test_dir) + self.logger.debug(f"Removed test files directory: {self.test_dir}") + + def run_test(self) -> bool: + """Run the test - to be implemented by subclasses""" + raise NotImplementedError("Subclasses must implement run_test()") + + @property + def test_name(self) -> str: + """Get the test name - to be implemented by subclasses""" + raise NotImplementedError("Subclasses must implement test_name property") + + @property + def test_description(self) -> str: + """Get the test description - to be implemented by subclasses""" + raise NotImplementedError("Subclasses must implement test_description property") \ No newline at end of file diff --git a/simulator_tests/test_basic_conversation.py b/simulator_tests/test_basic_conversation.py new file mode 100644 index 0000000..5c8c550 --- /dev/null +++ b/simulator_tests/test_basic_conversation.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Basic Conversation Flow Test + +Tests basic conversation continuity with the chat tool, including: +- Initial chat with file analysis +- Continuing conversation with same file (deduplication) +- Adding additional files to ongoing conversation +""" + +from .base_test import BaseSimulatorTest + + +class BasicConversationTest(BaseSimulatorTest): + """Test basic conversation flow with chat tool""" + + @property + def test_name(self) -> str: + return "basic_conversation" + + @property + def test_description(self) -> str: + return "Basic conversation flow with chat tool" + + def run_test(self) -> bool: + """Test basic conversation flow with chat tool""" + try: + self.logger.info("๐Ÿ“ Test: Basic conversation flow") + + # Setup test files + self.setup_test_files() + + # Initial chat tool call with file + self.logger.info(" 1.1: Initial chat with file analysis") + response1, continuation_id = self.call_mcp_tool( + "chat", + {"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]}, + ) + + if not response1 or not continuation_id: + self.logger.error("Failed to get initial response with continuation_id") + return False + + self.logger.info(f" โœ… Got continuation_id: {continuation_id}") + + # Continue conversation with same file (should be deduplicated) + self.logger.info(" 1.2: Continue conversation with same file") + response2, _ = self.call_mcp_tool( + "chat", + { + "prompt": "Please use low thinking mode. Now focus on the Calculator class specifically. Are there any improvements you'd suggest?", + "files": [self.test_files["python"]], # Same file - should be deduplicated + "continuation_id": continuation_id, + }, + ) + + if not response2: + self.logger.error("Failed to continue conversation") + return False + + # Continue with additional file + self.logger.info(" 1.3: Continue conversation with additional file") + response3, _ = self.call_mcp_tool( + "chat", + { + "prompt": "Please use low thinking mode. Now also analyze this configuration file and see how it might relate to the Python code", + "files": [self.test_files["python"], self.test_files["config"]], + "continuation_id": continuation_id, + }, + ) + + if not response3: + self.logger.error("Failed to continue with additional file") + return False + + self.logger.info(" โœ… Basic conversation flow working") + return True + + except Exception as e: + self.logger.error(f"Basic conversation flow test failed: {e}") + return False + finally: + self.cleanup_test_files() \ No newline at end of file diff --git a/simulator_tests/test_content_validation.py b/simulator_tests/test_content_validation.py new file mode 100644 index 0000000..5b98327 --- /dev/null +++ b/simulator_tests/test_content_validation.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Content Validation Test + +Tests that tools don't duplicate file content in their responses. +This test is specifically designed to catch content duplication bugs. +""" + +import json +import os +from .base_test import BaseSimulatorTest + + +class ContentValidationTest(BaseSimulatorTest): + """Test that tools don't duplicate file content in their responses""" + + @property + def test_name(self) -> str: + return "content_validation" + + @property + def test_description(self) -> str: + return "Content validation and duplicate detection" + + def run_test(self) -> bool: + """Test that tools don't duplicate file content in their responses""" + try: + self.logger.info("๐Ÿ“„ Test: Content validation and duplicate detection") + + # Setup test files first + self.setup_test_files() + + # Create a test file with distinctive content for validation + validation_content = '''""" +Configuration file for content validation testing +This content should appear only ONCE in any tool response +""" + +# Configuration constants +MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once +TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once +UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345" + +# Database settings +DATABASE_CONFIG = { + "host": "localhost", + "port": 5432, + "name": "validation_test_db" +} +''' + + validation_file = os.path.join(self.test_dir, "validation_config.py") + with open(validation_file, "w") as f: + f.write(validation_content) + + # Test 1: Precommit tool with files parameter (where the bug occurred) + self.logger.info(" 1: Testing precommit tool content duplication") + + # Call precommit tool with the validation file + response1, thread_id = self.call_mcp_tool( + "precommit", + { + "path": os.getcwd(), + "files": [validation_file], + "original_request": "Test for content duplication in precommit tool" + } + ) + + if response1: + # Parse response and check for content duplication + try: + response_data = json.loads(response1) + content = response_data.get("content", "") + + # Count occurrences of distinctive markers + max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000") + temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2") + unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER") + + # Validate no duplication + duplication_detected = False + issues = [] + + if max_content_count > 1: + issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times") + duplication_detected = True + + if temp_analytical_count > 1: + issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times") + duplication_detected = True + + if unique_marker_count > 1: + issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times") + duplication_detected = True + + if duplication_detected: + self.logger.error(f" โŒ Content duplication detected in precommit tool: {'; '.join(issues)}") + return False + else: + self.logger.info(" โœ… No content duplication in precommit tool") + + except json.JSONDecodeError: + self.logger.warning(" โš ๏ธ Could not parse precommit response as JSON") + + else: + self.logger.warning(" โš ๏ธ Precommit tool failed to respond") + + # Test 2: Other tools that use files parameter + tools_to_test = [ + ("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}), + ("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}), + ("analyze", {"files": [validation_file], "analysis_type": "code_quality"}) + ] + + for tool_name, params in tools_to_test: + self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication") + + response, _ = self.call_mcp_tool(tool_name, params) + if response: + try: + response_data = json.loads(response) + content = response_data.get("content", "") + + # Check for duplication + marker_count = content.count("UNIQUE_VALIDATION_MARKER") + if marker_count > 1: + self.logger.error(f" โŒ Content duplication in {tool_name}: marker appears {marker_count} times") + return False + else: + self.logger.info(f" โœ… No content duplication in {tool_name}") + + except json.JSONDecodeError: + self.logger.warning(f" โš ๏ธ Could not parse {tool_name} response") + else: + self.logger.warning(f" โš ๏ธ {tool_name} tool failed to respond") + + # Test 3: Cross-tool content validation with file deduplication + self.logger.info(" 3: Testing cross-tool content consistency") + + if thread_id: + # Continue conversation with same file - content should be deduplicated in conversation history + response2, _ = self.call_mcp_tool( + "chat", + { + "prompt": "Please use low thinking mode. Continue analyzing this configuration file", + "files": [validation_file], # Same file should be deduplicated + "continuation_id": thread_id, + }, + ) + + if response2: + try: + response_data = json.loads(response2) + content = response_data.get("content", "") + + # In continuation, the file content shouldn't be duplicated either + marker_count = content.count("UNIQUE_VALIDATION_MARKER") + if marker_count > 1: + self.logger.error(f" โŒ Content duplication in cross-tool continuation: marker appears {marker_count} times") + return False + else: + self.logger.info(" โœ… No content duplication in cross-tool continuation") + + except json.JSONDecodeError: + self.logger.warning(" โš ๏ธ Could not parse continuation response") + + # Cleanup + os.remove(validation_file) + + self.logger.info(" โœ… All content validation tests passed") + return True + + except Exception as e: + self.logger.error(f"Content validation test failed: {e}") + return False + finally: + self.cleanup_test_files() \ No newline at end of file diff --git a/simulator_tests/test_cross_tool_continuation.py b/simulator_tests/test_cross_tool_continuation.py new file mode 100644 index 0000000..ae05688 --- /dev/null +++ b/simulator_tests/test_cross_tool_continuation.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Cross-Tool Continuation Test + +Tests comprehensive cross-tool continuation scenarios to ensure +conversation context is maintained when switching between different tools. +""" + +from .base_test import BaseSimulatorTest + + +class CrossToolContinuationTest(BaseSimulatorTest): + """Test comprehensive cross-tool continuation scenarios""" + + @property + def test_name(self) -> str: + return "cross_tool_continuation" + + @property + def test_description(self) -> str: + return "Cross-tool conversation continuation scenarios" + + def run_test(self) -> bool: + """Test comprehensive cross-tool continuation scenarios""" + try: + self.logger.info("๐Ÿ”ง Test: Cross-tool continuation scenarios") + + # Setup test files + self.setup_test_files() + + success_count = 0 + total_scenarios = 3 + + # Scenario 1: chat -> thinkdeep -> codereview + if self._test_chat_thinkdeep_codereview(): + success_count += 1 + + # Scenario 2: analyze -> debug -> thinkdeep + if self._test_analyze_debug_thinkdeep(): + success_count += 1 + + # Scenario 3: Multi-file cross-tool continuation + if self._test_multi_file_continuation(): + success_count += 1 + + self.logger.info(f" โœ… Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed") + + # Consider successful if at least one scenario worked + return success_count > 0 + + except Exception as e: + self.logger.error(f"Cross-tool continuation test failed: {e}") + return False + finally: + self.cleanup_test_files() + + def _test_chat_thinkdeep_codereview(self) -> bool: + """Test chat -> thinkdeep -> codereview scenario""" + try: + self.logger.info(" 1: Testing chat -> thinkdeep -> codereview") + + # Start with chat + chat_response, chat_id = self.call_mcp_tool( + "chat", + { + "prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it", + "files": [self.test_files["python"]], + }, + ) + + if not chat_response or not chat_id: + self.logger.error("Failed to start chat conversation") + return False + + # Continue with thinkdeep + thinkdeep_response, _ = self.call_mcp_tool( + "thinkdeep", + { + "prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code", + "files": [self.test_files["python"]], # Same file should be deduplicated + "continuation_id": chat_id, + }, + ) + + if not thinkdeep_response: + self.logger.error("Failed chat -> thinkdeep continuation") + return False + + # Continue with codereview + codereview_response, _ = self.call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"]], # Same file should be deduplicated + "context": "Building on our previous analysis, provide a comprehensive code review", + "continuation_id": chat_id, + }, + ) + + if not codereview_response: + self.logger.error("Failed thinkdeep -> codereview continuation") + return False + + self.logger.info(" โœ… chat -> thinkdeep -> codereview working") + return True + + except Exception as e: + self.logger.error(f"Chat -> thinkdeep -> codereview scenario failed: {e}") + return False + + def _test_analyze_debug_thinkdeep(self) -> bool: + """Test analyze -> debug -> thinkdeep scenario""" + try: + self.logger.info(" 2: Testing analyze -> debug -> thinkdeep") + + # Start with analyze + analyze_response, analyze_id = self.call_mcp_tool( + "analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality"} + ) + + if not analyze_response or not analyze_id: + self.logger.warning("Failed to start analyze conversation, skipping scenario 2") + return False + + # Continue with debug + debug_response, _ = self.call_mcp_tool( + "debug", + { + "files": [self.test_files["python"]], # Same file should be deduplicated + "issue_description": "Based on our analysis, help debug the performance issue in fibonacci", + "continuation_id": analyze_id, + }, + ) + + if not debug_response: + self.logger.warning(" โš ๏ธ analyze -> debug continuation failed") + return False + + # Continue with thinkdeep + final_response, _ = self.call_mcp_tool( + "thinkdeep", + { + "prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found", + "files": [self.test_files["python"]], # Same file should be deduplicated + "continuation_id": analyze_id, + }, + ) + + if not final_response: + self.logger.warning(" โš ๏ธ debug -> thinkdeep continuation failed") + return False + + self.logger.info(" โœ… analyze -> debug -> thinkdeep working") + return True + + except Exception as e: + self.logger.error(f"Analyze -> debug -> thinkdeep scenario failed: {e}") + return False + + def _test_multi_file_continuation(self) -> bool: + """Test multi-file cross-tool continuation""" + try: + self.logger.info(" 3: Testing multi-file cross-tool continuation") + + # Start with both files + multi_response, multi_id = self.call_mcp_tool( + "chat", + { + "prompt": "Please use low thinking mode. Analyze both the Python code and configuration file", + "files": [self.test_files["python"], self.test_files["config"]], + }, + ) + + if not multi_response or not multi_id: + self.logger.warning("Failed to start multi-file conversation, skipping scenario 3") + return False + + # Switch to codereview with same files (should use conversation history) + multi_review, _ = self.call_mcp_tool( + "codereview", + { + "files": [self.test_files["python"], self.test_files["config"]], # Same files + "context": "Review both files in the context of our previous discussion", + "continuation_id": multi_id, + }, + ) + + if not multi_review: + self.logger.warning(" โš ๏ธ Multi-file cross-tool continuation failed") + return False + + self.logger.info(" โœ… Multi-file cross-tool continuation working") + return True + + except Exception as e: + self.logger.error(f"Multi-file continuation scenario failed: {e}") + return False \ No newline at end of file diff --git a/simulator_tests/test_logs_validation.py b/simulator_tests/test_logs_validation.py new file mode 100644 index 0000000..bbb90cc --- /dev/null +++ b/simulator_tests/test_logs_validation.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Docker Logs Validation Test + +Validates Docker logs to confirm file deduplication behavior and +conversation threading is working properly. +""" + +from .base_test import BaseSimulatorTest + + +class LogsValidationTest(BaseSimulatorTest): + """Validate Docker logs to confirm file deduplication behavior""" + + @property + def test_name(self) -> str: + return "logs_validation" + + @property + def test_description(self) -> str: + return "Docker logs validation" + + def run_test(self) -> bool: + """Validate Docker logs to confirm file deduplication behavior""" + try: + self.logger.info("๐Ÿ“‹ Test: Validating Docker logs for file deduplication...") + + # Get server logs from both main container and activity logs + result = self.run_command(["docker", "logs", self.container_name], capture_output=True) + + if result.returncode != 0: + self.logger.error(f"Failed to get Docker logs: {result.stderr}") + return False + + main_logs = result.stdout.decode() + result.stderr.decode() + + # Also get activity logs for more detailed conversation tracking + activity_result = self.run_command( + ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True + ) + + activity_logs = "" + if activity_result.returncode == 0: + activity_logs = activity_result.stdout.decode() + + logs = main_logs + "\n" + activity_logs + + # Look for conversation threading patterns that indicate the system is working + conversation_patterns = [ + "CONVERSATION_RESUME", + "CONVERSATION_CONTEXT", + "previous turns loaded", + "tool embedding", + "files included", + "files truncated", + "already in conversation history", + ] + + conversation_lines = [] + for line in logs.split("\n"): + for pattern in conversation_patterns: + if pattern.lower() in line.lower(): + conversation_lines.append(line.strip()) + break + + # Look for evidence of conversation threading and file handling + conversation_threading_found = False + multi_turn_conversations = False + + for line in conversation_lines: + lower_line = line.lower() + if "conversation_resume" in lower_line: + conversation_threading_found = True + self.logger.debug(f"๐Ÿ“„ Conversation threading: {line}") + elif "previous turns loaded" in lower_line: + multi_turn_conversations = True + self.logger.debug(f"๐Ÿ“„ Multi-turn conversation: {line}") + elif "already in conversation" in lower_line: + self.logger.info(f"โœ… Found explicit deduplication: {line}") + return True + + # Conversation threading with multiple turns is evidence of file deduplication working + if conversation_threading_found and multi_turn_conversations: + self.logger.info("โœ… Conversation threading with multi-turn context working") + self.logger.info( + "โœ… File deduplication working implicitly (files embedded once in conversation history)" + ) + return True + elif conversation_threading_found: + self.logger.info("โœ… Conversation threading detected") + return True + else: + self.logger.warning("โš ๏ธ No clear evidence of conversation threading in logs") + self.logger.debug(f"Found {len(conversation_lines)} conversation-related log lines") + return False + + except Exception as e: + self.logger.error(f"Log validation failed: {e}") + return False \ No newline at end of file diff --git a/simulator_tests/test_per_tool_deduplication.py b/simulator_tests/test_per_tool_deduplication.py new file mode 100644 index 0000000..74937b0 --- /dev/null +++ b/simulator_tests/test_per_tool_deduplication.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +""" +Per-Tool File Deduplication Test + +Tests file deduplication for each individual MCP tool to ensure +that files are properly deduplicated within single-tool conversations. +""" + +from .base_test import BaseSimulatorTest + + +class PerToolDeduplicationTest(BaseSimulatorTest): + """Test file deduplication for each individual tool""" + + @property + def test_name(self) -> str: + return "per_tool_deduplication" + + @property + def test_description(self) -> str: + return "File deduplication for individual tools" + + def run_test(self) -> bool: + """Test file deduplication for each individual tool""" + try: + self.logger.info("๐Ÿ“„ Test: Per-tool file deduplication") + + # Setup test files + self.setup_test_files() + + tools_to_test = [ + ( + "thinkdeep", + { + "prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements", + "files": [self.test_files["python"]], + }, + ), + ("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}), + ( + "debug", + { + "files": [self.test_files["python"]], + "issue_description": "The fibonacci function seems slow for large numbers", + }, + ), + ( + "codereview", + { + "files": [self.test_files["python"]], + "context": "General code review for quality and best practices", + }, + ), + ] + + successful_tests = 0 + total_tests = len(tools_to_test) + + for tool_name, initial_params in tools_to_test: + self.logger.info(f" {tool_name}: Testing {tool_name} tool file deduplication") + + # Initial call + response1, continuation_id = self.call_mcp_tool(tool_name, initial_params) + if not response1: + self.logger.warning(f" โš ๏ธ {tool_name} tool initial call failed, skipping") + continue + + if not continuation_id: + self.logger.warning(f" โš ๏ธ {tool_name} tool didn't provide continuation_id, skipping") + continue + + # Continue with same file - should be deduplicated + continue_params = initial_params.copy() + continue_params["continuation_id"] = continuation_id + + if tool_name == "thinkdeep": + continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation" + elif tool_name == "analyze": + continue_params["analysis_type"] = "performance" + elif tool_name == "debug": + continue_params["issue_description"] = "How can we optimize the fibonacci function?" + elif tool_name == "codereview": + continue_params["context"] = "Focus on the Calculator class implementation" + + response2, _ = self.call_mcp_tool(tool_name, continue_params) + if response2: + self.logger.info(f" โœ… {tool_name} tool file deduplication working") + successful_tests += 1 + else: + self.logger.warning(f" โš ๏ธ {tool_name} tool continuation failed") + + self.logger.info(f" โœ… Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed") + + # Consider test successful if at least one tool worked + return successful_tests > 0 + + except Exception as e: + self.logger.error(f"Per-tool file deduplication test failed: {e}") + return False + finally: + self.cleanup_test_files() \ No newline at end of file diff --git a/simulator_tests/test_redis_validation.py b/simulator_tests/test_redis_validation.py new file mode 100644 index 0000000..aeda1a9 --- /dev/null +++ b/simulator_tests/test_redis_validation.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Redis Conversation Memory Validation Test + +Validates that conversation memory is working via Redis by checking +for stored conversation threads and their content. +""" + +import json +from .base_test import BaseSimulatorTest + + +class RedisValidationTest(BaseSimulatorTest): + """Validate that conversation memory is working via Redis""" + + @property + def test_name(self) -> str: + return "redis_validation" + + @property + def test_description(self) -> str: + return "Redis conversation memory validation" + + def run_test(self) -> bool: + """Validate that conversation memory is working via Redis""" + try: + self.logger.info("๐Ÿ’พ Test: Validating conversation memory via Redis...") + + # First, test Redis connectivity + ping_result = self.run_command( + ["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True + ) + + if ping_result.returncode != 0: + self.logger.error("Failed to connect to Redis") + return False + + if "PONG" not in ping_result.stdout.decode(): + self.logger.error("Redis ping failed") + return False + + self.logger.info("โœ… Redis connectivity confirmed") + + # Check Redis for stored conversations + result = self.run_command( + ["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True + ) + + if result.returncode != 0: + self.logger.error("Failed to query Redis") + return False + + keys = result.stdout.decode().strip().split("\n") + thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"] + + if thread_keys: + self.logger.info(f"โœ… Found {len(thread_keys)} conversation threads in Redis") + + # Get details of first thread + thread_key = thread_keys[0] + result = self.run_command( + ["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True + ) + + if result.returncode == 0: + thread_data = result.stdout.decode() + try: + parsed = json.loads(thread_data) + turns = parsed.get("turns", []) + self.logger.info(f"โœ… Thread has {len(turns)} turns") + return True + except json.JSONDecodeError: + self.logger.warning("Could not parse thread data") + + return True + else: + # If no existing threads, create a test thread to validate Redis functionality + self.logger.info("๐Ÿ“ No existing threads found, creating test thread to validate Redis...") + + test_thread_id = "test_thread_validation" + test_data = { + "thread_id": test_thread_id, + "turns": [ + { + "tool": "chat", + "timestamp": "2025-06-11T16:30:00Z", + "prompt": "Test validation prompt" + } + ] + } + + # Store test data + store_result = self.run_command([ + "docker", "exec", self.redis_container, "redis-cli", + "SET", f"thread:{test_thread_id}", json.dumps(test_data) + ], capture_output=True) + + if store_result.returncode != 0: + self.logger.error("Failed to store test data in Redis") + return False + + # Retrieve test data + retrieve_result = self.run_command([ + "docker", "exec", self.redis_container, "redis-cli", + "GET", f"thread:{test_thread_id}" + ], capture_output=True) + + if retrieve_result.returncode != 0: + self.logger.error("Failed to retrieve test data from Redis") + return False + + retrieved_data = retrieve_result.stdout.decode() + try: + parsed = json.loads(retrieved_data) + if parsed.get("thread_id") == test_thread_id: + self.logger.info("โœ… Redis read/write validation successful") + + # Clean up test data + self.run_command([ + "docker", "exec", self.redis_container, "redis-cli", + "DEL", f"thread:{test_thread_id}" + ], capture_output=True) + + return True + else: + self.logger.error("Retrieved data doesn't match stored data") + return False + except json.JSONDecodeError: + self.logger.error("Could not parse retrieved test data") + return False + + except Exception as e: + self.logger.error(f"Conversation memory validation failed: {e}") + return False \ No newline at end of file diff --git a/test_simulation_files/test_module.py b/test_simulation_files/test_module.py index b6397dc..5defb99 100644 --- a/test_simulation_files/test_module.py +++ b/test_simulation_files/test_module.py @@ -2,13 +2,11 @@ Sample Python module for testing MCP conversation continuity """ - def fibonacci(n): """Calculate fibonacci number recursively""" if n <= 1: return n - return fibonacci(n - 1) + fibonacci(n - 2) - + return fibonacci(n-1) + fibonacci(n-2) def factorial(n): """Calculate factorial iteratively""" @@ -17,7 +15,6 @@ def factorial(n): result *= i return result - class Calculator: """Simple calculator class""" diff --git a/test_simulation_files/validation_config.py b/test_simulation_files/validation_config.py new file mode 100644 index 0000000..4f234a8 --- /dev/null +++ b/test_simulation_files/validation_config.py @@ -0,0 +1,16 @@ +""" +Configuration file for content validation testing +This content should appear only ONCE in any tool response +""" + +# Configuration constants +MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once +TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once +UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345" + +# Database settings +DATABASE_CONFIG = { + "host": "localhost", + "port": 5432, + "name": "validation_test_db" +} diff --git a/tests/test_precommit_with_mock_store.py b/tests/test_precommit_with_mock_store.py new file mode 100644 index 0000000..044f7f5 --- /dev/null +++ b/tests/test_precommit_with_mock_store.py @@ -0,0 +1,261 @@ +""" +Enhanced tests for precommit tool using mock storage to test real logic +""" + +import json +import tempfile +import os +from unittest.mock import Mock, patch, MagicMock +from typing import Dict, Any, Optional + +import pytest + +from tools.precommit import Precommit, PrecommitRequest + + +class MockRedisClient: + """Mock Redis client that uses in-memory dictionary storage""" + + def __init__(self): + self.data: Dict[str, str] = {} + self.ttl_data: Dict[str, int] = {} + + def get(self, key: str) -> Optional[str]: + return self.data.get(key) + + def set(self, key: str, value: str, ex: Optional[int] = None) -> bool: + self.data[key] = value + if ex: + self.ttl_data[key] = ex + return True + + def delete(self, key: str) -> int: + if key in self.data: + del self.data[key] + self.ttl_data.pop(key, None) + return 1 + return 0 + + def exists(self, key: str) -> int: + return 1 if key in self.data else 0 + + +class TestPrecommitToolWithMockStore: + """Test precommit tool with mock storage to validate actual logic""" + + @pytest.fixture + def mock_redis(self): + """Create mock Redis client""" + return MockRedisClient() + + @pytest.fixture + def tool(self, mock_redis): + """Create tool instance with mocked Redis""" + tool = Precommit() + + # Mock the Redis client getter to return our mock + with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis): + yield tool + + @pytest.fixture + def temp_repo(self): + """Create a temporary git repository with test files""" + import subprocess + + temp_dir = tempfile.mkdtemp() + + # Initialize git repo + subprocess.run(['git', 'init'], cwd=temp_dir, capture_output=True) + subprocess.run(['git', 'config', 'user.name', 'Test'], cwd=temp_dir, capture_output=True) + subprocess.run(['git', 'config', 'user.email', 'test@example.com'], cwd=temp_dir, capture_output=True) + + # Create test config file + config_content = '''"""Test configuration file""" + +# Version and metadata +__version__ = "1.0.0" +__author__ = "Test" + +# Configuration +MAX_CONTENT_TOKENS = 800_000 # 800K tokens for content +TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging +''' + + config_path = os.path.join(temp_dir, 'config.py') + with open(config_path, 'w') as f: + f.write(config_content) + + # Add and commit initial version + subprocess.run(['git', 'add', '.'], cwd=temp_dir, capture_output=True) + subprocess.run(['git', 'commit', '-m', 'Initial commit'], cwd=temp_dir, capture_output=True) + + # Modify config to create a diff + modified_content = config_content + '\nNEW_SETTING = "test" # Added setting\n' + with open(config_path, 'w') as f: + f.write(modified_content) + + yield temp_dir, config_path + + # Cleanup + import shutil + shutil.rmtree(temp_dir) + + @pytest.mark.asyncio + async def test_no_duplicate_file_content_in_prompt(self, tool, temp_repo, mock_redis): + """Test that file content doesn't appear twice in the generated prompt""" + temp_dir, config_path = temp_repo + + # Create request with files parameter + request = PrecommitRequest( + path=temp_dir, + files=[config_path], + original_request="Test configuration changes" + ) + + # Generate the prompt + prompt = await tool.prepare_prompt(request) + + # Test that MAX_CONTENT_TOKENS only appears once in the entire prompt + max_content_count = prompt.count('MAX_CONTENT_TOKENS = 800_000') + assert max_content_count == 1, f"MAX_CONTENT_TOKENS appears {max_content_count} times (should be 1)" + + # Test that the config file content only appears once + config_content_count = prompt.count('# Configuration') + assert config_content_count == 1, f"Config file content appears {config_content_count} times (should be 1)" + + # Verify expected sections are present + assert "## Original Request" in prompt + assert "Test configuration changes" in prompt + assert "## Additional Context Files" in prompt + assert "## Git Diffs" in prompt + + @pytest.mark.asyncio + async def test_conversation_memory_integration(self, tool, temp_repo, mock_redis): + """Test that conversation memory works with mock storage""" + temp_dir, config_path = temp_repo + + # Mock conversation memory functions to use our mock redis + with patch('utils.conversation_memory.get_redis_client', return_value=mock_redis): + # First request - should embed file content + request1 = PrecommitRequest( + path=temp_dir, + files=[config_path], + original_request="First review" + ) + + # Simulate conversation thread creation + from utils.conversation_memory import create_thread, add_turn + thread_id = create_thread("precommit", {"files": [config_path]}) + + # Test that file embedding works + files_to_embed = tool.filter_new_files([config_path], None) + assert config_path in files_to_embed, "New conversation should embed all files" + + # Add a turn to the conversation + add_turn(thread_id, "assistant", "First response", files=[config_path], tool_name="precommit") + + # Second request with continuation - should skip already embedded files + request2 = PrecommitRequest( + path=temp_dir, + files=[config_path], + continuation_id=thread_id, + original_request="Follow-up review" + ) + + files_to_embed_2 = tool.filter_new_files([config_path], thread_id) + assert len(files_to_embed_2) == 0, "Continuation should skip already embedded files" + + @pytest.mark.asyncio + async def test_prompt_structure_integrity(self, tool, temp_repo, mock_redis): + """Test that the prompt structure is well-formed and doesn't have content duplication""" + temp_dir, config_path = temp_repo + + request = PrecommitRequest( + path=temp_dir, + files=[config_path], + original_request="Validate prompt structure", + review_type="full", + severity_filter="high" + ) + + prompt = await tool.prepare_prompt(request) + + # Split prompt into sections + sections = { + "original_request": "## Original Request", + "review_parameters": "## Review Parameters", + "repo_summary": "## Repository Changes Summary", + "context_files_summary": "## Context Files Summary", + "git_diffs": "## Git Diffs", + "additional_context": "## Additional Context Files", + "review_instructions": "## Review Instructions" + } + + section_indices = {} + for name, header in sections.items(): + index = prompt.find(header) + if index != -1: + section_indices[name] = index + + # Verify sections appear in logical order + assert section_indices["original_request"] < section_indices["review_parameters"] + assert section_indices["review_parameters"] < section_indices["repo_summary"] + assert section_indices["git_diffs"] < section_indices["additional_context"] + assert section_indices["additional_context"] < section_indices["review_instructions"] + + # Test that file content only appears in Additional Context section + file_content_start = section_indices["additional_context"] + file_content_end = section_indices["review_instructions"] + + file_section = prompt[file_content_start:file_content_end] + before_file_section = prompt[:file_content_start] + after_file_section = prompt[file_content_end:] + + # MAX_CONTENT_TOKENS should only appear in the file section + assert 'MAX_CONTENT_TOKENS' in file_section + assert 'MAX_CONTENT_TOKENS' not in before_file_section + assert 'MAX_CONTENT_TOKENS' not in after_file_section + + @pytest.mark.asyncio + async def test_file_content_formatting(self, tool, temp_repo, mock_redis): + """Test that file content is properly formatted without duplication""" + temp_dir, config_path = temp_repo + + # Test the centralized file preparation method directly + file_content = tool._prepare_file_content_for_prompt( + [config_path], + None, # No continuation + "Test files", + max_tokens=100000, + reserve_tokens=1000 + ) + + # Should contain file markers + assert "--- BEGIN FILE:" in file_content + assert "--- END FILE:" in file_content + assert "config.py" in file_content + + # Should contain actual file content + assert "MAX_CONTENT_TOKENS = 800_000" in file_content + assert "__version__ = \"1.0.0\"" in file_content + + # Content should appear only once + assert file_content.count("MAX_CONTENT_TOKENS = 800_000") == 1 + assert file_content.count("__version__ = \"1.0.0\"") == 1 + + +def test_mock_redis_basic_operations(): + """Test that our mock Redis implementation works correctly""" + mock_redis = MockRedisClient() + + # Test basic operations + assert mock_redis.get("nonexistent") is None + assert mock_redis.exists("nonexistent") == 0 + + mock_redis.set("test_key", "test_value") + assert mock_redis.get("test_key") == "test_value" + assert mock_redis.exists("test_key") == 1 + + assert mock_redis.delete("test_key") == 1 + assert mock_redis.get("test_key") is None + assert mock_redis.delete("test_key") == 0 # Already deleted \ No newline at end of file diff --git a/tools/precommit.py b/tools/precommit.py index 1fd1498..050b6d3 100644 --- a/tools/precommit.py +++ b/tools/precommit.py @@ -10,7 +10,7 @@ from pydantic import Field from config import MAX_CONTEXT_TOKENS from prompts.tool_prompts import PRECOMMIT_PROMPT -from utils.file_utils import translate_file_paths, translate_path_for_environment +from utils.file_utils import read_files, translate_file_paths, translate_path_for_environment from utils.git_utils import find_git_repositories, get_git_status, run_git_command from utils.token_utils import estimate_tokens @@ -300,11 +300,11 @@ class Precommit(BaseTool): # Use centralized file handling with filtering for duplicate prevention file_content = self._prepare_file_content_for_prompt( - translated_files, - request.continuation_id, + translated_files, + request.continuation_id, "Context files", max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated - reserve_tokens=1000, # Small reserve for formatting + reserve_tokens=1000 # Small reserve for formatting ) if file_content: