Lots of tests with live simulation to validate conversation continuation / preservation work across requests

2025-06-11 17:16:05 +04:00
parent c90ac7561e
commit 780000f9c9
15 changed files with 272 additions and 2296 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -8,9 +8,9 @@ Each test is in its own file for better organization and maintainability.
 from .base_test import BaseSimulatorTest
 from .test_basic_conversation import BasicConversationTest
 from .test_content_validation import ContentValidationTest
-from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_cross_tool_continuation import CrossToolContinuationTest
 from .test_logs_validation import LogsValidationTest
+from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_redis_validation import RedisValidationTest

 # Test registry for dynamic loading
@@ -24,12 +24,12 @@ TEST_REGISTRY = {
 }

 __all__ = [
-    'BaseSimulatorTest',
-    'BasicConversationTest',
-    'ContentValidationTest', 
-    'PerToolDeduplicationTest',
-    'CrossToolContinuationTest',
-    'LogsValidationTest',
-    'RedisValidationTest',
-    'TEST_REGISTRY'
-]
+    "BaseSimulatorTest",
+    "BasicConversationTest",
+    "ContentValidationTest",
+    "PerToolDeduplicationTest",
+    "CrossToolContinuationTest",
+    "LogsValidationTest",
+    "RedisValidationTest",
+    "TEST_REGISTRY",
+]
--- a/simulator_tests/base_test.py
+++ b/simulator_tests/base_test.py
@@ -9,9 +9,7 @@ import json
 import logging
 import os
 import subprocess
-import tempfile
-import time
-from typing import Optional, Tuple
+from typing import Optional


 class BaseSimulatorTest:
@@ -23,7 +21,7 @@ class BaseSimulatorTest:
        self.test_dir = None
        self.container_name = "gemini-mcp-server"
        self.redis_container = "gemini-mcp-redis"
-        
+
        # Configure logging
        log_level = logging.DEBUG if verbose else logging.INFO
        logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -100,7 +98,7 @@ class Calculator:
        self.test_files = {"python": test_py, "config": test_config}
        self.logger.debug(f"Created test files: {list(self.test_files.values())}")

-    def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]:
+    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
        """Call an MCP tool via Claude CLI (docker exec)"""
        try:
            # Prepare the MCP initialization and tool call sequence
@@ -237,6 +235,7 @@ class Calculator:
        """Clean up test files"""
        if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir):
            import shutil
+
            shutil.rmtree(self.test_dir)
            self.logger.debug(f"Removed test files directory: {self.test_dir}")

@@ -252,4 +251,4 @@ class Calculator:
    @property
    def test_description(self) -> str:
        """Get the test description - to be implemented by subclasses"""
-        raise NotImplementedError("Subclasses must implement test_description property")
+        raise NotImplementedError("Subclasses must implement test_description property")
--- a/simulator_tests/test_basic_conversation.py
+++ b/simulator_tests/test_basic_conversation.py
@@ -34,7 +34,10 @@ class BasicConversationTest(BaseSimulatorTest):
            self.logger.info("  1.1: Initial chat with file analysis")
            response1, continuation_id = self.call_mcp_tool(
                "chat",
-                {"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]},
+                {
+                    "prompt": "Please use low thinking mode. Analyze this Python code and explain what it does",
+                    "files": [self.test_files["python"]],
+                },
            )

            if not response1 or not continuation_id:
@@ -80,4 +83,4 @@ class BasicConversationTest(BaseSimulatorTest):
            self.logger.error(f"Basic conversation flow test failed: {e}")
            return False
        finally:
-            self.cleanup_test_files()
+            self.cleanup_test_files()
--- a/simulator_tests/test_content_validation.py
+++ b/simulator_tests/test_content_validation.py
@@ -8,6 +8,7 @@ This test is specifically designed to catch content duplication bugs.

 import json
 import os
+
 from .base_test import BaseSimulatorTest


@@ -26,10 +27,10 @@ class ContentValidationTest(BaseSimulatorTest):
        """Test that tools don't duplicate file content in their responses"""
        try:
            self.logger.info("📄 Test: Content validation and duplicate detection")
-            
+
            # Setup test files first
            self.setup_test_files()
-            
+
            # Create a test file with distinctive content for validation
            validation_content = '''"""
 Configuration file for content validation testing
@@ -41,102 +42,110 @@ MAX_CONTENT_TOKENS = 800_000  # This line should appear exactly once
 TEMPERATURE_ANALYTICAL = 0.2  # This should also appear exactly once
 UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"

-# Database settings  
+# Database settings
 DATABASE_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "name": "validation_test_db"
 }
 '''
-            
+
            validation_file = os.path.join(self.test_dir, "validation_config.py")
            with open(validation_file, "w") as f:
                f.write(validation_content)
-            
+
            # Test 1: Precommit tool with files parameter (where the bug occurred)
            self.logger.info("  1: Testing precommit tool content duplication")
-            
+
            # Call precommit tool with the validation file
            response1, thread_id = self.call_mcp_tool(
-                "precommit", 
+                "precommit",
                {
                    "path": os.getcwd(),
                    "files": [validation_file],
-                    "original_request": "Test for content duplication in precommit tool"
-                }
+                    "original_request": "Test for content duplication in precommit tool",
+                },
            )
-            
+
            if response1:
                # Parse response and check for content duplication
                try:
                    response_data = json.loads(response1)
                    content = response_data.get("content", "")
-                    
+
                    # Count occurrences of distinctive markers
                    max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000")
                    temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2")
                    unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER")
-                    
+
                    # Validate no duplication
                    duplication_detected = False
                    issues = []
-                    
+
                    if max_content_count > 1:
                        issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times")
                        duplication_detected = True
-                    
+
                    if temp_analytical_count > 1:
                        issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times")
                        duplication_detected = True
-                        
+
                    if unique_marker_count > 1:
                        issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times")
                        duplication_detected = True
-                    
+
                    if duplication_detected:
                        self.logger.error(f"  ❌ Content duplication detected in precommit tool: {'; '.join(issues)}")
                        return False
                    else:
                        self.logger.info("  ✅ No content duplication in precommit tool")
-                        
+
                except json.JSONDecodeError:
                    self.logger.warning("  ⚠️  Could not parse precommit response as JSON")
-                    
+
            else:
                self.logger.warning("  ⚠️  Precommit tool failed to respond")
-            
+
            # Test 2: Other tools that use files parameter
            tools_to_test = [
-                ("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}),
-                ("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}),
-                ("analyze", {"files": [validation_file], "analysis_type": "code_quality"})
+                (
+                    "chat",
+                    {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]},
+                ),
+                (
+                    "codereview",
+                    {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"},
+                ),
+                ("analyze", {"files": [validation_file], "analysis_type": "code_quality"}),
            ]
-            
+
            for tool_name, params in tools_to_test:
                self.logger.info(f"  2.{tool_name}: Testing {tool_name} tool content duplication")
-                
+
                response, _ = self.call_mcp_tool(tool_name, params)
                if response:
                    try:
                        response_data = json.loads(response)
                        content = response_data.get("content", "")
-                        
+
                        # Check for duplication
                        marker_count = content.count("UNIQUE_VALIDATION_MARKER")
                        if marker_count > 1:
-                            self.logger.error(f"  ❌ Content duplication in {tool_name}: marker appears {marker_count} times")
+                            self.logger.error(
+                                f"  ❌ Content duplication in {tool_name}: marker appears {marker_count} times"
+                            )
                            return False
                        else:
                            self.logger.info(f"  ✅ No content duplication in {tool_name}")
-                            
+
                    except json.JSONDecodeError:
                        self.logger.warning(f"  ⚠️  Could not parse {tool_name} response")
                else:
                    self.logger.warning(f"  ⚠️  {tool_name} tool failed to respond")
-            
+
            # Test 3: Cross-tool content validation with file deduplication
            self.logger.info("  3: Testing cross-tool content consistency")
-            
+
            if thread_id:
                # Continue conversation with same file - content should be deduplicated in conversation history
                response2, _ = self.call_mcp_tool(
@@ -147,31 +156,33 @@ DATABASE_CONFIG = {
                        "continuation_id": thread_id,
                    },
                )
-                
+
                if response2:
                    try:
                        response_data = json.loads(response2)
                        content = response_data.get("content", "")
-                        
+
                        # In continuation, the file content shouldn't be duplicated either
                        marker_count = content.count("UNIQUE_VALIDATION_MARKER")
                        if marker_count > 1:
-                            self.logger.error(f"  ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times")
+                            self.logger.error(
+                                f"  ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times"
+                            )
                            return False
                        else:
                            self.logger.info("  ✅ No content duplication in cross-tool continuation")
-                            
+
                    except json.JSONDecodeError:
                        self.logger.warning("  ⚠️  Could not parse continuation response")
-            
+
            # Cleanup
            os.remove(validation_file)
-            
+
            self.logger.info("  ✅ All content validation tests passed")
            return True
-            
+
        except Exception as e:
            self.logger.error(f"Content validation test failed: {e}")
            return False
        finally:
-            self.cleanup_test_files()
+            self.cleanup_test_files()
--- a/simulator_tests/test_cross_tool_continuation.py
+++ b/simulator_tests/test_cross_tool_continuation.py
@@ -43,8 +43,10 @@ class CrossToolContinuationTest(BaseSimulatorTest):
            if self._test_multi_file_continuation():
                success_count += 1

-            self.logger.info(f"  ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed")
-            
+            self.logger.info(
+                f"  ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed"
+            )
+
            # Consider successful if at least one scenario worked
            return success_count > 0

@@ -193,4 +195,4 @@ class CrossToolContinuationTest(BaseSimulatorTest):

        except Exception as e:
            self.logger.error(f"Multi-file continuation scenario failed: {e}")
-            return False
+            return False
--- a/simulator_tests/test_logs_validation.py
+++ b/simulator_tests/test_logs_validation.py
@@ -96,4 +96,4 @@ class LogsValidationTest(BaseSimulatorTest):

        except Exception as e:
            self.logger.error(f"Log validation failed: {e}")
-            return False
+            return False
--- a/simulator_tests/test_per_tool_deduplication.py
+++ b/simulator_tests/test_per_tool_deduplication.py
@@ -32,16 +32,22 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
                (
                    "thinkdeep",
                    {
-                        "prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements",
+                        "current_analysis": "Please use low thinking mode. I'm analyzing this Python code to identify potential architectural improvements",
                        "files": [self.test_files["python"]],
                    },
                ),
-                ("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}),
+                (
+                    "analyze",
+                    {
+                        "files": [self.test_files["python"]],
+                        "question": "Please use low thinking mode. What are the architectural patterns in this code?",
+                    },
+                ),
                (
                    "debug",
                    {
                        "files": [self.test_files["python"]],
-                        "issue_description": "The fibonacci function seems slow for large numbers",
+                        "error_description": "Please use low thinking mode. The fibonacci function seems slow for large numbers",
                    },
                ),
                (
@@ -74,11 +80,17 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
                continue_params["continuation_id"] = continuation_id

                if tool_name == "thinkdeep":
-                    continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
+                    continue_params["current_analysis"] = (
+                        "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
+                    )
                elif tool_name == "analyze":
-                    continue_params["analysis_type"] = "performance"
+                    continue_params["question"] = (
+                        "Please use low thinking mode. What are the performance characteristics of this code?"
+                    )
                elif tool_name == "debug":
-                    continue_params["issue_description"] = "How can we optimize the fibonacci function?"
+                    continue_params["error_description"] = (
+                        "Please use low thinking mode. How can we optimize the fibonacci function?"
+                    )
                elif tool_name == "codereview":
                    continue_params["context"] = "Focus on the Calculator class implementation"

@@ -89,8 +101,10 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
                else:
                    self.logger.warning(f"  ⚠️ {tool_name} tool continuation failed")

-            self.logger.info(f"  ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed")
-            
+            self.logger.info(
+                f"  ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed"
+            )
+
            # Consider test successful if at least one tool worked
            return successful_tests > 0

@@ -98,4 +112,4 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
            self.logger.error(f"Per-tool file deduplication test failed: {e}")
            return False
        finally:
-            self.cleanup_test_files()
+            self.cleanup_test_files()
--- a/simulator_tests/test_redis_validation.py
+++ b/simulator_tests/test_redis_validation.py
@@ -7,6 +7,7 @@ for stored conversation threads and their content.
 """

 import json
+
 from .base_test import BaseSimulatorTest


@@ -30,15 +31,15 @@ class RedisValidationTest(BaseSimulatorTest):
            ping_result = self.run_command(
                ["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
            )
-            
+
            if ping_result.returncode != 0:
                self.logger.error("Failed to connect to Redis")
                return False
-                
+
            if "PONG" not in ping_result.stdout.decode():
                self.logger.error("Redis ping failed")
                return False
-                
+
            self.logger.info("✅ Redis connectivity confirmed")

            # Check Redis for stored conversations
@@ -76,51 +77,55 @@ class RedisValidationTest(BaseSimulatorTest):
            else:
                # If no existing threads, create a test thread to validate Redis functionality
                self.logger.info("📝 No existing threads found, creating test thread to validate Redis...")
-                
+
                test_thread_id = "test_thread_validation"
                test_data = {
                    "thread_id": test_thread_id,
                    "turns": [
-                        {
-                            "tool": "chat",
-                            "timestamp": "2025-06-11T16:30:00Z", 
-                            "prompt": "Test validation prompt"
-                        }
-                    ]
+                        {"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
+                    ],
                }
-                
+
                # Store test data
-                store_result = self.run_command([
-                    "docker", "exec", self.redis_container, "redis-cli", 
-                    "SET", f"thread:{test_thread_id}", json.dumps(test_data)
-                ], capture_output=True)
-                
+                store_result = self.run_command(
+                    [
+                        "docker",
+                        "exec",
+                        self.redis_container,
+                        "redis-cli",
+                        "SET",
+                        f"thread:{test_thread_id}",
+                        json.dumps(test_data),
+                    ],
+                    capture_output=True,
+                )
+
                if store_result.returncode != 0:
                    self.logger.error("Failed to store test data in Redis")
                    return False
-                    
+
                # Retrieve test data
-                retrieve_result = self.run_command([
-                    "docker", "exec", self.redis_container, "redis-cli",
-                    "GET", f"thread:{test_thread_id}"
-                ], capture_output=True)
-                
+                retrieve_result = self.run_command(
+                    ["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
+                    capture_output=True,
+                )
+
                if retrieve_result.returncode != 0:
                    self.logger.error("Failed to retrieve test data from Redis")
                    return False
-                    
+
                retrieved_data = retrieve_result.stdout.decode()
                try:
                    parsed = json.loads(retrieved_data)
                    if parsed.get("thread_id") == test_thread_id:
                        self.logger.info("✅ Redis read/write validation successful")
-                        
+
                        # Clean up test data
-                        self.run_command([
-                            "docker", "exec", self.redis_container, "redis-cli",
-                            "DEL", f"thread:{test_thread_id}"
-                        ], capture_output=True)
-                        
+                        self.run_command(
+                            ["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
+                            capture_output=True,
+                        )
+
                        return True
                    else:
                        self.logger.error("Retrieved data doesn't match stored data")
@@ -131,4 +136,4 @@ class RedisValidationTest(BaseSimulatorTest):

        except Exception as e:
            self.logger.error(f"Conversation memory validation failed: {e}")
-            return False
+            return False