Lots of tests with live simulation to validate conversation continuation / preservation work across requests

This commit is contained in:
Fahad
2025-06-11 17:16:05 +04:00
parent c90ac7561e
commit 780000f9c9
15 changed files with 272 additions and 2296 deletions

View File

@@ -8,9 +8,9 @@ Each test is in its own file for better organization and maintainability.
from .base_test import BaseSimulatorTest
from .test_basic_conversation import BasicConversationTest
from .test_content_validation import ContentValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_cross_tool_continuation import CrossToolContinuationTest
from .test_logs_validation import LogsValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_redis_validation import RedisValidationTest
# Test registry for dynamic loading
@@ -24,12 +24,12 @@ TEST_REGISTRY = {
}
__all__ = [
'BaseSimulatorTest',
'BasicConversationTest',
'ContentValidationTest',
'PerToolDeduplicationTest',
'CrossToolContinuationTest',
'LogsValidationTest',
'RedisValidationTest',
'TEST_REGISTRY'
]
"BaseSimulatorTest",
"BasicConversationTest",
"ContentValidationTest",
"PerToolDeduplicationTest",
"CrossToolContinuationTest",
"LogsValidationTest",
"RedisValidationTest",
"TEST_REGISTRY",
]

View File

@@ -9,9 +9,7 @@ import json
import logging
import os
import subprocess
import tempfile
import time
from typing import Optional, Tuple
from typing import Optional
class BaseSimulatorTest:
@@ -23,7 +21,7 @@ class BaseSimulatorTest:
self.test_dir = None
self.container_name = "gemini-mcp-server"
self.redis_container = "gemini-mcp-redis"
# Configure logging
log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -100,7 +98,7 @@ class Calculator:
self.test_files = {"python": test_py, "config": test_config}
self.logger.debug(f"Created test files: {list(self.test_files.values())}")
def call_mcp_tool(self, tool_name: str, params: dict) -> Tuple[Optional[str], Optional[str]]:
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec)"""
try:
# Prepare the MCP initialization and tool call sequence
@@ -237,6 +235,7 @@ class Calculator:
"""Clean up test files"""
if hasattr(self, "test_dir") and self.test_dir and os.path.exists(self.test_dir):
import shutil
shutil.rmtree(self.test_dir)
self.logger.debug(f"Removed test files directory: {self.test_dir}")
@@ -252,4 +251,4 @@ class Calculator:
@property
def test_description(self) -> str:
"""Get the test description - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement test_description property")
raise NotImplementedError("Subclasses must implement test_description property")

View File

@@ -34,7 +34,10 @@ class BasicConversationTest(BaseSimulatorTest):
self.logger.info(" 1.1: Initial chat with file analysis")
response1, continuation_id = self.call_mcp_tool(
"chat",
{"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does", "files": [self.test_files["python"]]},
{
"prompt": "Please use low thinking mode. Analyze this Python code and explain what it does",
"files": [self.test_files["python"]],
},
)
if not response1 or not continuation_id:
@@ -80,4 +83,4 @@ class BasicConversationTest(BaseSimulatorTest):
self.logger.error(f"Basic conversation flow test failed: {e}")
return False
finally:
self.cleanup_test_files()
self.cleanup_test_files()

View File

@@ -8,6 +8,7 @@ This test is specifically designed to catch content duplication bugs.
import json
import os
from .base_test import BaseSimulatorTest
@@ -26,10 +27,10 @@ class ContentValidationTest(BaseSimulatorTest):
"""Test that tools don't duplicate file content in their responses"""
try:
self.logger.info("📄 Test: Content validation and duplicate detection")
# Setup test files first
self.setup_test_files()
# Create a test file with distinctive content for validation
validation_content = '''"""
Configuration file for content validation testing
@@ -41,102 +42,110 @@ MAX_CONTENT_TOKENS = 800_000 # This line should appear exactly once
TEMPERATURE_ANALYTICAL = 0.2 # This should also appear exactly once
UNIQUE_VALIDATION_MARKER = "CONTENT_VALIDATION_TEST_12345"
# Database settings
# Database settings
DATABASE_CONFIG = {
"host": "localhost",
"port": 5432,
"name": "validation_test_db"
}
'''
validation_file = os.path.join(self.test_dir, "validation_config.py")
with open(validation_file, "w") as f:
f.write(validation_content)
# Test 1: Precommit tool with files parameter (where the bug occurred)
self.logger.info(" 1: Testing precommit tool content duplication")
# Call precommit tool with the validation file
response1, thread_id = self.call_mcp_tool(
"precommit",
"precommit",
{
"path": os.getcwd(),
"files": [validation_file],
"original_request": "Test for content duplication in precommit tool"
}
"original_request": "Test for content duplication in precommit tool",
},
)
if response1:
# Parse response and check for content duplication
try:
response_data = json.loads(response1)
content = response_data.get("content", "")
# Count occurrences of distinctive markers
max_content_count = content.count("MAX_CONTENT_TOKENS = 800_000")
temp_analytical_count = content.count("TEMPERATURE_ANALYTICAL = 0.2")
unique_marker_count = content.count("UNIQUE_VALIDATION_MARKER")
# Validate no duplication
duplication_detected = False
issues = []
if max_content_count > 1:
issues.append(f"MAX_CONTENT_TOKENS appears {max_content_count} times")
duplication_detected = True
if temp_analytical_count > 1:
issues.append(f"TEMPERATURE_ANALYTICAL appears {temp_analytical_count} times")
duplication_detected = True
if unique_marker_count > 1:
issues.append(f"UNIQUE_VALIDATION_MARKER appears {unique_marker_count} times")
duplication_detected = True
if duplication_detected:
self.logger.error(f" ❌ Content duplication detected in precommit tool: {'; '.join(issues)}")
return False
else:
self.logger.info(" ✅ No content duplication in precommit tool")
except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse precommit response as JSON")
else:
self.logger.warning(" ⚠️ Precommit tool failed to respond")
# Test 2: Other tools that use files parameter
tools_to_test = [
("chat", {"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}),
("codereview", {"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}),
("analyze", {"files": [validation_file], "analysis_type": "code_quality"})
(
"chat",
{"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]},
),
(
"codereview",
{"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"},
),
("analyze", {"files": [validation_file], "analysis_type": "code_quality"}),
]
for tool_name, params in tools_to_test:
self.logger.info(f" 2.{tool_name}: Testing {tool_name} tool content duplication")
response, _ = self.call_mcp_tool(tool_name, params)
if response:
try:
response_data = json.loads(response)
content = response_data.get("content", "")
# Check for duplication
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1:
self.logger.error(f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times")
self.logger.error(
f" ❌ Content duplication in {tool_name}: marker appears {marker_count} times"
)
return False
else:
self.logger.info(f" ✅ No content duplication in {tool_name}")
except json.JSONDecodeError:
self.logger.warning(f" ⚠️ Could not parse {tool_name} response")
else:
self.logger.warning(f" ⚠️ {tool_name} tool failed to respond")
# Test 3: Cross-tool content validation with file deduplication
self.logger.info(" 3: Testing cross-tool content consistency")
if thread_id:
# Continue conversation with same file - content should be deduplicated in conversation history
response2, _ = self.call_mcp_tool(
@@ -147,31 +156,33 @@ DATABASE_CONFIG = {
"continuation_id": thread_id,
},
)
if response2:
try:
response_data = json.loads(response2)
content = response_data.get("content", "")
# In continuation, the file content shouldn't be duplicated either
marker_count = content.count("UNIQUE_VALIDATION_MARKER")
if marker_count > 1:
self.logger.error(f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times")
self.logger.error(
f" ❌ Content duplication in cross-tool continuation: marker appears {marker_count} times"
)
return False
else:
self.logger.info(" ✅ No content duplication in cross-tool continuation")
except json.JSONDecodeError:
self.logger.warning(" ⚠️ Could not parse continuation response")
# Cleanup
os.remove(validation_file)
self.logger.info(" ✅ All content validation tests passed")
return True
except Exception as e:
self.logger.error(f"Content validation test failed: {e}")
return False
finally:
self.cleanup_test_files()
self.cleanup_test_files()

View File

@@ -43,8 +43,10 @@ class CrossToolContinuationTest(BaseSimulatorTest):
if self._test_multi_file_continuation():
success_count += 1
self.logger.info(f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed")
self.logger.info(
f" ✅ Cross-tool continuation scenarios completed: {success_count}/{total_scenarios} scenarios passed"
)
# Consider successful if at least one scenario worked
return success_count > 0
@@ -193,4 +195,4 @@ class CrossToolContinuationTest(BaseSimulatorTest):
except Exception as e:
self.logger.error(f"Multi-file continuation scenario failed: {e}")
return False
return False

View File

@@ -96,4 +96,4 @@ class LogsValidationTest(BaseSimulatorTest):
except Exception as e:
self.logger.error(f"Log validation failed: {e}")
return False
return False

View File

@@ -32,16 +32,22 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about this Python code and identify potential architectural improvements",
"current_analysis": "Please use low thinking mode. I'm analyzing this Python code to identify potential architectural improvements",
"files": [self.test_files["python"]],
},
),
("analyze", {"files": [self.test_files["python"]], "analysis_type": "architecture"}),
(
"analyze",
{
"files": [self.test_files["python"]],
"question": "Please use low thinking mode. What are the architectural patterns in this code?",
},
),
(
"debug",
{
"files": [self.test_files["python"]],
"issue_description": "The fibonacci function seems slow for large numbers",
"error_description": "Please use low thinking mode. The fibonacci function seems slow for large numbers",
},
),
(
@@ -74,11 +80,17 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
continue_params["continuation_id"] = continuation_id
if tool_name == "thinkdeep":
continue_params["prompt"] = "Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
continue_params["current_analysis"] = (
"Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
)
elif tool_name == "analyze":
continue_params["analysis_type"] = "performance"
continue_params["question"] = (
"Please use low thinking mode. What are the performance characteristics of this code?"
)
elif tool_name == "debug":
continue_params["issue_description"] = "How can we optimize the fibonacci function?"
continue_params["error_description"] = (
"Please use low thinking mode. How can we optimize the fibonacci function?"
)
elif tool_name == "codereview":
continue_params["context"] = "Focus on the Calculator class implementation"
@@ -89,8 +101,10 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
else:
self.logger.warning(f" ⚠️ {tool_name} tool continuation failed")
self.logger.info(f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed")
self.logger.info(
f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed"
)
# Consider test successful if at least one tool worked
return successful_tests > 0
@@ -98,4 +112,4 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
self.logger.error(f"Per-tool file deduplication test failed: {e}")
return False
finally:
self.cleanup_test_files()
self.cleanup_test_files()

View File

@@ -7,6 +7,7 @@ for stored conversation threads and their content.
"""
import json
from .base_test import BaseSimulatorTest
@@ -30,15 +31,15 @@ class RedisValidationTest(BaseSimulatorTest):
ping_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
)
if ping_result.returncode != 0:
self.logger.error("Failed to connect to Redis")
return False
if "PONG" not in ping_result.stdout.decode():
self.logger.error("Redis ping failed")
return False
self.logger.info("✅ Redis connectivity confirmed")
# Check Redis for stored conversations
@@ -76,51 +77,55 @@ class RedisValidationTest(BaseSimulatorTest):
else:
# If no existing threads, create a test thread to validate Redis functionality
self.logger.info("📝 No existing threads found, creating test thread to validate Redis...")
test_thread_id = "test_thread_validation"
test_data = {
"thread_id": test_thread_id,
"turns": [
{
"tool": "chat",
"timestamp": "2025-06-11T16:30:00Z",
"prompt": "Test validation prompt"
}
]
{"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
],
}
# Store test data
store_result = self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"SET", f"thread:{test_thread_id}", json.dumps(test_data)
], capture_output=True)
store_result = self.run_command(
[
"docker",
"exec",
self.redis_container,
"redis-cli",
"SET",
f"thread:{test_thread_id}",
json.dumps(test_data),
],
capture_output=True,
)
if store_result.returncode != 0:
self.logger.error("Failed to store test data in Redis")
return False
# Retrieve test data
retrieve_result = self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"GET", f"thread:{test_thread_id}"
], capture_output=True)
retrieve_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
capture_output=True,
)
if retrieve_result.returncode != 0:
self.logger.error("Failed to retrieve test data from Redis")
return False
retrieved_data = retrieve_result.stdout.decode()
try:
parsed = json.loads(retrieved_data)
if parsed.get("thread_id") == test_thread_id:
self.logger.info("✅ Redis read/write validation successful")
# Clean up test data
self.run_command([
"docker", "exec", self.redis_container, "redis-cli",
"DEL", f"thread:{test_thread_id}"
], capture_output=True)
self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
capture_output=True,
)
return True
else:
self.logger.error("Retrieved data doesn't match stored data")
@@ -131,4 +136,4 @@ class RedisValidationTest(BaseSimulatorTest):
except Exception as e:
self.logger.error(f"Conversation memory validation failed: {e}")
return False
return False