Extra logging and more tests
This commit is contained in:
@@ -4,8 +4,17 @@ Per-Tool File Deduplication Test
|
||||
|
||||
Tests file deduplication for each individual MCP tool to ensure
|
||||
that files are properly deduplicated within single-tool conversations.
|
||||
Validates that:
|
||||
1. Files are embedded only once in conversation history
|
||||
2. Continuation calls don't re-read existing files
|
||||
3. New files are still properly embedded
|
||||
4. Docker logs show deduplication behavior
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
@@ -20,96 +29,195 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "File deduplication for individual tools"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test file deduplication for each individual tool"""
|
||||
def get_docker_logs_since(self, since_time: str) -> str:
|
||||
"""Get docker logs since a specific timestamp"""
|
||||
try:
|
||||
self.logger.info("📄 Test: Per-tool file deduplication")
|
||||
# Check both main server and log monitor for comprehensive logs
|
||||
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
|
||||
cmd_monitor = ["docker", "logs", "--since", since_time, "gemini-mcp-log-monitor"]
|
||||
|
||||
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
|
||||
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
|
||||
|
||||
# Combine logs from both containers
|
||||
combined_logs = result_server.stdout + "\n" + result_monitor.stdout
|
||||
return combined_logs
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get docker logs: {e}")
|
||||
return ""
|
||||
|
||||
# create_additional_test_file method now inherited from base class
|
||||
|
||||
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
|
||||
"""Validate that logs show file deduplication behavior"""
|
||||
# Look for file embedding messages
|
||||
embedding_messages = [line for line in logs.split('\n') if '📁' in line and 'embedding' in line and tool_name in line]
|
||||
|
||||
# Look for deduplication/filtering messages
|
||||
filtering_messages = [line for line in logs.split('\n') if '📁' in line and 'Filtering' in line and tool_name in line]
|
||||
skipping_messages = [line for line in logs.split('\n') if '📁' in line and 'skipping' in line and tool_name in line]
|
||||
|
||||
deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0
|
||||
|
||||
if deduplication_found:
|
||||
self.logger.info(f" ✅ {tool_name}: Found deduplication evidence in logs")
|
||||
for msg in filtering_messages + skipping_messages:
|
||||
self.logger.debug(f" 📁 {msg.strip()}")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ {tool_name}: No deduplication evidence found in logs")
|
||||
self.logger.debug(f" 📁 All embedding messages: {embedding_messages}")
|
||||
|
||||
return deduplication_found
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test file deduplication with realistic precommit/codereview workflow"""
|
||||
try:
|
||||
self.logger.info("📄 Test: Simplified file deduplication with precommit/codereview workflow")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
# Create a dummy file for precommit testing
|
||||
dummy_content = '''def hello_world():
|
||||
"""A simple hello world function with a bug"""
|
||||
print("Hello world!")
|
||||
return "hello"
|
||||
|
||||
tools_to_test = [
|
||||
(
|
||||
"thinkdeep",
|
||||
{
|
||||
"current_analysis": "Please use low thinking mode. I'm analyzing this Python code to identify potential architectural improvements",
|
||||
"files": [self.test_files["python"]],
|
||||
},
|
||||
),
|
||||
(
|
||||
"analyze",
|
||||
{
|
||||
"files": [self.test_files["python"]],
|
||||
"question": "Please use low thinking mode. What are the architectural patterns in this code?",
|
||||
},
|
||||
),
|
||||
(
|
||||
"debug",
|
||||
{
|
||||
"files": [self.test_files["python"]],
|
||||
"error_description": "Please use low thinking mode. The fibonacci function seems slow for large numbers",
|
||||
},
|
||||
),
|
||||
(
|
||||
"codereview",
|
||||
{
|
||||
"files": [self.test_files["python"]],
|
||||
"context": "General code review for quality and best practices",
|
||||
},
|
||||
),
|
||||
# TODO: Fix the inconsistent return type
|
||||
def calculate_sum(a, b):
|
||||
return a + b # Missing type hints
|
||||
'''
|
||||
dummy_file_path = self.create_additional_test_file("dummy_code.py", dummy_content)
|
||||
|
||||
# Get timestamp for log filtering
|
||||
import datetime
|
||||
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
# Step 1: precommit tool with dummy file (low thinking mode)
|
||||
self.logger.info(" Step 1: precommit tool with dummy file")
|
||||
precommit_params = {
|
||||
"path": self.test_dir, # Required path parameter
|
||||
"files": [dummy_file_path],
|
||||
"original_request": "Please use low thinking mode. Review this code for commit readiness",
|
||||
"thinking_mode": "low"
|
||||
}
|
||||
|
||||
response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
|
||||
if not response1:
|
||||
self.logger.error(" ❌ Step 1: precommit tool failed")
|
||||
return False
|
||||
|
||||
if not continuation_id:
|
||||
self.logger.error(" ❌ Step 1: precommit tool didn't provide continuation_id")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1: precommit completed with continuation_id: {continuation_id[:8]}...")
|
||||
|
||||
# Step 2: codereview tool with same file (NO continuation - fresh conversation)
|
||||
self.logger.info(" Step 2: codereview tool with same file (fresh conversation)")
|
||||
codereview_params = {
|
||||
"files": [dummy_file_path],
|
||||
"context": "Please use low thinking mode. General code review for quality and best practices"
|
||||
}
|
||||
|
||||
response2, _ = self.call_mcp_tool("codereview", codereview_params)
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Step 2: codereview tool failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2: codereview completed (fresh conversation)")
|
||||
|
||||
# Step 3: Create new file and continue with precommit
|
||||
self.logger.info(" Step 3: precommit continuation with old + new file")
|
||||
new_file_content = '''def new_feature():
|
||||
"""A new feature function"""
|
||||
return {"status": "implemented", "version": "1.0"}
|
||||
|
||||
class NewUtility:
|
||||
"""A new utility class"""
|
||||
|
||||
def __init__(self):
|
||||
self.initialized = True
|
||||
|
||||
def process_data(self, data):
|
||||
return f"Processed: {data}"
|
||||
'''
|
||||
new_file_path = self.create_additional_test_file("new_feature.py", new_file_content)
|
||||
|
||||
# Continue precommit with both files
|
||||
continue_params = {
|
||||
"continuation_id": continuation_id,
|
||||
"path": self.test_dir, # Required path parameter
|
||||
"files": [dummy_file_path, new_file_path], # Old + new file
|
||||
"original_request": "Please use low thinking mode. Now also review the new feature file along with the previous one",
|
||||
"thinking_mode": "low"
|
||||
}
|
||||
|
||||
response3, _ = self.call_mcp_tool("precommit", continue_params)
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Step 3: precommit continuation failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 3: precommit continuation completed")
|
||||
|
||||
# Validate results in docker logs
|
||||
self.logger.info(" 📋 Validating conversation history and file deduplication...")
|
||||
logs = self.get_docker_logs_since(start_time)
|
||||
|
||||
# Check for conversation history building
|
||||
conversation_logs = [line for line in logs.split('\n') if 'conversation' in line.lower() or 'history' in line.lower()]
|
||||
|
||||
# Check for file embedding/deduplication
|
||||
embedding_logs = [line for line in logs.split('\n') if '📁' in line or 'embedding' in line.lower() or 'file' in line.lower()]
|
||||
|
||||
# Check for continuation evidence
|
||||
continuation_logs = [line for line in logs.split('\n') if 'continuation' in line.lower() or continuation_id[:8] in line]
|
||||
|
||||
# Check for both files mentioned
|
||||
dummy_file_mentioned = any("dummy_code.py" in line for line in logs.split('\n'))
|
||||
new_file_mentioned = any("new_feature.py" in line for line in logs.split('\n'))
|
||||
|
||||
# Print diagnostic information
|
||||
self.logger.info(f" 📊 Conversation logs found: {len(conversation_logs)}")
|
||||
self.logger.info(f" 📊 File embedding logs found: {len(embedding_logs)}")
|
||||
self.logger.info(f" 📊 Continuation logs found: {len(continuation_logs)}")
|
||||
self.logger.info(f" 📊 Dummy file mentioned: {dummy_file_mentioned}")
|
||||
self.logger.info(f" 📊 New file mentioned: {new_file_mentioned}")
|
||||
|
||||
if self.verbose:
|
||||
self.logger.debug(" 📋 Sample embedding logs:")
|
||||
for log in embedding_logs[:5]: # Show first 5
|
||||
if log.strip():
|
||||
self.logger.debug(f" {log.strip()}")
|
||||
|
||||
self.logger.debug(" 📋 Sample continuation logs:")
|
||||
for log in continuation_logs[:3]: # Show first 3
|
||||
if log.strip():
|
||||
self.logger.debug(f" {log.strip()}")
|
||||
|
||||
# Determine success criteria
|
||||
success_criteria = [
|
||||
len(embedding_logs) > 0, # File embedding occurred
|
||||
len(continuation_logs) > 0, # Continuation worked
|
||||
dummy_file_mentioned, # Original file processed
|
||||
new_file_mentioned # New file processed
|
||||
]
|
||||
|
||||
successful_tests = 0
|
||||
total_tests = len(tools_to_test)
|
||||
|
||||
for tool_name, initial_params in tools_to_test:
|
||||
self.logger.info(f" {tool_name}: Testing {tool_name} tool file deduplication")
|
||||
|
||||
# Initial call
|
||||
response1, continuation_id = self.call_mcp_tool(tool_name, initial_params)
|
||||
if not response1:
|
||||
self.logger.warning(f" ⚠️ {tool_name} tool initial call failed, skipping")
|
||||
continue
|
||||
|
||||
if not continuation_id:
|
||||
self.logger.warning(f" ⚠️ {tool_name} tool didn't provide continuation_id, skipping")
|
||||
continue
|
||||
|
||||
# Continue with same file - should be deduplicated
|
||||
continue_params = initial_params.copy()
|
||||
continue_params["continuation_id"] = continuation_id
|
||||
|
||||
if tool_name == "thinkdeep":
|
||||
continue_params["current_analysis"] = (
|
||||
"Please use low thinking mode. Now focus specifically on the recursive fibonacci implementation"
|
||||
)
|
||||
elif tool_name == "analyze":
|
||||
continue_params["question"] = (
|
||||
"Please use low thinking mode. What are the performance characteristics of this code?"
|
||||
)
|
||||
elif tool_name == "debug":
|
||||
continue_params["error_description"] = (
|
||||
"Please use low thinking mode. How can we optimize the fibonacci function?"
|
||||
)
|
||||
elif tool_name == "codereview":
|
||||
continue_params["context"] = "Focus on the Calculator class implementation"
|
||||
|
||||
response2, _ = self.call_mcp_tool(tool_name, continue_params)
|
||||
if response2:
|
||||
self.logger.info(f" ✅ {tool_name} tool file deduplication working")
|
||||
successful_tests += 1
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ {tool_name} tool continuation failed")
|
||||
|
||||
self.logger.info(
|
||||
f" ✅ Per-tool file deduplication tests completed: {successful_tests}/{total_tests} tools passed"
|
||||
)
|
||||
|
||||
# Consider test successful if at least one tool worked
|
||||
return successful_tests > 0
|
||||
|
||||
passed_criteria = sum(success_criteria)
|
||||
total_criteria = len(success_criteria)
|
||||
|
||||
self.logger.info(f" 📊 Success criteria met: {passed_criteria}/{total_criteria}")
|
||||
|
||||
if passed_criteria >= 3: # At least 3 out of 4 criteria
|
||||
self.logger.info(" ✅ File deduplication workflow test: PASSED")
|
||||
return True
|
||||
else:
|
||||
self.logger.warning(" ⚠️ File deduplication workflow test: FAILED")
|
||||
self.logger.warning(" 💡 Check docker logs for detailed file embedding and continuation activity")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Per-tool file deduplication test failed: {e}")
|
||||
self.logger.error(f"File deduplication workflow test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
|
||||
Reference in New Issue
Block a user