More tests

This commit is contained in:
Fahad
2025-06-11 18:44:34 +04:00
parent ee3b9fdcd8
commit 898373bc22
10 changed files with 455 additions and 105 deletions

View File

@@ -366,7 +366,10 @@ def parse_arguments():
parser.add_argument("--list-tests", action="store_true", help="List available tests and exit") parser.add_argument("--list-tests", action="store_true", help="List available tests and exit")
parser.add_argument("--individual", "-i", help="Run a single test individually") parser.add_argument("--individual", "-i", help="Run a single test individually")
parser.add_argument( parser.add_argument(
"--skip-docker", action="store_true", default=True, help="Skip Docker setup (assumes containers are already running) - DEFAULT" "--skip-docker",
action="store_true",
default=True,
help="Skip Docker setup (assumes containers are already running) - DEFAULT",
) )
parser.add_argument( parser.add_argument(
"--rebuild-docker", action="store_true", help="Force rebuild Docker environment (overrides --skip-docker)" "--rebuild-docker", action="store_true", help="Force rebuild Docker environment (overrides --skip-docker)"

View File

@@ -22,6 +22,7 @@ import asyncio
import logging import logging
import os import os
import sys import sys
import time
from datetime import datetime from datetime import datetime
from typing import Any from typing import Any
@@ -52,7 +53,8 @@ from tools.models import ToolOutput
log_level = os.getenv("LOG_LEVEL", "INFO").upper() log_level = os.getenv("LOG_LEVEL", "INFO").upper()
# Create timezone-aware formatter # Create timezone-aware formatter
import time
class LocalTimeFormatter(logging.Formatter): class LocalTimeFormatter(logging.Formatter):
def formatTime(self, record, datefmt=None): def formatTime(self, record, datefmt=None):
"""Override to use local timezone instead of UTC""" """Override to use local timezone instead of UTC"""
@@ -61,9 +63,10 @@ class LocalTimeFormatter(logging.Formatter):
s = time.strftime(datefmt, ct) s = time.strftime(datefmt, ct)
else: else:
t = time.strftime("%Y-%m-%d %H:%M:%S", ct) t = time.strftime("%Y-%m-%d %H:%M:%S", ct)
s = "%s,%03d" % (t, record.msecs) s = f"{t},{record.msecs:03.0f}"
return s return s
# Configure both console and file logging # Configure both console and file logging
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig( logging.basicConfig(
@@ -213,7 +216,9 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
if "continuation_id" in arguments and arguments["continuation_id"]: if "continuation_id" in arguments and arguments["continuation_id"]:
continuation_id = arguments["continuation_id"] continuation_id = arguments["continuation_id"]
logger.debug(f"Resuming conversation thread: {continuation_id}") logger.debug(f"Resuming conversation thread: {continuation_id}")
logger.debug(f"[CONVERSATION_DEBUG] Tool '{name}' resuming thread {continuation_id} with {len(arguments)} arguments") logger.debug(
f"[CONVERSATION_DEBUG] Tool '{name}' resuming thread {continuation_id} with {len(arguments)} arguments"
)
logger.debug(f"[CONVERSATION_DEBUG] Original arguments keys: {list(arguments.keys())}") logger.debug(f"[CONVERSATION_DEBUG] Original arguments keys: {list(arguments.keys())}")
# Log to activity file for monitoring # Log to activity file for monitoring
@@ -225,7 +230,7 @@ async def handle_call_tool(name: str, arguments: dict[str, Any]) -> list[TextCon
arguments = await reconstruct_thread_context(arguments) arguments = await reconstruct_thread_context(arguments)
logger.debug(f"[CONVERSATION_DEBUG] After thread reconstruction, arguments keys: {list(arguments.keys())}") logger.debug(f"[CONVERSATION_DEBUG] After thread reconstruction, arguments keys: {list(arguments.keys())}")
if '_remaining_tokens' in arguments: if "_remaining_tokens" in arguments:
logger.debug(f"[CONVERSATION_DEBUG] Remaining token budget: {arguments['_remaining_tokens']:,}") logger.debug(f"[CONVERSATION_DEBUG] Remaining token budget: {arguments['_remaining_tokens']:,}")
# Route to AI-powered tools that require Gemini API calls # Route to AI-powered tools that require Gemini API calls
@@ -354,7 +359,7 @@ async def reconstruct_thread_context(arguments: dict[str, Any]) -> dict[str, Any
success = add_turn(continuation_id, "user", user_prompt, files=user_files) success = add_turn(continuation_id, "user", user_prompt, files=user_files)
if not success: if not success:
logger.warning(f"Failed to add user turn to thread {continuation_id}") logger.warning(f"Failed to add user turn to thread {continuation_id}")
logger.debug(f"[CONVERSATION_DEBUG] Failed to add user turn - thread may be at turn limit or expired") logger.debug("[CONVERSATION_DEBUG] Failed to add user turn - thread may be at turn limit or expired")
else: else:
logger.debug(f"[CONVERSATION_DEBUG] Successfully added user turn to thread {continuation_id}") logger.debug(f"[CONVERSATION_DEBUG] Successfully added user turn to thread {continuation_id}")
@@ -387,7 +392,7 @@ async def reconstruct_thread_context(arguments: dict[str, Any]) -> dict[str, Any
remaining_tokens = MAX_CONTENT_TOKENS - conversation_tokens remaining_tokens = MAX_CONTENT_TOKENS - conversation_tokens
enhanced_arguments["_remaining_tokens"] = max(0, remaining_tokens) # Ensure non-negative enhanced_arguments["_remaining_tokens"] = max(0, remaining_tokens) # Ensure non-negative
logger.debug(f"[CONVERSATION_DEBUG] Token budget calculation:") logger.debug("[CONVERSATION_DEBUG] Token budget calculation:")
logger.debug(f"[CONVERSATION_DEBUG] MAX_CONTENT_TOKENS: {MAX_CONTENT_TOKENS:,}") logger.debug(f"[CONVERSATION_DEBUG] MAX_CONTENT_TOKENS: {MAX_CONTENT_TOKENS:,}")
logger.debug(f"[CONVERSATION_DEBUG] Conversation tokens: {conversation_tokens:,}") logger.debug(f"[CONVERSATION_DEBUG] Conversation tokens: {conversation_tokens:,}")
logger.debug(f"[CONVERSATION_DEBUG] Remaining tokens: {remaining_tokens:,}") logger.debug(f"[CONVERSATION_DEBUG] Remaining tokens: {remaining_tokens:,}")
@@ -404,7 +409,7 @@ async def reconstruct_thread_context(arguments: dict[str, Any]) -> dict[str, Any
logger.debug(f"[CONVERSATION_DEBUG] Final enhanced arguments keys: {list(enhanced_arguments.keys())}") logger.debug(f"[CONVERSATION_DEBUG] Final enhanced arguments keys: {list(enhanced_arguments.keys())}")
# Debug log files in the enhanced arguments for file tracking # Debug log files in the enhanced arguments for file tracking
if 'files' in enhanced_arguments: if "files" in enhanced_arguments:
logger.debug(f"[CONVERSATION_DEBUG] Final files in enhanced arguments: {enhanced_arguments['files']}") logger.debug(f"[CONVERSATION_DEBUG] Final files in enhanced arguments: {enhanced_arguments['files']}")
# Log to activity file for monitoring # Log to activity file for monitoring

View File

@@ -8,6 +8,7 @@ Each test is in its own file for better organization and maintainability.
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
from .test_basic_conversation import BasicConversationTest from .test_basic_conversation import BasicConversationTest
from .test_content_validation import ContentValidationTest from .test_content_validation import ContentValidationTest
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
from .test_cross_tool_continuation import CrossToolContinuationTest from .test_cross_tool_continuation import CrossToolContinuationTest
from .test_logs_validation import LogsValidationTest from .test_logs_validation import LogsValidationTest
from .test_per_tool_deduplication import PerToolDeduplicationTest from .test_per_tool_deduplication import PerToolDeduplicationTest
@@ -19,6 +20,7 @@ TEST_REGISTRY = {
"content_validation": ContentValidationTest, "content_validation": ContentValidationTest,
"per_tool_deduplication": PerToolDeduplicationTest, "per_tool_deduplication": PerToolDeduplicationTest,
"cross_tool_continuation": CrossToolContinuationTest, "cross_tool_continuation": CrossToolContinuationTest,
"cross_tool_comprehensive": CrossToolComprehensiveTest,
"logs_validation": LogsValidationTest, "logs_validation": LogsValidationTest,
"redis_validation": RedisValidationTest, "redis_validation": RedisValidationTest,
} }
@@ -29,6 +31,7 @@ __all__ = [
"ContentValidationTest", "ContentValidationTest",
"PerToolDeduplicationTest", "PerToolDeduplicationTest",
"CrossToolContinuationTest", "CrossToolContinuationTest",
"CrossToolComprehensiveTest",
"LogsValidationTest", "LogsValidationTest",
"RedisValidationTest", "RedisValidationTest",
"TEST_REGISTRY", "TEST_REGISTRY",

View File

@@ -96,10 +96,7 @@ class Calculator:
f.write(config_content) f.write(config_content)
# Ensure absolute paths for MCP server compatibility # Ensure absolute paths for MCP server compatibility
self.test_files = { self.test_files = {"python": os.path.abspath(test_py), "config": os.path.abspath(test_config)}
"python": os.path.abspath(test_py),
"config": os.path.abspath(test_config)
}
self.logger.debug(f"Created test files with absolute paths: {list(self.test_files.values())}") self.logger.debug(f"Created test files with absolute paths: {list(self.test_files.values())}")
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]: def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
@@ -237,7 +234,7 @@ class Calculator:
def create_additional_test_file(self, filename: str, content: str) -> str: def create_additional_test_file(self, filename: str, content: str) -> str:
"""Create an additional test file for mixed scenario testing""" """Create an additional test file for mixed scenario testing"""
if not hasattr(self, 'test_dir') or not self.test_dir: if not hasattr(self, "test_dir") or not self.test_dir:
raise RuntimeError("Test directory not initialized. Call setup_test_files() first.") raise RuntimeError("Test directory not initialized. Call setup_test_files() first.")
file_path = os.path.join(self.test_dir, filename) file_path = os.path.join(self.test_dir, filename)

View File

@@ -113,11 +113,17 @@ DATABASE_CONFIG = {
tools_to_test = [ tools_to_test = [
( (
"chat", "chat",
{"prompt": "Please use low thinking mode. Analyze this config file", "files": [validation_file]}, # Using absolute path {
"prompt": "Please use low thinking mode. Analyze this config file",
"files": [validation_file],
}, # Using absolute path
), ),
( (
"codereview", "codereview",
{"files": [validation_file], "context": "Please use low thinking mode. Review this configuration"}, # Using absolute path {
"files": [validation_file],
"context": "Please use low thinking mode. Review this configuration",
}, # Using absolute path
), ),
("analyze", {"files": [validation_file], "analysis_type": "code_quality"}), # Using absolute path ("analyze", {"files": [validation_file], "analysis_type": "code_quality"}), # Using absolute path
] ]

View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Comprehensive Cross-Tool Test
Tests file deduplication, conversation continuation, and file handling
across all available MCP tools using realistic workflows with low thinking mode.
Validates:
1. Cross-tool conversation continuation
2. File deduplication across different tools
3. Mixed file scenarios (old + new files)
4. Conversation history preservation
5. Proper tool chaining with context
"""
import subprocess
from .base_test import BaseSimulatorTest
class CrossToolComprehensiveTest(BaseSimulatorTest):
"""Comprehensive test across all MCP tools"""
@property
def test_name(self) -> str:
return "cross_tool_comprehensive"
@property
def test_description(self) -> str:
return "Comprehensive cross-tool file deduplication and continuation"
def get_docker_logs_since(self, since_time: str) -> str:
"""Get docker logs since a specific timestamp"""
try:
# Check both main server and log monitor for comprehensive logs
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
cmd_monitor = ["docker", "logs", "--since", since_time, "gemini-mcp-log-monitor"]
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
# Combine logs from both containers
combined_logs = result_server.stdout + "\n" + result_monitor.stdout
return combined_logs
except Exception as e:
self.logger.error(f"Failed to get docker logs: {e}")
return ""
def run_test(self) -> bool:
"""Comprehensive cross-tool test with all MCP tools"""
try:
self.logger.info("📄 Test: Comprehensive cross-tool file deduplication and continuation")
# Setup test files
self.setup_test_files()
# Create short test files for quick testing
python_code = '''def login(user, pwd):
# Security issue: plain text password
if user == "admin" and pwd == "123":
return True
return False
def hash_pwd(pwd):
# Weak hashing
return str(hash(pwd))
'''
config_file = """{
"db_password": "weak123",
"debug": true,
"secret_key": "test"
}"""
auth_file = self.create_additional_test_file("auth.py", python_code)
config_file_path = self.create_additional_test_file("config.json", config_file)
# Get timestamp for log filtering
import datetime
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
# Tool chain: chat → analyze → debug → codereview → precommit
# Each step builds on the previous with cross-tool continuation
current_continuation_id = None
responses = []
# Step 1: Start with chat tool to understand the codebase
self.logger.info(" Step 1: chat tool - Initial codebase exploration")
chat_params = {
"prompt": "Please give me a quick one line reply. I have an authentication module that needs review. Can you help me understand potential issues?",
"files": [auth_file],
"thinking_mode": "low"
}
response1, continuation_id1 = self.call_mcp_tool("chat", chat_params)
if not response1 or not continuation_id1:
self.logger.error(" ❌ Step 1: chat tool failed")
return False
self.logger.info(f" ✅ Step 1: chat completed with continuation_id: {continuation_id1[:8]}...")
responses.append(("chat", response1, continuation_id1))
current_continuation_id = continuation_id1
# Step 2: Use analyze tool to do deeper analysis (fresh conversation)
self.logger.info(" Step 2: analyze tool - Deep code analysis (fresh)")
analyze_params = {
"files": [auth_file],
"question": "Please give me a quick one line reply. What are the security vulnerabilities and architectural issues in this authentication code?",
"thinking_mode": "low"
}
response2, continuation_id2 = self.call_mcp_tool("analyze", analyze_params)
if not response2:
self.logger.error(" ❌ Step 2: analyze tool failed")
return False
self.logger.info(
f" ✅ Step 2: analyze completed with continuation_id: {continuation_id2[:8] if continuation_id2 else 'None'}..."
)
responses.append(("analyze", response2, continuation_id2))
# Step 3: Continue chat conversation with config file
self.logger.info(" Step 3: chat continuation - Add config file context")
chat_continue_params = {
"continuation_id": current_continuation_id,
"prompt": "Please give me a quick one line reply. I also have this configuration file. Can you analyze it alongside the authentication code?",
"files": [auth_file, config_file_path], # Old + new file
"thinking_mode": "low"
}
response3, _ = self.call_mcp_tool("chat", chat_continue_params)
if not response3:
self.logger.error(" ❌ Step 3: chat continuation failed")
return False
self.logger.info(" ✅ Step 3: chat continuation completed")
responses.append(("chat_continue", response3, current_continuation_id))
# Step 4: Use debug tool to identify specific issues
self.logger.info(" Step 4: debug tool - Identify specific problems")
debug_params = {
"files": [auth_file, config_file_path],
"error_description": "Please give me a quick one line reply. The authentication system has security vulnerabilities. Help me identify and fix the main issues.",
"thinking_mode": "low"
}
response4, continuation_id4 = self.call_mcp_tool("debug", debug_params)
if not response4:
self.logger.error(" ❌ Step 4: debug tool failed")
return False
self.logger.info(
f" ✅ Step 4: debug completed with continuation_id: {continuation_id4[:8] if continuation_id4 else 'None'}..."
)
responses.append(("debug", response4, continuation_id4))
# Step 5: Cross-tool continuation - continue debug with chat context
if continuation_id4:
self.logger.info(" Step 5: debug continuation - Additional analysis")
debug_continue_params = {
"continuation_id": continuation_id4,
"files": [auth_file, config_file_path],
"error_description": "Please give me a quick one line reply. What specific code changes would you recommend to fix the password hashing vulnerability?",
"thinking_mode": "low"
}
response5, _ = self.call_mcp_tool("debug", debug_continue_params)
if response5:
self.logger.info(" ✅ Step 5: debug continuation completed")
responses.append(("debug_continue", response5, continuation_id4))
# Step 6: Use codereview for comprehensive review
self.logger.info(" Step 6: codereview tool - Comprehensive code review")
codereview_params = {
"files": [auth_file, config_file_path],
"context": "Please give me a quick one line reply. Comprehensive security-focused code review for production readiness",
"thinking_mode": "low"
}
response6, continuation_id6 = self.call_mcp_tool("codereview", codereview_params)
if not response6:
self.logger.error(" ❌ Step 6: codereview tool failed")
return False
self.logger.info(
f" ✅ Step 6: codereview completed with continuation_id: {continuation_id6[:8] if continuation_id6 else 'None'}..."
)
responses.append(("codereview", response6, continuation_id6))
# Step 7: Create improved version and use precommit
self.logger.info(" Step 7: precommit tool - Pre-commit validation")
# Create a short improved version
improved_code = '''import hashlib
def secure_login(user, pwd):
# Better: hashed password check
hashed = hashlib.sha256(pwd.encode()).hexdigest()
if user == "admin" and hashed == "expected_hash":
return True
return False
'''
improved_file = self.create_additional_test_file("auth_improved.py", improved_code)
precommit_params = {
"path": self.test_dir,
"files": [auth_file, config_file_path, improved_file],
"original_request": "Please give me a quick one line reply. Ready to commit security improvements to authentication module",
"thinking_mode": "low",
}
response7, continuation_id7 = self.call_mcp_tool("precommit", precommit_params)
if not response7:
self.logger.error(" ❌ Step 7: precommit tool failed")
return False
self.logger.info(
f" ✅ Step 7: precommit completed with continuation_id: {continuation_id7[:8] if continuation_id7 else 'None'}..."
)
responses.append(("precommit", response7, continuation_id7))
# Validate comprehensive results
self.logger.info(" 📋 Validating comprehensive cross-tool results...")
logs = self.get_docker_logs_since(start_time)
# Validation criteria
tools_used = [r[0] for r in responses]
continuation_ids_created = [r[2] for r in responses if r[2]]
# Check for various log patterns
conversation_logs = [
line for line in logs.split("\n") if "conversation" in line.lower() or "history" in line.lower()
]
embedding_logs = [
line
for line in logs.split("\n")
if "📁" in line or "embedding" in line.lower() or "file" in line.lower()
]
continuation_logs = [
line for line in logs.split("\n") if "continuation" in line.lower() or "resuming" in line.lower()
]
cross_tool_logs = [
line
for line in logs.split("\n")
if any(tool in line.lower() for tool in ["chat", "analyze", "debug", "codereview", "precommit"])
]
# File mentions
auth_file_mentioned = any("auth.py" in line for line in logs.split("\n"))
config_file_mentioned = any("config.json" in line for line in logs.split("\n"))
improved_file_mentioned = any("auth_improved.py" in line for line in logs.split("\n"))
# Print comprehensive diagnostics
self.logger.info(f" 📊 Tools used: {len(tools_used)} ({', '.join(tools_used)})")
self.logger.info(f" 📊 Continuation IDs created: {len(continuation_ids_created)}")
self.logger.info(f" 📊 Conversation logs found: {len(conversation_logs)}")
self.logger.info(f" 📊 File embedding logs found: {len(embedding_logs)}")
self.logger.info(f" 📊 Continuation logs found: {len(continuation_logs)}")
self.logger.info(f" 📊 Cross-tool activity logs: {len(cross_tool_logs)}")
self.logger.info(f" 📊 Auth file mentioned: {auth_file_mentioned}")
self.logger.info(f" 📊 Config file mentioned: {config_file_mentioned}")
self.logger.info(f" 📊 Improved file mentioned: {improved_file_mentioned}")
if self.verbose:
self.logger.debug(" 📋 Sample tool activity logs:")
for log in cross_tool_logs[:10]: # Show first 10
if log.strip():
self.logger.debug(f" {log.strip()}")
self.logger.debug(" 📋 Sample continuation logs:")
for log in continuation_logs[:5]: # Show first 5
if log.strip():
self.logger.debug(f" {log.strip()}")
# Comprehensive success criteria
success_criteria = [
len(tools_used) >= 5, # Used multiple tools
len(continuation_ids_created) >= 3, # Created multiple continuation threads
len(embedding_logs) > 10, # Significant file embedding activity
len(continuation_logs) > 0, # Evidence of continuation
auth_file_mentioned, # Original file processed
config_file_mentioned, # Additional file processed
improved_file_mentioned, # New file processed
len(conversation_logs) > 5, # Conversation history activity
]
passed_criteria = sum(success_criteria)
total_criteria = len(success_criteria)
self.logger.info(f" 📊 Success criteria met: {passed_criteria}/{total_criteria}")
if passed_criteria >= 6: # At least 6 out of 8 criteria
self.logger.info(" ✅ Comprehensive cross-tool test: PASSED")
return True
else:
self.logger.warning(" ⚠️ Comprehensive cross-tool test: FAILED")
self.logger.warning(" 💡 Check logs for detailed cross-tool activity")
return False
except Exception as e:
self.logger.error(f"Comprehensive cross-tool test failed: {e}")
return False
finally:
self.cleanup_test_files()

View File

@@ -11,10 +11,8 @@ Validates that:
4. Docker logs show deduplication behavior 4. Docker logs show deduplication behavior
""" """
import json
import os
import subprocess import subprocess
import tempfile
from .base_test import BaseSimulatorTest from .base_test import BaseSimulatorTest
@@ -51,11 +49,17 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool: def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
"""Validate that logs show file deduplication behavior""" """Validate that logs show file deduplication behavior"""
# Look for file embedding messages # Look for file embedding messages
embedding_messages = [line for line in logs.split('\n') if '📁' in line and 'embedding' in line and tool_name in line] embedding_messages = [
line for line in logs.split("\n") if "📁" in line and "embedding" in line and tool_name in line
]
# Look for deduplication/filtering messages # Look for deduplication/filtering messages
filtering_messages = [line for line in logs.split('\n') if '📁' in line and 'Filtering' in line and tool_name in line] filtering_messages = [
skipping_messages = [line for line in logs.split('\n') if '📁' in line and 'skipping' in line and tool_name in line] line for line in logs.split("\n") if "📁" in line and "Filtering" in line and tool_name in line
]
skipping_messages = [
line for line in logs.split("\n") if "📁" in line and "skipping" in line and tool_name in line
]
deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0 deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0
@@ -77,20 +81,18 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
# Setup test files # Setup test files
self.setup_test_files() self.setup_test_files()
# Create a dummy file for precommit testing # Create a short dummy file for quick testing
dummy_content = '''def hello_world(): dummy_content = '''def add(a, b):
"""A simple hello world function with a bug"""
print("Hello world!")
return "hello"
# TODO: Fix the inconsistent return type
def calculate_sum(a, b):
return a + b # Missing type hints return a + b # Missing type hints
def divide(x, y):
return x / y # No zero check
''' '''
dummy_file_path = self.create_additional_test_file("dummy_code.py", dummy_content) dummy_file_path = self.create_additional_test_file("dummy_code.py", dummy_content)
# Get timestamp for log filtering # Get timestamp for log filtering
import datetime import datetime
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
# Step 1: precommit tool with dummy file (low thinking mode) # Step 1: precommit tool with dummy file (low thinking mode)
@@ -98,8 +100,8 @@ def calculate_sum(a, b):
precommit_params = { precommit_params = {
"path": self.test_dir, # Required path parameter "path": self.test_dir, # Required path parameter
"files": [dummy_file_path], "files": [dummy_file_path],
"original_request": "Please use low thinking mode. Review this code for commit readiness", "original_request": "Please give me a quick one line reply. Review this code for commit readiness",
"thinking_mode": "low" "thinking_mode": "low",
} }
response1, continuation_id = self.call_mcp_tool("precommit", precommit_params) response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
@@ -111,13 +113,19 @@ def calculate_sum(a, b):
self.logger.error(" ❌ Step 1: precommit tool didn't provide continuation_id") self.logger.error(" ❌ Step 1: precommit tool didn't provide continuation_id")
return False return False
# Validate continuation_id format (should be UUID)
if len(continuation_id) < 32:
self.logger.error(f" ❌ Step 1: Invalid continuation_id format: {continuation_id}")
return False
self.logger.info(f" ✅ Step 1: precommit completed with continuation_id: {continuation_id[:8]}...") self.logger.info(f" ✅ Step 1: precommit completed with continuation_id: {continuation_id[:8]}...")
# Step 2: codereview tool with same file (NO continuation - fresh conversation) # Step 2: codereview tool with same file (NO continuation - fresh conversation)
self.logger.info(" Step 2: codereview tool with same file (fresh conversation)") self.logger.info(" Step 2: codereview tool with same file (fresh conversation)")
codereview_params = { codereview_params = {
"files": [dummy_file_path], "files": [dummy_file_path],
"context": "Please use low thinking mode. General code review for quality and best practices" "context": "Please give me a quick one line reply. General code review for quality and best practices",
"thinking_mode": "low"
} }
response2, _ = self.call_mcp_tool("codereview", codereview_params) response2, _ = self.call_mcp_tool("codereview", codereview_params)
@@ -129,18 +137,11 @@ def calculate_sum(a, b):
# Step 3: Create new file and continue with precommit # Step 3: Create new file and continue with precommit
self.logger.info(" Step 3: precommit continuation with old + new file") self.logger.info(" Step 3: precommit continuation with old + new file")
new_file_content = '''def new_feature(): new_file_content = '''def multiply(x, y):
"""A new feature function""" return x * y
return {"status": "implemented", "version": "1.0"}
class NewUtility: def subtract(a, b):
"""A new utility class""" return a - b
def __init__(self):
self.initialized = True
def process_data(self, data):
return f"Processed: {data}"
''' '''
new_file_path = self.create_additional_test_file("new_feature.py", new_file_content) new_file_path = self.create_additional_test_file("new_feature.py", new_file_content)
@@ -149,8 +150,8 @@ class NewUtility:
"continuation_id": continuation_id, "continuation_id": continuation_id,
"path": self.test_dir, # Required path parameter "path": self.test_dir, # Required path parameter
"files": [dummy_file_path, new_file_path], # Old + new file "files": [dummy_file_path, new_file_path], # Old + new file
"original_request": "Please use low thinking mode. Now also review the new feature file along with the previous one", "original_request": "Please give me a quick one line reply. Now also review the new feature file along with the previous one",
"thinking_mode": "low" "thinking_mode": "low",
} }
response3, _ = self.call_mcp_tool("precommit", continue_params) response3, _ = self.call_mcp_tool("precommit", continue_params)
@@ -165,17 +166,25 @@ class NewUtility:
logs = self.get_docker_logs_since(start_time) logs = self.get_docker_logs_since(start_time)
# Check for conversation history building # Check for conversation history building
conversation_logs = [line for line in logs.split('\n') if 'conversation' in line.lower() or 'history' in line.lower()] conversation_logs = [
line for line in logs.split("\n") if "conversation" in line.lower() or "history" in line.lower()
]
# Check for file embedding/deduplication # Check for file embedding/deduplication
embedding_logs = [line for line in logs.split('\n') if '📁' in line or 'embedding' in line.lower() or 'file' in line.lower()] embedding_logs = [
line
for line in logs.split("\n")
if "📁" in line or "embedding" in line.lower() or "file" in line.lower()
]
# Check for continuation evidence # Check for continuation evidence
continuation_logs = [line for line in logs.split('\n') if 'continuation' in line.lower() or continuation_id[:8] in line] continuation_logs = [
line for line in logs.split("\n") if "continuation" in line.lower() or continuation_id[:8] in line
]
# Check for both files mentioned # Check for both files mentioned
dummy_file_mentioned = any("dummy_code.py" in line for line in logs.split('\n')) dummy_file_mentioned = any("dummy_code.py" in line for line in logs.split("\n"))
new_file_mentioned = any("new_feature.py" in line for line in logs.split('\n')) new_file_mentioned = any("new_feature.py" in line for line in logs.split("\n"))
# Print diagnostic information # Print diagnostic information
self.logger.info(f" 📊 Conversation logs found: {len(conversation_logs)}") self.logger.info(f" 📊 Conversation logs found: {len(conversation_logs)}")
@@ -200,7 +209,7 @@ class NewUtility:
len(embedding_logs) > 0, # File embedding occurred len(embedding_logs) > 0, # File embedding occurred
len(continuation_logs) > 0, # Continuation worked len(continuation_logs) > 0, # Continuation worked
dummy_file_mentioned, # Original file processed dummy_file_mentioned, # Original file processed
new_file_mentioned # New file processed new_file_mentioned, # New file processed
] ]
passed_criteria = sum(success_criteria) passed_criteria = sum(success_criteria)

View File

@@ -226,12 +226,16 @@ class BaseTool(ABC):
logger.debug( logger.debug(
f"📁 {self.name} tool: No files found in conversation history for thread {continuation_id}" f"📁 {self.name} tool: No files found in conversation history for thread {continuation_id}"
) )
logger.debug(f"[FILES] {self.name}: No embedded files found, returning all {len(requested_files)} requested files") logger.debug(
f"[FILES] {self.name}: No embedded files found, returning all {len(requested_files)} requested files"
)
return requested_files return requested_files
# Return only files that haven't been embedded yet # Return only files that haven't been embedded yet
new_files = [f for f in requested_files if f not in embedded_files] new_files = [f for f in requested_files if f not in embedded_files]
logger.debug(f"[FILES] {self.name}: After filtering: {len(new_files)} new files, {len(requested_files) - len(new_files)} already embedded") logger.debug(
f"[FILES] {self.name}: After filtering: {len(new_files)} new files, {len(requested_files) - len(new_files)} already embedded"
)
logger.debug(f"[FILES] {self.name}: New files to embed: {new_files}") logger.debug(f"[FILES] {self.name}: New files to embed: {new_files}")
# Log filtering results for debugging # Log filtering results for debugging
@@ -249,7 +253,9 @@ class BaseTool(ABC):
# and include all files rather than risk losing access to needed files # and include all files rather than risk losing access to needed files
logger.warning(f"📁 {self.name} tool: Error checking conversation history for {continuation_id}: {e}") logger.warning(f"📁 {self.name} tool: Error checking conversation history for {continuation_id}: {e}")
logger.warning(f"📁 {self.name} tool: Including all requested files as fallback") logger.warning(f"📁 {self.name} tool: Including all requested files as fallback")
logger.debug(f"[FILES] {self.name}: Exception in filter_new_files, returning all {len(requested_files)} files as fallback") logger.debug(
f"[FILES] {self.name}: Exception in filter_new_files, returning all {len(requested_files)} files as fallback"
)
return requested_files return requested_files
def _prepare_file_content_for_prompt( def _prepare_file_content_for_prompt(
@@ -312,7 +318,9 @@ class BaseTool(ABC):
# Read content of new files only # Read content of new files only
if files_to_embed: if files_to_embed:
logger.debug(f"📁 {self.name} tool embedding {len(files_to_embed)} new files: {', '.join(files_to_embed)}") logger.debug(f"📁 {self.name} tool embedding {len(files_to_embed)} new files: {', '.join(files_to_embed)}")
logger.debug(f"[FILES] {self.name}: Starting file embedding with token budget {effective_max_tokens + reserve_tokens:,}") logger.debug(
f"[FILES] {self.name}: Starting file embedding with token budget {effective_max_tokens + reserve_tokens:,}"
)
try: try:
file_content = read_files( file_content = read_files(
files_to_embed, max_tokens=effective_max_tokens + reserve_tokens, reserve_tokens=reserve_tokens files_to_embed, max_tokens=effective_max_tokens + reserve_tokens, reserve_tokens=reserve_tokens
@@ -916,6 +924,7 @@ If any of these would strengthen your analysis, specify what Claude should searc
if continuation_id: if continuation_id:
# Check remaining turns in existing thread # Check remaining turns in existing thread
from utils.conversation_memory import get_thread from utils.conversation_memory import get_thread
context = get_thread(continuation_id) context = get_thread(continuation_id)
if context: if context:
current_turns = len(context.turns) current_turns = len(context.turns)

View File

@@ -301,7 +301,7 @@ def get_conversation_file_list(context: ThreadContext) -> list[str]:
list[str]: Deduplicated list of file paths referenced in the conversation list[str]: Deduplicated list of file paths referenced in the conversation
""" """
if not context.turns: if not context.turns:
logger.debug(f"[FILES] No turns found, returning empty file list") logger.debug("[FILES] No turns found, returning empty file list")
return [] return []
# Collect all unique files from all turns, preserving order of first appearance # Collect all unique files from all turns, preserving order of first appearance
@@ -409,13 +409,17 @@ def build_conversation_history(context: ThreadContext, read_files_func=None) ->
logger.debug( logger.debug(
f"📄 File embedded in conversation history: {file_path} ({content_tokens:,} tokens)" f"📄 File embedded in conversation history: {file_path} ({content_tokens:,} tokens)"
) )
logger.debug(f"[FILES] Successfully embedded {file_path} - {content_tokens:,} tokens (total: {total_tokens:,})") logger.debug(
f"[FILES] Successfully embedded {file_path} - {content_tokens:,} tokens (total: {total_tokens:,})"
)
else: else:
files_truncated += 1 files_truncated += 1
logger.debug( logger.debug(
f"📄 File truncated due to token limit: {file_path} ({content_tokens:,} tokens, would exceed {MAX_CONTENT_TOKENS:,} limit)" f"📄 File truncated due to token limit: {file_path} ({content_tokens:,} tokens, would exceed {MAX_CONTENT_TOKENS:,} limit)"
) )
logger.debug(f"[FILES] File {file_path} would exceed token limit - skipping (would be {total_tokens + content_tokens:,} tokens)") logger.debug(
f"[FILES] File {file_path} would exceed token limit - skipping (would be {total_tokens + content_tokens:,} tokens)"
)
# Stop processing more files # Stop processing more files
break break
else: else:
@@ -439,7 +443,9 @@ def build_conversation_history(context: ThreadContext, read_files_func=None) ->
logger.debug( logger.debug(
f"📄 Conversation history file embedding complete: {files_included} files embedded, {files_truncated} truncated, {total_tokens:,} total tokens" f"📄 Conversation history file embedding complete: {files_included} files embedded, {files_truncated} truncated, {total_tokens:,} total tokens"
) )
logger.debug(f"[FILES] File embedding summary - {files_included} embedded, {files_truncated} truncated, {total_tokens:,} tokens total") logger.debug(
f"[FILES] File embedding summary - {files_included} embedded, {files_truncated} truncated, {total_tokens:,} tokens total"
)
else: else:
history_parts.append("(No accessible files found)") history_parts.append("(No accessible files found)")
logger.debug( logger.debug(
@@ -509,7 +515,9 @@ def build_conversation_history(context: ThreadContext, read_files_func=None) ->
# Summary log of what was built # Summary log of what was built
user_turns = len([t for t in context.turns if t.role == "user"]) user_turns = len([t for t in context.turns if t.role == "user"])
assistant_turns = len([t for t in context.turns if t.role == "assistant"]) assistant_turns = len([t for t in context.turns if t.role == "assistant"])
logger.debug(f"[FLOW] Built conversation history: {user_turns} user + {assistant_turns} assistant turns, {len(all_files)} files, {total_conversation_tokens:,} tokens") logger.debug(
f"[FLOW] Built conversation history: {user_turns} user + {assistant_turns} assistant turns, {len(all_files)} files, {total_conversation_tokens:,} tokens"
)
return complete_history, total_conversation_tokens return complete_history, total_conversation_tokens

View File

@@ -518,7 +518,9 @@ def read_files(
max_tokens = MAX_CONTEXT_TOKENS max_tokens = MAX_CONTEXT_TOKENS
logger.debug(f"[FILES] read_files called with {len(file_paths)} paths") logger.debug(f"[FILES] read_files called with {len(file_paths)} paths")
logger.debug(f"[FILES] Token budget: max={max_tokens:,}, reserve={reserve_tokens:,}, available={max_tokens - reserve_tokens:,}") logger.debug(
f"[FILES] Token budget: max={max_tokens:,}, reserve={reserve_tokens:,}, available={max_tokens - reserve_tokens:,}"
)
content_parts = [] content_parts = []
total_tokens = 0 total_tokens = 0
@@ -546,7 +548,7 @@ def read_files(
if not all_files and file_paths: if not all_files and file_paths:
# No files found but paths were provided # No files found but paths were provided
logger.debug(f"[FILES] No files found from provided paths") logger.debug("[FILES] No files found from provided paths")
content_parts.append(f"\n--- NO FILES FOUND ---\nProvided paths: {', '.join(file_paths)}\n--- END ---\n") content_parts.append(f"\n--- NO FILES FOUND ---\nProvided paths: {', '.join(file_paths)}\n--- END ---\n")
else: else:
# Read files sequentially until token limit is reached # Read files sequentially until token limit is reached
@@ -567,7 +569,9 @@ def read_files(
logger.debug(f"[FILES] Added file {file_path}, total tokens: {total_tokens:,}") logger.debug(f"[FILES] Added file {file_path}, total tokens: {total_tokens:,}")
else: else:
# File too large for remaining budget # File too large for remaining budget
logger.debug(f"[FILES] File {file_path} too large for remaining budget ({file_tokens:,} tokens, {available_tokens - total_tokens:,} remaining)") logger.debug(
f"[FILES] File {file_path} too large for remaining budget ({file_tokens:,} tokens, {available_tokens - total_tokens:,} remaining)"
)
files_skipped.append(file_path) files_skipped.append(file_path)
# Add informative note about skipped files to help users understand # Add informative note about skipped files to help users understand