Migration from Docker to Standalone Python Server (#73)

* Migration from docker to standalone server
Migration handling
Fixed tests
Use simpler in-memory storage
Support for concurrent logging to disk
Simplified direct connections to localhost

* Migration from docker / redis to standalone script
Updated tests
Updated run script
Fixed requirements
Use dotenv
Ask if user would like to install MCP in Claude Desktop once
Updated docs

* More cleanup and references to docker removed

* Cleanup

* Comments

* Fixed tests

* Fix GitHub Actions workflow for standalone Python architecture

- Install requirements-dev.txt for pytest and testing dependencies
- Remove Docker setup from simulation tests (now standalone)
- Simplify linting job to use requirements-dev.txt
- Update simulation tests to run directly without Docker

Fixes unit test failures in CI due to missing pytest dependency.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* Remove simulation tests from GitHub Actions

- Removed simulation-tests job that makes real API calls
- Keep only unit tests (mocked, no API costs) and linting
- Simulation tests should be run manually with real API keys
- Reduces CI costs and complexity

GitHub Actions now only runs:
- Unit tests (569 tests, all mocked)
- Code quality checks (ruff, black)

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* Fixed tests

* Fixed tests

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Beehive Innovations
2025-06-18 23:41:22 +04:00
committed by GitHub
parent 9d72545ecd
commit 4151c3c3a5
121 changed files with 2842 additions and 3168 deletions

View File

@@ -26,7 +26,8 @@ from .test_openrouter_models import OpenRouterModelsTest
from .test_per_tool_deduplication import PerToolDeduplicationTest
from .test_planner_continuation_history import PlannerContinuationHistoryTest
from .test_planner_validation import PlannerValidationTest
from .test_redis_validation import RedisValidationTest
# Redis validation test removed - no longer needed for standalone server
from .test_refactor_validation import RefactorValidationTest
from .test_testgen_validation import TestGenValidationTest
from .test_token_allocation_validation import TokenAllocationValidationTest
@@ -42,7 +43,7 @@ TEST_REGISTRY = {
"cross_tool_comprehensive": CrossToolComprehensiveTest,
"line_number_validation": LineNumberValidationTest,
"logs_validation": LogsValidationTest,
"redis_validation": RedisValidationTest,
# "redis_validation": RedisValidationTest, # Removed - no longer needed for standalone server
"model_thinking_config": TestModelThinkingConfig,
"o3_model_selection": O3ModelSelectionTest,
"ollama_custom_url": OllamaCustomUrlTest,
@@ -72,7 +73,7 @@ __all__ = [
"CrossToolComprehensiveTest",
"LineNumberValidationTest",
"LogsValidationTest",
"RedisValidationTest",
# "RedisValidationTest", # Removed - no longer needed for standalone server
"TestModelThinkingConfig",
"O3ModelSelectionTest",
"O3ProExpensiveTest",

View File

@@ -11,6 +11,8 @@ import os
import subprocess
from typing import Optional
from .log_utils import LogUtils
class BaseSimulatorTest:
"""Base class for all communication simulator tests"""
@@ -19,14 +21,25 @@ class BaseSimulatorTest:
self.verbose = verbose
self.test_files = {}
self.test_dir = None
self.container_name = "zen-mcp-server"
self.redis_container = "zen-mcp-redis"
self.python_path = self._get_python_path()
# Configure logging
log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
self.logger = logging.getLogger(self.__class__.__name__)
def _get_python_path(self) -> str:
"""Get the Python path for the virtual environment"""
current_dir = os.getcwd()
venv_python = os.path.join(current_dir, ".zen_venv", "bin", "python")
if os.path.exists(venv_python):
return venv_python
# Fallback to system python if venv doesn't exist
self.logger.warning("Virtual environment not found, using system python")
return "python"
def setup_test_files(self):
"""Create test files for the simulation"""
# Test Python file
@@ -100,7 +113,7 @@ class Calculator:
self.logger.debug(f"Created test files with absolute paths: {list(self.test_files.values())}")
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec)"""
"""Call an MCP tool via standalone server"""
try:
# Prepare the MCP initialization and tool call sequence
init_request = {
@@ -131,8 +144,8 @@ class Calculator:
# Join with newlines as MCP expects
input_data = "\n".join(messages) + "\n"
# Simulate Claude CLI calling the MCP server via docker exec
docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"]
# Call the standalone MCP server directly
server_cmd = [self.python_path, "server.py"]
self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")
@@ -140,7 +153,7 @@ class Calculator:
# For consensus tool and other long-running tools, we need to ensure
# the subprocess doesn't close prematurely
result = subprocess.run(
docker_cmd,
server_cmd,
input=input_data,
text=True,
capture_output=True,
@@ -149,7 +162,7 @@ class Calculator:
)
if result.returncode != 0:
self.logger.error(f"Docker exec failed with return code {result.returncode}")
self.logger.error(f"Standalone server failed with return code {result.returncode}")
self.logger.error(f"Stderr: {result.stderr}")
# Still try to parse stdout as the response might have been written before the error
self.logger.debug(f"Attempting to parse stdout despite error: {result.stdout[:500]}")
@@ -263,6 +276,56 @@ class Calculator:
shutil.rmtree(self.test_dir)
self.logger.debug(f"Removed test files directory: {self.test_dir}")
# ============================================================================
# Log Utility Methods (delegate to LogUtils)
# ============================================================================
def get_server_logs_since(self, since_time: Optional[str] = None) -> str:
"""Get server logs from both main and activity log files."""
return LogUtils.get_server_logs_since(since_time)
def get_recent_server_logs(self, lines: int = 500) -> str:
"""Get recent server logs from the main log file."""
return LogUtils.get_recent_server_logs(lines)
def get_server_logs_subprocess(self, lines: int = 500) -> str:
"""Get server logs using subprocess (alternative method)."""
return LogUtils.get_server_logs_subprocess(lines)
def check_server_logs_for_errors(self, lines: int = 500) -> list[str]:
"""Check server logs for error messages."""
return LogUtils.check_server_logs_for_errors(lines)
def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
"""Extract token budget calculation information from logs."""
return LogUtils.extract_conversation_usage_logs(logs)
def extract_conversation_token_usage(self, logs: str) -> list[int]:
"""Extract conversation token usage values from logs."""
return LogUtils.extract_conversation_token_usage(logs)
def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
"""Extract thread creation logs with parent relationships."""
return LogUtils.extract_thread_creation_logs(logs)
def extract_history_traversal_logs(self, logs: str) -> list[dict[str, any]]:
"""Extract conversation history traversal logs."""
return LogUtils.extract_history_traversal_logs(logs)
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
"""Validate that logs show file deduplication behavior."""
return LogUtils.validate_file_deduplication_in_logs(logs, tool_name, test_file)
def search_logs_for_pattern(
self, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
) -> list[str]:
"""Search logs for a specific pattern."""
return LogUtils.search_logs_for_pattern(pattern, logs, case_sensitive)
def get_log_file_info(self) -> dict[str, dict[str, any]]:
"""Get information about log files."""
return LogUtils.get_log_file_info()
def run_test(self) -> bool:
"""Run the test - to be implemented by subclasses"""
raise NotImplementedError("Subclasses must implement run_test()")

View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Conversation Base Test Class for In-Process MCP Tool Testing
This class enables testing MCP tools within the same process to maintain conversation
memory state across tool calls. Unlike BaseSimulatorTest which runs each tool call
as a separate subprocess (losing memory state), this class calls tools directly
in-process, allowing conversation functionality to work correctly.
USAGE:
- Inherit from ConversationBaseTest instead of BaseSimulatorTest for conversation tests
- Use call_mcp_tool_direct() to call tools in-process
- Conversation memory persists across tool calls within the same test
- setUp() clears memory between test methods for proper isolation
EXAMPLE:
class TestConversationFeature(ConversationBaseTest):
def test_cross_tool_continuation(self):
# Step 1: Call precommit tool
result1, continuation_id = self.call_mcp_tool_direct("precommit", {
"path": "/path/to/repo",
"prompt": "Review these changes"
})
# Step 2: Continue with codereview tool - memory is preserved!
result2, _ = self.call_mcp_tool_direct("codereview", {
"files": ["/path/to/file.py"],
"prompt": "Focus on security issues",
"continuation_id": continuation_id
})
"""
import asyncio
import json
from typing import Optional
from .base_test import BaseSimulatorTest
class ConversationBaseTest(BaseSimulatorTest):
"""Base class for conversation tests that require in-process tool calling"""
def __init__(self, verbose: bool = False):
super().__init__(verbose)
self._tools = None
self._loop = None
def setUp(self):
"""Set up test environment - clears conversation memory between tests"""
super().setup_test_files()
# Clear conversation memory for test isolation
self._clear_conversation_memory()
# Import tools from server.py for in-process calling
if self._tools is None:
self._import_tools()
def _clear_conversation_memory(self):
"""Clear all conversation memory to ensure test isolation"""
try:
from utils.storage_backend import get_storage_backend
storage = get_storage_backend()
# Clear all stored conversation threads
with storage._lock:
storage._store.clear()
self.logger.debug("Cleared conversation memory for test isolation")
except Exception as e:
self.logger.warning(f"Could not clear conversation memory: {e}")
def _import_tools(self):
"""Import tools from server.py for direct calling"""
try:
import os
import sys
# Add project root to Python path if not already there
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
sys.path.insert(0, project_root)
# Import tools from server
from server import TOOLS
self._tools = TOOLS
self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
except ImportError as e:
raise RuntimeError(f"Could not import tools from server.py: {e}")
def _get_event_loop(self):
"""Get or create event loop for async tool execution"""
if self._loop is None:
try:
self._loop = asyncio.get_event_loop()
except RuntimeError:
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
return self._loop
def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""
Call an MCP tool directly in-process without subprocess isolation.
This method maintains conversation memory across calls, enabling proper
testing of conversation functionality.
Args:
tool_name: Name of the tool to call (e.g., "precommit", "codereview")
params: Parameters to pass to the tool
Returns:
tuple: (response_content, continuation_id) where continuation_id
can be used for follow-up calls
"""
if self._tools is None:
raise RuntimeError("Tools not imported. Call setUp() first.")
if tool_name not in self._tools:
raise ValueError(f"Tool '{tool_name}' not found. Available: {list(self._tools.keys())}")
try:
tool = self._tools[tool_name]
self.logger.debug(f"Calling tool '{tool_name}' directly in-process")
# Set up minimal model context if not provided
if "model" not in params:
params["model"] = "flash" # Use fast model for testing
# Execute tool directly using asyncio
loop = self._get_event_loop()
# Import required modules for model resolution (similar to server.py)
from config import DEFAULT_MODEL
from providers.registry import ModelProviderRegistry
from utils.model_context import ModelContext
# Resolve model (simplified version of server.py logic)
model_name = params.get("model", DEFAULT_MODEL)
provider = ModelProviderRegistry.get_provider_for_model(model_name)
if not provider:
# Fallback to available model for testing
available_models = list(ModelProviderRegistry.get_available_models(respect_restrictions=True).keys())
if available_models:
model_name = available_models[0]
params["model"] = model_name
self.logger.debug(f"Using fallback model for testing: {model_name}")
# Create model context
model_context = ModelContext(model_name)
params["_model_context"] = model_context
params["_resolved_model_name"] = model_name
# Execute tool asynchronously
result = loop.run_until_complete(tool.execute(params))
if not result or len(result) == 0:
return None, None
# Extract response content
response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
# Parse response to extract continuation_id
continuation_id = self._extract_continuation_id_from_response(response_text)
self.logger.debug(f"Tool '{tool_name}' completed successfully in-process")
return response_text, continuation_id
except Exception as e:
self.logger.error(f"Direct tool call failed for '{tool_name}': {e}")
return None, None
def _extract_continuation_id_from_response(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from tool response"""
try:
# Parse the response as JSON to look for continuation metadata
response_data = json.loads(response_text)
# Look for continuation_id in various places
if isinstance(response_data, dict):
# Check metadata
metadata = response_data.get("metadata", {})
if "thread_id" in metadata:
return metadata["thread_id"]
# Check continuation_offer
continuation_offer = response_data.get("continuation_offer", {})
if continuation_offer and "continuation_id" in continuation_offer:
return continuation_offer["continuation_id"]
# Check follow_up_request
follow_up = response_data.get("follow_up_request", {})
if follow_up and "continuation_id" in follow_up:
return follow_up["continuation_id"]
return None
except (json.JSONDecodeError, AttributeError):
# If response is not JSON or doesn't have expected structure, return None
return None
def tearDown(self):
"""Clean up after test"""
super().cleanup_test_files()
# Clear memory again for good measure
self._clear_conversation_memory()
@property
def test_name(self) -> str:
"""Get the test name"""
return self.__class__.__name__
@property
def test_description(self) -> str:
"""Get the test description"""
return "In-process conversation test"

View File

@@ -0,0 +1,316 @@
"""
Centralized log utility for simulator tests.
This module provides common log reading and parsing functionality
used across multiple simulator test files to reduce code duplication.
"""
import logging
import re
import subprocess
from typing import Optional, Union
class LogUtils:
"""Centralized logging utilities for simulator tests."""
# Log file paths
MAIN_LOG_FILE = "logs/mcp_server.log"
ACTIVITY_LOG_FILE = "logs/mcp_activity.log"
@classmethod
def get_server_logs_since(cls, since_time: Optional[str] = None) -> str:
"""
Get server logs from both main and activity log files.
Args:
since_time: Currently ignored, returns all available logs
Returns:
Combined logs from both log files
"""
try:
main_logs = ""
activity_logs = ""
# Read main server log
try:
with open(cls.MAIN_LOG_FILE) as f:
main_logs = f.read()
except FileNotFoundError:
pass
# Read activity log
try:
with open(cls.ACTIVITY_LOG_FILE) as f:
activity_logs = f.read()
except FileNotFoundError:
pass
return main_logs + "\n" + activity_logs
except Exception as e:
logging.warning(f"Failed to read server logs: {e}")
return ""
@classmethod
def get_recent_server_logs(cls, lines: int = 500) -> str:
"""
Get recent server logs from the main log file.
Args:
lines: Number of recent lines to retrieve (default: 500)
Returns:
Recent log content as string
"""
try:
with open(cls.MAIN_LOG_FILE) as f:
all_lines = f.readlines()
recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
return "".join(recent_lines)
except FileNotFoundError:
logging.warning(f"Log file {cls.MAIN_LOG_FILE} not found")
return ""
except Exception as e:
logging.warning(f"Failed to read recent server logs: {e}")
return ""
@classmethod
def get_server_logs_subprocess(cls, lines: int = 500) -> str:
"""
Get server logs using subprocess (alternative method).
Args:
lines: Number of recent lines to retrieve
Returns:
Recent log content as string
"""
try:
result = subprocess.run(
["tail", "-n", str(lines), cls.MAIN_LOG_FILE], capture_output=True, text=True, timeout=10
)
return result.stdout + result.stderr
except Exception as e:
logging.warning(f"Failed to get server logs via subprocess: {e}")
return ""
@classmethod
def check_server_logs_for_errors(cls, lines: int = 500) -> list[str]:
"""
Check server logs for error messages.
Args:
lines: Number of recent lines to check
Returns:
List of error messages found
"""
logs = cls.get_recent_server_logs(lines)
error_patterns = [r"ERROR.*", r"CRITICAL.*", r"Failed.*", r"Exception.*", r"Error:.*"]
errors = []
for line in logs.split("\n"):
for pattern in error_patterns:
if re.search(pattern, line, re.IGNORECASE):
errors.append(line.strip())
break
return errors
@classmethod
def extract_conversation_usage_logs(cls, logs: str) -> list[dict[str, int]]:
"""
Extract token budget calculation information from logs.
Args:
logs: Log content to parse
Returns:
List of dictionaries containing token usage data
"""
usage_data = []
pattern = r"\[CONVERSATION_DEBUG\] Token budget calculation:"
for line in logs.split("\n"):
if re.search(pattern, line):
# Parse the token usage information
usage_info = {}
# Extract total capacity
capacity_match = re.search(r"Total capacity: ([\d,]+)", line)
if capacity_match:
usage_info["total_capacity"] = int(capacity_match.group(1).replace(",", ""))
# Extract content allocation
content_match = re.search(r"Content allocation: ([\d,]+)", line)
if content_match:
usage_info["content_allocation"] = int(content_match.group(1).replace(",", ""))
# Extract conversation tokens
conv_match = re.search(r"Conversation tokens: ([\d,]+)", line)
if conv_match:
usage_info["conversation_tokens"] = int(conv_match.group(1).replace(",", ""))
# Extract remaining tokens
remaining_match = re.search(r"Remaining tokens: ([\d,]+)", line)
if remaining_match:
usage_info["remaining_tokens"] = int(remaining_match.group(1).replace(",", ""))
if usage_info:
usage_data.append(usage_info)
return usage_data
@classmethod
def extract_conversation_token_usage(cls, logs: str) -> list[int]:
"""
Extract conversation token usage values from logs.
Args:
logs: Log content to parse
Returns:
List of token usage values
"""
pattern = r"Conversation history token usage:\s*([\d,]+)"
usage_values = []
for match in re.finditer(pattern, logs):
usage_value = int(match.group(1).replace(",", ""))
usage_values.append(usage_value)
return usage_values
@classmethod
def extract_thread_creation_logs(cls, logs: str) -> list[dict[str, str]]:
"""
Extract thread creation logs with parent relationships.
Args:
logs: Log content to parse
Returns:
List of dictionaries with thread relationship data
"""
thread_data = []
pattern = r"\[THREAD\] Created new thread (\w+)(?: with parent (\w+))?"
for match in re.finditer(pattern, logs):
thread_info = {"thread_id": match.group(1), "parent_id": match.group(2) if match.group(2) else None}
thread_data.append(thread_info)
return thread_data
@classmethod
def extract_history_traversal_logs(cls, logs: str) -> list[dict[str, Union[str, int]]]:
"""
Extract conversation history traversal logs.
Args:
logs: Log content to parse
Returns:
List of dictionaries with traversal data
"""
traversal_data = []
pattern = r"\[THREAD\] Retrieved chain of (\d+) messages for thread (\w+)"
for match in re.finditer(pattern, logs):
traversal_info = {"chain_length": int(match.group(1)), "thread_id": match.group(2)}
traversal_data.append(traversal_info)
return traversal_data
@classmethod
def validate_file_deduplication_in_logs(cls, logs: str, tool_name: str, test_file: str) -> bool:
"""
Validate that logs show file deduplication behavior.
Args:
logs: Log content to parse
tool_name: Name of the tool being tested
test_file: Name of the test file to check for deduplication
Returns:
True if deduplication evidence is found, False otherwise
"""
# Look for embedding calculation
embedding_pattern = f"Calculating embeddings for {test_file}"
has_embedding = bool(re.search(embedding_pattern, logs))
# Look for filtering message
filtering_pattern = f"Filtering {test_file} to prevent duplication"
has_filtering = bool(re.search(filtering_pattern, logs))
# Look for skip message
skip_pattern = f"Skipping {test_file} \\(already processed"
has_skip = bool(re.search(skip_pattern, logs))
# Look for tool-specific processing
tool_pattern = f"\\[{tool_name.upper()}\\].*{test_file}"
has_tool_processing = bool(re.search(tool_pattern, logs, re.IGNORECASE))
# Deduplication is confirmed if we see evidence of processing and filtering/skipping
return has_embedding and (has_filtering or has_skip) and has_tool_processing
@classmethod
def search_logs_for_pattern(
cls, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
) -> list[str]:
"""
Search logs for a specific pattern.
Args:
pattern: Regex pattern to search for
logs: Log content to search (if None, reads recent logs)
case_sensitive: Whether the search should be case sensitive
Returns:
List of matching lines
"""
if logs is None:
logs = cls.get_recent_server_logs()
flags = 0 if case_sensitive else re.IGNORECASE
matches = []
for line in logs.split("\n"):
if re.search(pattern, line, flags):
matches.append(line.strip())
return matches
@classmethod
def get_log_file_info(cls) -> dict[str, dict[str, Union[str, int, bool]]]:
"""
Get information about log files.
Returns:
Dictionary with file information for each log file
"""
import os
file_info = {}
for log_file in [cls.MAIN_LOG_FILE, cls.ACTIVITY_LOG_FILE]:
if os.path.exists(log_file):
stat = os.stat(log_file)
file_info[log_file] = {
"exists": True,
"size_bytes": stat.st_size,
"size_mb": round(stat.st_size / (1024 * 1024), 2),
"last_modified": stat.st_mtime,
"readable": os.access(log_file, os.R_OK),
}
else:
file_info[log_file] = {
"exists": False,
"size_bytes": 0,
"size_mb": 0,
"last_modified": 0,
"readable": False,
}
return file_info

View File

@@ -7,7 +7,6 @@ and builds conversation context correctly when using continuation_id.
"""
import json
import subprocess
from .base_test import BaseSimulatorTest
@@ -23,19 +22,16 @@ class TestConsensusConversation(BaseSimulatorTest):
def test_description(self) -> str:
return "Test consensus tool conversation building and continuation"
def get_docker_logs(self):
"""Get Docker container logs"""
def get_server_logs(self):
"""Get server logs from local log file"""
try:
result = subprocess.run(
["docker", "logs", "--tail", "100", self.container_name], capture_output=True, text=True, timeout=30
)
if result.returncode == 0:
return result.stdout.split("\n")
else:
self.logger.warning(f"Failed to get Docker logs: {result.stderr}")
return []
log_file_path = "logs/mcp_server.log"
with open(log_file_path) as f:
lines = f.readlines()
# Return last 100 lines
return [line.strip() for line in lines[-100:]]
except Exception as e:
self.logger.warning(f"Exception getting Docker logs: {e}")
self.logger.warning(f"Exception getting server logs: {e}")
return []
def run_test(self) -> bool:
@@ -121,9 +117,9 @@ class TestConsensusConversation(BaseSimulatorTest):
self.logger.info("Phase 3: Checking server logs for conversation building")
# Check for conversation-related log entries
logs = self.get_docker_logs()
logs = self.get_server_logs()
if not logs:
self.logger.warning("Could not retrieve Docker logs for verification")
self.logger.warning("Could not retrieve server logs for verification")
else:
# Look for conversation building indicators
conversation_logs = [

View File

@@ -22,42 +22,6 @@ class ContentValidationTest(BaseSimulatorTest):
def test_description(self) -> str:
return "Content validation and duplicate detection"
def get_docker_logs_since(self, since_time: str) -> str:
"""Get docker logs since a specific timestamp"""
try:
# Check both main server and log monitor for comprehensive logs
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
import subprocess
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
# Get the internal log files which have more detailed logging
server_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
)
activity_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
)
# Combine all logs
combined_logs = (
result_server.stdout
+ "\n"
+ result_monitor.stdout
+ "\n"
+ server_log_result.stdout
+ "\n"
+ activity_log_result.stdout
)
return combined_logs
except Exception as e:
self.logger.error(f"Failed to get docker logs: {e}")
return ""
def run_test(self) -> bool:
"""Test that file processing system properly handles file deduplication"""
try:
@@ -151,9 +115,9 @@ DATABASE_CONFIG = {
else:
self.logger.warning(" ⚠️ Different tool failed")
# Validate file processing behavior from Docker logs
# Validate file processing behavior from server logs
self.logger.info(" 4: Validating file processing logs")
logs = self.get_docker_logs_since(start_time)
logs = self.get_server_logs_since(start_time)
# Check for proper file embedding logs
embedding_logs = [

View File

@@ -21,8 +21,6 @@ This validates the conversation threading system's ability to:
- Properly traverse parent relationships for history reconstruction
"""
import re
import subprocess
from .base_test import BaseSimulatorTest
@@ -38,53 +36,6 @@ class ConversationChainValidationTest(BaseSimulatorTest):
def test_description(self) -> str:
return "Conversation chain and threading validation"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
"""Extract thread creation logs with parent relationships"""
thread_logs = []
lines = logs.split("\n")
for line in lines:
if "[THREAD] Created new thread" in line:
# Parse: [THREAD] Created new thread 9dc779eb-645f-4850-9659-34c0e6978d73 with parent a0ce754d-c995-4b3e-9103-88af429455aa
match = re.search(r"\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)", line)
if match:
thread_id = match.group(1)
parent_id = match.group(2) if match.group(2) != "None" else None
thread_logs.append({"thread_id": thread_id, "parent_id": parent_id, "log_line": line})
return thread_logs
def extract_history_traversal_logs(self, logs: str) -> list[dict[str, str]]:
"""Extract conversation history traversal logs"""
traversal_logs = []
lines = logs.split("\n")
for line in lines:
if "[THREAD] Retrieved chain of" in line:
# Parse: [THREAD] Retrieved chain of 3 threads for 9dc779eb-645f-4850-9659-34c0e6978d73
match = re.search(r"\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)", line)
if match:
chain_length = int(match.group(1))
thread_id = match.group(2)
traversal_logs.append({"thread_id": thread_id, "chain_length": chain_length, "log_line": line})
return traversal_logs
def run_test(self) -> bool:
"""Test conversation chain and threading functionality"""
try:

View File

@@ -12,7 +12,6 @@ Validates:
5. Proper tool chaining with context
"""
import subprocess
from .base_test import BaseSimulatorTest
@@ -28,40 +27,6 @@ class CrossToolComprehensiveTest(BaseSimulatorTest):
def test_description(self) -> str:
return "Comprehensive cross-tool file deduplication and continuation"
def get_docker_logs_since(self, since_time: str) -> str:
"""Get docker logs since a specific timestamp"""
try:
# Check both main server and log monitor for comprehensive logs
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
# Get the internal log files which have more detailed logging
server_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
)
activity_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
)
# Combine all logs
combined_logs = (
result_server.stdout
+ "\n"
+ result_monitor.stdout
+ "\n"
+ server_log_result.stdout
+ "\n"
+ activity_log_result.stdout
)
return combined_logs
except Exception as e:
self.logger.error(f"Failed to get docker logs: {e}")
return ""
def run_test(self) -> bool:
"""Comprehensive cross-tool test with all MCP tools"""
try:
@@ -247,7 +212,7 @@ def secure_login(user, pwd):
# Validate comprehensive results
self.logger.info(" 📋 Validating comprehensive cross-tool results...")
logs = self.get_docker_logs_since(start_time)
logs = self.get_server_logs_since(start_time)
# Validation criteria
tools_used = [r[0] for r in responses]

View File

@@ -6,10 +6,10 @@ Tests comprehensive cross-tool continuation scenarios to ensure
conversation context is maintained when switching between different tools.
"""
from .base_test import BaseSimulatorTest
from .conversation_base_test import ConversationBaseTest
class CrossToolContinuationTest(BaseSimulatorTest):
class CrossToolContinuationTest(ConversationBaseTest):
"""Test comprehensive cross-tool continuation scenarios"""
@property
@@ -25,8 +25,8 @@ class CrossToolContinuationTest(BaseSimulatorTest):
try:
self.logger.info("🔧 Test: Cross-tool continuation scenarios")
# Setup test files
self.setup_test_files()
# Setup test environment for conversation testing
self.setUp()
success_count = 0
total_scenarios = 3
@@ -62,7 +62,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
self.logger.info(" 1: Testing chat -> thinkdeep -> codereview")
# Start with chat
chat_response, chat_id = self.call_mcp_tool(
chat_response, chat_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
@@ -76,7 +76,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
return False
# Continue with thinkdeep
thinkdeep_response, _ = self.call_mcp_tool(
thinkdeep_response, _ = self.call_mcp_tool_direct(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
@@ -91,7 +91,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
return False
# Continue with codereview
codereview_response, _ = self.call_mcp_tool(
codereview_response, _ = self.call_mcp_tool_direct(
"codereview",
{
"files": [self.test_files["python"]], # Same file should be deduplicated
@@ -118,8 +118,13 @@ class CrossToolContinuationTest(BaseSimulatorTest):
self.logger.info(" 2: Testing analyze -> debug -> thinkdeep")
# Start with analyze
analyze_response, analyze_id = self.call_mcp_tool(
"analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality", "model": "flash"}
analyze_response, analyze_id = self.call_mcp_tool_direct(
"analyze",
{
"files": [self.test_files["python"]],
"prompt": "Analyze this code for quality and performance issues",
"model": "flash",
},
)
if not analyze_response or not analyze_id:
@@ -127,7 +132,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
return False
# Continue with debug
debug_response, _ = self.call_mcp_tool(
debug_response, _ = self.call_mcp_tool_direct(
"debug",
{
"files": [self.test_files["python"]], # Same file should be deduplicated
@@ -142,7 +147,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
return False
# Continue with thinkdeep
final_response, _ = self.call_mcp_tool(
final_response, _ = self.call_mcp_tool_direct(
"thinkdeep",
{
"prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
@@ -169,7 +174,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
self.logger.info(" 3: Testing multi-file cross-tool continuation")
# Start with both files
multi_response, multi_id = self.call_mcp_tool(
multi_response, multi_id = self.call_mcp_tool_direct(
"chat",
{
"prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
@@ -183,7 +188,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
return False
# Switch to codereview with same files (should use conversation history)
multi_review, _ = self.call_mcp_tool(
multi_review, _ = self.call_mcp_tool_direct(
"codereview",
{
"files": [self.test_files["python"], self.test_files["config"]], # Same files

View File

@@ -378,35 +378,28 @@ The code looks correct to me, but something is causing valid sessions to be trea
# Validate logs
self.logger.info(" 📋 Validating execution logs...")
# Get server logs from the actual log file inside the container
result = self.run_command(
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
)
# Get server logs using inherited method
logs = self.get_recent_server_logs(500)
if result.returncode == 0:
logs = result.stdout.decode() + result.stderr.decode()
# Look for debug tool execution patterns
debug_patterns = [
"debug tool",
"[DEBUG]",
"systematic investigation",
"Token budget",
"Essential files for debugging",
]
# Look for debug tool execution patterns
debug_patterns = [
"debug tool",
"[DEBUG]",
"systematic investigation",
"Token budget",
"Essential files for debugging",
]
patterns_found = 0
for pattern in debug_patterns:
if pattern in logs:
patterns_found += 1
self.logger.debug(f" ✅ Found log pattern: {pattern}")
patterns_found = 0
for pattern in debug_patterns:
if pattern in logs:
patterns_found += 1
self.logger.debug(f" ✅ Found log pattern: {pattern}")
if patterns_found >= 3:
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
else:
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
if patterns_found >= 3:
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
else:
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
# Test continuation if available
if continuation_id:

View File

@@ -145,14 +145,16 @@ def validate_data(data):
# Test 4: Validate log patterns
self.logger.info(" 1.4: Validating line number processing in logs")
# Get logs from container
result = self.run_command(
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
)
logs = ""
if result.returncode == 0:
logs = result.stdout.decode()
# Get logs from server
try:
log_file_path = "logs/mcp_server.log"
with open(log_file_path) as f:
lines = f.readlines()
logs = "".join(lines[-500:])
except Exception as e:
self.logger.error(f"Failed to read server logs: {e}")
logs = ""
pass
# Check for line number formatting patterns
line_number_patterns = ["Line numbers for", "enabled", "", "line number"] # The line number separator

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
Docker Logs Validation Test
Server Logs Validation Test
Validates Docker logs to confirm file deduplication behavior and
Validates server logs to confirm file deduplication behavior and
conversation threading is working properly.
"""
@@ -10,7 +10,7 @@ from .base_test import BaseSimulatorTest
class LogsValidationTest(BaseSimulatorTest):
"""Validate Docker logs to confirm file deduplication behavior"""
"""Validate server logs to confirm file deduplication behavior"""
@property
def test_name(self) -> str:
@@ -18,39 +18,35 @@ class LogsValidationTest(BaseSimulatorTest):
@property
def test_description(self) -> str:
return "Docker logs validation"
return "Server logs validation"
def run_test(self) -> bool:
"""Validate Docker logs to confirm file deduplication behavior"""
"""Validate server logs to confirm file deduplication behavior"""
try:
self.logger.info("📋 Test: Validating Docker logs for file deduplication...")
self.logger.info("📋 Test: Validating server logs for file deduplication...")
# Get server logs from main container
result = self.run_command(["docker", "logs", self.container_name], capture_output=True)
# Get server logs from log files
import os
if result.returncode != 0:
self.logger.error(f"Failed to get Docker logs: {result.stderr}")
logs = ""
log_files = ["logs/mcp_server.log", "logs/mcp_activity.log"]
for log_file in log_files:
if os.path.exists(log_file):
try:
with open(log_file) as f:
file_content = f.read()
logs += f"\n=== {log_file} ===\n{file_content}\n"
self.logger.debug(f"Read {len(file_content)} characters from {log_file}")
except Exception as e:
self.logger.warning(f"Could not read {log_file}: {e}")
else:
self.logger.warning(f"Log file not found: {log_file}")
if not logs.strip():
self.logger.warning("No log content found - server may not have processed any requests yet")
return False
main_logs = result.stdout.decode() + result.stderr.decode()
# Get logs from log monitor container (where detailed activity is logged)
monitor_result = self.run_command(["docker", "logs", "zen-mcp-log-monitor"], capture_output=True)
monitor_logs = ""
if monitor_result.returncode == 0:
monitor_logs = monitor_result.stdout.decode() + monitor_result.stderr.decode()
# Also get activity logs for more detailed conversation tracking
activity_result = self.run_command(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True
)
activity_logs = ""
if activity_result.returncode == 0:
activity_logs = activity_result.stdout.decode()
logs = main_logs + "\n" + monitor_logs + "\n" + activity_logs
# Look for conversation threading patterns that indicate the system is working
conversation_patterns = [
"CONVERSATION_RESUME",

View File

@@ -4,11 +4,10 @@ O3 Model Selection Test
Tests that O3 models are properly selected and used when explicitly specified,
regardless of the default model configuration (even when set to auto).
Validates model selection via Docker logs.
Validates model selection via server logs.
"""
import datetime
import subprocess
from .base_test import BaseSimulatorTest
@@ -24,47 +23,16 @@ class O3ModelSelectionTest(BaseSimulatorTest):
def test_description(self) -> str:
return "O3 model selection and usage validation"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
# Read logs directly from the log file - use more lines to ensure we get all test-related logs
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def run_test(self) -> bool:
"""Test O3 model selection and usage"""
try:
self.logger.info(" Test: O3 model selection and usage validation")
# Check which API keys are configured
check_cmd = [
"docker",
"exec",
self.container_name,
"python",
"-c",
'import os; print(f\'OPENAI_KEY:{bool(os.environ.get("OPENAI_API_KEY"))}|OPENROUTER_KEY:{bool(os.environ.get("OPENROUTER_API_KEY"))}\')',
]
result = subprocess.run(check_cmd, capture_output=True, text=True)
import os
has_openai = False
has_openrouter = False
if result.returncode == 0:
output = result.stdout.strip()
if "OPENAI_KEY:True" in output:
has_openai = True
if "OPENROUTER_KEY:True" in output:
has_openrouter = True
has_openai = bool(os.environ.get("OPENAI_API_KEY"))
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
# If only OpenRouter is configured, adjust test expectations
if has_openrouter and not has_openai:

View File

@@ -9,7 +9,6 @@ Tests custom API endpoint functionality with Ollama-style local models, includin
- Model alias resolution for local models
"""
import subprocess
from .base_test import BaseSimulatorTest
@@ -30,14 +29,15 @@ class OllamaCustomUrlTest(BaseSimulatorTest):
try:
self.logger.info("Test: Ollama custom URL functionality")
# Check if custom URL is configured in the Docker container
custom_url = self._check_docker_custom_url()
# Check if custom URL is configured
import os
custom_url = os.environ.get("CUSTOM_API_URL")
if not custom_url:
self.logger.warning("CUSTOM_API_URL not set in Docker container, skipping Ollama test")
self.logger.warning("CUSTOM_API_URL not set, skipping Ollama test")
self.logger.info("To enable this test, add to .env file:")
self.logger.info("CUSTOM_API_URL=http://host.docker.internal:11434/v1")
self.logger.info("CUSTOM_API_URL=http://localhost:11434/v1")
self.logger.info("CUSTOM_API_KEY=")
self.logger.info("Then restart docker-compose")
return True # Skip gracefully
self.logger.info(f"Testing with custom URL: {custom_url}")
@@ -172,25 +172,6 @@ if __name__ == "__main__":
finally:
self.cleanup_test_files()
def _check_docker_custom_url(self) -> str:
"""Check if CUSTOM_API_URL is set in the Docker container"""
try:
result = subprocess.run(
["docker", "exec", self.container_name, "printenv", "CUSTOM_API_URL"],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return ""
except Exception as e:
self.logger.debug(f"Failed to check Docker CUSTOM_API_URL: {e}")
return ""
def validate_successful_response(self, response: str, test_name: str, files_provided: bool = False) -> bool:
"""Validate that the response indicates success, not an error
@@ -201,7 +182,7 @@ if __name__ == "__main__":
"""
if not response:
self.logger.error(f"No response received for {test_name}")
self._check_docker_logs_for_errors()
self._check_server_logs_for_errors()
return False
# Check for common error indicators
@@ -227,7 +208,7 @@ if __name__ == "__main__":
]
# Special handling for clarification requests from local models
if "clarification_required" in response.lower():
if "files_required_to_continue" in response.lower():
if files_provided:
# If we provided actual files, clarification request is a FAILURE
self.logger.error(
@@ -243,7 +224,7 @@ if __name__ == "__main__":
self.logger.debug(f"Clarification response: {response[:200]}...")
return True
# Check for SSRF security restriction - this is expected for local URLs from Docker
# Check for SSRF security restriction - this is expected for local URLs
if "restricted IP address" in response and "security risk (SSRF)" in response:
self.logger.info(
f"✅ Custom URL routing working - {test_name} correctly attempted to connect to custom API"
@@ -256,19 +237,19 @@ if __name__ == "__main__":
if error.lower() in response_lower:
self.logger.error(f"Error detected in {test_name}: {error}")
self.logger.debug(f"Full response: {response}")
self._check_docker_logs_for_errors()
self._check_server_logs_for_errors()
return False
# Response should be substantial (more than just a few words)
if len(response.strip()) < 10:
self.logger.error(f"Response too short for {test_name}: {response}")
self._check_docker_logs_for_errors()
self._check_server_logs_for_errors()
return False
# Verify this looks like a real AI response, not just an error message
if not self._validate_ai_response_content(response):
self.logger.error(f"Response doesn't look like valid AI output for {test_name}")
self._check_docker_logs_for_errors()
self._check_server_logs_for_errors()
return False
self.logger.debug(f"Successful response for {test_name}: {response[:100]}...")
@@ -329,24 +310,23 @@ if __name__ == "__main__":
return True
def _check_docker_logs_for_errors(self):
"""Check Docker logs for any error messages that might explain failures"""
def _check_server_logs_for_errors(self):
"""Check server logs for any error messages that might explain failures"""
try:
# Get recent logs from the container
result = subprocess.run(
["docker", "logs", "--tail", "50", self.container_name], capture_output=True, text=True, timeout=10
)
# Get recent logs from the log file
log_file_path = "logs/mcp_server.log"
with open(log_file_path) as f:
lines = f.readlines()
recent_logs = lines[-50:] # Last 50 lines
if result.returncode == 0 and result.stderr:
recent_logs = result.stderr.strip()
if recent_logs:
self.logger.info("Recent container logs:")
for line in recent_logs.split("\n")[-10:]: # Last 10 lines
if line.strip():
self.logger.info(f" {line}")
if recent_logs:
self.logger.info("Recent server logs:")
for line in recent_logs[-10:]: # Last 10 lines
if line.strip():
self.logger.info(f" {line.strip()}")
except Exception as e:
self.logger.debug(f"Failed to check Docker logs: {e}")
self.logger.debug(f"Failed to check server logs: {e}")
def validate_local_model_response(self, response: str) -> bool:
"""Validate that response appears to come from a local model"""

View File

@@ -8,7 +8,6 @@ Tests that verify the system correctly falls back to OpenRouter when:
- Auto mode correctly selects OpenRouter models
"""
import subprocess
from .base_test import BaseSimulatorTest
@@ -24,53 +23,28 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
def test_description(self) -> str:
return "OpenRouter fallback behavior when only provider"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def run_test(self) -> bool:
"""Test OpenRouter fallback behavior"""
try:
self.logger.info("Test: OpenRouter fallback behavior when only provider available")
# Check if ONLY OpenRouter API key is configured (this is a fallback test)
check_cmd = [
"docker",
"exec",
self.container_name,
"python",
"-c",
'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))) + "|GEMINI_KEY:" + str(bool(os.environ.get("GEMINI_API_KEY"))) + "|OPENAI_KEY:" + str(bool(os.environ.get("OPENAI_API_KEY"))))',
]
result = subprocess.run(check_cmd, capture_output=True, text=True)
import os
if result.returncode == 0:
output = result.stdout.strip()
has_openrouter = "OPENROUTER_KEY:True" in output
has_gemini = "GEMINI_KEY:True" in output
has_openai = "OPENAI_KEY:True" in output
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
has_openai = bool(os.environ.get("OPENAI_API_KEY"))
if not has_openrouter:
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
self.logger.info(" This test requires OPENROUTER_API_KEY to be set in .env")
return True # Return True to indicate test is skipped, not failed
if not has_openrouter:
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
self.logger.info(" This test requires OPENROUTER_API_KEY to be set in .env")
return True # Return True to indicate test is skipped, not failed
if has_gemini or has_openai:
self.logger.info(" ⚠️ Other API keys configured - this is not a fallback scenario")
self.logger.info(" This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
self.logger.info(" Current setup has multiple providers, so fallback behavior doesn't apply")
return True # Return True to indicate test is skipped, not failed
if has_gemini or has_openai:
self.logger.info(" ⚠️ Other API keys configured - this is not a fallback scenario")
self.logger.info(" This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
self.logger.info(" Current setup has multiple providers, so fallback behavior doesn't apply")
return True # Return True to indicate test is skipped, not failed
# Setup test files
self.setup_test_files()

View File

@@ -9,7 +9,6 @@ Tests that verify OpenRouter functionality including:
- Error handling when models are not available
"""
import subprocess
from .base_test import BaseSimulatorTest
@@ -25,39 +24,17 @@ class OpenRouterModelsTest(BaseSimulatorTest):
def test_description(self) -> str:
return "OpenRouter model functionality and alias mapping"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
# Read logs directly from the log file
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def run_test(self) -> bool:
"""Test OpenRouter model functionality"""
try:
self.logger.info("Test: OpenRouter model functionality and alias mapping")
# Check if OpenRouter API key is configured
check_cmd = [
"docker",
"exec",
self.container_name,
"python",
"-c",
'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))))',
]
result = subprocess.run(check_cmd, capture_output=True, text=True)
import os
if result.returncode == 0 and "OPENROUTER_KEY:False" in result.stdout:
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
if not has_openrouter:
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
self.logger.info(" This test requires OPENROUTER_API_KEY to be set in .env")
return True # Return True to indicate test is skipped, not failed

View File

@@ -8,16 +8,15 @@ Validates that:
1. Files are embedded only once in conversation history
2. Continuation calls don't re-read existing files
3. New files are still properly embedded
4. Docker logs show deduplication behavior
4. Server logs show deduplication behavior
"""
import os
import subprocess
from .base_test import BaseSimulatorTest
from .conversation_base_test import ConversationBaseTest
class PerToolDeduplicationTest(BaseSimulatorTest):
class PerToolDeduplicationTest(ConversationBaseTest):
"""Test file deduplication for each individual tool"""
@property
@@ -28,74 +27,16 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
def test_description(self) -> str:
return "File deduplication for individual tools"
def get_docker_logs_since(self, since_time: str) -> str:
"""Get docker logs since a specific timestamp"""
try:
# Check both main server and log monitor for comprehensive logs
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
# Get the internal log files which have more detailed logging
server_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
)
activity_log_result = subprocess.run(
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
)
# Combine all logs
combined_logs = (
result_server.stdout
+ "\n"
+ result_monitor.stdout
+ "\n"
+ server_log_result.stdout
+ "\n"
+ activity_log_result.stdout
)
return combined_logs
except Exception as e:
self.logger.error(f"Failed to get docker logs: {e}")
return ""
# create_additional_test_file method now inherited from base class
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
"""Validate that logs show file deduplication behavior"""
# Look for file embedding messages
embedding_messages = [
line for line in logs.split("\n") if "📁" in line and "embedding" in line and tool_name in line
]
# Look for deduplication/filtering messages
filtering_messages = [
line for line in logs.split("\n") if "📁" in line and "Filtering" in line and tool_name in line
]
skipping_messages = [
line for line in logs.split("\n") if "📁" in line and "skipping" in line and tool_name in line
]
deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0
if deduplication_found:
self.logger.info(f"{tool_name}: Found deduplication evidence in logs")
for msg in filtering_messages + skipping_messages:
self.logger.debug(f" 📁 {msg.strip()}")
else:
self.logger.warning(f" ⚠️ {tool_name}: No deduplication evidence found in logs")
self.logger.debug(f" 📁 All embedding messages: {embedding_messages}")
return deduplication_found
def run_test(self) -> bool:
"""Test file deduplication with realistic precommit/codereview workflow"""
try:
self.logger.info("📄 Test: Simplified file deduplication with precommit/codereview workflow")
# Setup test environment for conversation testing
self.setUp()
# Setup test files
self.setup_test_files()
@@ -126,7 +67,7 @@ def divide(x, y):
"model": "flash",
}
response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
response1, continuation_id = self.call_mcp_tool_direct("precommit", precommit_params)
if not response1:
self.logger.error(" ❌ Step 1: precommit tool failed")
return False
@@ -151,7 +92,7 @@ def divide(x, y):
"model": "flash",
}
response2, _ = self.call_mcp_tool("codereview", codereview_params)
response2, _ = self.call_mcp_tool_direct("codereview", codereview_params)
if not response2:
self.logger.error(" ❌ Step 2: codereview tool failed")
return False
@@ -181,16 +122,16 @@ def subtract(a, b):
"model": "flash",
}
response3, _ = self.call_mcp_tool("precommit", continue_params)
response3, _ = self.call_mcp_tool_direct("precommit", continue_params)
if not response3:
self.logger.error(" ❌ Step 3: precommit continuation failed")
return False
self.logger.info(" ✅ Step 3: precommit continuation completed")
# Validate results in docker logs
# Validate results in server logs
self.logger.info(" 📋 Validating conversation history and file deduplication...")
logs = self.get_docker_logs_since(start_time)
logs = self.get_server_logs_since(start_time)
# Check for conversation history building
conversation_logs = [
@@ -249,7 +190,7 @@ def subtract(a, b):
return True
else:
self.logger.warning(" ⚠️ File deduplication workflow test: FAILED")
self.logger.warning(" 💡 Check docker logs for detailed file embedding and continuation activity")
self.logger.warning(" 💡 Check server logs for detailed file embedding and continuation activity")
return False
except Exception as e:

View File

@@ -244,7 +244,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "Deployment strategy: Use Kubernetes for container orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
"step": "Deployment strategy: Use Kubernetes for orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Complete the session
@@ -326,7 +326,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
"""Call an MCP tool via standalone server - override for planner-specific response handling"""
# Use parent implementation to get the raw response
response_text, _ = super().call_mcp_tool(tool_name, params)

View File

@@ -275,7 +275,7 @@ class PlannerValidationTest(BaseSimulatorTest):
response3, _ = self.call_mcp_tool(
"planner",
{
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler Docker Swarm deployment initially, then migrate to Kubernetes later.",
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
"step_number": 3,
"total_steps": 4,
"next_step_required": True,
@@ -311,7 +311,7 @@ class PlannerValidationTest(BaseSimulatorTest):
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
"""Call an MCP tool via standalone server - override for planner-specific response handling"""
# Use parent implementation to get the raw response
response_text, _ = super().call_mcp_tool(tool_name, params)

View File

@@ -1,139 +0,0 @@
#!/usr/bin/env python3
"""
Redis Conversation Memory Validation Test
Validates that conversation memory is working via Redis by checking
for stored conversation threads and their content.
"""
import json
from .base_test import BaseSimulatorTest
class RedisValidationTest(BaseSimulatorTest):
"""Validate that conversation memory is working via Redis"""
@property
def test_name(self) -> str:
return "redis_validation"
@property
def test_description(self) -> str:
return "Redis conversation memory validation"
def run_test(self) -> bool:
"""Validate that conversation memory is working via Redis"""
try:
self.logger.info("💾 Test: Validating conversation memory via Redis...")
# First, test Redis connectivity
ping_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
)
if ping_result.returncode != 0:
self.logger.error("Failed to connect to Redis")
return False
if "PONG" not in ping_result.stdout.decode():
self.logger.error("Redis ping failed")
return False
self.logger.info("✅ Redis connectivity confirmed")
# Check Redis for stored conversations
result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True
)
if result.returncode != 0:
self.logger.error("Failed to query Redis")
return False
keys = result.stdout.decode().strip().split("\n")
thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"]
if thread_keys:
self.logger.info(f"✅ Found {len(thread_keys)} conversation threads in Redis")
# Get details of first thread
thread_key = thread_keys[0]
result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True
)
if result.returncode == 0:
thread_data = result.stdout.decode()
try:
parsed = json.loads(thread_data)
turns = parsed.get("turns", [])
self.logger.info(f"✅ Thread has {len(turns)} turns")
return True
except json.JSONDecodeError:
self.logger.warning("Could not parse thread data")
return True
else:
# If no existing threads, create a test thread to validate Redis functionality
self.logger.info(" No existing threads found, creating test thread to validate Redis...")
test_thread_id = "test_thread_validation"
test_data = {
"thread_id": test_thread_id,
"turns": [
{"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
],
}
# Store test data
store_result = self.run_command(
[
"docker",
"exec",
self.redis_container,
"redis-cli",
"SET",
f"thread:{test_thread_id}",
json.dumps(test_data),
],
capture_output=True,
)
if store_result.returncode != 0:
self.logger.error("Failed to store test data in Redis")
return False
# Retrieve test data
retrieve_result = self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
capture_output=True,
)
if retrieve_result.returncode != 0:
self.logger.error("Failed to retrieve test data from Redis")
return False
retrieved_data = retrieve_result.stdout.decode()
try:
parsed = json.loads(retrieved_data)
if parsed.get("thread_id") == test_thread_id:
self.logger.info("✅ Redis read/write validation successful")
# Clean up test data
self.run_command(
["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
capture_output=True,
)
return True
else:
self.logger.error("Retrieved data doesn't match stored data")
return False
except json.JSONDecodeError:
self.logger.error("Could not parse retrieved test data")
return False
except Exception as e:
self.logger.error(f"Conversation memory validation failed: {e}")
return False

View File

@@ -241,35 +241,28 @@ def handle_everything(user_input, config, database):
# Validate logs
self.logger.info(" 📋 Validating execution logs...")
# Get server logs from the actual log file inside the container
result = self.run_command(
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
)
# Get server logs using inherited method
logs = self.get_recent_server_logs(500)
if result.returncode == 0:
logs = result.stdout.decode() + result.stderr.decode()
# Look for refactor tool execution patterns
refactor_patterns = [
"[REFACTOR]",
"refactor tool",
"codesmells",
"Token budget",
"Code files embedded successfully",
]
# Look for refactor tool execution patterns
refactor_patterns = [
"[REFACTOR]",
"refactor tool",
"codesmells",
"Token budget",
"Code files embedded successfully",
]
patterns_found = 0
for pattern in refactor_patterns:
if pattern in logs:
patterns_found += 1
self.logger.debug(f" ✅ Found log pattern: {pattern}")
patterns_found = 0
for pattern in refactor_patterns:
if pattern in logs:
patterns_found += 1
self.logger.debug(f" ✅ Found log pattern: {pattern}")
if patterns_found >= 3:
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
else:
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
if patterns_found >= 3:
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
else:
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
self.logger.info(" ✅ Refactor tool validation completed successfully")
return True

View File

@@ -11,7 +11,6 @@ This test validates that:
import datetime
import re
import subprocess
from .base_test import BaseSimulatorTest
@@ -27,78 +26,6 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
def test_description(self) -> str:
return "Token allocation and conversation history validation"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
"""Extract actual conversation token usage from server logs"""
usage_logs = []
# Look for conversation debug logs that show actual usage
lines = logs.split("\n")
for i, line in enumerate(lines):
if "[CONVERSATION_DEBUG] Token budget calculation:" in line:
# Found start of token budget log, extract the following lines
usage = {}
for j in range(1, 8): # Next 7 lines contain the usage details
if i + j < len(lines):
detail_line = lines[i + j]
# Parse Total capacity: 1,048,576
if "Total capacity:" in detail_line:
match = re.search(r"Total capacity:\s*([\d,]+)", detail_line)
if match:
usage["total_capacity"] = int(match.group(1).replace(",", ""))
# Parse Content allocation: 838,860
elif "Content allocation:" in detail_line:
match = re.search(r"Content allocation:\s*([\d,]+)", detail_line)
if match:
usage["content_allocation"] = int(match.group(1).replace(",", ""))
# Parse Conversation tokens: 12,345
elif "Conversation tokens:" in detail_line:
match = re.search(r"Conversation tokens:\s*([\d,]+)", detail_line)
if match:
usage["conversation_tokens"] = int(match.group(1).replace(",", ""))
# Parse Remaining tokens: 825,515
elif "Remaining tokens:" in detail_line:
match = re.search(r"Remaining tokens:\s*([\d,]+)", detail_line)
if match:
usage["remaining_tokens"] = int(match.group(1).replace(",", ""))
if usage: # Only add if we found some usage data
usage_logs.append(usage)
return usage_logs
def extract_conversation_token_usage(self, logs: str) -> list[int]:
"""Extract conversation token usage from logs"""
usage_values = []
# Look for conversation token usage logs
pattern = r"Conversation history token usage:\s*([\d,]+)"
matches = re.findall(pattern, logs)
for match in matches:
usage_values.append(int(match.replace(",", "")))
return usage_values
def run_test(self) -> bool:
"""Test token allocation and conversation history functionality"""
try:

View File

@@ -81,7 +81,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
"don't have access",
"cannot see",
"no image",
"clarification_required",
"files_required_to_continue",
"image you're referring to",
"supply the image",
"error",
@@ -122,7 +122,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
"don't have access",
"cannot see",
"no image",
"clarification_required",
"files_required_to_continue",
"image you're referring to",
"supply the image",
"error",

View File

@@ -9,7 +9,6 @@ Tests that verify X.AI GROK functionality including:
- API integration and response validation
"""
import subprocess
from .base_test import BaseSimulatorTest
@@ -25,44 +24,18 @@ class XAIModelsTest(BaseSimulatorTest):
def test_description(self) -> str:
return "X.AI GROK model functionality and integration"
def get_recent_server_logs(self) -> str:
"""Get recent server logs from the log file directly"""
try:
# Read logs directly from the log file
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return result.stdout
else:
self.logger.warning(f"Failed to read server logs: {result.stderr}")
return ""
except Exception as e:
self.logger.error(f"Failed to get server logs: {e}")
return ""
def run_test(self) -> bool:
"""Test X.AI GROK model functionality"""
try:
self.logger.info("Test: X.AI GROK model functionality and integration")
# Check if X.AI API key is configured and not empty
check_cmd = [
"docker",
"exec",
self.container_name,
"python",
"-c",
"""
import os
xai_key = os.environ.get("XAI_API_KEY", "")
is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
print(f"XAI_KEY_VALID:{is_valid}")
""".strip(),
]
result = subprocess.run(check_cmd, capture_output=True, text=True)
import os
if result.returncode == 0 and "XAI_KEY_VALID:False" in result.stdout:
xai_key = os.environ.get("XAI_API_KEY", "")
is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
if not is_valid:
self.logger.info(" ⚠️ X.AI API key not configured or empty - skipping test")
self.logger.info(" This test requires XAI_API_KEY to be set in .env with a valid key")
return True # Return True to indicate test is skipped, not failed