Migration from Docker to Standalone Python Server (#73)
* Migration from docker to standalone server Migration handling Fixed tests Use simpler in-memory storage Support for concurrent logging to disk Simplified direct connections to localhost * Migration from docker / redis to standalone script Updated tests Updated run script Fixed requirements Use dotenv Ask if user would like to install MCP in Claude Desktop once Updated docs * More cleanup and references to docker removed * Cleanup * Comments * Fixed tests * Fix GitHub Actions workflow for standalone Python architecture - Install requirements-dev.txt for pytest and testing dependencies - Remove Docker setup from simulation tests (now standalone) - Simplify linting job to use requirements-dev.txt - Update simulation tests to run directly without Docker Fixes unit test failures in CI due to missing pytest dependency. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Remove simulation tests from GitHub Actions - Removed simulation-tests job that makes real API calls - Keep only unit tests (mocked, no API costs) and linting - Simulation tests should be run manually with real API keys - Reduces CI costs and complexity GitHub Actions now only runs: - Unit tests (569 tests, all mocked) - Code quality checks (ruff, black) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed tests * Fixed tests --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
9d72545ecd
commit
4151c3c3a5
@@ -26,7 +26,8 @@ from .test_openrouter_models import OpenRouterModelsTest
|
||||
from .test_per_tool_deduplication import PerToolDeduplicationTest
|
||||
from .test_planner_continuation_history import PlannerContinuationHistoryTest
|
||||
from .test_planner_validation import PlannerValidationTest
|
||||
from .test_redis_validation import RedisValidationTest
|
||||
|
||||
# Redis validation test removed - no longer needed for standalone server
|
||||
from .test_refactor_validation import RefactorValidationTest
|
||||
from .test_testgen_validation import TestGenValidationTest
|
||||
from .test_token_allocation_validation import TokenAllocationValidationTest
|
||||
@@ -42,7 +43,7 @@ TEST_REGISTRY = {
|
||||
"cross_tool_comprehensive": CrossToolComprehensiveTest,
|
||||
"line_number_validation": LineNumberValidationTest,
|
||||
"logs_validation": LogsValidationTest,
|
||||
"redis_validation": RedisValidationTest,
|
||||
# "redis_validation": RedisValidationTest, # Removed - no longer needed for standalone server
|
||||
"model_thinking_config": TestModelThinkingConfig,
|
||||
"o3_model_selection": O3ModelSelectionTest,
|
||||
"ollama_custom_url": OllamaCustomUrlTest,
|
||||
@@ -72,7 +73,7 @@ __all__ = [
|
||||
"CrossToolComprehensiveTest",
|
||||
"LineNumberValidationTest",
|
||||
"LogsValidationTest",
|
||||
"RedisValidationTest",
|
||||
# "RedisValidationTest", # Removed - no longer needed for standalone server
|
||||
"TestModelThinkingConfig",
|
||||
"O3ModelSelectionTest",
|
||||
"O3ProExpensiveTest",
|
||||
|
||||
@@ -11,6 +11,8 @@ import os
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
from .log_utils import LogUtils
|
||||
|
||||
|
||||
class BaseSimulatorTest:
|
||||
"""Base class for all communication simulator tests"""
|
||||
@@ -19,14 +21,25 @@ class BaseSimulatorTest:
|
||||
self.verbose = verbose
|
||||
self.test_files = {}
|
||||
self.test_dir = None
|
||||
self.container_name = "zen-mcp-server"
|
||||
self.redis_container = "zen-mcp-redis"
|
||||
self.python_path = self._get_python_path()
|
||||
|
||||
# Configure logging
|
||||
log_level = logging.DEBUG if verbose else logging.INFO
|
||||
logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
def _get_python_path(self) -> str:
|
||||
"""Get the Python path for the virtual environment"""
|
||||
current_dir = os.getcwd()
|
||||
venv_python = os.path.join(current_dir, ".zen_venv", "bin", "python")
|
||||
|
||||
if os.path.exists(venv_python):
|
||||
return venv_python
|
||||
|
||||
# Fallback to system python if venv doesn't exist
|
||||
self.logger.warning("Virtual environment not found, using system python")
|
||||
return "python"
|
||||
|
||||
def setup_test_files(self):
|
||||
"""Create test files for the simulation"""
|
||||
# Test Python file
|
||||
@@ -100,7 +113,7 @@ class Calculator:
|
||||
self.logger.debug(f"Created test files with absolute paths: {list(self.test_files.values())}")
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool via Claude CLI (docker exec)"""
|
||||
"""Call an MCP tool via standalone server"""
|
||||
try:
|
||||
# Prepare the MCP initialization and tool call sequence
|
||||
init_request = {
|
||||
@@ -131,8 +144,8 @@ class Calculator:
|
||||
# Join with newlines as MCP expects
|
||||
input_data = "\n".join(messages) + "\n"
|
||||
|
||||
# Simulate Claude CLI calling the MCP server via docker exec
|
||||
docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"]
|
||||
# Call the standalone MCP server directly
|
||||
server_cmd = [self.python_path, "server.py"]
|
||||
|
||||
self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")
|
||||
|
||||
@@ -140,7 +153,7 @@ class Calculator:
|
||||
# For consensus tool and other long-running tools, we need to ensure
|
||||
# the subprocess doesn't close prematurely
|
||||
result = subprocess.run(
|
||||
docker_cmd,
|
||||
server_cmd,
|
||||
input=input_data,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
@@ -149,7 +162,7 @@ class Calculator:
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error(f"Docker exec failed with return code {result.returncode}")
|
||||
self.logger.error(f"Standalone server failed with return code {result.returncode}")
|
||||
self.logger.error(f"Stderr: {result.stderr}")
|
||||
# Still try to parse stdout as the response might have been written before the error
|
||||
self.logger.debug(f"Attempting to parse stdout despite error: {result.stdout[:500]}")
|
||||
@@ -263,6 +276,56 @@ class Calculator:
|
||||
shutil.rmtree(self.test_dir)
|
||||
self.logger.debug(f"Removed test files directory: {self.test_dir}")
|
||||
|
||||
# ============================================================================
|
||||
# Log Utility Methods (delegate to LogUtils)
|
||||
# ============================================================================
|
||||
|
||||
def get_server_logs_since(self, since_time: Optional[str] = None) -> str:
|
||||
"""Get server logs from both main and activity log files."""
|
||||
return LogUtils.get_server_logs_since(since_time)
|
||||
|
||||
def get_recent_server_logs(self, lines: int = 500) -> str:
|
||||
"""Get recent server logs from the main log file."""
|
||||
return LogUtils.get_recent_server_logs(lines)
|
||||
|
||||
def get_server_logs_subprocess(self, lines: int = 500) -> str:
|
||||
"""Get server logs using subprocess (alternative method)."""
|
||||
return LogUtils.get_server_logs_subprocess(lines)
|
||||
|
||||
def check_server_logs_for_errors(self, lines: int = 500) -> list[str]:
|
||||
"""Check server logs for error messages."""
|
||||
return LogUtils.check_server_logs_for_errors(lines)
|
||||
|
||||
def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
|
||||
"""Extract token budget calculation information from logs."""
|
||||
return LogUtils.extract_conversation_usage_logs(logs)
|
||||
|
||||
def extract_conversation_token_usage(self, logs: str) -> list[int]:
|
||||
"""Extract conversation token usage values from logs."""
|
||||
return LogUtils.extract_conversation_token_usage(logs)
|
||||
|
||||
def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
|
||||
"""Extract thread creation logs with parent relationships."""
|
||||
return LogUtils.extract_thread_creation_logs(logs)
|
||||
|
||||
def extract_history_traversal_logs(self, logs: str) -> list[dict[str, any]]:
|
||||
"""Extract conversation history traversal logs."""
|
||||
return LogUtils.extract_history_traversal_logs(logs)
|
||||
|
||||
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
|
||||
"""Validate that logs show file deduplication behavior."""
|
||||
return LogUtils.validate_file_deduplication_in_logs(logs, tool_name, test_file)
|
||||
|
||||
def search_logs_for_pattern(
|
||||
self, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
|
||||
) -> list[str]:
|
||||
"""Search logs for a specific pattern."""
|
||||
return LogUtils.search_logs_for_pattern(pattern, logs, case_sensitive)
|
||||
|
||||
def get_log_file_info(self) -> dict[str, dict[str, any]]:
|
||||
"""Get information about log files."""
|
||||
return LogUtils.get_log_file_info()
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Run the test - to be implemented by subclasses"""
|
||||
raise NotImplementedError("Subclasses must implement run_test()")
|
||||
|
||||
216
simulator_tests/conversation_base_test.py
Normal file
216
simulator_tests/conversation_base_test.py
Normal file
@@ -0,0 +1,216 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Conversation Base Test Class for In-Process MCP Tool Testing
|
||||
|
||||
This class enables testing MCP tools within the same process to maintain conversation
|
||||
memory state across tool calls. Unlike BaseSimulatorTest which runs each tool call
|
||||
as a separate subprocess (losing memory state), this class calls tools directly
|
||||
in-process, allowing conversation functionality to work correctly.
|
||||
|
||||
USAGE:
|
||||
- Inherit from ConversationBaseTest instead of BaseSimulatorTest for conversation tests
|
||||
- Use call_mcp_tool_direct() to call tools in-process
|
||||
- Conversation memory persists across tool calls within the same test
|
||||
- setUp() clears memory between test methods for proper isolation
|
||||
|
||||
EXAMPLE:
|
||||
class TestConversationFeature(ConversationBaseTest):
|
||||
def test_cross_tool_continuation(self):
|
||||
# Step 1: Call precommit tool
|
||||
result1, continuation_id = self.call_mcp_tool_direct("precommit", {
|
||||
"path": "/path/to/repo",
|
||||
"prompt": "Review these changes"
|
||||
})
|
||||
|
||||
# Step 2: Continue with codereview tool - memory is preserved!
|
||||
result2, _ = self.call_mcp_tool_direct("codereview", {
|
||||
"files": ["/path/to/file.py"],
|
||||
"prompt": "Focus on security issues",
|
||||
"continuation_id": continuation_id
|
||||
})
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class ConversationBaseTest(BaseSimulatorTest):
|
||||
"""Base class for conversation tests that require in-process tool calling"""
|
||||
|
||||
def __init__(self, verbose: bool = False):
|
||||
super().__init__(verbose)
|
||||
self._tools = None
|
||||
self._loop = None
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment - clears conversation memory between tests"""
|
||||
super().setup_test_files()
|
||||
|
||||
# Clear conversation memory for test isolation
|
||||
self._clear_conversation_memory()
|
||||
|
||||
# Import tools from server.py for in-process calling
|
||||
if self._tools is None:
|
||||
self._import_tools()
|
||||
|
||||
def _clear_conversation_memory(self):
|
||||
"""Clear all conversation memory to ensure test isolation"""
|
||||
try:
|
||||
from utils.storage_backend import get_storage_backend
|
||||
|
||||
storage = get_storage_backend()
|
||||
# Clear all stored conversation threads
|
||||
with storage._lock:
|
||||
storage._store.clear()
|
||||
self.logger.debug("Cleared conversation memory for test isolation")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not clear conversation memory: {e}")
|
||||
|
||||
def _import_tools(self):
|
||||
"""Import tools from server.py for direct calling"""
|
||||
try:
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add project root to Python path if not already there
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
if project_root not in sys.path:
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
# Import tools from server
|
||||
from server import TOOLS
|
||||
|
||||
self._tools = TOOLS
|
||||
self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
|
||||
except ImportError as e:
|
||||
raise RuntimeError(f"Could not import tools from server.py: {e}")
|
||||
|
||||
def _get_event_loop(self):
|
||||
"""Get or create event loop for async tool execution"""
|
||||
if self._loop is None:
|
||||
try:
|
||||
self._loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
self._loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(self._loop)
|
||||
return self._loop
|
||||
|
||||
def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Call an MCP tool directly in-process without subprocess isolation.
|
||||
|
||||
This method maintains conversation memory across calls, enabling proper
|
||||
testing of conversation functionality.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool to call (e.g., "precommit", "codereview")
|
||||
params: Parameters to pass to the tool
|
||||
|
||||
Returns:
|
||||
tuple: (response_content, continuation_id) where continuation_id
|
||||
can be used for follow-up calls
|
||||
"""
|
||||
if self._tools is None:
|
||||
raise RuntimeError("Tools not imported. Call setUp() first.")
|
||||
|
||||
if tool_name not in self._tools:
|
||||
raise ValueError(f"Tool '{tool_name}' not found. Available: {list(self._tools.keys())}")
|
||||
|
||||
try:
|
||||
tool = self._tools[tool_name]
|
||||
self.logger.debug(f"Calling tool '{tool_name}' directly in-process")
|
||||
|
||||
# Set up minimal model context if not provided
|
||||
if "model" not in params:
|
||||
params["model"] = "flash" # Use fast model for testing
|
||||
|
||||
# Execute tool directly using asyncio
|
||||
loop = self._get_event_loop()
|
||||
|
||||
# Import required modules for model resolution (similar to server.py)
|
||||
from config import DEFAULT_MODEL
|
||||
from providers.registry import ModelProviderRegistry
|
||||
from utils.model_context import ModelContext
|
||||
|
||||
# Resolve model (simplified version of server.py logic)
|
||||
model_name = params.get("model", DEFAULT_MODEL)
|
||||
provider = ModelProviderRegistry.get_provider_for_model(model_name)
|
||||
if not provider:
|
||||
# Fallback to available model for testing
|
||||
available_models = list(ModelProviderRegistry.get_available_models(respect_restrictions=True).keys())
|
||||
if available_models:
|
||||
model_name = available_models[0]
|
||||
params["model"] = model_name
|
||||
self.logger.debug(f"Using fallback model for testing: {model_name}")
|
||||
|
||||
# Create model context
|
||||
model_context = ModelContext(model_name)
|
||||
params["_model_context"] = model_context
|
||||
params["_resolved_model_name"] = model_name
|
||||
|
||||
# Execute tool asynchronously
|
||||
result = loop.run_until_complete(tool.execute(params))
|
||||
|
||||
if not result or len(result) == 0:
|
||||
return None, None
|
||||
|
||||
# Extract response content
|
||||
response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||
|
||||
# Parse response to extract continuation_id
|
||||
continuation_id = self._extract_continuation_id_from_response(response_text)
|
||||
|
||||
self.logger.debug(f"Tool '{tool_name}' completed successfully in-process")
|
||||
return response_text, continuation_id
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Direct tool call failed for '{tool_name}': {e}")
|
||||
return None, None
|
||||
|
||||
def _extract_continuation_id_from_response(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from tool response"""
|
||||
try:
|
||||
# Parse the response as JSON to look for continuation metadata
|
||||
response_data = json.loads(response_text)
|
||||
|
||||
# Look for continuation_id in various places
|
||||
if isinstance(response_data, dict):
|
||||
# Check metadata
|
||||
metadata = response_data.get("metadata", {})
|
||||
if "thread_id" in metadata:
|
||||
return metadata["thread_id"]
|
||||
|
||||
# Check continuation_offer
|
||||
continuation_offer = response_data.get("continuation_offer", {})
|
||||
if continuation_offer and "continuation_id" in continuation_offer:
|
||||
return continuation_offer["continuation_id"]
|
||||
|
||||
# Check follow_up_request
|
||||
follow_up = response_data.get("follow_up_request", {})
|
||||
if follow_up and "continuation_id" in follow_up:
|
||||
return follow_up["continuation_id"]
|
||||
|
||||
return None
|
||||
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
# If response is not JSON or doesn't have expected structure, return None
|
||||
return None
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up after test"""
|
||||
super().cleanup_test_files()
|
||||
# Clear memory again for good measure
|
||||
self._clear_conversation_memory()
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
"""Get the test name"""
|
||||
return self.__class__.__name__
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
"""Get the test description"""
|
||||
return "In-process conversation test"
|
||||
316
simulator_tests/log_utils.py
Normal file
316
simulator_tests/log_utils.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""
|
||||
Centralized log utility for simulator tests.
|
||||
|
||||
This module provides common log reading and parsing functionality
|
||||
used across multiple simulator test files to reduce code duplication.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
class LogUtils:
|
||||
"""Centralized logging utilities for simulator tests."""
|
||||
|
||||
# Log file paths
|
||||
MAIN_LOG_FILE = "logs/mcp_server.log"
|
||||
ACTIVITY_LOG_FILE = "logs/mcp_activity.log"
|
||||
|
||||
@classmethod
|
||||
def get_server_logs_since(cls, since_time: Optional[str] = None) -> str:
|
||||
"""
|
||||
Get server logs from both main and activity log files.
|
||||
|
||||
Args:
|
||||
since_time: Currently ignored, returns all available logs
|
||||
|
||||
Returns:
|
||||
Combined logs from both log files
|
||||
"""
|
||||
try:
|
||||
main_logs = ""
|
||||
activity_logs = ""
|
||||
|
||||
# Read main server log
|
||||
try:
|
||||
with open(cls.MAIN_LOG_FILE) as f:
|
||||
main_logs = f.read()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Read activity log
|
||||
try:
|
||||
with open(cls.ACTIVITY_LOG_FILE) as f:
|
||||
activity_logs = f.read()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
return main_logs + "\n" + activity_logs
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to read server logs: {e}")
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def get_recent_server_logs(cls, lines: int = 500) -> str:
|
||||
"""
|
||||
Get recent server logs from the main log file.
|
||||
|
||||
Args:
|
||||
lines: Number of recent lines to retrieve (default: 500)
|
||||
|
||||
Returns:
|
||||
Recent log content as string
|
||||
"""
|
||||
try:
|
||||
with open(cls.MAIN_LOG_FILE) as f:
|
||||
all_lines = f.readlines()
|
||||
recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
|
||||
return "".join(recent_lines)
|
||||
except FileNotFoundError:
|
||||
logging.warning(f"Log file {cls.MAIN_LOG_FILE} not found")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to read recent server logs: {e}")
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def get_server_logs_subprocess(cls, lines: int = 500) -> str:
|
||||
"""
|
||||
Get server logs using subprocess (alternative method).
|
||||
|
||||
Args:
|
||||
lines: Number of recent lines to retrieve
|
||||
|
||||
Returns:
|
||||
Recent log content as string
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["tail", "-n", str(lines), cls.MAIN_LOG_FILE], capture_output=True, text=True, timeout=10
|
||||
)
|
||||
return result.stdout + result.stderr
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to get server logs via subprocess: {e}")
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def check_server_logs_for_errors(cls, lines: int = 500) -> list[str]:
|
||||
"""
|
||||
Check server logs for error messages.
|
||||
|
||||
Args:
|
||||
lines: Number of recent lines to check
|
||||
|
||||
Returns:
|
||||
List of error messages found
|
||||
"""
|
||||
logs = cls.get_recent_server_logs(lines)
|
||||
error_patterns = [r"ERROR.*", r"CRITICAL.*", r"Failed.*", r"Exception.*", r"Error:.*"]
|
||||
|
||||
errors = []
|
||||
for line in logs.split("\n"):
|
||||
for pattern in error_patterns:
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
errors.append(line.strip())
|
||||
break
|
||||
|
||||
return errors
|
||||
|
||||
@classmethod
|
||||
def extract_conversation_usage_logs(cls, logs: str) -> list[dict[str, int]]:
|
||||
"""
|
||||
Extract token budget calculation information from logs.
|
||||
|
||||
Args:
|
||||
logs: Log content to parse
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing token usage data
|
||||
"""
|
||||
usage_data = []
|
||||
pattern = r"\[CONVERSATION_DEBUG\] Token budget calculation:"
|
||||
|
||||
for line in logs.split("\n"):
|
||||
if re.search(pattern, line):
|
||||
# Parse the token usage information
|
||||
usage_info = {}
|
||||
|
||||
# Extract total capacity
|
||||
capacity_match = re.search(r"Total capacity: ([\d,]+)", line)
|
||||
if capacity_match:
|
||||
usage_info["total_capacity"] = int(capacity_match.group(1).replace(",", ""))
|
||||
|
||||
# Extract content allocation
|
||||
content_match = re.search(r"Content allocation: ([\d,]+)", line)
|
||||
if content_match:
|
||||
usage_info["content_allocation"] = int(content_match.group(1).replace(",", ""))
|
||||
|
||||
# Extract conversation tokens
|
||||
conv_match = re.search(r"Conversation tokens: ([\d,]+)", line)
|
||||
if conv_match:
|
||||
usage_info["conversation_tokens"] = int(conv_match.group(1).replace(",", ""))
|
||||
|
||||
# Extract remaining tokens
|
||||
remaining_match = re.search(r"Remaining tokens: ([\d,]+)", line)
|
||||
if remaining_match:
|
||||
usage_info["remaining_tokens"] = int(remaining_match.group(1).replace(",", ""))
|
||||
|
||||
if usage_info:
|
||||
usage_data.append(usage_info)
|
||||
|
||||
return usage_data
|
||||
|
||||
@classmethod
|
||||
def extract_conversation_token_usage(cls, logs: str) -> list[int]:
|
||||
"""
|
||||
Extract conversation token usage values from logs.
|
||||
|
||||
Args:
|
||||
logs: Log content to parse
|
||||
|
||||
Returns:
|
||||
List of token usage values
|
||||
"""
|
||||
pattern = r"Conversation history token usage:\s*([\d,]+)"
|
||||
usage_values = []
|
||||
|
||||
for match in re.finditer(pattern, logs):
|
||||
usage_value = int(match.group(1).replace(",", ""))
|
||||
usage_values.append(usage_value)
|
||||
|
||||
return usage_values
|
||||
|
||||
@classmethod
|
||||
def extract_thread_creation_logs(cls, logs: str) -> list[dict[str, str]]:
|
||||
"""
|
||||
Extract thread creation logs with parent relationships.
|
||||
|
||||
Args:
|
||||
logs: Log content to parse
|
||||
|
||||
Returns:
|
||||
List of dictionaries with thread relationship data
|
||||
"""
|
||||
thread_data = []
|
||||
pattern = r"\[THREAD\] Created new thread (\w+)(?: with parent (\w+))?"
|
||||
|
||||
for match in re.finditer(pattern, logs):
|
||||
thread_info = {"thread_id": match.group(1), "parent_id": match.group(2) if match.group(2) else None}
|
||||
thread_data.append(thread_info)
|
||||
|
||||
return thread_data
|
||||
|
||||
@classmethod
|
||||
def extract_history_traversal_logs(cls, logs: str) -> list[dict[str, Union[str, int]]]:
|
||||
"""
|
||||
Extract conversation history traversal logs.
|
||||
|
||||
Args:
|
||||
logs: Log content to parse
|
||||
|
||||
Returns:
|
||||
List of dictionaries with traversal data
|
||||
"""
|
||||
traversal_data = []
|
||||
pattern = r"\[THREAD\] Retrieved chain of (\d+) messages for thread (\w+)"
|
||||
|
||||
for match in re.finditer(pattern, logs):
|
||||
traversal_info = {"chain_length": int(match.group(1)), "thread_id": match.group(2)}
|
||||
traversal_data.append(traversal_info)
|
||||
|
||||
return traversal_data
|
||||
|
||||
@classmethod
|
||||
def validate_file_deduplication_in_logs(cls, logs: str, tool_name: str, test_file: str) -> bool:
|
||||
"""
|
||||
Validate that logs show file deduplication behavior.
|
||||
|
||||
Args:
|
||||
logs: Log content to parse
|
||||
tool_name: Name of the tool being tested
|
||||
test_file: Name of the test file to check for deduplication
|
||||
|
||||
Returns:
|
||||
True if deduplication evidence is found, False otherwise
|
||||
"""
|
||||
# Look for embedding calculation
|
||||
embedding_pattern = f"Calculating embeddings for {test_file}"
|
||||
has_embedding = bool(re.search(embedding_pattern, logs))
|
||||
|
||||
# Look for filtering message
|
||||
filtering_pattern = f"Filtering {test_file} to prevent duplication"
|
||||
has_filtering = bool(re.search(filtering_pattern, logs))
|
||||
|
||||
# Look for skip message
|
||||
skip_pattern = f"Skipping {test_file} \\(already processed"
|
||||
has_skip = bool(re.search(skip_pattern, logs))
|
||||
|
||||
# Look for tool-specific processing
|
||||
tool_pattern = f"\\[{tool_name.upper()}\\].*{test_file}"
|
||||
has_tool_processing = bool(re.search(tool_pattern, logs, re.IGNORECASE))
|
||||
|
||||
# Deduplication is confirmed if we see evidence of processing and filtering/skipping
|
||||
return has_embedding and (has_filtering or has_skip) and has_tool_processing
|
||||
|
||||
@classmethod
|
||||
def search_logs_for_pattern(
|
||||
cls, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
|
||||
) -> list[str]:
|
||||
"""
|
||||
Search logs for a specific pattern.
|
||||
|
||||
Args:
|
||||
pattern: Regex pattern to search for
|
||||
logs: Log content to search (if None, reads recent logs)
|
||||
case_sensitive: Whether the search should be case sensitive
|
||||
|
||||
Returns:
|
||||
List of matching lines
|
||||
"""
|
||||
if logs is None:
|
||||
logs = cls.get_recent_server_logs()
|
||||
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
matches = []
|
||||
|
||||
for line in logs.split("\n"):
|
||||
if re.search(pattern, line, flags):
|
||||
matches.append(line.strip())
|
||||
|
||||
return matches
|
||||
|
||||
@classmethod
|
||||
def get_log_file_info(cls) -> dict[str, dict[str, Union[str, int, bool]]]:
|
||||
"""
|
||||
Get information about log files.
|
||||
|
||||
Returns:
|
||||
Dictionary with file information for each log file
|
||||
"""
|
||||
import os
|
||||
|
||||
file_info = {}
|
||||
|
||||
for log_file in [cls.MAIN_LOG_FILE, cls.ACTIVITY_LOG_FILE]:
|
||||
if os.path.exists(log_file):
|
||||
stat = os.stat(log_file)
|
||||
file_info[log_file] = {
|
||||
"exists": True,
|
||||
"size_bytes": stat.st_size,
|
||||
"size_mb": round(stat.st_size / (1024 * 1024), 2),
|
||||
"last_modified": stat.st_mtime,
|
||||
"readable": os.access(log_file, os.R_OK),
|
||||
}
|
||||
else:
|
||||
file_info[log_file] = {
|
||||
"exists": False,
|
||||
"size_bytes": 0,
|
||||
"size_mb": 0,
|
||||
"last_modified": 0,
|
||||
"readable": False,
|
||||
}
|
||||
|
||||
return file_info
|
||||
@@ -7,7 +7,6 @@ and builds conversation context correctly when using continuation_id.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -23,19 +22,16 @@ class TestConsensusConversation(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "Test consensus tool conversation building and continuation"
|
||||
|
||||
def get_docker_logs(self):
|
||||
"""Get Docker container logs"""
|
||||
def get_server_logs(self):
|
||||
"""Get server logs from local log file"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["docker", "logs", "--tail", "100", self.container_name], capture_output=True, text=True, timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.split("\n")
|
||||
else:
|
||||
self.logger.warning(f"Failed to get Docker logs: {result.stderr}")
|
||||
return []
|
||||
log_file_path = "logs/mcp_server.log"
|
||||
with open(log_file_path) as f:
|
||||
lines = f.readlines()
|
||||
# Return last 100 lines
|
||||
return [line.strip() for line in lines[-100:]]
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Exception getting Docker logs: {e}")
|
||||
self.logger.warning(f"Exception getting server logs: {e}")
|
||||
return []
|
||||
|
||||
def run_test(self) -> bool:
|
||||
@@ -121,9 +117,9 @@ class TestConsensusConversation(BaseSimulatorTest):
|
||||
self.logger.info("Phase 3: Checking server logs for conversation building")
|
||||
|
||||
# Check for conversation-related log entries
|
||||
logs = self.get_docker_logs()
|
||||
logs = self.get_server_logs()
|
||||
if not logs:
|
||||
self.logger.warning("Could not retrieve Docker logs for verification")
|
||||
self.logger.warning("Could not retrieve server logs for verification")
|
||||
else:
|
||||
# Look for conversation building indicators
|
||||
conversation_logs = [
|
||||
|
||||
@@ -22,42 +22,6 @@ class ContentValidationTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "Content validation and duplicate detection"
|
||||
|
||||
def get_docker_logs_since(self, since_time: str) -> str:
|
||||
"""Get docker logs since a specific timestamp"""
|
||||
try:
|
||||
# Check both main server and log monitor for comprehensive logs
|
||||
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
|
||||
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
|
||||
|
||||
import subprocess
|
||||
|
||||
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
|
||||
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
|
||||
|
||||
# Get the internal log files which have more detailed logging
|
||||
server_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
activity_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
# Combine all logs
|
||||
combined_logs = (
|
||||
result_server.stdout
|
||||
+ "\n"
|
||||
+ result_monitor.stdout
|
||||
+ "\n"
|
||||
+ server_log_result.stdout
|
||||
+ "\n"
|
||||
+ activity_log_result.stdout
|
||||
)
|
||||
return combined_logs
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get docker logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test that file processing system properly handles file deduplication"""
|
||||
try:
|
||||
@@ -151,9 +115,9 @@ DATABASE_CONFIG = {
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Different tool failed")
|
||||
|
||||
# Validate file processing behavior from Docker logs
|
||||
# Validate file processing behavior from server logs
|
||||
self.logger.info(" 4: Validating file processing logs")
|
||||
logs = self.get_docker_logs_since(start_time)
|
||||
logs = self.get_server_logs_since(start_time)
|
||||
|
||||
# Check for proper file embedding logs
|
||||
embedding_logs = [
|
||||
|
||||
@@ -21,8 +21,6 @@ This validates the conversation threading system's ability to:
|
||||
- Properly traverse parent relationships for history reconstruction
|
||||
"""
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -38,53 +36,6 @@ class ConversationChainValidationTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "Conversation chain and threading validation"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
|
||||
"""Extract thread creation logs with parent relationships"""
|
||||
thread_logs = []
|
||||
|
||||
lines = logs.split("\n")
|
||||
for line in lines:
|
||||
if "[THREAD] Created new thread" in line:
|
||||
# Parse: [THREAD] Created new thread 9dc779eb-645f-4850-9659-34c0e6978d73 with parent a0ce754d-c995-4b3e-9103-88af429455aa
|
||||
match = re.search(r"\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)", line)
|
||||
if match:
|
||||
thread_id = match.group(1)
|
||||
parent_id = match.group(2) if match.group(2) != "None" else None
|
||||
thread_logs.append({"thread_id": thread_id, "parent_id": parent_id, "log_line": line})
|
||||
|
||||
return thread_logs
|
||||
|
||||
def extract_history_traversal_logs(self, logs: str) -> list[dict[str, str]]:
|
||||
"""Extract conversation history traversal logs"""
|
||||
traversal_logs = []
|
||||
|
||||
lines = logs.split("\n")
|
||||
for line in lines:
|
||||
if "[THREAD] Retrieved chain of" in line:
|
||||
# Parse: [THREAD] Retrieved chain of 3 threads for 9dc779eb-645f-4850-9659-34c0e6978d73
|
||||
match = re.search(r"\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)", line)
|
||||
if match:
|
||||
chain_length = int(match.group(1))
|
||||
thread_id = match.group(2)
|
||||
traversal_logs.append({"thread_id": thread_id, "chain_length": chain_length, "log_line": line})
|
||||
|
||||
return traversal_logs
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test conversation chain and threading functionality"""
|
||||
try:
|
||||
|
||||
@@ -12,7 +12,6 @@ Validates:
|
||||
5. Proper tool chaining with context
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -28,40 +27,6 @@ class CrossToolComprehensiveTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "Comprehensive cross-tool file deduplication and continuation"
|
||||
|
||||
def get_docker_logs_since(self, since_time: str) -> str:
|
||||
"""Get docker logs since a specific timestamp"""
|
||||
try:
|
||||
# Check both main server and log monitor for comprehensive logs
|
||||
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
|
||||
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
|
||||
|
||||
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
|
||||
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
|
||||
|
||||
# Get the internal log files which have more detailed logging
|
||||
server_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
activity_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
# Combine all logs
|
||||
combined_logs = (
|
||||
result_server.stdout
|
||||
+ "\n"
|
||||
+ result_monitor.stdout
|
||||
+ "\n"
|
||||
+ server_log_result.stdout
|
||||
+ "\n"
|
||||
+ activity_log_result.stdout
|
||||
)
|
||||
return combined_logs
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get docker logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Comprehensive cross-tool test with all MCP tools"""
|
||||
try:
|
||||
@@ -247,7 +212,7 @@ def secure_login(user, pwd):
|
||||
|
||||
# Validate comprehensive results
|
||||
self.logger.info(" 📋 Validating comprehensive cross-tool results...")
|
||||
logs = self.get_docker_logs_since(start_time)
|
||||
logs = self.get_server_logs_since(start_time)
|
||||
|
||||
# Validation criteria
|
||||
tools_used = [r[0] for r in responses]
|
||||
|
||||
@@ -6,10 +6,10 @@ Tests comprehensive cross-tool continuation scenarios to ensure
|
||||
conversation context is maintained when switching between different tools.
|
||||
"""
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
class CrossToolContinuationTest(ConversationBaseTest):
|
||||
"""Test comprehensive cross-tool continuation scenarios"""
|
||||
|
||||
@property
|
||||
@@ -25,8 +25,8 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
try:
|
||||
self.logger.info("🔧 Test: Cross-tool continuation scenarios")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
# Setup test environment for conversation testing
|
||||
self.setUp()
|
||||
|
||||
success_count = 0
|
||||
total_scenarios = 3
|
||||
@@ -62,7 +62,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
self.logger.info(" 1: Testing chat -> thinkdeep -> codereview")
|
||||
|
||||
# Start with chat
|
||||
chat_response, chat_id = self.call_mcp_tool(
|
||||
chat_response, chat_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
|
||||
@@ -76,7 +76,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
# Continue with thinkdeep
|
||||
thinkdeep_response, _ = self.call_mcp_tool(
|
||||
thinkdeep_response, _ = self.call_mcp_tool_direct(
|
||||
"thinkdeep",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
|
||||
@@ -91,7 +91,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
# Continue with codereview
|
||||
codereview_response, _ = self.call_mcp_tool(
|
||||
codereview_response, _ = self.call_mcp_tool_direct(
|
||||
"codereview",
|
||||
{
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
@@ -118,8 +118,13 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
self.logger.info(" 2: Testing analyze -> debug -> thinkdeep")
|
||||
|
||||
# Start with analyze
|
||||
analyze_response, analyze_id = self.call_mcp_tool(
|
||||
"analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality", "model": "flash"}
|
||||
analyze_response, analyze_id = self.call_mcp_tool_direct(
|
||||
"analyze",
|
||||
{
|
||||
"files": [self.test_files["python"]],
|
||||
"prompt": "Analyze this code for quality and performance issues",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not analyze_response or not analyze_id:
|
||||
@@ -127,7 +132,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
# Continue with debug
|
||||
debug_response, _ = self.call_mcp_tool(
|
||||
debug_response, _ = self.call_mcp_tool_direct(
|
||||
"debug",
|
||||
{
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
@@ -142,7 +147,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
# Continue with thinkdeep
|
||||
final_response, _ = self.call_mcp_tool(
|
||||
final_response, _ = self.call_mcp_tool_direct(
|
||||
"thinkdeep",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
|
||||
@@ -169,7 +174,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
self.logger.info(" 3: Testing multi-file cross-tool continuation")
|
||||
|
||||
# Start with both files
|
||||
multi_response, multi_id = self.call_mcp_tool(
|
||||
multi_response, multi_id = self.call_mcp_tool_direct(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
|
||||
@@ -183,7 +188,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
# Switch to codereview with same files (should use conversation history)
|
||||
multi_review, _ = self.call_mcp_tool(
|
||||
multi_review, _ = self.call_mcp_tool_direct(
|
||||
"codereview",
|
||||
{
|
||||
"files": [self.test_files["python"], self.test_files["config"]], # Same files
|
||||
|
||||
@@ -378,35 +378,28 @@ The code looks correct to me, but something is causing valid sessions to be trea
|
||||
# Validate logs
|
||||
self.logger.info(" 📋 Validating execution logs...")
|
||||
|
||||
# Get server logs from the actual log file inside the container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
# Get server logs using inherited method
|
||||
logs = self.get_recent_server_logs(500)
|
||||
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode() + result.stderr.decode()
|
||||
# Look for debug tool execution patterns
|
||||
debug_patterns = [
|
||||
"debug tool",
|
||||
"[DEBUG]",
|
||||
"systematic investigation",
|
||||
"Token budget",
|
||||
"Essential files for debugging",
|
||||
]
|
||||
|
||||
# Look for debug tool execution patterns
|
||||
debug_patterns = [
|
||||
"debug tool",
|
||||
"[DEBUG]",
|
||||
"systematic investigation",
|
||||
"Token budget",
|
||||
"Essential files for debugging",
|
||||
]
|
||||
patterns_found = 0
|
||||
for pattern in debug_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
patterns_found = 0
|
||||
for pattern in debug_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
|
||||
|
||||
# Test continuation if available
|
||||
if continuation_id:
|
||||
|
||||
@@ -145,14 +145,16 @@ def validate_data(data):
|
||||
# Test 4: Validate log patterns
|
||||
self.logger.info(" 1.4: Validating line number processing in logs")
|
||||
|
||||
# Get logs from container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
|
||||
logs = ""
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode()
|
||||
# Get logs from server
|
||||
try:
|
||||
log_file_path = "logs/mcp_server.log"
|
||||
with open(log_file_path) as f:
|
||||
lines = f.readlines()
|
||||
logs = "".join(lines[-500:])
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to read server logs: {e}")
|
||||
logs = ""
|
||||
pass
|
||||
|
||||
# Check for line number formatting patterns
|
||||
line_number_patterns = ["Line numbers for", "enabled", "│", "line number"] # The line number separator
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Docker Logs Validation Test
|
||||
Server Logs Validation Test
|
||||
|
||||
Validates Docker logs to confirm file deduplication behavior and
|
||||
Validates server logs to confirm file deduplication behavior and
|
||||
conversation threading is working properly.
|
||||
"""
|
||||
|
||||
@@ -10,7 +10,7 @@ from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class LogsValidationTest(BaseSimulatorTest):
|
||||
"""Validate Docker logs to confirm file deduplication behavior"""
|
||||
"""Validate server logs to confirm file deduplication behavior"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
@@ -18,39 +18,35 @@ class LogsValidationTest(BaseSimulatorTest):
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Docker logs validation"
|
||||
return "Server logs validation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Validate Docker logs to confirm file deduplication behavior"""
|
||||
"""Validate server logs to confirm file deduplication behavior"""
|
||||
try:
|
||||
self.logger.info("📋 Test: Validating Docker logs for file deduplication...")
|
||||
self.logger.info("📋 Test: Validating server logs for file deduplication...")
|
||||
|
||||
# Get server logs from main container
|
||||
result = self.run_command(["docker", "logs", self.container_name], capture_output=True)
|
||||
# Get server logs from log files
|
||||
import os
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error(f"Failed to get Docker logs: {result.stderr}")
|
||||
logs = ""
|
||||
log_files = ["logs/mcp_server.log", "logs/mcp_activity.log"]
|
||||
|
||||
for log_file in log_files:
|
||||
if os.path.exists(log_file):
|
||||
try:
|
||||
with open(log_file) as f:
|
||||
file_content = f.read()
|
||||
logs += f"\n=== {log_file} ===\n{file_content}\n"
|
||||
self.logger.debug(f"Read {len(file_content)} characters from {log_file}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not read {log_file}: {e}")
|
||||
else:
|
||||
self.logger.warning(f"Log file not found: {log_file}")
|
||||
|
||||
if not logs.strip():
|
||||
self.logger.warning("No log content found - server may not have processed any requests yet")
|
||||
return False
|
||||
|
||||
main_logs = result.stdout.decode() + result.stderr.decode()
|
||||
|
||||
# Get logs from log monitor container (where detailed activity is logged)
|
||||
monitor_result = self.run_command(["docker", "logs", "zen-mcp-log-monitor"], capture_output=True)
|
||||
monitor_logs = ""
|
||||
if monitor_result.returncode == 0:
|
||||
monitor_logs = monitor_result.stdout.decode() + monitor_result.stderr.decode()
|
||||
|
||||
# Also get activity logs for more detailed conversation tracking
|
||||
activity_result = self.run_command(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True
|
||||
)
|
||||
|
||||
activity_logs = ""
|
||||
if activity_result.returncode == 0:
|
||||
activity_logs = activity_result.stdout.decode()
|
||||
|
||||
logs = main_logs + "\n" + monitor_logs + "\n" + activity_logs
|
||||
|
||||
# Look for conversation threading patterns that indicate the system is working
|
||||
conversation_patterns = [
|
||||
"CONVERSATION_RESUME",
|
||||
|
||||
@@ -4,11 +4,10 @@ O3 Model Selection Test
|
||||
|
||||
Tests that O3 models are properly selected and used when explicitly specified,
|
||||
regardless of the default model configuration (even when set to auto).
|
||||
Validates model selection via Docker logs.
|
||||
Validates model selection via server logs.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -24,47 +23,16 @@ class O3ModelSelectionTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "O3 model selection and usage validation"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
# Read logs directly from the log file - use more lines to ensure we get all test-related logs
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test O3 model selection and usage"""
|
||||
try:
|
||||
self.logger.info(" Test: O3 model selection and usage validation")
|
||||
|
||||
# Check which API keys are configured
|
||||
check_cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
self.container_name,
|
||||
"python",
|
||||
"-c",
|
||||
'import os; print(f\'OPENAI_KEY:{bool(os.environ.get("OPENAI_API_KEY"))}|OPENROUTER_KEY:{bool(os.environ.get("OPENROUTER_API_KEY"))}\')',
|
||||
]
|
||||
result = subprocess.run(check_cmd, capture_output=True, text=True)
|
||||
import os
|
||||
|
||||
has_openai = False
|
||||
has_openrouter = False
|
||||
|
||||
if result.returncode == 0:
|
||||
output = result.stdout.strip()
|
||||
if "OPENAI_KEY:True" in output:
|
||||
has_openai = True
|
||||
if "OPENROUTER_KEY:True" in output:
|
||||
has_openrouter = True
|
||||
has_openai = bool(os.environ.get("OPENAI_API_KEY"))
|
||||
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
|
||||
|
||||
# If only OpenRouter is configured, adjust test expectations
|
||||
if has_openrouter and not has_openai:
|
||||
|
||||
@@ -9,7 +9,6 @@ Tests custom API endpoint functionality with Ollama-style local models, includin
|
||||
- Model alias resolution for local models
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -30,14 +29,15 @@ class OllamaCustomUrlTest(BaseSimulatorTest):
|
||||
try:
|
||||
self.logger.info("Test: Ollama custom URL functionality")
|
||||
|
||||
# Check if custom URL is configured in the Docker container
|
||||
custom_url = self._check_docker_custom_url()
|
||||
# Check if custom URL is configured
|
||||
import os
|
||||
|
||||
custom_url = os.environ.get("CUSTOM_API_URL")
|
||||
if not custom_url:
|
||||
self.logger.warning("CUSTOM_API_URL not set in Docker container, skipping Ollama test")
|
||||
self.logger.warning("CUSTOM_API_URL not set, skipping Ollama test")
|
||||
self.logger.info("To enable this test, add to .env file:")
|
||||
self.logger.info("CUSTOM_API_URL=http://host.docker.internal:11434/v1")
|
||||
self.logger.info("CUSTOM_API_URL=http://localhost:11434/v1")
|
||||
self.logger.info("CUSTOM_API_KEY=")
|
||||
self.logger.info("Then restart docker-compose")
|
||||
return True # Skip gracefully
|
||||
|
||||
self.logger.info(f"Testing with custom URL: {custom_url}")
|
||||
@@ -172,25 +172,6 @@ if __name__ == "__main__":
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
|
||||
def _check_docker_custom_url(self) -> str:
|
||||
"""Check if CUSTOM_API_URL is set in the Docker container"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "printenv", "CUSTOM_API_URL"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to check Docker CUSTOM_API_URL: {e}")
|
||||
return ""
|
||||
|
||||
def validate_successful_response(self, response: str, test_name: str, files_provided: bool = False) -> bool:
|
||||
"""Validate that the response indicates success, not an error
|
||||
|
||||
@@ -201,7 +182,7 @@ if __name__ == "__main__":
|
||||
"""
|
||||
if not response:
|
||||
self.logger.error(f"No response received for {test_name}")
|
||||
self._check_docker_logs_for_errors()
|
||||
self._check_server_logs_for_errors()
|
||||
return False
|
||||
|
||||
# Check for common error indicators
|
||||
@@ -227,7 +208,7 @@ if __name__ == "__main__":
|
||||
]
|
||||
|
||||
# Special handling for clarification requests from local models
|
||||
if "clarification_required" in response.lower():
|
||||
if "files_required_to_continue" in response.lower():
|
||||
if files_provided:
|
||||
# If we provided actual files, clarification request is a FAILURE
|
||||
self.logger.error(
|
||||
@@ -243,7 +224,7 @@ if __name__ == "__main__":
|
||||
self.logger.debug(f"Clarification response: {response[:200]}...")
|
||||
return True
|
||||
|
||||
# Check for SSRF security restriction - this is expected for local URLs from Docker
|
||||
# Check for SSRF security restriction - this is expected for local URLs
|
||||
if "restricted IP address" in response and "security risk (SSRF)" in response:
|
||||
self.logger.info(
|
||||
f"✅ Custom URL routing working - {test_name} correctly attempted to connect to custom API"
|
||||
@@ -256,19 +237,19 @@ if __name__ == "__main__":
|
||||
if error.lower() in response_lower:
|
||||
self.logger.error(f"Error detected in {test_name}: {error}")
|
||||
self.logger.debug(f"Full response: {response}")
|
||||
self._check_docker_logs_for_errors()
|
||||
self._check_server_logs_for_errors()
|
||||
return False
|
||||
|
||||
# Response should be substantial (more than just a few words)
|
||||
if len(response.strip()) < 10:
|
||||
self.logger.error(f"Response too short for {test_name}: {response}")
|
||||
self._check_docker_logs_for_errors()
|
||||
self._check_server_logs_for_errors()
|
||||
return False
|
||||
|
||||
# Verify this looks like a real AI response, not just an error message
|
||||
if not self._validate_ai_response_content(response):
|
||||
self.logger.error(f"Response doesn't look like valid AI output for {test_name}")
|
||||
self._check_docker_logs_for_errors()
|
||||
self._check_server_logs_for_errors()
|
||||
return False
|
||||
|
||||
self.logger.debug(f"Successful response for {test_name}: {response[:100]}...")
|
||||
@@ -329,24 +310,23 @@ if __name__ == "__main__":
|
||||
|
||||
return True
|
||||
|
||||
def _check_docker_logs_for_errors(self):
|
||||
"""Check Docker logs for any error messages that might explain failures"""
|
||||
def _check_server_logs_for_errors(self):
|
||||
"""Check server logs for any error messages that might explain failures"""
|
||||
try:
|
||||
# Get recent logs from the container
|
||||
result = subprocess.run(
|
||||
["docker", "logs", "--tail", "50", self.container_name], capture_output=True, text=True, timeout=10
|
||||
)
|
||||
# Get recent logs from the log file
|
||||
log_file_path = "logs/mcp_server.log"
|
||||
with open(log_file_path) as f:
|
||||
lines = f.readlines()
|
||||
recent_logs = lines[-50:] # Last 50 lines
|
||||
|
||||
if result.returncode == 0 and result.stderr:
|
||||
recent_logs = result.stderr.strip()
|
||||
if recent_logs:
|
||||
self.logger.info("Recent container logs:")
|
||||
for line in recent_logs.split("\n")[-10:]: # Last 10 lines
|
||||
if line.strip():
|
||||
self.logger.info(f" {line}")
|
||||
if recent_logs:
|
||||
self.logger.info("Recent server logs:")
|
||||
for line in recent_logs[-10:]: # Last 10 lines
|
||||
if line.strip():
|
||||
self.logger.info(f" {line.strip()}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to check Docker logs: {e}")
|
||||
self.logger.debug(f"Failed to check server logs: {e}")
|
||||
|
||||
def validate_local_model_response(self, response: str) -> bool:
|
||||
"""Validate that response appears to come from a local model"""
|
||||
|
||||
@@ -8,7 +8,6 @@ Tests that verify the system correctly falls back to OpenRouter when:
|
||||
- Auto mode correctly selects OpenRouter models
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -24,53 +23,28 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "OpenRouter fallback behavior when only provider"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test OpenRouter fallback behavior"""
|
||||
try:
|
||||
self.logger.info("Test: OpenRouter fallback behavior when only provider available")
|
||||
|
||||
# Check if ONLY OpenRouter API key is configured (this is a fallback test)
|
||||
check_cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
self.container_name,
|
||||
"python",
|
||||
"-c",
|
||||
'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))) + "|GEMINI_KEY:" + str(bool(os.environ.get("GEMINI_API_KEY"))) + "|OPENAI_KEY:" + str(bool(os.environ.get("OPENAI_API_KEY"))))',
|
||||
]
|
||||
result = subprocess.run(check_cmd, capture_output=True, text=True)
|
||||
import os
|
||||
|
||||
if result.returncode == 0:
|
||||
output = result.stdout.strip()
|
||||
has_openrouter = "OPENROUTER_KEY:True" in output
|
||||
has_gemini = "GEMINI_KEY:True" in output
|
||||
has_openai = "OPENAI_KEY:True" in output
|
||||
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
|
||||
has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
|
||||
has_openai = bool(os.environ.get("OPENAI_API_KEY"))
|
||||
|
||||
if not has_openrouter:
|
||||
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
|
||||
self.logger.info(" ℹ️ This test requires OPENROUTER_API_KEY to be set in .env")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
if not has_openrouter:
|
||||
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
|
||||
self.logger.info(" ℹ️ This test requires OPENROUTER_API_KEY to be set in .env")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
|
||||
if has_gemini or has_openai:
|
||||
self.logger.info(" ⚠️ Other API keys configured - this is not a fallback scenario")
|
||||
self.logger.info(" ℹ️ This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
|
||||
self.logger.info(" ℹ️ Current setup has multiple providers, so fallback behavior doesn't apply")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
if has_gemini or has_openai:
|
||||
self.logger.info(" ⚠️ Other API keys configured - this is not a fallback scenario")
|
||||
self.logger.info(" ℹ️ This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
|
||||
self.logger.info(" ℹ️ Current setup has multiple providers, so fallback behavior doesn't apply")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
@@ -9,7 +9,6 @@ Tests that verify OpenRouter functionality including:
|
||||
- Error handling when models are not available
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -25,39 +24,17 @@ class OpenRouterModelsTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "OpenRouter model functionality and alias mapping"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
# Read logs directly from the log file
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test OpenRouter model functionality"""
|
||||
try:
|
||||
self.logger.info("Test: OpenRouter model functionality and alias mapping")
|
||||
|
||||
# Check if OpenRouter API key is configured
|
||||
check_cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
self.container_name,
|
||||
"python",
|
||||
"-c",
|
||||
'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))))',
|
||||
]
|
||||
result = subprocess.run(check_cmd, capture_output=True, text=True)
|
||||
import os
|
||||
|
||||
if result.returncode == 0 and "OPENROUTER_KEY:False" in result.stdout:
|
||||
has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
|
||||
|
||||
if not has_openrouter:
|
||||
self.logger.info(" ⚠️ OpenRouter API key not configured - skipping test")
|
||||
self.logger.info(" ℹ️ This test requires OPENROUTER_API_KEY to be set in .env")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
|
||||
@@ -8,16 +8,15 @@ Validates that:
|
||||
1. Files are embedded only once in conversation history
|
||||
2. Continuation calls don't re-read existing files
|
||||
3. New files are still properly embedded
|
||||
4. Docker logs show deduplication behavior
|
||||
4. Server logs show deduplication behavior
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class PerToolDeduplicationTest(BaseSimulatorTest):
|
||||
class PerToolDeduplicationTest(ConversationBaseTest):
|
||||
"""Test file deduplication for each individual tool"""
|
||||
|
||||
@property
|
||||
@@ -28,74 +27,16 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "File deduplication for individual tools"
|
||||
|
||||
def get_docker_logs_since(self, since_time: str) -> str:
|
||||
"""Get docker logs since a specific timestamp"""
|
||||
try:
|
||||
# Check both main server and log monitor for comprehensive logs
|
||||
cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
|
||||
cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
|
||||
|
||||
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
|
||||
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
|
||||
|
||||
# Get the internal log files which have more detailed logging
|
||||
server_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
activity_log_result = subprocess.run(
|
||||
["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
# Combine all logs
|
||||
combined_logs = (
|
||||
result_server.stdout
|
||||
+ "\n"
|
||||
+ result_monitor.stdout
|
||||
+ "\n"
|
||||
+ server_log_result.stdout
|
||||
+ "\n"
|
||||
+ activity_log_result.stdout
|
||||
)
|
||||
return combined_logs
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get docker logs: {e}")
|
||||
return ""
|
||||
|
||||
# create_additional_test_file method now inherited from base class
|
||||
|
||||
def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
|
||||
"""Validate that logs show file deduplication behavior"""
|
||||
# Look for file embedding messages
|
||||
embedding_messages = [
|
||||
line for line in logs.split("\n") if "📁" in line and "embedding" in line and tool_name in line
|
||||
]
|
||||
|
||||
# Look for deduplication/filtering messages
|
||||
filtering_messages = [
|
||||
line for line in logs.split("\n") if "📁" in line and "Filtering" in line and tool_name in line
|
||||
]
|
||||
skipping_messages = [
|
||||
line for line in logs.split("\n") if "📁" in line and "skipping" in line and tool_name in line
|
||||
]
|
||||
|
||||
deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0
|
||||
|
||||
if deduplication_found:
|
||||
self.logger.info(f" ✅ {tool_name}: Found deduplication evidence in logs")
|
||||
for msg in filtering_messages + skipping_messages:
|
||||
self.logger.debug(f" 📁 {msg.strip()}")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ {tool_name}: No deduplication evidence found in logs")
|
||||
self.logger.debug(f" 📁 All embedding messages: {embedding_messages}")
|
||||
|
||||
return deduplication_found
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test file deduplication with realistic precommit/codereview workflow"""
|
||||
try:
|
||||
self.logger.info("📄 Test: Simplified file deduplication with precommit/codereview workflow")
|
||||
|
||||
# Setup test environment for conversation testing
|
||||
self.setUp()
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
@@ -126,7 +67,7 @@ def divide(x, y):
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
|
||||
response1, continuation_id = self.call_mcp_tool_direct("precommit", precommit_params)
|
||||
if not response1:
|
||||
self.logger.error(" ❌ Step 1: precommit tool failed")
|
||||
return False
|
||||
@@ -151,7 +92,7 @@ def divide(x, y):
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response2, _ = self.call_mcp_tool("codereview", codereview_params)
|
||||
response2, _ = self.call_mcp_tool_direct("codereview", codereview_params)
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Step 2: codereview tool failed")
|
||||
return False
|
||||
@@ -181,16 +122,16 @@ def subtract(a, b):
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response3, _ = self.call_mcp_tool("precommit", continue_params)
|
||||
response3, _ = self.call_mcp_tool_direct("precommit", continue_params)
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Step 3: precommit continuation failed")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 3: precommit continuation completed")
|
||||
|
||||
# Validate results in docker logs
|
||||
# Validate results in server logs
|
||||
self.logger.info(" 📋 Validating conversation history and file deduplication...")
|
||||
logs = self.get_docker_logs_since(start_time)
|
||||
logs = self.get_server_logs_since(start_time)
|
||||
|
||||
# Check for conversation history building
|
||||
conversation_logs = [
|
||||
@@ -249,7 +190,7 @@ def subtract(a, b):
|
||||
return True
|
||||
else:
|
||||
self.logger.warning(" ⚠️ File deduplication workflow test: FAILED")
|
||||
self.logger.warning(" 💡 Check docker logs for detailed file embedding and continuation activity")
|
||||
self.logger.warning(" 💡 Check server logs for detailed file embedding and continuation activity")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -244,7 +244,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Deployment strategy: Use Kubernetes for container orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
|
||||
"step": "Deployment strategy: Use Kubernetes for orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Complete the session
|
||||
@@ -326,7 +326,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
|
||||
"""Call an MCP tool via standalone server - override for planner-specific response handling"""
|
||||
# Use parent implementation to get the raw response
|
||||
response_text, _ = super().call_mcp_tool(tool_name, params)
|
||||
|
||||
|
||||
@@ -275,7 +275,7 @@ class PlannerValidationTest(BaseSimulatorTest):
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler Docker Swarm deployment initially, then migrate to Kubernetes later.",
|
||||
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
@@ -311,7 +311,7 @@ class PlannerValidationTest(BaseSimulatorTest):
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
|
||||
"""Call an MCP tool via standalone server - override for planner-specific response handling"""
|
||||
# Use parent implementation to get the raw response
|
||||
response_text, _ = super().call_mcp_tool(tool_name, params)
|
||||
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Redis Conversation Memory Validation Test
|
||||
|
||||
Validates that conversation memory is working via Redis by checking
|
||||
for stored conversation threads and their content.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class RedisValidationTest(BaseSimulatorTest):
|
||||
"""Validate that conversation memory is working via Redis"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "redis_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Redis conversation memory validation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Validate that conversation memory is working via Redis"""
|
||||
try:
|
||||
self.logger.info("💾 Test: Validating conversation memory via Redis...")
|
||||
|
||||
# First, test Redis connectivity
|
||||
ping_result = self.run_command(
|
||||
["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
|
||||
)
|
||||
|
||||
if ping_result.returncode != 0:
|
||||
self.logger.error("Failed to connect to Redis")
|
||||
return False
|
||||
|
||||
if "PONG" not in ping_result.stdout.decode():
|
||||
self.logger.error("Redis ping failed")
|
||||
return False
|
||||
|
||||
self.logger.info("✅ Redis connectivity confirmed")
|
||||
|
||||
# Check Redis for stored conversations
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error("Failed to query Redis")
|
||||
return False
|
||||
|
||||
keys = result.stdout.decode().strip().split("\n")
|
||||
thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"]
|
||||
|
||||
if thread_keys:
|
||||
self.logger.info(f"✅ Found {len(thread_keys)} conversation threads in Redis")
|
||||
|
||||
# Get details of first thread
|
||||
thread_key = thread_keys[0]
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
thread_data = result.stdout.decode()
|
||||
try:
|
||||
parsed = json.loads(thread_data)
|
||||
turns = parsed.get("turns", [])
|
||||
self.logger.info(f"✅ Thread has {len(turns)} turns")
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not parse thread data")
|
||||
|
||||
return True
|
||||
else:
|
||||
# If no existing threads, create a test thread to validate Redis functionality
|
||||
self.logger.info(" No existing threads found, creating test thread to validate Redis...")
|
||||
|
||||
test_thread_id = "test_thread_validation"
|
||||
test_data = {
|
||||
"thread_id": test_thread_id,
|
||||
"turns": [
|
||||
{"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
|
||||
],
|
||||
}
|
||||
|
||||
# Store test data
|
||||
store_result = self.run_command(
|
||||
[
|
||||
"docker",
|
||||
"exec",
|
||||
self.redis_container,
|
||||
"redis-cli",
|
||||
"SET",
|
||||
f"thread:{test_thread_id}",
|
||||
json.dumps(test_data),
|
||||
],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
if store_result.returncode != 0:
|
||||
self.logger.error("Failed to store test data in Redis")
|
||||
return False
|
||||
|
||||
# Retrieve test data
|
||||
retrieve_result = self.run_command(
|
||||
["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
if retrieve_result.returncode != 0:
|
||||
self.logger.error("Failed to retrieve test data from Redis")
|
||||
return False
|
||||
|
||||
retrieved_data = retrieve_result.stdout.decode()
|
||||
try:
|
||||
parsed = json.loads(retrieved_data)
|
||||
if parsed.get("thread_id") == test_thread_id:
|
||||
self.logger.info("✅ Redis read/write validation successful")
|
||||
|
||||
# Clean up test data
|
||||
self.run_command(
|
||||
["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
return True
|
||||
else:
|
||||
self.logger.error("Retrieved data doesn't match stored data")
|
||||
return False
|
||||
except json.JSONDecodeError:
|
||||
self.logger.error("Could not parse retrieved test data")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Conversation memory validation failed: {e}")
|
||||
return False
|
||||
@@ -241,35 +241,28 @@ def handle_everything(user_input, config, database):
|
||||
# Validate logs
|
||||
self.logger.info(" 📋 Validating execution logs...")
|
||||
|
||||
# Get server logs from the actual log file inside the container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
# Get server logs using inherited method
|
||||
logs = self.get_recent_server_logs(500)
|
||||
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode() + result.stderr.decode()
|
||||
# Look for refactor tool execution patterns
|
||||
refactor_patterns = [
|
||||
"[REFACTOR]",
|
||||
"refactor tool",
|
||||
"codesmells",
|
||||
"Token budget",
|
||||
"Code files embedded successfully",
|
||||
]
|
||||
|
||||
# Look for refactor tool execution patterns
|
||||
refactor_patterns = [
|
||||
"[REFACTOR]",
|
||||
"refactor tool",
|
||||
"codesmells",
|
||||
"Token budget",
|
||||
"Code files embedded successfully",
|
||||
]
|
||||
patterns_found = 0
|
||||
for pattern in refactor_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
patterns_found = 0
|
||||
for pattern in refactor_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
|
||||
|
||||
self.logger.info(" ✅ Refactor tool validation completed successfully")
|
||||
return True
|
||||
|
||||
@@ -11,7 +11,6 @@ This test validates that:
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -27,78 +26,6 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "Token allocation and conversation history validation"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
|
||||
"""Extract actual conversation token usage from server logs"""
|
||||
usage_logs = []
|
||||
|
||||
# Look for conversation debug logs that show actual usage
|
||||
lines = logs.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if "[CONVERSATION_DEBUG] Token budget calculation:" in line:
|
||||
# Found start of token budget log, extract the following lines
|
||||
usage = {}
|
||||
for j in range(1, 8): # Next 7 lines contain the usage details
|
||||
if i + j < len(lines):
|
||||
detail_line = lines[i + j]
|
||||
|
||||
# Parse Total capacity: 1,048,576
|
||||
if "Total capacity:" in detail_line:
|
||||
match = re.search(r"Total capacity:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage["total_capacity"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Content allocation: 838,860
|
||||
elif "Content allocation:" in detail_line:
|
||||
match = re.search(r"Content allocation:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage["content_allocation"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Conversation tokens: 12,345
|
||||
elif "Conversation tokens:" in detail_line:
|
||||
match = re.search(r"Conversation tokens:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage["conversation_tokens"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Remaining tokens: 825,515
|
||||
elif "Remaining tokens:" in detail_line:
|
||||
match = re.search(r"Remaining tokens:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage["remaining_tokens"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
if usage: # Only add if we found some usage data
|
||||
usage_logs.append(usage)
|
||||
|
||||
return usage_logs
|
||||
|
||||
def extract_conversation_token_usage(self, logs: str) -> list[int]:
|
||||
"""Extract conversation token usage from logs"""
|
||||
usage_values = []
|
||||
|
||||
# Look for conversation token usage logs
|
||||
pattern = r"Conversation history token usage:\s*([\d,]+)"
|
||||
matches = re.findall(pattern, logs)
|
||||
|
||||
for match in matches:
|
||||
usage_values.append(int(match.replace(",", "")))
|
||||
|
||||
return usage_values
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test token allocation and conversation history functionality"""
|
||||
try:
|
||||
|
||||
@@ -81,7 +81,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
|
||||
"don't have access",
|
||||
"cannot see",
|
||||
"no image",
|
||||
"clarification_required",
|
||||
"files_required_to_continue",
|
||||
"image you're referring to",
|
||||
"supply the image",
|
||||
"error",
|
||||
@@ -122,7 +122,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
|
||||
"don't have access",
|
||||
"cannot see",
|
||||
"no image",
|
||||
"clarification_required",
|
||||
"files_required_to_continue",
|
||||
"image you're referring to",
|
||||
"supply the image",
|
||||
"error",
|
||||
|
||||
@@ -9,7 +9,6 @@ Tests that verify X.AI GROK functionality including:
|
||||
- API integration and response validation
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -25,44 +24,18 @@ class XAIModelsTest(BaseSimulatorTest):
|
||||
def test_description(self) -> str:
|
||||
return "X.AI GROK model functionality and integration"
|
||||
|
||||
def get_recent_server_logs(self) -> str:
|
||||
"""Get recent server logs from the log file directly"""
|
||||
try:
|
||||
# Read logs directly from the log file
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
self.logger.warning(f"Failed to read server logs: {result.stderr}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test X.AI GROK model functionality"""
|
||||
try:
|
||||
self.logger.info("Test: X.AI GROK model functionality and integration")
|
||||
|
||||
# Check if X.AI API key is configured and not empty
|
||||
check_cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
self.container_name,
|
||||
"python",
|
||||
"-c",
|
||||
"""
|
||||
import os
|
||||
xai_key = os.environ.get("XAI_API_KEY", "")
|
||||
is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
|
||||
print(f"XAI_KEY_VALID:{is_valid}")
|
||||
""".strip(),
|
||||
]
|
||||
result = subprocess.run(check_cmd, capture_output=True, text=True)
|
||||
import os
|
||||
|
||||
if result.returncode == 0 and "XAI_KEY_VALID:False" in result.stdout:
|
||||
xai_key = os.environ.get("XAI_API_KEY", "")
|
||||
is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
|
||||
|
||||
if not is_valid:
|
||||
self.logger.info(" ⚠️ X.AI API key not configured or empty - skipping test")
|
||||
self.logger.info(" ℹ️ This test requires XAI_API_KEY to be set in .env with a valid key")
|
||||
return True # Return True to indicate test is skipped, not failed
|
||||
|
||||
Reference in New Issue
Block a user