Migration from Docker to Standalone Python Server (#73)

* Migration from docker to standalone server Migration handling Fixed tests Use simpler in-memory storage Support for concurrent logging to disk Simplified direct connections to localhost * Migration from docker / redis to standalone script Updated tests Updated run script Fixed requirements Use dotenv Ask if user would like to install MCP in Claude Desktop once Updated docs * More cleanup and references to docker removed * Cleanup * Comments * Fixed tests * Fix GitHub Actions workflow for standalone Python architecture - Install requirements-dev.txt for pytest and testing dependencies - Remove Docker setup from simulation tests (now standalone) - Simplify linting job to use requirements-dev.txt - Update simulation tests to run directly without Docker Fixes unit test failures in CI due to missing pytest dependency. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Remove simulation tests from GitHub Actions - Removed simulation-tests job that makes real API calls - Keep only unit tests (mocked, no API costs) and linting - Simulation tests should be run manually with real API keys - Reduces CI costs and complexity GitHub Actions now only runs: - Unit tests (569 tests, all mocked) - Code quality checks (ruff, black) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed tests * Fixed tests --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-18 23:41:22 +04:00
parent 9d72545ecd
commit 4151c3c3a5
121 changed files with 2842 additions and 3168 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -26,7 +26,8 @@ from .test_openrouter_models import OpenRouterModelsTest
 from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_planner_continuation_history import PlannerContinuationHistoryTest
 from .test_planner_validation import PlannerValidationTest
-from .test_redis_validation import RedisValidationTest
+
+# Redis validation test removed - no longer needed for standalone server
 from .test_refactor_validation import RefactorValidationTest
 from .test_testgen_validation import TestGenValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
@@ -42,7 +43,7 @@ TEST_REGISTRY = {
    "cross_tool_comprehensive": CrossToolComprehensiveTest,
    "line_number_validation": LineNumberValidationTest,
    "logs_validation": LogsValidationTest,
-    "redis_validation": RedisValidationTest,
+    # "redis_validation": RedisValidationTest,  # Removed - no longer needed for standalone server
    "model_thinking_config": TestModelThinkingConfig,
    "o3_model_selection": O3ModelSelectionTest,
    "ollama_custom_url": OllamaCustomUrlTest,
@@ -72,7 +73,7 @@ __all__ = [
    "CrossToolComprehensiveTest",
    "LineNumberValidationTest",
    "LogsValidationTest",
-    "RedisValidationTest",
+    # "RedisValidationTest",  # Removed - no longer needed for standalone server
    "TestModelThinkingConfig",
    "O3ModelSelectionTest",
    "O3ProExpensiveTest",
--- a/simulator_tests/base_test.py
+++ b/simulator_tests/base_test.py
@@ -11,6 +11,8 @@ import os
 import subprocess
 from typing import Optional

+from .log_utils import LogUtils
+

 class BaseSimulatorTest:
    """Base class for all communication simulator tests"""
@@ -19,14 +21,25 @@ class BaseSimulatorTest:
        self.verbose = verbose
        self.test_files = {}
        self.test_dir = None
-        self.container_name = "zen-mcp-server"
-        self.redis_container = "zen-mcp-redis"
+        self.python_path = self._get_python_path()

        # Configure logging
        log_level = logging.DEBUG if verbose else logging.INFO
        logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
        self.logger = logging.getLogger(self.__class__.__name__)

+    def _get_python_path(self) -> str:
+        """Get the Python path for the virtual environment"""
+        current_dir = os.getcwd()
+        venv_python = os.path.join(current_dir, ".zen_venv", "bin", "python")
+
+        if os.path.exists(venv_python):
+            return venv_python
+
+        # Fallback to system python if venv doesn't exist
+        self.logger.warning("Virtual environment not found, using system python")
+        return "python"
+
    def setup_test_files(self):
        """Create test files for the simulation"""
        # Test Python file
@@ -100,7 +113,7 @@ class Calculator:
        self.logger.debug(f"Created test files with absolute paths: {list(self.test_files.values())}")

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
-        """Call an MCP tool via Claude CLI (docker exec)"""
+        """Call an MCP tool via standalone server"""
        try:
            # Prepare the MCP initialization and tool call sequence
            init_request = {
@@ -131,8 +144,8 @@ class Calculator:
            # Join with newlines as MCP expects
            input_data = "\n".join(messages) + "\n"

-            # Simulate Claude CLI calling the MCP server via docker exec
-            docker_cmd = ["docker", "exec", "-i", self.container_name, "python", "server.py"]
+            # Call the standalone MCP server directly
+            server_cmd = [self.python_path, "server.py"]

            self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")

@@ -140,7 +153,7 @@ class Calculator:
            # For consensus tool and other long-running tools, we need to ensure
            # the subprocess doesn't close prematurely
            result = subprocess.run(
-                docker_cmd,
+                server_cmd,
                input=input_data,
                text=True,
                capture_output=True,
@@ -149,7 +162,7 @@ class Calculator:
            )

            if result.returncode != 0:
-                self.logger.error(f"Docker exec failed with return code {result.returncode}")
+                self.logger.error(f"Standalone server failed with return code {result.returncode}")
                self.logger.error(f"Stderr: {result.stderr}")
                # Still try to parse stdout as the response might have been written before the error
                self.logger.debug(f"Attempting to parse stdout despite error: {result.stdout[:500]}")
@@ -263,6 +276,56 @@ class Calculator:
            shutil.rmtree(self.test_dir)
            self.logger.debug(f"Removed test files directory: {self.test_dir}")

+    # ============================================================================
+    # Log Utility Methods (delegate to LogUtils)
+    # ============================================================================
+
+    def get_server_logs_since(self, since_time: Optional[str] = None) -> str:
+        """Get server logs from both main and activity log files."""
+        return LogUtils.get_server_logs_since(since_time)
+
+    def get_recent_server_logs(self, lines: int = 500) -> str:
+        """Get recent server logs from the main log file."""
+        return LogUtils.get_recent_server_logs(lines)
+
+    def get_server_logs_subprocess(self, lines: int = 500) -> str:
+        """Get server logs using subprocess (alternative method)."""
+        return LogUtils.get_server_logs_subprocess(lines)
+
+    def check_server_logs_for_errors(self, lines: int = 500) -> list[str]:
+        """Check server logs for error messages."""
+        return LogUtils.check_server_logs_for_errors(lines)
+
+    def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
+        """Extract token budget calculation information from logs."""
+        return LogUtils.extract_conversation_usage_logs(logs)
+
+    def extract_conversation_token_usage(self, logs: str) -> list[int]:
+        """Extract conversation token usage values from logs."""
+        return LogUtils.extract_conversation_token_usage(logs)
+
+    def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
+        """Extract thread creation logs with parent relationships."""
+        return LogUtils.extract_thread_creation_logs(logs)
+
+    def extract_history_traversal_logs(self, logs: str) -> list[dict[str, any]]:
+        """Extract conversation history traversal logs."""
+        return LogUtils.extract_history_traversal_logs(logs)
+
+    def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
+        """Validate that logs show file deduplication behavior."""
+        return LogUtils.validate_file_deduplication_in_logs(logs, tool_name, test_file)
+
+    def search_logs_for_pattern(
+        self, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
+    ) -> list[str]:
+        """Search logs for a specific pattern."""
+        return LogUtils.search_logs_for_pattern(pattern, logs, case_sensitive)
+
+    def get_log_file_info(self) -> dict[str, dict[str, any]]:
+        """Get information about log files."""
+        return LogUtils.get_log_file_info()
+
    def run_test(self) -> bool:
        """Run the test - to be implemented by subclasses"""
        raise NotImplementedError("Subclasses must implement run_test()")
--- a/simulator_tests/conversation_base_test.py
+++ b/simulator_tests/conversation_base_test.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Conversation Base Test Class for In-Process MCP Tool Testing
+
+This class enables testing MCP tools within the same process to maintain conversation
+memory state across tool calls. Unlike BaseSimulatorTest which runs each tool call
+as a separate subprocess (losing memory state), this class calls tools directly
+in-process, allowing conversation functionality to work correctly.
+
+USAGE:
+- Inherit from ConversationBaseTest instead of BaseSimulatorTest for conversation tests
+- Use call_mcp_tool_direct() to call tools in-process
+- Conversation memory persists across tool calls within the same test
+- setUp() clears memory between test methods for proper isolation
+
+EXAMPLE:
+    class TestConversationFeature(ConversationBaseTest):
+        def test_cross_tool_continuation(self):
+            # Step 1: Call precommit tool
+            result1, continuation_id = self.call_mcp_tool_direct("precommit", {
+                "path": "/path/to/repo",
+                "prompt": "Review these changes"
+            })
+
+            # Step 2: Continue with codereview tool - memory is preserved!
+            result2, _ = self.call_mcp_tool_direct("codereview", {
+                "files": ["/path/to/file.py"],
+                "prompt": "Focus on security issues",
+                "continuation_id": continuation_id
+            })
+"""
+
+import asyncio
+import json
+from typing import Optional
+
+from .base_test import BaseSimulatorTest
+
+
+class ConversationBaseTest(BaseSimulatorTest):
+    """Base class for conversation tests that require in-process tool calling"""
+
+    def __init__(self, verbose: bool = False):
+        super().__init__(verbose)
+        self._tools = None
+        self._loop = None
+
+    def setUp(self):
+        """Set up test environment - clears conversation memory between tests"""
+        super().setup_test_files()
+
+        # Clear conversation memory for test isolation
+        self._clear_conversation_memory()
+
+        # Import tools from server.py for in-process calling
+        if self._tools is None:
+            self._import_tools()
+
+    def _clear_conversation_memory(self):
+        """Clear all conversation memory to ensure test isolation"""
+        try:
+            from utils.storage_backend import get_storage_backend
+
+            storage = get_storage_backend()
+            # Clear all stored conversation threads
+            with storage._lock:
+                storage._store.clear()
+            self.logger.debug("Cleared conversation memory for test isolation")
+        except Exception as e:
+            self.logger.warning(f"Could not clear conversation memory: {e}")
+
+    def _import_tools(self):
+        """Import tools from server.py for direct calling"""
+        try:
+            import os
+            import sys
+
+            # Add project root to Python path if not already there
+            project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            if project_root not in sys.path:
+                sys.path.insert(0, project_root)
+
+            # Import tools from server
+            from server import TOOLS
+
+            self._tools = TOOLS
+            self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
+        except ImportError as e:
+            raise RuntimeError(f"Could not import tools from server.py: {e}")
+
+    def _get_event_loop(self):
+        """Get or create event loop for async tool execution"""
+        if self._loop is None:
+            try:
+                self._loop = asyncio.get_event_loop()
+            except RuntimeError:
+                self._loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(self._loop)
+        return self._loop
+
+    def call_mcp_tool_direct(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
+        """
+        Call an MCP tool directly in-process without subprocess isolation.
+
+        This method maintains conversation memory across calls, enabling proper
+        testing of conversation functionality.
+
+        Args:
+            tool_name: Name of the tool to call (e.g., "precommit", "codereview")
+            params: Parameters to pass to the tool
+
+        Returns:
+            tuple: (response_content, continuation_id) where continuation_id
+                   can be used for follow-up calls
+        """
+        if self._tools is None:
+            raise RuntimeError("Tools not imported. Call setUp() first.")
+
+        if tool_name not in self._tools:
+            raise ValueError(f"Tool '{tool_name}' not found. Available: {list(self._tools.keys())}")
+
+        try:
+            tool = self._tools[tool_name]
+            self.logger.debug(f"Calling tool '{tool_name}' directly in-process")
+
+            # Set up minimal model context if not provided
+            if "model" not in params:
+                params["model"] = "flash"  # Use fast model for testing
+
+            # Execute tool directly using asyncio
+            loop = self._get_event_loop()
+
+            # Import required modules for model resolution (similar to server.py)
+            from config import DEFAULT_MODEL
+            from providers.registry import ModelProviderRegistry
+            from utils.model_context import ModelContext
+
+            # Resolve model (simplified version of server.py logic)
+            model_name = params.get("model", DEFAULT_MODEL)
+            provider = ModelProviderRegistry.get_provider_for_model(model_name)
+            if not provider:
+                # Fallback to available model for testing
+                available_models = list(ModelProviderRegistry.get_available_models(respect_restrictions=True).keys())
+                if available_models:
+                    model_name = available_models[0]
+                    params["model"] = model_name
+                    self.logger.debug(f"Using fallback model for testing: {model_name}")
+
+            # Create model context
+            model_context = ModelContext(model_name)
+            params["_model_context"] = model_context
+            params["_resolved_model_name"] = model_name
+
+            # Execute tool asynchronously
+            result = loop.run_until_complete(tool.execute(params))
+
+            if not result or len(result) == 0:
+                return None, None
+
+            # Extract response content
+            response_text = result[0].text if hasattr(result[0], "text") else str(result[0])
+
+            # Parse response to extract continuation_id
+            continuation_id = self._extract_continuation_id_from_response(response_text)
+
+            self.logger.debug(f"Tool '{tool_name}' completed successfully in-process")
+            return response_text, continuation_id
+
+        except Exception as e:
+            self.logger.error(f"Direct tool call failed for '{tool_name}': {e}")
+            return None, None
+
+    def _extract_continuation_id_from_response(self, response_text: str) -> Optional[str]:
+        """Extract continuation_id from tool response"""
+        try:
+            # Parse the response as JSON to look for continuation metadata
+            response_data = json.loads(response_text)
+
+            # Look for continuation_id in various places
+            if isinstance(response_data, dict):
+                # Check metadata
+                metadata = response_data.get("metadata", {})
+                if "thread_id" in metadata:
+                    return metadata["thread_id"]
+
+                # Check continuation_offer
+                continuation_offer = response_data.get("continuation_offer", {})
+                if continuation_offer and "continuation_id" in continuation_offer:
+                    return continuation_offer["continuation_id"]
+
+                # Check follow_up_request
+                follow_up = response_data.get("follow_up_request", {})
+                if follow_up and "continuation_id" in follow_up:
+                    return follow_up["continuation_id"]
+
+            return None
+
+        except (json.JSONDecodeError, AttributeError):
+            # If response is not JSON or doesn't have expected structure, return None
+            return None
+
+    def tearDown(self):
+        """Clean up after test"""
+        super().cleanup_test_files()
+        # Clear memory again for good measure
+        self._clear_conversation_memory()
+
+    @property
+    def test_name(self) -> str:
+        """Get the test name"""
+        return self.__class__.__name__
+
+    @property
+    def test_description(self) -> str:
+        """Get the test description"""
+        return "In-process conversation test"
--- a/simulator_tests/log_utils.py
+++ b/simulator_tests/log_utils.py
@@ -0,0 +1,316 @@
+"""
+Centralized log utility for simulator tests.
+
+This module provides common log reading and parsing functionality
+used across multiple simulator test files to reduce code duplication.
+"""
+
+import logging
+import re
+import subprocess
+from typing import Optional, Union
+
+
+class LogUtils:
+    """Centralized logging utilities for simulator tests."""
+
+    # Log file paths
+    MAIN_LOG_FILE = "logs/mcp_server.log"
+    ACTIVITY_LOG_FILE = "logs/mcp_activity.log"
+
+    @classmethod
+    def get_server_logs_since(cls, since_time: Optional[str] = None) -> str:
+        """
+        Get server logs from both main and activity log files.
+
+        Args:
+            since_time: Currently ignored, returns all available logs
+
+        Returns:
+            Combined logs from both log files
+        """
+        try:
+            main_logs = ""
+            activity_logs = ""
+
+            # Read main server log
+            try:
+                with open(cls.MAIN_LOG_FILE) as f:
+                    main_logs = f.read()
+            except FileNotFoundError:
+                pass
+
+            # Read activity log
+            try:
+                with open(cls.ACTIVITY_LOG_FILE) as f:
+                    activity_logs = f.read()
+            except FileNotFoundError:
+                pass
+
+            return main_logs + "\n" + activity_logs
+
+        except Exception as e:
+            logging.warning(f"Failed to read server logs: {e}")
+            return ""
+
+    @classmethod
+    def get_recent_server_logs(cls, lines: int = 500) -> str:
+        """
+        Get recent server logs from the main log file.
+
+        Args:
+            lines: Number of recent lines to retrieve (default: 500)
+
+        Returns:
+            Recent log content as string
+        """
+        try:
+            with open(cls.MAIN_LOG_FILE) as f:
+                all_lines = f.readlines()
+                recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
+                return "".join(recent_lines)
+        except FileNotFoundError:
+            logging.warning(f"Log file {cls.MAIN_LOG_FILE} not found")
+            return ""
+        except Exception as e:
+            logging.warning(f"Failed to read recent server logs: {e}")
+            return ""
+
+    @classmethod
+    def get_server_logs_subprocess(cls, lines: int = 500) -> str:
+        """
+        Get server logs using subprocess (alternative method).
+
+        Args:
+            lines: Number of recent lines to retrieve
+
+        Returns:
+            Recent log content as string
+        """
+        try:
+            result = subprocess.run(
+                ["tail", "-n", str(lines), cls.MAIN_LOG_FILE], capture_output=True, text=True, timeout=10
+            )
+            return result.stdout + result.stderr
+        except Exception as e:
+            logging.warning(f"Failed to get server logs via subprocess: {e}")
+            return ""
+
+    @classmethod
+    def check_server_logs_for_errors(cls, lines: int = 500) -> list[str]:
+        """
+        Check server logs for error messages.
+
+        Args:
+            lines: Number of recent lines to check
+
+        Returns:
+            List of error messages found
+        """
+        logs = cls.get_recent_server_logs(lines)
+        error_patterns = [r"ERROR.*", r"CRITICAL.*", r"Failed.*", r"Exception.*", r"Error:.*"]
+
+        errors = []
+        for line in logs.split("\n"):
+            for pattern in error_patterns:
+                if re.search(pattern, line, re.IGNORECASE):
+                    errors.append(line.strip())
+                    break
+
+        return errors
+
+    @classmethod
+    def extract_conversation_usage_logs(cls, logs: str) -> list[dict[str, int]]:
+        """
+        Extract token budget calculation information from logs.
+
+        Args:
+            logs: Log content to parse
+
+        Returns:
+            List of dictionaries containing token usage data
+        """
+        usage_data = []
+        pattern = r"\[CONVERSATION_DEBUG\] Token budget calculation:"
+
+        for line in logs.split("\n"):
+            if re.search(pattern, line):
+                # Parse the token usage information
+                usage_info = {}
+
+                # Extract total capacity
+                capacity_match = re.search(r"Total capacity: ([\d,]+)", line)
+                if capacity_match:
+                    usage_info["total_capacity"] = int(capacity_match.group(1).replace(",", ""))
+
+                # Extract content allocation
+                content_match = re.search(r"Content allocation: ([\d,]+)", line)
+                if content_match:
+                    usage_info["content_allocation"] = int(content_match.group(1).replace(",", ""))
+
+                # Extract conversation tokens
+                conv_match = re.search(r"Conversation tokens: ([\d,]+)", line)
+                if conv_match:
+                    usage_info["conversation_tokens"] = int(conv_match.group(1).replace(",", ""))
+
+                # Extract remaining tokens
+                remaining_match = re.search(r"Remaining tokens: ([\d,]+)", line)
+                if remaining_match:
+                    usage_info["remaining_tokens"] = int(remaining_match.group(1).replace(",", ""))
+
+                if usage_info:
+                    usage_data.append(usage_info)
+
+        return usage_data
+
+    @classmethod
+    def extract_conversation_token_usage(cls, logs: str) -> list[int]:
+        """
+        Extract conversation token usage values from logs.
+
+        Args:
+            logs: Log content to parse
+
+        Returns:
+            List of token usage values
+        """
+        pattern = r"Conversation history token usage:\s*([\d,]+)"
+        usage_values = []
+
+        for match in re.finditer(pattern, logs):
+            usage_value = int(match.group(1).replace(",", ""))
+            usage_values.append(usage_value)
+
+        return usage_values
+
+    @classmethod
+    def extract_thread_creation_logs(cls, logs: str) -> list[dict[str, str]]:
+        """
+        Extract thread creation logs with parent relationships.
+
+        Args:
+            logs: Log content to parse
+
+        Returns:
+            List of dictionaries with thread relationship data
+        """
+        thread_data = []
+        pattern = r"\[THREAD\] Created new thread (\w+)(?: with parent (\w+))?"
+
+        for match in re.finditer(pattern, logs):
+            thread_info = {"thread_id": match.group(1), "parent_id": match.group(2) if match.group(2) else None}
+            thread_data.append(thread_info)
+
+        return thread_data
+
+    @classmethod
+    def extract_history_traversal_logs(cls, logs: str) -> list[dict[str, Union[str, int]]]:
+        """
+        Extract conversation history traversal logs.
+
+        Args:
+            logs: Log content to parse
+
+        Returns:
+            List of dictionaries with traversal data
+        """
+        traversal_data = []
+        pattern = r"\[THREAD\] Retrieved chain of (\d+) messages for thread (\w+)"
+
+        for match in re.finditer(pattern, logs):
+            traversal_info = {"chain_length": int(match.group(1)), "thread_id": match.group(2)}
+            traversal_data.append(traversal_info)
+
+        return traversal_data
+
+    @classmethod
+    def validate_file_deduplication_in_logs(cls, logs: str, tool_name: str, test_file: str) -> bool:
+        """
+        Validate that logs show file deduplication behavior.
+
+        Args:
+            logs: Log content to parse
+            tool_name: Name of the tool being tested
+            test_file: Name of the test file to check for deduplication
+
+        Returns:
+            True if deduplication evidence is found, False otherwise
+        """
+        # Look for embedding calculation
+        embedding_pattern = f"Calculating embeddings for {test_file}"
+        has_embedding = bool(re.search(embedding_pattern, logs))
+
+        # Look for filtering message
+        filtering_pattern = f"Filtering {test_file} to prevent duplication"
+        has_filtering = bool(re.search(filtering_pattern, logs))
+
+        # Look for skip message
+        skip_pattern = f"Skipping {test_file} \\(already processed"
+        has_skip = bool(re.search(skip_pattern, logs))
+
+        # Look for tool-specific processing
+        tool_pattern = f"\\[{tool_name.upper()}\\].*{test_file}"
+        has_tool_processing = bool(re.search(tool_pattern, logs, re.IGNORECASE))
+
+        # Deduplication is confirmed if we see evidence of processing and filtering/skipping
+        return has_embedding and (has_filtering or has_skip) and has_tool_processing
+
+    @classmethod
+    def search_logs_for_pattern(
+        cls, pattern: str, logs: Optional[str] = None, case_sensitive: bool = False
+    ) -> list[str]:
+        """
+        Search logs for a specific pattern.
+
+        Args:
+            pattern: Regex pattern to search for
+            logs: Log content to search (if None, reads recent logs)
+            case_sensitive: Whether the search should be case sensitive
+
+        Returns:
+            List of matching lines
+        """
+        if logs is None:
+            logs = cls.get_recent_server_logs()
+
+        flags = 0 if case_sensitive else re.IGNORECASE
+        matches = []
+
+        for line in logs.split("\n"):
+            if re.search(pattern, line, flags):
+                matches.append(line.strip())
+
+        return matches
+
+    @classmethod
+    def get_log_file_info(cls) -> dict[str, dict[str, Union[str, int, bool]]]:
+        """
+        Get information about log files.
+
+        Returns:
+            Dictionary with file information for each log file
+        """
+        import os
+
+        file_info = {}
+
+        for log_file in [cls.MAIN_LOG_FILE, cls.ACTIVITY_LOG_FILE]:
+            if os.path.exists(log_file):
+                stat = os.stat(log_file)
+                file_info[log_file] = {
+                    "exists": True,
+                    "size_bytes": stat.st_size,
+                    "size_mb": round(stat.st_size / (1024 * 1024), 2),
+                    "last_modified": stat.st_mtime,
+                    "readable": os.access(log_file, os.R_OK),
+                }
+            else:
+                file_info[log_file] = {
+                    "exists": False,
+                    "size_bytes": 0,
+                    "size_mb": 0,
+                    "last_modified": 0,
+                    "readable": False,
+                }
+
+        return file_info
--- a/simulator_tests/test_consensus_conversation.py
+++ b/simulator_tests/test_consensus_conversation.py
@@ -7,7 +7,6 @@ and builds conversation context correctly when using continuation_id.
 """

 import json
-import subprocess

 from .base_test import BaseSimulatorTest

@@ -23,19 +22,16 @@ class TestConsensusConversation(BaseSimulatorTest):
    def test_description(self) -> str:
        return "Test consensus tool conversation building and continuation"

-    def get_docker_logs(self):
-        """Get Docker container logs"""
+    def get_server_logs(self):
+        """Get server logs from local log file"""
        try:
-            result = subprocess.run(
-                ["docker", "logs", "--tail", "100", self.container_name], capture_output=True, text=True, timeout=30
-            )
-            if result.returncode == 0:
-                return result.stdout.split("\n")
-            else:
-                self.logger.warning(f"Failed to get Docker logs: {result.stderr}")
-                return []
+            log_file_path = "logs/mcp_server.log"
+            with open(log_file_path) as f:
+                lines = f.readlines()
+                # Return last 100 lines
+                return [line.strip() for line in lines[-100:]]
        except Exception as e:
-            self.logger.warning(f"Exception getting Docker logs: {e}")
+            self.logger.warning(f"Exception getting server logs: {e}")
            return []

    def run_test(self) -> bool:
@@ -121,9 +117,9 @@ class TestConsensusConversation(BaseSimulatorTest):
            self.logger.info("Phase 3: Checking server logs for conversation building")

            # Check for conversation-related log entries
-            logs = self.get_docker_logs()
+            logs = self.get_server_logs()
            if not logs:
-                self.logger.warning("Could not retrieve Docker logs for verification")
+                self.logger.warning("Could not retrieve server logs for verification")
            else:
                # Look for conversation building indicators
                conversation_logs = [
--- a/simulator_tests/test_content_validation.py
+++ b/simulator_tests/test_content_validation.py
@@ -22,42 +22,6 @@ class ContentValidationTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "Content validation and duplicate detection"

-    def get_docker_logs_since(self, since_time: str) -> str:
-        """Get docker logs since a specific timestamp"""
-        try:
-            # Check both main server and log monitor for comprehensive logs
-            cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
-            cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
-
-            import subprocess
-
-            result_server = subprocess.run(cmd_server, capture_output=True, text=True)
-            result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
-
-            # Get the internal log files which have more detailed logging
-            server_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
-            )
-
-            activity_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
-            )
-
-            # Combine all logs
-            combined_logs = (
-                result_server.stdout
-                + "\n"
-                + result_monitor.stdout
-                + "\n"
-                + server_log_result.stdout
-                + "\n"
-                + activity_log_result.stdout
-            )
-            return combined_logs
-        except Exception as e:
-            self.logger.error(f"Failed to get docker logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Test that file processing system properly handles file deduplication"""
        try:
@@ -151,9 +115,9 @@ DATABASE_CONFIG = {
            else:
                self.logger.warning("  ⚠️  Different tool failed")

-            # Validate file processing behavior from Docker logs
+            # Validate file processing behavior from server logs
            self.logger.info("  4: Validating file processing logs")
-            logs = self.get_docker_logs_since(start_time)
+            logs = self.get_server_logs_since(start_time)

            # Check for proper file embedding logs
            embedding_logs = [
--- a/simulator_tests/test_conversation_chain_validation.py
+++ b/simulator_tests/test_conversation_chain_validation.py
@@ -21,8 +21,6 @@ This validates the conversation threading system's ability to:
 - Properly traverse parent relationships for history reconstruction
 """

-import re
-import subprocess

 from .base_test import BaseSimulatorTest

@@ -38,53 +36,6 @@ class ConversationChainValidationTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "Conversation chain and threading validation"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
-    def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
-        """Extract thread creation logs with parent relationships"""
-        thread_logs = []
-
-        lines = logs.split("\n")
-        for line in lines:
-            if "[THREAD] Created new thread" in line:
-                # Parse: [THREAD] Created new thread 9dc779eb-645f-4850-9659-34c0e6978d73 with parent a0ce754d-c995-4b3e-9103-88af429455aa
-                match = re.search(r"\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)", line)
-                if match:
-                    thread_id = match.group(1)
-                    parent_id = match.group(2) if match.group(2) != "None" else None
-                    thread_logs.append({"thread_id": thread_id, "parent_id": parent_id, "log_line": line})
-
-        return thread_logs
-
-    def extract_history_traversal_logs(self, logs: str) -> list[dict[str, str]]:
-        """Extract conversation history traversal logs"""
-        traversal_logs = []
-
-        lines = logs.split("\n")
-        for line in lines:
-            if "[THREAD] Retrieved chain of" in line:
-                # Parse: [THREAD] Retrieved chain of 3 threads for 9dc779eb-645f-4850-9659-34c0e6978d73
-                match = re.search(r"\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)", line)
-                if match:
-                    chain_length = int(match.group(1))
-                    thread_id = match.group(2)
-                    traversal_logs.append({"thread_id": thread_id, "chain_length": chain_length, "log_line": line})
-
-        return traversal_logs
-
    def run_test(self) -> bool:
        """Test conversation chain and threading functionality"""
        try:
--- a/simulator_tests/test_cross_tool_comprehensive.py
+++ b/simulator_tests/test_cross_tool_comprehensive.py
@@ -12,7 +12,6 @@ Validates:
 5. Proper tool chaining with context
 """

-import subprocess

 from .base_test import BaseSimulatorTest

@@ -28,40 +27,6 @@ class CrossToolComprehensiveTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "Comprehensive cross-tool file deduplication and continuation"

-    def get_docker_logs_since(self, since_time: str) -> str:
-        """Get docker logs since a specific timestamp"""
-        try:
-            # Check both main server and log monitor for comprehensive logs
-            cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
-            cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
-
-            result_server = subprocess.run(cmd_server, capture_output=True, text=True)
-            result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
-
-            # Get the internal log files which have more detailed logging
-            server_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
-            )
-
-            activity_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
-            )
-
-            # Combine all logs
-            combined_logs = (
-                result_server.stdout
-                + "\n"
-                + result_monitor.stdout
-                + "\n"
-                + server_log_result.stdout
-                + "\n"
-                + activity_log_result.stdout
-            )
-            return combined_logs
-        except Exception as e:
-            self.logger.error(f"Failed to get docker logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Comprehensive cross-tool test with all MCP tools"""
        try:
@@ -247,7 +212,7 @@ def secure_login(user, pwd):

            # Validate comprehensive results
            self.logger.info("  📋 Validating comprehensive cross-tool results...")
-            logs = self.get_docker_logs_since(start_time)
+            logs = self.get_server_logs_since(start_time)

            # Validation criteria
            tools_used = [r[0] for r in responses]
--- a/simulator_tests/test_cross_tool_continuation.py
+++ b/simulator_tests/test_cross_tool_continuation.py
@@ -6,10 +6,10 @@ Tests comprehensive cross-tool continuation scenarios to ensure
 conversation context is maintained when switching between different tools.
 """

-from .base_test import BaseSimulatorTest
+from .conversation_base_test import ConversationBaseTest


-class CrossToolContinuationTest(BaseSimulatorTest):
+class CrossToolContinuationTest(ConversationBaseTest):
    """Test comprehensive cross-tool continuation scenarios"""

    @property
@@ -25,8 +25,8 @@ class CrossToolContinuationTest(BaseSimulatorTest):
        try:
            self.logger.info("🔧 Test: Cross-tool continuation scenarios")

-            # Setup test files
-            self.setup_test_files()
+            # Setup test environment for conversation testing
+            self.setUp()

            success_count = 0
            total_scenarios = 3
@@ -62,7 +62,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
            self.logger.info("  1: Testing chat -> thinkdeep -> codereview")

            # Start with chat
-            chat_response, chat_id = self.call_mcp_tool(
+            chat_response, chat_id = self.call_mcp_tool_direct(
                "chat",
                {
                    "prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
@@ -76,7 +76,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
                return False

            # Continue with thinkdeep
-            thinkdeep_response, _ = self.call_mcp_tool(
+            thinkdeep_response, _ = self.call_mcp_tool_direct(
                "thinkdeep",
                {
                    "prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
@@ -91,7 +91,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
                return False

            # Continue with codereview
-            codereview_response, _ = self.call_mcp_tool(
+            codereview_response, _ = self.call_mcp_tool_direct(
                "codereview",
                {
                    "files": [self.test_files["python"]],  # Same file should be deduplicated
@@ -118,8 +118,13 @@ class CrossToolContinuationTest(BaseSimulatorTest):
            self.logger.info("  2: Testing analyze -> debug -> thinkdeep")

            # Start with analyze
-            analyze_response, analyze_id = self.call_mcp_tool(
-                "analyze", {"files": [self.test_files["python"]], "analysis_type": "code_quality", "model": "flash"}
+            analyze_response, analyze_id = self.call_mcp_tool_direct(
+                "analyze",
+                {
+                    "files": [self.test_files["python"]],
+                    "prompt": "Analyze this code for quality and performance issues",
+                    "model": "flash",
+                },
            )

            if not analyze_response or not analyze_id:
@@ -127,7 +132,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
                return False

            # Continue with debug
-            debug_response, _ = self.call_mcp_tool(
+            debug_response, _ = self.call_mcp_tool_direct(
                "debug",
                {
                    "files": [self.test_files["python"]],  # Same file should be deduplicated
@@ -142,7 +147,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
                return False

            # Continue with thinkdeep
-            final_response, _ = self.call_mcp_tool(
+            final_response, _ = self.call_mcp_tool_direct(
                "thinkdeep",
                {
                    "prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
@@ -169,7 +174,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
            self.logger.info("  3: Testing multi-file cross-tool continuation")

            # Start with both files
-            multi_response, multi_id = self.call_mcp_tool(
+            multi_response, multi_id = self.call_mcp_tool_direct(
                "chat",
                {
                    "prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
@@ -183,7 +188,7 @@ class CrossToolContinuationTest(BaseSimulatorTest):
                return False

            # Switch to codereview with same files (should use conversation history)
-            multi_review, _ = self.call_mcp_tool(
+            multi_review, _ = self.call_mcp_tool_direct(
                "codereview",
                {
                    "files": [self.test_files["python"], self.test_files["config"]],  # Same files
--- a/simulator_tests/test_debug_validation.py
+++ b/simulator_tests/test_debug_validation.py
@@ -378,35 +378,28 @@ The code looks correct to me, but something is causing valid sessions to be trea
            # Validate logs
            self.logger.info("  📋 Validating execution logs...")

-            # Get server logs from the actual log file inside the container
-            result = self.run_command(
-                ["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
-            )
+            # Get server logs using inherited method
+            logs = self.get_recent_server_logs(500)

-            if result.returncode == 0:
-                logs = result.stdout.decode() + result.stderr.decode()
+            # Look for debug tool execution patterns
+            debug_patterns = [
+                "debug tool",
+                "[DEBUG]",
+                "systematic investigation",
+                "Token budget",
+                "Essential files for debugging",
+            ]

-                # Look for debug tool execution patterns
-                debug_patterns = [
-                    "debug tool",
-                    "[DEBUG]",
-                    "systematic investigation",
-                    "Token budget",
-                    "Essential files for debugging",
-                ]
+            patterns_found = 0
+            for pattern in debug_patterns:
+                if pattern in logs:
+                    patterns_found += 1
+                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")

-                patterns_found = 0
-                for pattern in debug_patterns:
-                    if pattern in logs:
-                        patterns_found += 1
-                        self.logger.debug(f"  ✅ Found log pattern: {pattern}")
-
-                if patterns_found >= 3:
-                    self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
-                else:
-                    self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
+            if patterns_found >= 3:
+                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
            else:
-                self.logger.warning("  ⚠️ Could not retrieve Docker logs")
+                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")

            # Test continuation if available
            if continuation_id:
--- a/simulator_tests/test_line_number_validation.py
+++ b/simulator_tests/test_line_number_validation.py
@@ -145,14 +145,16 @@ def validate_data(data):
            # Test 4: Validate log patterns
            self.logger.info("  1.4: Validating line number processing in logs")

-            # Get logs from container
-            result = self.run_command(
-                ["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
-            )
-
-            logs = ""
-            if result.returncode == 0:
-                logs = result.stdout.decode()
+            # Get logs from server
+            try:
+                log_file_path = "logs/mcp_server.log"
+                with open(log_file_path) as f:
+                    lines = f.readlines()
+                    logs = "".join(lines[-500:])
+            except Exception as e:
+                self.logger.error(f"Failed to read server logs: {e}")
+                logs = ""
+                pass

            # Check for line number formatting patterns
            line_number_patterns = ["Line numbers for", "enabled", "│", "line number"]  # The line number separator
--- a/simulator_tests/test_logs_validation.py
+++ b/simulator_tests/test_logs_validation.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
-Docker Logs Validation Test
+Server Logs Validation Test

-Validates Docker logs to confirm file deduplication behavior and
+Validates server logs to confirm file deduplication behavior and
 conversation threading is working properly.
 """

@@ -10,7 +10,7 @@ from .base_test import BaseSimulatorTest


 class LogsValidationTest(BaseSimulatorTest):
-    """Validate Docker logs to confirm file deduplication behavior"""
+    """Validate server logs to confirm file deduplication behavior"""

    @property
    def test_name(self) -> str:
@@ -18,39 +18,35 @@ class LogsValidationTest(BaseSimulatorTest):

    @property
    def test_description(self) -> str:
-        return "Docker logs validation"
+        return "Server logs validation"

    def run_test(self) -> bool:
-        """Validate Docker logs to confirm file deduplication behavior"""
+        """Validate server logs to confirm file deduplication behavior"""
        try:
-            self.logger.info("📋 Test: Validating Docker logs for file deduplication...")
+            self.logger.info("📋 Test: Validating server logs for file deduplication...")

-            # Get server logs from main container
-            result = self.run_command(["docker", "logs", self.container_name], capture_output=True)
+            # Get server logs from log files
+            import os

-            if result.returncode != 0:
-                self.logger.error(f"Failed to get Docker logs: {result.stderr}")
+            logs = ""
+            log_files = ["logs/mcp_server.log", "logs/mcp_activity.log"]
+
+            for log_file in log_files:
+                if os.path.exists(log_file):
+                    try:
+                        with open(log_file) as f:
+                            file_content = f.read()
+                            logs += f"\n=== {log_file} ===\n{file_content}\n"
+                            self.logger.debug(f"Read {len(file_content)} characters from {log_file}")
+                    except Exception as e:
+                        self.logger.warning(f"Could not read {log_file}: {e}")
+                else:
+                    self.logger.warning(f"Log file not found: {log_file}")
+
+            if not logs.strip():
+                self.logger.warning("No log content found - server may not have processed any requests yet")
                return False

-            main_logs = result.stdout.decode() + result.stderr.decode()
-
-            # Get logs from log monitor container (where detailed activity is logged)
-            monitor_result = self.run_command(["docker", "logs", "zen-mcp-log-monitor"], capture_output=True)
-            monitor_logs = ""
-            if monitor_result.returncode == 0:
-                monitor_logs = monitor_result.stdout.decode() + monitor_result.stderr.decode()
-
-            # Also get activity logs for more detailed conversation tracking
-            activity_result = self.run_command(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True
-            )
-
-            activity_logs = ""
-            if activity_result.returncode == 0:
-                activity_logs = activity_result.stdout.decode()
-
-            logs = main_logs + "\n" + monitor_logs + "\n" + activity_logs
-
            # Look for conversation threading patterns that indicate the system is working
            conversation_patterns = [
                "CONVERSATION_RESUME",
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -4,11 +4,10 @@ O3 Model Selection Test

 Tests that O3 models are properly selected and used when explicitly specified,
 regardless of the default model configuration (even when set to auto).
-Validates model selection via Docker logs.
+Validates model selection via server logs.
 """

 import datetime
-import subprocess

 from .base_test import BaseSimulatorTest

@@ -24,47 +23,16 @@ class O3ModelSelectionTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "O3 model selection and usage validation"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            # Read logs directly from the log file - use more lines to ensure we get all test-related logs
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Test O3 model selection and usage"""
        try:
            self.logger.info(" Test: O3 model selection and usage validation")

            # Check which API keys are configured
-            check_cmd = [
-                "docker",
-                "exec",
-                self.container_name,
-                "python",
-                "-c",
-                'import os; print(f\'OPENAI_KEY:{bool(os.environ.get("OPENAI_API_KEY"))}|OPENROUTER_KEY:{bool(os.environ.get("OPENROUTER_API_KEY"))}\')',
-            ]
-            result = subprocess.run(check_cmd, capture_output=True, text=True)
+            import os

-            has_openai = False
-            has_openrouter = False
-
-            if result.returncode == 0:
-                output = result.stdout.strip()
-                if "OPENAI_KEY:True" in output:
-                    has_openai = True
-                if "OPENROUTER_KEY:True" in output:
-                    has_openrouter = True
+            has_openai = bool(os.environ.get("OPENAI_API_KEY"))
+            has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))

            # If only OpenRouter is configured, adjust test expectations
            if has_openrouter and not has_openai:
--- a/simulator_tests/test_ollama_custom_url.py
+++ b/simulator_tests/test_ollama_custom_url.py
@@ -9,7 +9,6 @@ Tests custom API endpoint functionality with Ollama-style local models, includin
 - Model alias resolution for local models
 """

-import subprocess

 from .base_test import BaseSimulatorTest

@@ -30,14 +29,15 @@ class OllamaCustomUrlTest(BaseSimulatorTest):
        try:
            self.logger.info("Test: Ollama custom URL functionality")

-            # Check if custom URL is configured in the Docker container
-            custom_url = self._check_docker_custom_url()
+            # Check if custom URL is configured
+            import os
+
+            custom_url = os.environ.get("CUSTOM_API_URL")
            if not custom_url:
-                self.logger.warning("CUSTOM_API_URL not set in Docker container, skipping Ollama test")
+                self.logger.warning("CUSTOM_API_URL not set, skipping Ollama test")
                self.logger.info("To enable this test, add to .env file:")
-                self.logger.info("CUSTOM_API_URL=http://host.docker.internal:11434/v1")
+                self.logger.info("CUSTOM_API_URL=http://localhost:11434/v1")
                self.logger.info("CUSTOM_API_KEY=")
-                self.logger.info("Then restart docker-compose")
                return True  # Skip gracefully

            self.logger.info(f"Testing with custom URL: {custom_url}")
@@ -172,25 +172,6 @@ if __name__ == "__main__":
        finally:
            self.cleanup_test_files()

-    def _check_docker_custom_url(self) -> str:
-        """Check if CUSTOM_API_URL is set in the Docker container"""
-        try:
-            result = subprocess.run(
-                ["docker", "exec", self.container_name, "printenv", "CUSTOM_API_URL"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-
-            if result.returncode == 0 and result.stdout.strip():
-                return result.stdout.strip()
-
-            return ""
-
-        except Exception as e:
-            self.logger.debug(f"Failed to check Docker CUSTOM_API_URL: {e}")
-            return ""
-
    def validate_successful_response(self, response: str, test_name: str, files_provided: bool = False) -> bool:
        """Validate that the response indicates success, not an error

@@ -201,7 +182,7 @@ if __name__ == "__main__":
        """
        if not response:
            self.logger.error(f"No response received for {test_name}")
-            self._check_docker_logs_for_errors()
+            self._check_server_logs_for_errors()
            return False

        # Check for common error indicators
@@ -227,7 +208,7 @@ if __name__ == "__main__":
        ]

        # Special handling for clarification requests from local models
-        if "clarification_required" in response.lower():
+        if "files_required_to_continue" in response.lower():
            if files_provided:
                # If we provided actual files, clarification request is a FAILURE
                self.logger.error(
@@ -243,7 +224,7 @@ if __name__ == "__main__":
                self.logger.debug(f"Clarification response: {response[:200]}...")
                return True

-        # Check for SSRF security restriction - this is expected for local URLs from Docker
+        # Check for SSRF security restriction - this is expected for local URLs
        if "restricted IP address" in response and "security risk (SSRF)" in response:
            self.logger.info(
                f"✅ Custom URL routing working - {test_name} correctly attempted to connect to custom API"
@@ -256,19 +237,19 @@ if __name__ == "__main__":
            if error.lower() in response_lower:
                self.logger.error(f"Error detected in {test_name}: {error}")
                self.logger.debug(f"Full response: {response}")
-                self._check_docker_logs_for_errors()
+                self._check_server_logs_for_errors()
                return False

        # Response should be substantial (more than just a few words)
        if len(response.strip()) < 10:
            self.logger.error(f"Response too short for {test_name}: {response}")
-            self._check_docker_logs_for_errors()
+            self._check_server_logs_for_errors()
            return False

        # Verify this looks like a real AI response, not just an error message
        if not self._validate_ai_response_content(response):
            self.logger.error(f"Response doesn't look like valid AI output for {test_name}")
-            self._check_docker_logs_for_errors()
+            self._check_server_logs_for_errors()
            return False

        self.logger.debug(f"Successful response for {test_name}: {response[:100]}...")
@@ -329,24 +310,23 @@ if __name__ == "__main__":

        return True

-    def _check_docker_logs_for_errors(self):
-        """Check Docker logs for any error messages that might explain failures"""
+    def _check_server_logs_for_errors(self):
+        """Check server logs for any error messages that might explain failures"""
        try:
-            # Get recent logs from the container
-            result = subprocess.run(
-                ["docker", "logs", "--tail", "50", self.container_name], capture_output=True, text=True, timeout=10
-            )
+            # Get recent logs from the log file
+            log_file_path = "logs/mcp_server.log"
+            with open(log_file_path) as f:
+                lines = f.readlines()
+                recent_logs = lines[-50:]  # Last 50 lines

-            if result.returncode == 0 and result.stderr:
-                recent_logs = result.stderr.strip()
-                if recent_logs:
-                    self.logger.info("Recent container logs:")
-                    for line in recent_logs.split("\n")[-10:]:  # Last 10 lines
-                        if line.strip():
-                            self.logger.info(f"  {line}")
+            if recent_logs:
+                self.logger.info("Recent server logs:")
+                for line in recent_logs[-10:]:  # Last 10 lines
+                    if line.strip():
+                        self.logger.info(f"  {line.strip()}")

        except Exception as e:
-            self.logger.debug(f"Failed to check Docker logs: {e}")
+            self.logger.debug(f"Failed to check server logs: {e}")

    def validate_local_model_response(self, response: str) -> bool:
        """Validate that response appears to come from a local model"""
--- a/simulator_tests/test_openrouter_fallback.py
+++ b/simulator_tests/test_openrouter_fallback.py
@@ -8,7 +8,6 @@ Tests that verify the system correctly falls back to OpenRouter when:
 - Auto mode correctly selects OpenRouter models
 """

-import subprocess

 from .base_test import BaseSimulatorTest

@@ -24,53 +23,28 @@ class OpenRouterFallbackTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "OpenRouter fallback behavior when only provider"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Test OpenRouter fallback behavior"""
        try:
            self.logger.info("Test: OpenRouter fallback behavior when only provider available")

            # Check if ONLY OpenRouter API key is configured (this is a fallback test)
-            check_cmd = [
-                "docker",
-                "exec",
-                self.container_name,
-                "python",
-                "-c",
-                'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))) + "|GEMINI_KEY:" + str(bool(os.environ.get("GEMINI_API_KEY"))) + "|OPENAI_KEY:" + str(bool(os.environ.get("OPENAI_API_KEY"))))',
-            ]
-            result = subprocess.run(check_cmd, capture_output=True, text=True)
+            import os

-            if result.returncode == 0:
-                output = result.stdout.strip()
-                has_openrouter = "OPENROUTER_KEY:True" in output
-                has_gemini = "GEMINI_KEY:True" in output
-                has_openai = "OPENAI_KEY:True" in output
+            has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
+            has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
+            has_openai = bool(os.environ.get("OPENAI_API_KEY"))

-                if not has_openrouter:
-                    self.logger.info("  ⚠️  OpenRouter API key not configured - skipping test")
-                    self.logger.info("  ℹ️  This test requires OPENROUTER_API_KEY to be set in .env")
-                    return True  # Return True to indicate test is skipped, not failed
+            if not has_openrouter:
+                self.logger.info("  ⚠️  OpenRouter API key not configured - skipping test")
+                self.logger.info("  ℹ️  This test requires OPENROUTER_API_KEY to be set in .env")
+                return True  # Return True to indicate test is skipped, not failed

-                if has_gemini or has_openai:
-                    self.logger.info("  ⚠️  Other API keys configured - this is not a fallback scenario")
-                    self.logger.info("  ℹ️  This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
-                    self.logger.info("  ℹ️  Current setup has multiple providers, so fallback behavior doesn't apply")
-                    return True  # Return True to indicate test is skipped, not failed
+            if has_gemini or has_openai:
+                self.logger.info("  ⚠️  Other API keys configured - this is not a fallback scenario")
+                self.logger.info("  ℹ️  This test requires ONLY OpenRouter to be configured (no Gemini/OpenAI keys)")
+                self.logger.info("  ℹ️  Current setup has multiple providers, so fallback behavior doesn't apply")
+                return True  # Return True to indicate test is skipped, not failed

            # Setup test files
            self.setup_test_files()
--- a/simulator_tests/test_openrouter_models.py
+++ b/simulator_tests/test_openrouter_models.py
@@ -9,7 +9,6 @@ Tests that verify OpenRouter functionality including:
 - Error handling when models are not available
 """

-import subprocess

 from .base_test import BaseSimulatorTest

@@ -25,39 +24,17 @@ class OpenRouterModelsTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "OpenRouter model functionality and alias mapping"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            # Read logs directly from the log file
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Test OpenRouter model functionality"""
        try:
            self.logger.info("Test: OpenRouter model functionality and alias mapping")

            # Check if OpenRouter API key is configured
-            check_cmd = [
-                "docker",
-                "exec",
-                self.container_name,
-                "python",
-                "-c",
-                'import os; print("OPENROUTER_KEY:" + str(bool(os.environ.get("OPENROUTER_API_KEY"))))',
-            ]
-            result = subprocess.run(check_cmd, capture_output=True, text=True)
+            import os

-            if result.returncode == 0 and "OPENROUTER_KEY:False" in result.stdout:
+            has_openrouter = bool(os.environ.get("OPENROUTER_API_KEY"))
+
+            if not has_openrouter:
                self.logger.info("  ⚠️  OpenRouter API key not configured - skipping test")
                self.logger.info("  ℹ️  This test requires OPENROUTER_API_KEY to be set in .env")
                return True  # Return True to indicate test is skipped, not failed
--- a/simulator_tests/test_per_tool_deduplication.py
+++ b/simulator_tests/test_per_tool_deduplication.py
@@ -8,16 +8,15 @@ Validates that:
 1. Files are embedded only once in conversation history
 2. Continuation calls don't re-read existing files
 3. New files are still properly embedded
-4. Docker logs show deduplication behavior
+4. Server logs show deduplication behavior
 """

 import os
-import subprocess

-from .base_test import BaseSimulatorTest
+from .conversation_base_test import ConversationBaseTest


-class PerToolDeduplicationTest(BaseSimulatorTest):
+class PerToolDeduplicationTest(ConversationBaseTest):
    """Test file deduplication for each individual tool"""

    @property
@@ -28,74 +27,16 @@ class PerToolDeduplicationTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "File deduplication for individual tools"

-    def get_docker_logs_since(self, since_time: str) -> str:
-        """Get docker logs since a specific timestamp"""
-        try:
-            # Check both main server and log monitor for comprehensive logs
-            cmd_server = ["docker", "logs", "--since", since_time, self.container_name]
-            cmd_monitor = ["docker", "logs", "--since", since_time, "zen-mcp-log-monitor"]
-
-            result_server = subprocess.run(cmd_server, capture_output=True, text=True)
-            result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
-
-            # Get the internal log files which have more detailed logging
-            server_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_server.log"], capture_output=True, text=True
-            )
-
-            activity_log_result = subprocess.run(
-                ["docker", "exec", self.container_name, "cat", "/tmp/mcp_activity.log"], capture_output=True, text=True
-            )
-
-            # Combine all logs
-            combined_logs = (
-                result_server.stdout
-                + "\n"
-                + result_monitor.stdout
-                + "\n"
-                + server_log_result.stdout
-                + "\n"
-                + activity_log_result.stdout
-            )
-            return combined_logs
-        except Exception as e:
-            self.logger.error(f"Failed to get docker logs: {e}")
-            return ""
-
    # create_additional_test_file method now inherited from base class

-    def validate_file_deduplication_in_logs(self, logs: str, tool_name: str, test_file: str) -> bool:
-        """Validate that logs show file deduplication behavior"""
-        # Look for file embedding messages
-        embedding_messages = [
-            line for line in logs.split("\n") if "📁" in line and "embedding" in line and tool_name in line
-        ]
-
-        # Look for deduplication/filtering messages
-        filtering_messages = [
-            line for line in logs.split("\n") if "📁" in line and "Filtering" in line and tool_name in line
-        ]
-        skipping_messages = [
-            line for line in logs.split("\n") if "📁" in line and "skipping" in line and tool_name in line
-        ]
-
-        deduplication_found = len(filtering_messages) > 0 or len(skipping_messages) > 0
-
-        if deduplication_found:
-            self.logger.info(f"  ✅ {tool_name}: Found deduplication evidence in logs")
-            for msg in filtering_messages + skipping_messages:
-                self.logger.debug(f"    📁 {msg.strip()}")
-        else:
-            self.logger.warning(f"  ⚠️ {tool_name}: No deduplication evidence found in logs")
-            self.logger.debug(f"  📁 All embedding messages: {embedding_messages}")
-
-        return deduplication_found
-
    def run_test(self) -> bool:
        """Test file deduplication with realistic precommit/codereview workflow"""
        try:
            self.logger.info("📄 Test: Simplified file deduplication with precommit/codereview workflow")

+            # Setup test environment for conversation testing
+            self.setUp()
+
            # Setup test files
            self.setup_test_files()

@@ -126,7 +67,7 @@ def divide(x, y):
                "model": "flash",
            }

-            response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
+            response1, continuation_id = self.call_mcp_tool_direct("precommit", precommit_params)
            if not response1:
                self.logger.error("  ❌ Step 1: precommit tool failed")
                return False
@@ -151,7 +92,7 @@ def divide(x, y):
                "model": "flash",
            }

-            response2, _ = self.call_mcp_tool("codereview", codereview_params)
+            response2, _ = self.call_mcp_tool_direct("codereview", codereview_params)
            if not response2:
                self.logger.error("  ❌ Step 2: codereview tool failed")
                return False
@@ -181,16 +122,16 @@ def subtract(a, b):
                "model": "flash",
            }

-            response3, _ = self.call_mcp_tool("precommit", continue_params)
+            response3, _ = self.call_mcp_tool_direct("precommit", continue_params)
            if not response3:
                self.logger.error("  ❌ Step 3: precommit continuation failed")
                return False

            self.logger.info("  ✅ Step 3: precommit continuation completed")

-            # Validate results in docker logs
+            # Validate results in server logs
            self.logger.info("  📋 Validating conversation history and file deduplication...")
-            logs = self.get_docker_logs_since(start_time)
+            logs = self.get_server_logs_since(start_time)

            # Check for conversation history building
            conversation_logs = [
@@ -249,7 +190,7 @@ def subtract(a, b):
                return True
            else:
                self.logger.warning("  ⚠️ File deduplication workflow test: FAILED")
-                self.logger.warning("  💡 Check docker logs for detailed file embedding and continuation activity")
+                self.logger.warning("  💡 Check server logs for detailed file embedding and continuation activity")
                return False

        except Exception as e:
--- a/simulator_tests/test_planner_continuation_history.py
+++ b/simulator_tests/test_planner_continuation_history.py
@@ -244,7 +244,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
            response2, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Deployment strategy: Use Kubernetes for container orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
+                    "step": "Deployment strategy: Use Kubernetes for orchestration with Helm charts. Implement CI/CD pipeline with GitOps. Use service mesh (Istio) for traffic management, monitoring, and security. Deploy databases in separate namespaces with backup automation.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Complete the session
@@ -326,7 +326,7 @@ class PlannerContinuationHistoryTest(BaseSimulatorTest):
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
-        """Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
+        """Call an MCP tool via standalone server - override for planner-specific response handling"""
        # Use parent implementation to get the raw response
        response_text, _ = super().call_mcp_tool(tool_name, params)

--- a/simulator_tests/test_planner_validation.py
+++ b/simulator_tests/test_planner_validation.py
@@ -275,7 +275,7 @@ class PlannerValidationTest(BaseSimulatorTest):
            response3, _ = self.call_mcp_tool(
                "planner",
                {
-                    "step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler Docker Swarm deployment initially, then migrate to Kubernetes later.",
+                    "step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
                    "step_number": 3,
                    "total_steps": 4,
                    "next_step_required": True,
@@ -311,7 +311,7 @@ class PlannerValidationTest(BaseSimulatorTest):
            return False

    def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
-        """Call an MCP tool via Claude CLI (docker exec) - override for planner-specific response handling"""
+        """Call an MCP tool via standalone server - override for planner-specific response handling"""
        # Use parent implementation to get the raw response
        response_text, _ = super().call_mcp_tool(tool_name, params)

--- a/simulator_tests/test_redis_validation.py
+++ b/simulator_tests/test_redis_validation.py
@@ -1,139 +0,0 @@
-#!/usr/bin/env python3
-"""
-Redis Conversation Memory Validation Test
-
-Validates that conversation memory is working via Redis by checking
-for stored conversation threads and their content.
-"""
-
-import json
-
-from .base_test import BaseSimulatorTest
-
-
-class RedisValidationTest(BaseSimulatorTest):
-    """Validate that conversation memory is working via Redis"""
-
-    @property
-    def test_name(self) -> str:
-        return "redis_validation"
-
-    @property
-    def test_description(self) -> str:
-        return "Redis conversation memory validation"
-
-    def run_test(self) -> bool:
-        """Validate that conversation memory is working via Redis"""
-        try:
-            self.logger.info("💾 Test: Validating conversation memory via Redis...")
-
-            # First, test Redis connectivity
-            ping_result = self.run_command(
-                ["docker", "exec", self.redis_container, "redis-cli", "ping"], capture_output=True
-            )
-
-            if ping_result.returncode != 0:
-                self.logger.error("Failed to connect to Redis")
-                return False
-
-            if "PONG" not in ping_result.stdout.decode():
-                self.logger.error("Redis ping failed")
-                return False
-
-            self.logger.info("✅ Redis connectivity confirmed")
-
-            # Check Redis for stored conversations
-            result = self.run_command(
-                ["docker", "exec", self.redis_container, "redis-cli", "KEYS", "thread:*"], capture_output=True
-            )
-
-            if result.returncode != 0:
-                self.logger.error("Failed to query Redis")
-                return False
-
-            keys = result.stdout.decode().strip().split("\n")
-            thread_keys = [k for k in keys if k.startswith("thread:") and k != "thread:*"]
-
-            if thread_keys:
-                self.logger.info(f"✅ Found {len(thread_keys)} conversation threads in Redis")
-
-                # Get details of first thread
-                thread_key = thread_keys[0]
-                result = self.run_command(
-                    ["docker", "exec", self.redis_container, "redis-cli", "GET", thread_key], capture_output=True
-                )
-
-                if result.returncode == 0:
-                    thread_data = result.stdout.decode()
-                    try:
-                        parsed = json.loads(thread_data)
-                        turns = parsed.get("turns", [])
-                        self.logger.info(f"✅ Thread has {len(turns)} turns")
-                        return True
-                    except json.JSONDecodeError:
-                        self.logger.warning("Could not parse thread data")
-
-                return True
-            else:
-                # If no existing threads, create a test thread to validate Redis functionality
-                self.logger.info(" No existing threads found, creating test thread to validate Redis...")
-
-                test_thread_id = "test_thread_validation"
-                test_data = {
-                    "thread_id": test_thread_id,
-                    "turns": [
-                        {"tool": "chat", "timestamp": "2025-06-11T16:30:00Z", "prompt": "Test validation prompt"}
-                    ],
-                }
-
-                # Store test data
-                store_result = self.run_command(
-                    [
-                        "docker",
-                        "exec",
-                        self.redis_container,
-                        "redis-cli",
-                        "SET",
-                        f"thread:{test_thread_id}",
-                        json.dumps(test_data),
-                    ],
-                    capture_output=True,
-                )
-
-                if store_result.returncode != 0:
-                    self.logger.error("Failed to store test data in Redis")
-                    return False
-
-                # Retrieve test data
-                retrieve_result = self.run_command(
-                    ["docker", "exec", self.redis_container, "redis-cli", "GET", f"thread:{test_thread_id}"],
-                    capture_output=True,
-                )
-
-                if retrieve_result.returncode != 0:
-                    self.logger.error("Failed to retrieve test data from Redis")
-                    return False
-
-                retrieved_data = retrieve_result.stdout.decode()
-                try:
-                    parsed = json.loads(retrieved_data)
-                    if parsed.get("thread_id") == test_thread_id:
-                        self.logger.info("✅ Redis read/write validation successful")
-
-                        # Clean up test data
-                        self.run_command(
-                            ["docker", "exec", self.redis_container, "redis-cli", "DEL", f"thread:{test_thread_id}"],
-                            capture_output=True,
-                        )
-
-                        return True
-                    else:
-                        self.logger.error("Retrieved data doesn't match stored data")
-                        return False
-                except json.JSONDecodeError:
-                    self.logger.error("Could not parse retrieved test data")
-                    return False
-
-        except Exception as e:
-            self.logger.error(f"Conversation memory validation failed: {e}")
-            return False
--- a/simulator_tests/test_refactor_validation.py
+++ b/simulator_tests/test_refactor_validation.py
@@ -241,35 +241,28 @@ def handle_everything(user_input, config, database):
            # Validate logs
            self.logger.info("  📋 Validating execution logs...")

-            # Get server logs from the actual log file inside the container
-            result = self.run_command(
-                ["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
-            )
+            # Get server logs using inherited method
+            logs = self.get_recent_server_logs(500)

-            if result.returncode == 0:
-                logs = result.stdout.decode() + result.stderr.decode()
+            # Look for refactor tool execution patterns
+            refactor_patterns = [
+                "[REFACTOR]",
+                "refactor tool",
+                "codesmells",
+                "Token budget",
+                "Code files embedded successfully",
+            ]

-                # Look for refactor tool execution patterns
-                refactor_patterns = [
-                    "[REFACTOR]",
-                    "refactor tool",
-                    "codesmells",
-                    "Token budget",
-                    "Code files embedded successfully",
-                ]
+            patterns_found = 0
+            for pattern in refactor_patterns:
+                if pattern in logs:
+                    patterns_found += 1
+                    self.logger.debug(f"  ✅ Found log pattern: {pattern}")

-                patterns_found = 0
-                for pattern in refactor_patterns:
-                    if pattern in logs:
-                        patterns_found += 1
-                        self.logger.debug(f"  ✅ Found log pattern: {pattern}")
-
-                if patterns_found >= 3:
-                    self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
-                else:
-                    self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
+            if patterns_found >= 3:
+                self.logger.info(f"  ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
            else:
-                self.logger.warning("  ⚠️ Could not retrieve Docker logs")
+                self.logger.warning(f"  ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")

            self.logger.info("  ✅ Refactor tool validation completed successfully")
            return True
--- a/simulator_tests/test_token_allocation_validation.py
+++ b/simulator_tests/test_token_allocation_validation.py
@@ -11,7 +11,6 @@ This test validates that:

 import datetime
 import re
-import subprocess

 from .base_test import BaseSimulatorTest

@@ -27,78 +26,6 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "Token allocation and conversation history validation"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
-    def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
-        """Extract actual conversation token usage from server logs"""
-        usage_logs = []
-
-        # Look for conversation debug logs that show actual usage
-        lines = logs.split("\n")
-
-        for i, line in enumerate(lines):
-            if "[CONVERSATION_DEBUG] Token budget calculation:" in line:
-                # Found start of token budget log, extract the following lines
-                usage = {}
-                for j in range(1, 8):  # Next 7 lines contain the usage details
-                    if i + j < len(lines):
-                        detail_line = lines[i + j]
-
-                        # Parse Total capacity: 1,048,576
-                        if "Total capacity:" in detail_line:
-                            match = re.search(r"Total capacity:\s*([\d,]+)", detail_line)
-                            if match:
-                                usage["total_capacity"] = int(match.group(1).replace(",", ""))
-
-                        # Parse Content allocation: 838,860
-                        elif "Content allocation:" in detail_line:
-                            match = re.search(r"Content allocation:\s*([\d,]+)", detail_line)
-                            if match:
-                                usage["content_allocation"] = int(match.group(1).replace(",", ""))
-
-                        # Parse Conversation tokens: 12,345
-                        elif "Conversation tokens:" in detail_line:
-                            match = re.search(r"Conversation tokens:\s*([\d,]+)", detail_line)
-                            if match:
-                                usage["conversation_tokens"] = int(match.group(1).replace(",", ""))
-
-                        # Parse Remaining tokens: 825,515
-                        elif "Remaining tokens:" in detail_line:
-                            match = re.search(r"Remaining tokens:\s*([\d,]+)", detail_line)
-                            if match:
-                                usage["remaining_tokens"] = int(match.group(1).replace(",", ""))
-
-                if usage:  # Only add if we found some usage data
-                    usage_logs.append(usage)
-
-        return usage_logs
-
-    def extract_conversation_token_usage(self, logs: str) -> list[int]:
-        """Extract conversation token usage from logs"""
-        usage_values = []
-
-        # Look for conversation token usage logs
-        pattern = r"Conversation history token usage:\s*([\d,]+)"
-        matches = re.findall(pattern, logs)
-
-        for match in matches:
-            usage_values.append(int(match.replace(",", "")))
-
-        return usage_values
-
    def run_test(self) -> bool:
        """Test token allocation and conversation history functionality"""
        try:
--- a/simulator_tests/test_vision_capability.py
+++ b/simulator_tests/test_vision_capability.py
@@ -81,7 +81,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
                    "don't have access",
                    "cannot see",
                    "no image",
-                    "clarification_required",
+                    "files_required_to_continue",
                    "image you're referring to",
                    "supply the image",
                    "error",
@@ -122,7 +122,7 @@ class VisionCapabilityTest(BaseSimulatorTest):
                    "don't have access",
                    "cannot see",
                    "no image",
-                    "clarification_required",
+                    "files_required_to_continue",
                    "image you're referring to",
                    "supply the image",
                    "error",
--- a/simulator_tests/test_xai_models.py
+++ b/simulator_tests/test_xai_models.py
@@ -9,7 +9,6 @@ Tests that verify X.AI GROK functionality including:
 - API integration and response validation
 """

-import subprocess

 from .base_test import BaseSimulatorTest

@@ -25,44 +24,18 @@ class XAIModelsTest(BaseSimulatorTest):
    def test_description(self) -> str:
        return "X.AI GROK model functionality and integration"

-    def get_recent_server_logs(self) -> str:
-        """Get recent server logs from the log file directly"""
-        try:
-            # Read logs directly from the log file
-            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-
-            if result.returncode == 0:
-                return result.stdout
-            else:
-                self.logger.warning(f"Failed to read server logs: {result.stderr}")
-                return ""
-        except Exception as e:
-            self.logger.error(f"Failed to get server logs: {e}")
-            return ""
-
    def run_test(self) -> bool:
        """Test X.AI GROK model functionality"""
        try:
            self.logger.info("Test: X.AI GROK model functionality and integration")

            # Check if X.AI API key is configured and not empty
-            check_cmd = [
-                "docker",
-                "exec",
-                self.container_name,
-                "python",
-                "-c",
-                """
-import os
-xai_key = os.environ.get("XAI_API_KEY", "")
-is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
-print(f"XAI_KEY_VALID:{is_valid}")
-                """.strip(),
-            ]
-            result = subprocess.run(check_cmd, capture_output=True, text=True)
+            import os

-            if result.returncode == 0 and "XAI_KEY_VALID:False" in result.stdout:
+            xai_key = os.environ.get("XAI_API_KEY", "")
+            is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
+
+            if not is_valid:
                self.logger.info("  ⚠️  X.AI API key not configured or empty - skipping test")
                self.logger.info("  ℹ️  This test requires XAI_API_KEY to be set in .env with a valid key")
                return True  # Return True to indicate test is skipped, not failed