Rebranding, refactoring, renaming, cleanup, updated docs

2025-06-12 10:40:43 +04:00
parent 9a55ca8898
commit fb66825bf6
55 changed files with 1048 additions and 1474 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -1,13 +1,14 @@
 """
 Communication Simulator Tests Package

-This package contains individual test modules for the Gemini MCP Communication Simulator.
+This package contains individual test modules for the Zen MCP Communication Simulator.
 Each test is in its own file for better organization and maintainability.
 """

 from .base_test import BaseSimulatorTest
 from .test_basic_conversation import BasicConversationTest
 from .test_content_validation import ContentValidationTest
+from .test_conversation_chain_validation import ConversationChainValidationTest
 from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
 from .test_cross_tool_continuation import CrossToolContinuationTest
 from .test_logs_validation import LogsValidationTest
@@ -16,7 +17,6 @@ from .test_o3_model_selection import O3ModelSelectionTest
 from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_redis_validation import RedisValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
-from .test_conversation_chain_validation import ConversationChainValidationTest

 # Test registry for dynamic loading
 TEST_REGISTRY = {
--- a/simulator_tests/base_test.py
+++ b/simulator_tests/base_test.py
@@ -19,8 +19,8 @@ class BaseSimulatorTest:
        self.verbose = verbose
        self.test_files = {}
        self.test_dir = None
-        self.container_name = "gemini-mcp-server"
-        self.redis_container = "gemini-mcp-redis"
+        self.container_name = "zen-mcp-server"
+        self.redis_container = "zen-mcp-redis"

        # Configure logging
        log_level = logging.DEBUG if verbose else logging.INFO
--- a/simulator_tests/test_content_validation.py
+++ b/simulator_tests/test_content_validation.py
@@ -6,7 +6,6 @@ Tests that tools don't duplicate file content in their responses.
 This test is specifically designed to catch content duplication bugs.
 """

-import json
 import os

 from .base_test import BaseSimulatorTest
@@ -31,6 +30,7 @@ class ContentValidationTest(BaseSimulatorTest):
            cmd_monitor = ["docker", "logs", "--since", since_time, "gemini-mcp-log-monitor"]

            import subprocess
+
            result_server = subprocess.run(cmd_server, capture_output=True, text=True)
            result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)

@@ -76,6 +76,7 @@ DATABASE_CONFIG = {

            # Get timestamp for log filtering
            import datetime
+
            start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

            # Test 1: Initial tool call with validation file
@@ -139,26 +140,25 @@ DATABASE_CONFIG = {

            # Check for proper file embedding logs
            embedding_logs = [
-                line for line in logs.split("\n")
-                if "📁" in line or "embedding" in line.lower() or "[FILES]" in line
+                line for line in logs.split("\n") if "📁" in line or "embedding" in line.lower() or "[FILES]" in line
            ]

            # Check for deduplication evidence
            deduplication_logs = [
-                line for line in logs.split("\n")
+                line
+                for line in logs.split("\n")
                if "skipping" in line.lower() and "already in conversation" in line.lower()
            ]

            # Check for file processing patterns
            new_file_logs = [
-                line for line in logs.split("\n")
-                if "all 1 files are new" in line or "New conversation" in line
+                line for line in logs.split("\n") if "all 1 files are new" in line or "New conversation" in line
            ]

            # Validation criteria
            validation_file_mentioned = any("validation_config.py" in line for line in logs.split("\n"))
            embedding_found = len(embedding_logs) > 0
-            proper_deduplication = len(deduplication_logs) > 0 or len(new_file_logs) >= 2  # Should see new conversation patterns
+            (len(deduplication_logs) > 0 or len(new_file_logs) >= 2)  # Should see new conversation patterns

            self.logger.info(f"  📊 Embedding logs found: {len(embedding_logs)}")
            self.logger.info(f"  📊 Deduplication evidence: {len(deduplication_logs)}")
@@ -175,7 +175,7 @@ DATABASE_CONFIG = {
            success_criteria = [
                ("Embedding logs found", embedding_found),
                ("File processing evidence", validation_file_mentioned),
-                ("Multiple tool calls", len(new_file_logs) >= 2)
+                ("Multiple tool calls", len(new_file_logs) >= 2),
            ]

            passed_criteria = sum(1 for _, passed in success_criteria if passed)
--- a/simulator_tests/test_conversation_chain_validation.py
+++ b/simulator_tests/test_conversation_chain_validation.py
@@ -4,14 +4,14 @@ Conversation Chain and Threading Validation Test

 This test validates that:
 1. Multiple tool invocations create proper parent->parent->parent chains
-2. New conversations can be started independently 
+2. New conversations can be started independently
 3. Original conversation chains can be resumed from any point
 4. History traversal works correctly for all scenarios
 5. Thread relationships are properly maintained in Redis

 Test Flow:
 Chain A: chat -> analyze -> debug (3 linked threads)
-Chain B: chat -> analyze (2 linked threads, independent)  
+Chain B: chat -> analyze (2 linked threads, independent)
 Chain A Branch: debug (continue from original chat, creating branch)

 This validates the conversation threading system's ability to:
@@ -21,10 +21,8 @@ This validates the conversation threading system's ability to:
 - Properly traverse parent relationships for history reconstruction
 """

-import datetime
-import subprocess
 import re
-from typing import Dict, List, Tuple, Optional
+import subprocess

 from .base_test import BaseSimulatorTest

@@ -45,7 +43,7 @@ class ConversationChainValidationTest(BaseSimulatorTest):
        try:
            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
            result = subprocess.run(cmd, capture_output=True, text=True)
-            
+
            if result.returncode == 0:
                return result.stdout
            else:
@@ -55,44 +53,36 @@ class ConversationChainValidationTest(BaseSimulatorTest):
            self.logger.error(f"Failed to get server logs: {e}")
            return ""

-    def extract_thread_creation_logs(self, logs: str) -> List[Dict[str, str]]:
+    def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
        """Extract thread creation logs with parent relationships"""
        thread_logs = []
-        
-        lines = logs.split('\n')
+
+        lines = logs.split("\n")
        for line in lines:
            if "[THREAD] Created new thread" in line:
                # Parse: [THREAD] Created new thread 9dc779eb-645f-4850-9659-34c0e6978d73 with parent a0ce754d-c995-4b3e-9103-88af429455aa
-                match = re.search(r'\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)', line)
+                match = re.search(r"\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)", line)
                if match:
                    thread_id = match.group(1)
                    parent_id = match.group(2) if match.group(2) != "None" else None
-                    thread_logs.append({
-                        "thread_id": thread_id,
-                        "parent_id": parent_id,
-                        "log_line": line
-                    })
-        
+                    thread_logs.append({"thread_id": thread_id, "parent_id": parent_id, "log_line": line})
+
        return thread_logs

-    def extract_history_traversal_logs(self, logs: str) -> List[Dict[str, str]]:
+    def extract_history_traversal_logs(self, logs: str) -> list[dict[str, str]]:
        """Extract conversation history traversal logs"""
        traversal_logs = []
-        
-        lines = logs.split('\n')
+
+        lines = logs.split("\n")
        for line in lines:
            if "[THREAD] Retrieved chain of" in line:
                # Parse: [THREAD] Retrieved chain of 3 threads for 9dc779eb-645f-4850-9659-34c0e6978d73
-                match = re.search(r'\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)', line)
+                match = re.search(r"\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)", line)
                if match:
                    chain_length = int(match.group(1))
                    thread_id = match.group(2)
-                    traversal_logs.append({
-                        "thread_id": thread_id,
-                        "chain_length": chain_length,
-                        "log_line": line
-                    })
-        
+                    traversal_logs.append({"thread_id": thread_id, "chain_length": chain_length, "log_line": line})
+
        return traversal_logs

    def run_test(self) -> bool:
@@ -113,16 +103,16 @@ class TestClass:
        return "Method in test class"
 """
            test_file_path = self.create_additional_test_file("chain_test.py", test_file_content)
-            
+
            # Track all continuation IDs and their relationships
            conversation_chains = {}
-            
+
            # === CHAIN A: Build linear conversation chain ===
            self.logger.info("  🔗 Chain A: Building linear conversation chain")
-            
+
            # Step A1: Start with chat tool (creates thread_id_1)
            self.logger.info("    Step A1: Chat tool - start new conversation")
-            
+
            response_a1, continuation_id_a1 = self.call_mcp_tool(
                "chat",
                {
@@ -138,11 +128,11 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step A1 completed - thread_id: {continuation_id_a1[:8]}...")
-            conversation_chains['A1'] = continuation_id_a1
+            conversation_chains["A1"] = continuation_id_a1

            # Step A2: Continue with analyze tool (creates thread_id_2 with parent=thread_id_1)
            self.logger.info("    Step A2: Analyze tool - continue Chain A")
-            
+
            response_a2, continuation_id_a2 = self.call_mcp_tool(
                "analyze",
                {
@@ -159,11 +149,11 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step A2 completed - thread_id: {continuation_id_a2[:8]}...")
-            conversation_chains['A2'] = continuation_id_a2
+            conversation_chains["A2"] = continuation_id_a2

-            # Step A3: Continue with debug tool (creates thread_id_3 with parent=thread_id_2)  
+            # Step A3: Continue with debug tool (creates thread_id_3 with parent=thread_id_2)
            self.logger.info("    Step A3: Debug tool - continue Chain A")
-            
+
            response_a3, continuation_id_a3 = self.call_mcp_tool(
                "debug",
                {
@@ -180,14 +170,14 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step A3 completed - thread_id: {continuation_id_a3[:8]}...")
-            conversation_chains['A3'] = continuation_id_a3
+            conversation_chains["A3"] = continuation_id_a3

            # === CHAIN B: Start independent conversation ===
            self.logger.info("  🔗 Chain B: Starting independent conversation")
-            
+
            # Step B1: Start new chat conversation (creates thread_id_4, no parent)
            self.logger.info("    Step B1: Chat tool - start NEW independent conversation")
-            
+
            response_b1, continuation_id_b1 = self.call_mcp_tool(
                "chat",
                {
@@ -202,11 +192,11 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step B1 completed - thread_id: {continuation_id_b1[:8]}...")
-            conversation_chains['B1'] = continuation_id_b1
+            conversation_chains["B1"] = continuation_id_b1

            # Step B2: Continue the new conversation (creates thread_id_5 with parent=thread_id_4)
            self.logger.info("    Step B2: Analyze tool - continue Chain B")
-            
+
            response_b2, continuation_id_b2 = self.call_mcp_tool(
                "analyze",
                {
@@ -222,14 +212,14 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step B2 completed - thread_id: {continuation_id_b2[:8]}...")
-            conversation_chains['B2'] = continuation_id_b2
+            conversation_chains["B2"] = continuation_id_b2

            # === CHAIN A BRANCH: Go back to original conversation ===
            self.logger.info("  🔗 Chain A Branch: Resume original conversation from A1")
-            
+
            # Step A1-Branch: Use original continuation_id_a1 to branch (creates thread_id_6 with parent=thread_id_1)
            self.logger.info("    Step A1-Branch: Debug tool - branch from original Chain A")
-            
+
            response_a1_branch, continuation_id_a1_branch = self.call_mcp_tool(
                "debug",
                {
@@ -246,73 +236,79 @@ class TestClass:
                return False

            self.logger.info(f"    ✅ Step A1-Branch completed - thread_id: {continuation_id_a1_branch[:8]}...")
-            conversation_chains['A1_Branch'] = continuation_id_a1_branch
+            conversation_chains["A1_Branch"] = continuation_id_a1_branch

            # === ANALYSIS: Validate thread relationships and history traversal ===
            self.logger.info("  📊 Analyzing conversation chain structure...")
-            
+
            # Get logs and extract thread relationships
            logs = self.get_recent_server_logs()
            thread_creation_logs = self.extract_thread_creation_logs(logs)
            history_traversal_logs = self.extract_history_traversal_logs(logs)
-            
+
            self.logger.info(f"    Found {len(thread_creation_logs)} thread creation logs")
            self.logger.info(f"    Found {len(history_traversal_logs)} history traversal logs")
-            
+
            # Debug: Show what we found
            if self.verbose:
                self.logger.debug("    Thread creation logs found:")
                for log in thread_creation_logs:
-                    self.logger.debug(f"      {log['thread_id'][:8]}... parent: {log['parent_id'][:8] if log['parent_id'] else 'None'}...")
+                    self.logger.debug(
+                        f"      {log['thread_id'][:8]}... parent: {log['parent_id'][:8] if log['parent_id'] else 'None'}..."
+                    )
                self.logger.debug("    History traversal logs found:")
                for log in history_traversal_logs:
                    self.logger.debug(f"      {log['thread_id'][:8]}... chain length: {log['chain_length']}")
-            
+
            # Build expected thread relationships
            expected_relationships = []
-            
+
            # Note: A1 and B1 won't appear in thread creation logs because they're new conversations (no parent)
            # Only continuation threads (A2, A3, B2, A1-Branch) will appear in creation logs
-            
+
            # Find logs for each continuation thread
-            a2_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a2), None)
-            a3_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a3), None)
-            b2_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_b2), None)
-            a1_branch_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a1_branch), None)
-            
+            a2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a2), None)
+            a3_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a3), None)
+            b2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_b2), None)
+            a1_branch_log = next(
+                (log for log in thread_creation_logs if log["thread_id"] == continuation_id_a1_branch), None
+            )
+
            # A2 should have A1 as parent
            if a2_log:
-                expected_relationships.append(("A2 has A1 as parent", a2_log['parent_id'] == continuation_id_a1))
-            
+                expected_relationships.append(("A2 has A1 as parent", a2_log["parent_id"] == continuation_id_a1))
+
            # A3 should have A2 as parent
            if a3_log:
-                expected_relationships.append(("A3 has A2 as parent", a3_log['parent_id'] == continuation_id_a2))
-            
+                expected_relationships.append(("A3 has A2 as parent", a3_log["parent_id"] == continuation_id_a2))
+
            # B2 should have B1 as parent (independent chain)
            if b2_log:
-                expected_relationships.append(("B2 has B1 as parent", b2_log['parent_id'] == continuation_id_b1))
-            
+                expected_relationships.append(("B2 has B1 as parent", b2_log["parent_id"] == continuation_id_b1))
+
            # A1-Branch should have A1 as parent (branching)
            if a1_branch_log:
-                expected_relationships.append(("A1-Branch has A1 as parent", a1_branch_log['parent_id'] == continuation_id_a1))
-            
+                expected_relationships.append(
+                    ("A1-Branch has A1 as parent", a1_branch_log["parent_id"] == continuation_id_a1)
+                )
+
            # Validate history traversal
            traversal_validations = []
-            
+
            # History traversal logs are only generated when conversation history is built from scratch
            # (not when history is already embedded in the prompt by server.py)
            # So we should expect at least 1 traversal log, but not necessarily for every continuation
-            
+
            if len(history_traversal_logs) > 0:
                # Validate that any traversal logs we find have reasonable chain lengths
                for log in history_traversal_logs:
-                    thread_id = log['thread_id']
-                    chain_length = log['chain_length']
-                    
+                    thread_id = log["thread_id"]
+                    chain_length = log["chain_length"]
+
                    # Chain length should be at least 2 for any continuation thread
                    # (original thread + continuation thread)
                    is_valid_length = chain_length >= 2
-                    
+
                    # Try to identify which thread this is for better validation
                    thread_description = "Unknown thread"
                    if thread_id == continuation_id_a2:
@@ -327,12 +323,16 @@ class TestClass:
                    elif thread_id == continuation_id_a1_branch:
                        thread_description = "A1-Branch (should be 2-thread chain)"
                        is_valid_length = chain_length == 2
-                    
-                    traversal_validations.append((f"{thread_description[:8]}... has valid chain length", is_valid_length))
-                    
+
+                    traversal_validations.append(
+                        (f"{thread_description[:8]}... has valid chain length", is_valid_length)
+                    )
+
                # Also validate we found at least one traversal (shows the system is working)
-                traversal_validations.append(("At least one history traversal occurred", len(history_traversal_logs) >= 1))
-            
+                traversal_validations.append(
+                    ("At least one history traversal occurred", len(history_traversal_logs) >= 1)
+                )
+
            # === VALIDATION RESULTS ===
            self.logger.info("  📊 Thread Relationship Validation:")
            relationship_passed = 0
@@ -341,7 +341,7 @@ class TestClass:
                self.logger.info(f"    {status} {desc}")
                if passed:
                    relationship_passed += 1
-            
+
            self.logger.info("  📊 History Traversal Validation:")
            traversal_passed = 0
            for desc, passed in traversal_validations:
@@ -349,31 +349,35 @@ class TestClass:
                self.logger.info(f"    {status} {desc}")
                if passed:
                    traversal_passed += 1
-            
+
            # === SUCCESS CRITERIA ===
            total_relationship_checks = len(expected_relationships)
            total_traversal_checks = len(traversal_validations)
-            
-            self.logger.info(f"  📊 Validation Summary:")
+
+            self.logger.info("  📊 Validation Summary:")
            self.logger.info(f"    Thread relationships: {relationship_passed}/{total_relationship_checks}")
            self.logger.info(f"    History traversal: {traversal_passed}/{total_traversal_checks}")
-            
+
            # Success requires at least 80% of validations to pass
            relationship_success = relationship_passed >= (total_relationship_checks * 0.8)
-            
+
            # If no traversal checks were possible, it means no traversal logs were found
            # This could indicate an issue since we expect at least some history building
            if total_traversal_checks == 0:
-                self.logger.warning("    No history traversal logs found - this may indicate conversation history is always pre-embedded")
+                self.logger.warning(
+                    "    No history traversal logs found - this may indicate conversation history is always pre-embedded"
+                )
                # Still consider it successful since the thread relationships are what matter most
                traversal_success = True
            else:
                traversal_success = traversal_passed >= (total_traversal_checks * 0.8)
-            
+
            overall_success = relationship_success and traversal_success
-            
-            self.logger.info(f"  📊 Conversation Chain Structure:")
-            self.logger.info(f"    Chain A: {continuation_id_a1[:8]} → {continuation_id_a2[:8]} → {continuation_id_a3[:8]}")
+
+            self.logger.info("  📊 Conversation Chain Structure:")
+            self.logger.info(
+                f"    Chain A: {continuation_id_a1[:8]} → {continuation_id_a2[:8]} → {continuation_id_a3[:8]}"
+            )
            self.logger.info(f"    Chain B: {continuation_id_b1[:8]} → {continuation_id_b2[:8]}")
            self.logger.info(f"    Branch:  {continuation_id_a1[:8]} → {continuation_id_a1_branch[:8]}")

@@ -394,13 +398,13 @@ class TestClass:
 def main():
    """Run the conversation chain validation test"""
    import sys
-    
+
    verbose = "--verbose" in sys.argv or "-v" in sys.argv
    test = ConversationChainValidationTest(verbose=verbose)
-    
+
    success = test.run_test()
    sys.exit(0 if success else 1)


 if __name__ == "__main__":
-    main()
+    main()
--- a/simulator_tests/test_o3_model_selection.py
+++ b/simulator_tests/test_o3_model_selection.py
@@ -30,7 +30,7 @@ class O3ModelSelectionTest(BaseSimulatorTest):
            # Read logs directly from the log file - more reliable than docker logs --since
            cmd = ["docker", "exec", self.container_name, "tail", "-n", "200", "/tmp/mcp_server.log"]
            result = subprocess.run(cmd, capture_output=True, text=True)
-            
+
            if result.returncode == 0:
                return result.stdout
            else:
@@ -49,7 +49,7 @@ class O3ModelSelectionTest(BaseSimulatorTest):
            self.setup_test_files()

            # Get timestamp for log filtering
-            start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
+            datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

            # Test 1: Explicit O3 model selection
            self.logger.info("  1: Testing explicit O3 model selection")
@@ -115,37 +115,26 @@ def multiply(x, y):

            self.logger.info("  ✅ O3 with codereview tool completed")

-            # Validate model usage from server logs  
+            # Validate model usage from server logs
            self.logger.info("  4: Validating model usage in logs")
            logs = self.get_recent_server_logs()

            # Check for OpenAI API calls (this proves O3 models are being used)
-            openai_api_logs = [
-                line for line in logs.split("\n")
-                if "Sending request to openai API" in line
-            ]
+            openai_api_logs = [line for line in logs.split("\n") if "Sending request to openai API" in line]

            # Check for OpenAI HTTP responses (confirms successful O3 calls)
            openai_http_logs = [
-                line for line in logs.split("\n")
-                if "HTTP Request: POST https://api.openai.com" in line
+                line for line in logs.split("\n") if "HTTP Request: POST https://api.openai.com" in line
            ]

            # Check for received responses from OpenAI
-            openai_response_logs = [
-                line for line in logs.split("\n")
-                if "Received response from openai API" in line
-            ]
+            openai_response_logs = [line for line in logs.split("\n") if "Received response from openai API" in line]

            # Check that we have both chat and codereview tool calls to OpenAI
-            chat_openai_logs = [
-                line for line in logs.split("\n")
-                if "Sending request to openai API for chat" in line
-            ]
+            chat_openai_logs = [line for line in logs.split("\n") if "Sending request to openai API for chat" in line]

            codereview_openai_logs = [
-                line for line in logs.split("\n")
-                if "Sending request to openai API for codereview" in line
+                line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
            ]

            # Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
@@ -178,7 +167,7 @@ def multiply(x, y):
                ("OpenAI HTTP requests successful", openai_http_success),
                ("OpenAI responses received", openai_responses_received),
                ("Chat tool used OpenAI", chat_calls_to_openai),
-                ("Codereview tool used OpenAI", codereview_calls_to_openai)
+                ("Codereview tool used OpenAI", codereview_calls_to_openai),
            ]

            passed_criteria = sum(1 for _, passed in success_criteria if passed)
@@ -214,4 +203,4 @@ def main():


 if __name__ == "__main__":
-    main()
+    main()
--- a/simulator_tests/test_token_allocation_validation.py
+++ b/simulator_tests/test_token_allocation_validation.py
@@ -10,9 +10,8 @@ This test validates that:
 """

 import datetime
-import subprocess
 import re
-from typing import Dict, List, Tuple
+import subprocess

 from .base_test import BaseSimulatorTest

@@ -33,7 +32,7 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
        try:
            cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
            result = subprocess.run(cmd, capture_output=True, text=True)
-            
+
            if result.returncode == 0:
                return result.stdout
            else:
@@ -43,13 +42,13 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
            self.logger.error(f"Failed to get server logs: {e}")
            return ""

-    def extract_conversation_usage_logs(self, logs: str) -> List[Dict[str, int]]:
+    def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
        """Extract actual conversation token usage from server logs"""
        usage_logs = []
-        
+
        # Look for conversation debug logs that show actual usage
-        lines = logs.split('\n')
-        
+        lines = logs.split("\n")
+
        for i, line in enumerate(lines):
            if "[CONVERSATION_DEBUG] Token budget calculation:" in line:
                # Found start of token budget log, extract the following lines
@@ -57,47 +56,47 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
                for j in range(1, 8):  # Next 7 lines contain the usage details
                    if i + j < len(lines):
                        detail_line = lines[i + j]
-                        
+
                        # Parse Total capacity: 1,048,576
                        if "Total capacity:" in detail_line:
-                            match = re.search(r'Total capacity:\s*([\d,]+)', detail_line)
+                            match = re.search(r"Total capacity:\s*([\d,]+)", detail_line)
                            if match:
-                                usage['total_capacity'] = int(match.group(1).replace(',', ''))
-                        
+                                usage["total_capacity"] = int(match.group(1).replace(",", ""))
+
                        # Parse Content allocation: 838,860
                        elif "Content allocation:" in detail_line:
-                            match = re.search(r'Content allocation:\s*([\d,]+)', detail_line)
+                            match = re.search(r"Content allocation:\s*([\d,]+)", detail_line)
                            if match:
-                                usage['content_allocation'] = int(match.group(1).replace(',', ''))
-                        
-                        # Parse Conversation tokens: 12,345  
+                                usage["content_allocation"] = int(match.group(1).replace(",", ""))
+
+                        # Parse Conversation tokens: 12,345
                        elif "Conversation tokens:" in detail_line:
-                            match = re.search(r'Conversation tokens:\s*([\d,]+)', detail_line)
+                            match = re.search(r"Conversation tokens:\s*([\d,]+)", detail_line)
                            if match:
-                                usage['conversation_tokens'] = int(match.group(1).replace(',', ''))
-                        
+                                usage["conversation_tokens"] = int(match.group(1).replace(",", ""))
+
                        # Parse Remaining tokens: 825,515
                        elif "Remaining tokens:" in detail_line:
-                            match = re.search(r'Remaining tokens:\s*([\d,]+)', detail_line)
+                            match = re.search(r"Remaining tokens:\s*([\d,]+)", detail_line)
                            if match:
-                                usage['remaining_tokens'] = int(match.group(1).replace(',', ''))
-                
+                                usage["remaining_tokens"] = int(match.group(1).replace(",", ""))
+
                if usage:  # Only add if we found some usage data
                    usage_logs.append(usage)
-        
+
        return usage_logs

-    def extract_conversation_token_usage(self, logs: str) -> List[int]:
+    def extract_conversation_token_usage(self, logs: str) -> list[int]:
        """Extract conversation token usage from logs"""
        usage_values = []
-        
+
        # Look for conversation token usage logs
-        pattern = r'Conversation history token usage:\s*([\d,]+)'
+        pattern = r"Conversation history token usage:\s*([\d,]+)"
        matches = re.findall(pattern, logs)
-        
+
        for match in matches:
-            usage_values.append(int(match.replace(',', '')))
-        
+            usage_values.append(int(match.replace(",", "")))
+
        return usage_values

    def run_test(self) -> bool:
@@ -111,11 +110,11 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
            # Create additional test files for this test - make them substantial enough to see token differences
            file1_content = """def fibonacci(n):
    '''Calculate fibonacci number recursively
-    
+
    This is a classic recursive algorithm that demonstrates
    the exponential time complexity of naive recursion.
    For large values of n, this becomes very slow.
-    
+
    Time complexity: O(2^n)
    Space complexity: O(n) due to call stack
    '''
@@ -125,10 +124,10 @@ class TokenAllocationValidationTest(BaseSimulatorTest):

 def factorial(n):
    '''Calculate factorial using recursion
-    
+
    More efficient than fibonacci as each value
    is calculated only once.
-    
+
    Time complexity: O(n)
    Space complexity: O(n) due to call stack
    '''
@@ -157,14 +156,14 @@ if __name__ == "__main__":
    for i in range(10):
        print(f"  F({i}) = {fibonacci(i)}")
 """
-            
+
            file2_content = """class Calculator:
    '''Advanced calculator class with error handling and logging'''
-    
+
    def __init__(self):
        self.history = []
        self.last_result = 0
-    
+
    def add(self, a, b):
        '''Addition with history tracking'''
        result = a + b
@@ -172,7 +171,7 @@ if __name__ == "__main__":
        self.history.append(operation)
        self.last_result = result
        return result
-    
+
    def multiply(self, a, b):
        '''Multiplication with history tracking'''
        result = a * b
@@ -180,20 +179,20 @@ if __name__ == "__main__":
        self.history.append(operation)
        self.last_result = result
        return result
-    
+
    def divide(self, a, b):
        '''Division with error handling and history tracking'''
        if b == 0:
            error_msg = f"Division by zero error: {a} / {b}"
            self.history.append(error_msg)
            raise ValueError("Cannot divide by zero")
-        
+
        result = a / b
        operation = f"{a} / {b} = {result}"
        self.history.append(operation)
        self.last_result = result
        return result
-    
+
    def power(self, base, exponent):
        '''Exponentiation with history tracking'''
        result = base ** exponent
@@ -201,11 +200,11 @@ if __name__ == "__main__":
        self.history.append(operation)
        self.last_result = result
        return result
-    
+
    def get_history(self):
        '''Return calculation history'''
        return self.history.copy()
-    
+
    def clear_history(self):
        '''Clear calculation history'''
        self.history.clear()
@@ -215,32 +214,32 @@ if __name__ == "__main__":
 if __name__ == "__main__":
    calc = Calculator()
    print("=== Calculator Demo ===")
-    
+
    # Perform various calculations
    print(f"Addition: {calc.add(10, 20)}")
    print(f"Multiplication: {calc.multiply(5, 8)}")
    print(f"Division: {calc.divide(100, 4)}")
    print(f"Power: {calc.power(2, 8)}")
-    
+
    print("\\nCalculation History:")
    for operation in calc.get_history():
        print(f"  {operation}")
-    
+
    print(f"\\nLast result: {calc.last_result}")
 """

            # Create test files
            file1_path = self.create_additional_test_file("math_functions.py", file1_content)
            file2_path = self.create_additional_test_file("calculator.py", file2_content)
-            
+
            # Track continuation IDs to validate each step generates new ones
            continuation_ids = []

            # Step 1: Initial chat with first file
            self.logger.info("  Step 1: Initial chat with file1 - checking token allocation")
-            
-            step1_start_time = datetime.datetime.now()
-            
+
+            datetime.datetime.now()
+
            response1, continuation_id1 = self.call_mcp_tool(
                "chat",
                {
@@ -260,31 +259,33 @@ if __name__ == "__main__":

            # Get logs and analyze file processing (Step 1 is new conversation, no conversation debug logs expected)
            logs_step1 = self.get_recent_server_logs()
-            
+
            # For Step 1, check for file embedding logs instead of conversation usage
            file_embedding_logs_step1 = [
-                line for line in logs_step1.split('\n')
-                if 'successfully embedded' in line and 'files' in line and 'tokens' in line
+                line
+                for line in logs_step1.split("\n")
+                if "successfully embedded" in line and "files" in line and "tokens" in line
            ]
-            
+
            if not file_embedding_logs_step1:
                self.logger.error("  ❌ Step 1: No file embedding logs found")
                return False
-            
+
            # Extract file token count from embedding logs
            step1_file_tokens = 0
            for log in file_embedding_logs_step1:
                # Look for pattern like "successfully embedded 1 files (146 tokens)"
                import re
-                match = re.search(r'\((\d+) tokens\)', log)
+
+                match = re.search(r"\((\d+) tokens\)", log)
                if match:
                    step1_file_tokens = int(match.group(1))
                    break
-            
+
            self.logger.info(f"  📊 Step 1 File Processing - Embedded files: {step1_file_tokens:,} tokens")
-            
+
            # Validate that file1 is actually mentioned in the embedding logs (check for actual filename)
-            file1_mentioned = any('math_functions.py' in log for log in file_embedding_logs_step1)
+            file1_mentioned = any("math_functions.py" in log for log in file_embedding_logs_step1)
            if not file1_mentioned:
                # Debug: show what files were actually found in the logs
                self.logger.debug("  📋 Files found in embedding logs:")
@@ -300,8 +301,10 @@ if __name__ == "__main__":
                    # Continue test - the important thing is that files were processed

            # Step 2: Different tool continuing same conversation - should build conversation history
-            self.logger.info("  Step 2: Analyze tool continuing chat conversation - checking conversation history buildup")
-            
+            self.logger.info(
+                "  Step 2: Analyze tool continuing chat conversation - checking conversation history buildup"
+            )
+
            response2, continuation_id2 = self.call_mcp_tool(
                "analyze",
                {
@@ -314,12 +317,12 @@ if __name__ == "__main__":
            )

            if not response2 or not continuation_id2:
-                self.logger.error("  ❌ Step 2 failed - no response or continuation ID") 
+                self.logger.error("  ❌ Step 2 failed - no response or continuation ID")
                return False

            self.logger.info(f"  ✅ Step 2 completed with continuation_id: {continuation_id2[:8]}...")
            continuation_ids.append(continuation_id2)
-            
+
            # Validate that we got a different continuation ID
            if continuation_id2 == continuation_id1:
                self.logger.error("  ❌ Step 2: Got same continuation ID as Step 1 - continuation not working")
@@ -328,33 +331,37 @@ if __name__ == "__main__":
            # Get logs and analyze token usage
            logs_step2 = self.get_recent_server_logs()
            usage_step2 = self.extract_conversation_usage_logs(logs_step2)
-            
+
            if len(usage_step2) < 2:
-                self.logger.warning(f"  ⚠️ Step 2: Only found {len(usage_step2)} conversation usage logs, expected at least 2")
-                # Debug: Look for any CONVERSATION_DEBUG logs 
-                conversation_debug_lines = [line for line in logs_step2.split('\n') if 'CONVERSATION_DEBUG' in line]
+                self.logger.warning(
+                    f"  ⚠️ Step 2: Only found {len(usage_step2)} conversation usage logs, expected at least 2"
+                )
+                # Debug: Look for any CONVERSATION_DEBUG logs
+                conversation_debug_lines = [line for line in logs_step2.split("\n") if "CONVERSATION_DEBUG" in line]
                self.logger.debug(f"  📋 Found {len(conversation_debug_lines)} CONVERSATION_DEBUG lines in step 2")
-                
+
                if conversation_debug_lines:
                    self.logger.debug("  📋 Recent CONVERSATION_DEBUG lines:")
                    for line in conversation_debug_lines[-10:]:  # Show last 10
                        self.logger.debug(f"    {line}")
-                
+
                # If we have at least 1 usage log, continue with adjusted expectations
                if len(usage_step2) >= 1:
                    self.logger.info("  📋 Continuing with single usage log for analysis")
                else:
                    self.logger.error("  ❌ No conversation usage logs found at all")
                    return False
-            
+
            latest_usage_step2 = usage_step2[-1]  # Get most recent usage
-            self.logger.info(f"  📊 Step 2 Token Usage - Total Capacity: {latest_usage_step2.get('total_capacity', 0):,}, "
-                            f"Conversation: {latest_usage_step2.get('conversation_tokens', 0):,}, "
-                            f"Remaining: {latest_usage_step2.get('remaining_tokens', 0):,}")
+            self.logger.info(
+                f"  📊 Step 2 Token Usage - Total Capacity: {latest_usage_step2.get('total_capacity', 0):,}, "
+                f"Conversation: {latest_usage_step2.get('conversation_tokens', 0):,}, "
+                f"Remaining: {latest_usage_step2.get('remaining_tokens', 0):,}"
+            )

            # Step 3: Continue conversation with additional file - should show increased token usage
            self.logger.info("  Step 3: Continue conversation with file1 + file2 - checking token growth")
-            
+
            response3, continuation_id3 = self.call_mcp_tool(
                "chat",
                {
@@ -376,26 +383,30 @@ if __name__ == "__main__":
            # Get logs and analyze final token usage
            logs_step3 = self.get_recent_server_logs()
            usage_step3 = self.extract_conversation_usage_logs(logs_step3)
-            
+
            self.logger.info(f"  📋 Found {len(usage_step3)} total conversation usage logs")
-            
+
            if len(usage_step3) < 3:
-                self.logger.warning(f"  ⚠️ Step 3: Only found {len(usage_step3)} conversation usage logs, expected at least 3")
+                self.logger.warning(
+                    f"  ⚠️ Step 3: Only found {len(usage_step3)} conversation usage logs, expected at least 3"
+                )
                # Let's check if we have at least some logs to work with
                if len(usage_step3) == 0:
                    self.logger.error("  ❌ No conversation usage logs found at all")
                    # Debug: show some recent logs
-                    recent_lines = logs_step3.split('\n')[-50:]
+                    recent_lines = logs_step3.split("\n")[-50:]
                    self.logger.debug("  📋 Recent log lines:")
                    for line in recent_lines:
                        if line.strip() and "CONVERSATION_DEBUG" in line:
                            self.logger.debug(f"    {line}")
                    return False
-            
+
            latest_usage_step3 = usage_step3[-1]  # Get most recent usage
-            self.logger.info(f"  📊 Step 3 Token Usage - Total Capacity: {latest_usage_step3.get('total_capacity', 0):,}, "
-                            f"Conversation: {latest_usage_step3.get('conversation_tokens', 0):,}, "
-                            f"Remaining: {latest_usage_step3.get('remaining_tokens', 0):,}")
+            self.logger.info(
+                f"  📊 Step 3 Token Usage - Total Capacity: {latest_usage_step3.get('total_capacity', 0):,}, "
+                f"Conversation: {latest_usage_step3.get('conversation_tokens', 0):,}, "
+                f"Remaining: {latest_usage_step3.get('remaining_tokens', 0):,}"
+            )

            # Validation: Check token processing and conversation history
            self.logger.info("  📋 Validating token processing and conversation history...")
@@ -405,14 +416,14 @@ if __name__ == "__main__":
            step2_remaining = 0
            step3_conversation = 0
            step3_remaining = 0
-            
+
            if len(usage_step2) > 0:
-                step2_conversation = latest_usage_step2.get('conversation_tokens', 0)
-                step2_remaining = latest_usage_step2.get('remaining_tokens', 0)
-            
+                step2_conversation = latest_usage_step2.get("conversation_tokens", 0)
+                step2_remaining = latest_usage_step2.get("remaining_tokens", 0)
+
            if len(usage_step3) >= len(usage_step2) + 1:  # Should have one more log than step2
-                step3_conversation = latest_usage_step3.get('conversation_tokens', 0) 
-                step3_remaining = latest_usage_step3.get('remaining_tokens', 0)
+                step3_conversation = latest_usage_step3.get("conversation_tokens", 0)
+                step3_remaining = latest_usage_step3.get("remaining_tokens", 0)
            else:
                # Use step2 values as fallback
                step3_conversation = step2_conversation
@@ -421,62 +432,78 @@ if __name__ == "__main__":

            # Validation criteria
            criteria = []
-            
+
            # 1. Step 1 should have processed files successfully
            step1_processed_files = step1_file_tokens > 0
            criteria.append(("Step 1 processed files successfully", step1_processed_files))
-            
+
            # 2. Step 2 should have conversation history (if continuation worked)
-            step2_has_conversation = step2_conversation > 0 if len(usage_step2) > 0 else True  # Pass if no logs (might be different issue)
+            step2_has_conversation = (
+                step2_conversation > 0 if len(usage_step2) > 0 else True
+            )  # Pass if no logs (might be different issue)
            step2_has_remaining = step2_remaining > 0 if len(usage_step2) > 0 else True
            criteria.append(("Step 2 has conversation history", step2_has_conversation))
            criteria.append(("Step 2 has remaining tokens", step2_has_remaining))
-            
+
            # 3. Step 3 should show conversation growth
-            step3_has_conversation = step3_conversation >= step2_conversation if len(usage_step3) > len(usage_step2) else True
+            step3_has_conversation = (
+                step3_conversation >= step2_conversation if len(usage_step3) > len(usage_step2) else True
+            )
            criteria.append(("Step 3 maintains conversation history", step3_has_conversation))
-            
+
            # 4. Check that we got some conversation usage logs for continuation calls
            has_conversation_logs = len(usage_step3) > 0
            criteria.append(("Found conversation usage logs", has_conversation_logs))
-            
+
            # 5. Validate unique continuation IDs per response
            unique_continuation_ids = len(set(continuation_ids)) == len(continuation_ids)
            criteria.append(("Each response generated unique continuation ID", unique_continuation_ids))
-            
+
            # 6. Validate continuation IDs were different from each step
-            step_ids_different = len(continuation_ids) == 3 and continuation_ids[0] != continuation_ids[1] and continuation_ids[1] != continuation_ids[2]
+            step_ids_different = (
+                len(continuation_ids) == 3
+                and continuation_ids[0] != continuation_ids[1]
+                and continuation_ids[1] != continuation_ids[2]
+            )
            criteria.append(("All continuation IDs are different", step_ids_different))

            # Log detailed analysis
-            self.logger.info(f"  📊 Token Processing Analysis:")
+            self.logger.info("  📊 Token Processing Analysis:")
            self.logger.info(f"    Step 1 - File tokens: {step1_file_tokens:,} (new conversation)")
            self.logger.info(f"    Step 2 - Conversation: {step2_conversation:,}, Remaining: {step2_remaining:,}")
            self.logger.info(f"    Step 3 - Conversation: {step3_conversation:,}, Remaining: {step3_remaining:,}")
-            
+
            # Log continuation ID analysis
-            self.logger.info(f"  📊 Continuation ID Analysis:")
+            self.logger.info("  📊 Continuation ID Analysis:")
            self.logger.info(f"    Step 1 ID: {continuation_ids[0][:8]}... (generated)")
            self.logger.info(f"    Step 2 ID: {continuation_ids[1][:8]}... (generated from Step 1)")
            self.logger.info(f"    Step 3 ID: {continuation_ids[2][:8]}... (generated from Step 2)")
-            
+
            # Check for file mentions in step 3 (should include both files)
            # Look for file processing in conversation memory logs and tool embedding logs
-            file2_mentioned_step3 = any('calculator.py' in log for log in logs_step3.split('\n') if ('embedded' in log.lower() and ('conversation' in log.lower() or 'tool' in log.lower())))
-            file1_still_mentioned_step3 = any('math_functions.py' in log for log in logs_step3.split('\n') if ('embedded' in log.lower() and ('conversation' in log.lower() or 'tool' in log.lower())))
-            
-            self.logger.info(f"  📊 File Processing in Step 3:")
+            file2_mentioned_step3 = any(
+                "calculator.py" in log
+                for log in logs_step3.split("\n")
+                if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
+            )
+            file1_still_mentioned_step3 = any(
+                "math_functions.py" in log
+                for log in logs_step3.split("\n")
+                if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
+            )
+
+            self.logger.info("  📊 File Processing in Step 3:")
            self.logger.info(f"    File1 (math_functions.py) mentioned: {file1_still_mentioned_step3}")
            self.logger.info(f"    File2 (calculator.py) mentioned: {file2_mentioned_step3}")
-            
-            # Add file increase validation 
+
+            # Add file increase validation
            step3_file_increase = file2_mentioned_step3  # New file should be visible
            criteria.append(("Step 3 shows new file being processed", step3_file_increase))

            # Check validation criteria
            passed_criteria = sum(1 for _, passed in criteria if passed)
            total_criteria = len(criteria)
-            
+
            self.logger.info(f"  📊 Validation criteria: {passed_criteria}/{total_criteria}")
            for criterion, passed in criteria:
                status = "✅" if passed else "❌"
@@ -484,15 +511,11 @@ if __name__ == "__main__":

            # Check for file embedding logs
            file_embedding_logs = [
-                line for line in logs_step3.split('\n')
-                if 'tool embedding' in line and 'files' in line
-            ]
-            
-            conversation_logs = [
-                line for line in logs_step3.split('\n') 
-                if 'conversation history' in line.lower()
+                line for line in logs_step3.split("\n") if "tool embedding" in line and "files" in line
            ]

+            conversation_logs = [line for line in logs_step3.split("\n") if "conversation history" in line.lower()]
+
            self.logger.info(f"  📊 File embedding logs: {len(file_embedding_logs)}")
            self.logger.info(f"  📊 Conversation history logs: {len(conversation_logs)}")

@@ -516,13 +539,13 @@ if __name__ == "__main__":
 def main():
    """Run the token allocation validation test"""
    import sys
-    
+
    verbose = "--verbose" in sys.argv or "-v" in sys.argv
    test = TokenAllocationValidationTest(verbose=verbose)
-    
+
    success = test.run_test()
    sys.exit(0 if success else 1)


 if __name__ == "__main__":
-    main()
+    main()