Rebranding, refactoring, renaming, cleanup, updated docs
This commit is contained in:
@@ -1,13 +1,14 @@
|
||||
"""
|
||||
Communication Simulator Tests Package
|
||||
|
||||
This package contains individual test modules for the Gemini MCP Communication Simulator.
|
||||
This package contains individual test modules for the Zen MCP Communication Simulator.
|
||||
Each test is in its own file for better organization and maintainability.
|
||||
"""
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .test_basic_conversation import BasicConversationTest
|
||||
from .test_content_validation import ContentValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||
from .test_cross_tool_continuation import CrossToolContinuationTest
|
||||
from .test_logs_validation import LogsValidationTest
|
||||
@@ -16,7 +17,6 @@ from .test_o3_model_selection import O3ModelSelectionTest
|
||||
from .test_per_tool_deduplication import PerToolDeduplicationTest
|
||||
from .test_redis_validation import RedisValidationTest
|
||||
from .test_token_allocation_validation import TokenAllocationValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
|
||||
# Test registry for dynamic loading
|
||||
TEST_REGISTRY = {
|
||||
|
||||
@@ -19,8 +19,8 @@ class BaseSimulatorTest:
|
||||
self.verbose = verbose
|
||||
self.test_files = {}
|
||||
self.test_dir = None
|
||||
self.container_name = "gemini-mcp-server"
|
||||
self.redis_container = "gemini-mcp-redis"
|
||||
self.container_name = "zen-mcp-server"
|
||||
self.redis_container = "zen-mcp-redis"
|
||||
|
||||
# Configure logging
|
||||
log_level = logging.DEBUG if verbose else logging.INFO
|
||||
|
||||
@@ -6,7 +6,6 @@ Tests that tools don't duplicate file content in their responses.
|
||||
This test is specifically designed to catch content duplication bugs.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
@@ -31,6 +30,7 @@ class ContentValidationTest(BaseSimulatorTest):
|
||||
cmd_monitor = ["docker", "logs", "--since", since_time, "gemini-mcp-log-monitor"]
|
||||
|
||||
import subprocess
|
||||
|
||||
result_server = subprocess.run(cmd_server, capture_output=True, text=True)
|
||||
result_monitor = subprocess.run(cmd_monitor, capture_output=True, text=True)
|
||||
|
||||
@@ -76,6 +76,7 @@ DATABASE_CONFIG = {
|
||||
|
||||
# Get timestamp for log filtering
|
||||
import datetime
|
||||
|
||||
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
# Test 1: Initial tool call with validation file
|
||||
@@ -139,26 +140,25 @@ DATABASE_CONFIG = {
|
||||
|
||||
# Check for proper file embedding logs
|
||||
embedding_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "📁" in line or "embedding" in line.lower() or "[FILES]" in line
|
||||
line for line in logs.split("\n") if "📁" in line or "embedding" in line.lower() or "[FILES]" in line
|
||||
]
|
||||
|
||||
# Check for deduplication evidence
|
||||
deduplication_logs = [
|
||||
line for line in logs.split("\n")
|
||||
line
|
||||
for line in logs.split("\n")
|
||||
if "skipping" in line.lower() and "already in conversation" in line.lower()
|
||||
]
|
||||
|
||||
# Check for file processing patterns
|
||||
new_file_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "all 1 files are new" in line or "New conversation" in line
|
||||
line for line in logs.split("\n") if "all 1 files are new" in line or "New conversation" in line
|
||||
]
|
||||
|
||||
# Validation criteria
|
||||
validation_file_mentioned = any("validation_config.py" in line for line in logs.split("\n"))
|
||||
embedding_found = len(embedding_logs) > 0
|
||||
proper_deduplication = len(deduplication_logs) > 0 or len(new_file_logs) >= 2 # Should see new conversation patterns
|
||||
(len(deduplication_logs) > 0 or len(new_file_logs) >= 2) # Should see new conversation patterns
|
||||
|
||||
self.logger.info(f" 📊 Embedding logs found: {len(embedding_logs)}")
|
||||
self.logger.info(f" 📊 Deduplication evidence: {len(deduplication_logs)}")
|
||||
@@ -175,7 +175,7 @@ DATABASE_CONFIG = {
|
||||
success_criteria = [
|
||||
("Embedding logs found", embedding_found),
|
||||
("File processing evidence", validation_file_mentioned),
|
||||
("Multiple tool calls", len(new_file_logs) >= 2)
|
||||
("Multiple tool calls", len(new_file_logs) >= 2),
|
||||
]
|
||||
|
||||
passed_criteria = sum(1 for _, passed in success_criteria if passed)
|
||||
|
||||
@@ -4,14 +4,14 @@ Conversation Chain and Threading Validation Test
|
||||
|
||||
This test validates that:
|
||||
1. Multiple tool invocations create proper parent->parent->parent chains
|
||||
2. New conversations can be started independently
|
||||
2. New conversations can be started independently
|
||||
3. Original conversation chains can be resumed from any point
|
||||
4. History traversal works correctly for all scenarios
|
||||
5. Thread relationships are properly maintained in Redis
|
||||
|
||||
Test Flow:
|
||||
Chain A: chat -> analyze -> debug (3 linked threads)
|
||||
Chain B: chat -> analyze (2 linked threads, independent)
|
||||
Chain B: chat -> analyze (2 linked threads, independent)
|
||||
Chain A Branch: debug (continue from original chat, creating branch)
|
||||
|
||||
This validates the conversation threading system's ability to:
|
||||
@@ -21,10 +21,8 @@ This validates the conversation threading system's ability to:
|
||||
- Properly traverse parent relationships for history reconstruction
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import subprocess
|
||||
import re
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -45,7 +43,7 @@ class ConversationChainValidationTest(BaseSimulatorTest):
|
||||
try:
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
@@ -55,44 +53,36 @@ class ConversationChainValidationTest(BaseSimulatorTest):
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def extract_thread_creation_logs(self, logs: str) -> List[Dict[str, str]]:
|
||||
def extract_thread_creation_logs(self, logs: str) -> list[dict[str, str]]:
|
||||
"""Extract thread creation logs with parent relationships"""
|
||||
thread_logs = []
|
||||
|
||||
lines = logs.split('\n')
|
||||
|
||||
lines = logs.split("\n")
|
||||
for line in lines:
|
||||
if "[THREAD] Created new thread" in line:
|
||||
# Parse: [THREAD] Created new thread 9dc779eb-645f-4850-9659-34c0e6978d73 with parent a0ce754d-c995-4b3e-9103-88af429455aa
|
||||
match = re.search(r'\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)', line)
|
||||
match = re.search(r"\[THREAD\] Created new thread ([a-f0-9-]+) with parent ([a-f0-9-]+|None)", line)
|
||||
if match:
|
||||
thread_id = match.group(1)
|
||||
parent_id = match.group(2) if match.group(2) != "None" else None
|
||||
thread_logs.append({
|
||||
"thread_id": thread_id,
|
||||
"parent_id": parent_id,
|
||||
"log_line": line
|
||||
})
|
||||
|
||||
thread_logs.append({"thread_id": thread_id, "parent_id": parent_id, "log_line": line})
|
||||
|
||||
return thread_logs
|
||||
|
||||
def extract_history_traversal_logs(self, logs: str) -> List[Dict[str, str]]:
|
||||
def extract_history_traversal_logs(self, logs: str) -> list[dict[str, str]]:
|
||||
"""Extract conversation history traversal logs"""
|
||||
traversal_logs = []
|
||||
|
||||
lines = logs.split('\n')
|
||||
|
||||
lines = logs.split("\n")
|
||||
for line in lines:
|
||||
if "[THREAD] Retrieved chain of" in line:
|
||||
# Parse: [THREAD] Retrieved chain of 3 threads for 9dc779eb-645f-4850-9659-34c0e6978d73
|
||||
match = re.search(r'\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)', line)
|
||||
match = re.search(r"\[THREAD\] Retrieved chain of (\d+) threads for ([a-f0-9-]+)", line)
|
||||
if match:
|
||||
chain_length = int(match.group(1))
|
||||
thread_id = match.group(2)
|
||||
traversal_logs.append({
|
||||
"thread_id": thread_id,
|
||||
"chain_length": chain_length,
|
||||
"log_line": line
|
||||
})
|
||||
|
||||
traversal_logs.append({"thread_id": thread_id, "chain_length": chain_length, "log_line": line})
|
||||
|
||||
return traversal_logs
|
||||
|
||||
def run_test(self) -> bool:
|
||||
@@ -113,16 +103,16 @@ class TestClass:
|
||||
return "Method in test class"
|
||||
"""
|
||||
test_file_path = self.create_additional_test_file("chain_test.py", test_file_content)
|
||||
|
||||
|
||||
# Track all continuation IDs and their relationships
|
||||
conversation_chains = {}
|
||||
|
||||
|
||||
# === CHAIN A: Build linear conversation chain ===
|
||||
self.logger.info(" 🔗 Chain A: Building linear conversation chain")
|
||||
|
||||
|
||||
# Step A1: Start with chat tool (creates thread_id_1)
|
||||
self.logger.info(" Step A1: Chat tool - start new conversation")
|
||||
|
||||
|
||||
response_a1, continuation_id_a1 = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
@@ -138,11 +128,11 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step A1 completed - thread_id: {continuation_id_a1[:8]}...")
|
||||
conversation_chains['A1'] = continuation_id_a1
|
||||
conversation_chains["A1"] = continuation_id_a1
|
||||
|
||||
# Step A2: Continue with analyze tool (creates thread_id_2 with parent=thread_id_1)
|
||||
self.logger.info(" Step A2: Analyze tool - continue Chain A")
|
||||
|
||||
|
||||
response_a2, continuation_id_a2 = self.call_mcp_tool(
|
||||
"analyze",
|
||||
{
|
||||
@@ -159,11 +149,11 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step A2 completed - thread_id: {continuation_id_a2[:8]}...")
|
||||
conversation_chains['A2'] = continuation_id_a2
|
||||
conversation_chains["A2"] = continuation_id_a2
|
||||
|
||||
# Step A3: Continue with debug tool (creates thread_id_3 with parent=thread_id_2)
|
||||
# Step A3: Continue with debug tool (creates thread_id_3 with parent=thread_id_2)
|
||||
self.logger.info(" Step A3: Debug tool - continue Chain A")
|
||||
|
||||
|
||||
response_a3, continuation_id_a3 = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
@@ -180,14 +170,14 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step A3 completed - thread_id: {continuation_id_a3[:8]}...")
|
||||
conversation_chains['A3'] = continuation_id_a3
|
||||
conversation_chains["A3"] = continuation_id_a3
|
||||
|
||||
# === CHAIN B: Start independent conversation ===
|
||||
self.logger.info(" 🔗 Chain B: Starting independent conversation")
|
||||
|
||||
|
||||
# Step B1: Start new chat conversation (creates thread_id_4, no parent)
|
||||
self.logger.info(" Step B1: Chat tool - start NEW independent conversation")
|
||||
|
||||
|
||||
response_b1, continuation_id_b1 = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
@@ -202,11 +192,11 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step B1 completed - thread_id: {continuation_id_b1[:8]}...")
|
||||
conversation_chains['B1'] = continuation_id_b1
|
||||
conversation_chains["B1"] = continuation_id_b1
|
||||
|
||||
# Step B2: Continue the new conversation (creates thread_id_5 with parent=thread_id_4)
|
||||
self.logger.info(" Step B2: Analyze tool - continue Chain B")
|
||||
|
||||
|
||||
response_b2, continuation_id_b2 = self.call_mcp_tool(
|
||||
"analyze",
|
||||
{
|
||||
@@ -222,14 +212,14 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step B2 completed - thread_id: {continuation_id_b2[:8]}...")
|
||||
conversation_chains['B2'] = continuation_id_b2
|
||||
conversation_chains["B2"] = continuation_id_b2
|
||||
|
||||
# === CHAIN A BRANCH: Go back to original conversation ===
|
||||
self.logger.info(" 🔗 Chain A Branch: Resume original conversation from A1")
|
||||
|
||||
|
||||
# Step A1-Branch: Use original continuation_id_a1 to branch (creates thread_id_6 with parent=thread_id_1)
|
||||
self.logger.info(" Step A1-Branch: Debug tool - branch from original Chain A")
|
||||
|
||||
|
||||
response_a1_branch, continuation_id_a1_branch = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
@@ -246,73 +236,79 @@ class TestClass:
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step A1-Branch completed - thread_id: {continuation_id_a1_branch[:8]}...")
|
||||
conversation_chains['A1_Branch'] = continuation_id_a1_branch
|
||||
conversation_chains["A1_Branch"] = continuation_id_a1_branch
|
||||
|
||||
# === ANALYSIS: Validate thread relationships and history traversal ===
|
||||
self.logger.info(" 📊 Analyzing conversation chain structure...")
|
||||
|
||||
|
||||
# Get logs and extract thread relationships
|
||||
logs = self.get_recent_server_logs()
|
||||
thread_creation_logs = self.extract_thread_creation_logs(logs)
|
||||
history_traversal_logs = self.extract_history_traversal_logs(logs)
|
||||
|
||||
|
||||
self.logger.info(f" Found {len(thread_creation_logs)} thread creation logs")
|
||||
self.logger.info(f" Found {len(history_traversal_logs)} history traversal logs")
|
||||
|
||||
|
||||
# Debug: Show what we found
|
||||
if self.verbose:
|
||||
self.logger.debug(" Thread creation logs found:")
|
||||
for log in thread_creation_logs:
|
||||
self.logger.debug(f" {log['thread_id'][:8]}... parent: {log['parent_id'][:8] if log['parent_id'] else 'None'}...")
|
||||
self.logger.debug(
|
||||
f" {log['thread_id'][:8]}... parent: {log['parent_id'][:8] if log['parent_id'] else 'None'}..."
|
||||
)
|
||||
self.logger.debug(" History traversal logs found:")
|
||||
for log in history_traversal_logs:
|
||||
self.logger.debug(f" {log['thread_id'][:8]}... chain length: {log['chain_length']}")
|
||||
|
||||
|
||||
# Build expected thread relationships
|
||||
expected_relationships = []
|
||||
|
||||
|
||||
# Note: A1 and B1 won't appear in thread creation logs because they're new conversations (no parent)
|
||||
# Only continuation threads (A2, A3, B2, A1-Branch) will appear in creation logs
|
||||
|
||||
|
||||
# Find logs for each continuation thread
|
||||
a2_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a2), None)
|
||||
a3_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a3), None)
|
||||
b2_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_b2), None)
|
||||
a1_branch_log = next((log for log in thread_creation_logs if log['thread_id'] == continuation_id_a1_branch), None)
|
||||
|
||||
a2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a2), None)
|
||||
a3_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a3), None)
|
||||
b2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_b2), None)
|
||||
a1_branch_log = next(
|
||||
(log for log in thread_creation_logs if log["thread_id"] == continuation_id_a1_branch), None
|
||||
)
|
||||
|
||||
# A2 should have A1 as parent
|
||||
if a2_log:
|
||||
expected_relationships.append(("A2 has A1 as parent", a2_log['parent_id'] == continuation_id_a1))
|
||||
|
||||
expected_relationships.append(("A2 has A1 as parent", a2_log["parent_id"] == continuation_id_a1))
|
||||
|
||||
# A3 should have A2 as parent
|
||||
if a3_log:
|
||||
expected_relationships.append(("A3 has A2 as parent", a3_log['parent_id'] == continuation_id_a2))
|
||||
|
||||
expected_relationships.append(("A3 has A2 as parent", a3_log["parent_id"] == continuation_id_a2))
|
||||
|
||||
# B2 should have B1 as parent (independent chain)
|
||||
if b2_log:
|
||||
expected_relationships.append(("B2 has B1 as parent", b2_log['parent_id'] == continuation_id_b1))
|
||||
|
||||
expected_relationships.append(("B2 has B1 as parent", b2_log["parent_id"] == continuation_id_b1))
|
||||
|
||||
# A1-Branch should have A1 as parent (branching)
|
||||
if a1_branch_log:
|
||||
expected_relationships.append(("A1-Branch has A1 as parent", a1_branch_log['parent_id'] == continuation_id_a1))
|
||||
|
||||
expected_relationships.append(
|
||||
("A1-Branch has A1 as parent", a1_branch_log["parent_id"] == continuation_id_a1)
|
||||
)
|
||||
|
||||
# Validate history traversal
|
||||
traversal_validations = []
|
||||
|
||||
|
||||
# History traversal logs are only generated when conversation history is built from scratch
|
||||
# (not when history is already embedded in the prompt by server.py)
|
||||
# So we should expect at least 1 traversal log, but not necessarily for every continuation
|
||||
|
||||
|
||||
if len(history_traversal_logs) > 0:
|
||||
# Validate that any traversal logs we find have reasonable chain lengths
|
||||
for log in history_traversal_logs:
|
||||
thread_id = log['thread_id']
|
||||
chain_length = log['chain_length']
|
||||
|
||||
thread_id = log["thread_id"]
|
||||
chain_length = log["chain_length"]
|
||||
|
||||
# Chain length should be at least 2 for any continuation thread
|
||||
# (original thread + continuation thread)
|
||||
is_valid_length = chain_length >= 2
|
||||
|
||||
|
||||
# Try to identify which thread this is for better validation
|
||||
thread_description = "Unknown thread"
|
||||
if thread_id == continuation_id_a2:
|
||||
@@ -327,12 +323,16 @@ class TestClass:
|
||||
elif thread_id == continuation_id_a1_branch:
|
||||
thread_description = "A1-Branch (should be 2-thread chain)"
|
||||
is_valid_length = chain_length == 2
|
||||
|
||||
traversal_validations.append((f"{thread_description[:8]}... has valid chain length", is_valid_length))
|
||||
|
||||
|
||||
traversal_validations.append(
|
||||
(f"{thread_description[:8]}... has valid chain length", is_valid_length)
|
||||
)
|
||||
|
||||
# Also validate we found at least one traversal (shows the system is working)
|
||||
traversal_validations.append(("At least one history traversal occurred", len(history_traversal_logs) >= 1))
|
||||
|
||||
traversal_validations.append(
|
||||
("At least one history traversal occurred", len(history_traversal_logs) >= 1)
|
||||
)
|
||||
|
||||
# === VALIDATION RESULTS ===
|
||||
self.logger.info(" 📊 Thread Relationship Validation:")
|
||||
relationship_passed = 0
|
||||
@@ -341,7 +341,7 @@ class TestClass:
|
||||
self.logger.info(f" {status} {desc}")
|
||||
if passed:
|
||||
relationship_passed += 1
|
||||
|
||||
|
||||
self.logger.info(" 📊 History Traversal Validation:")
|
||||
traversal_passed = 0
|
||||
for desc, passed in traversal_validations:
|
||||
@@ -349,31 +349,35 @@ class TestClass:
|
||||
self.logger.info(f" {status} {desc}")
|
||||
if passed:
|
||||
traversal_passed += 1
|
||||
|
||||
|
||||
# === SUCCESS CRITERIA ===
|
||||
total_relationship_checks = len(expected_relationships)
|
||||
total_traversal_checks = len(traversal_validations)
|
||||
|
||||
self.logger.info(f" 📊 Validation Summary:")
|
||||
|
||||
self.logger.info(" 📊 Validation Summary:")
|
||||
self.logger.info(f" Thread relationships: {relationship_passed}/{total_relationship_checks}")
|
||||
self.logger.info(f" History traversal: {traversal_passed}/{total_traversal_checks}")
|
||||
|
||||
|
||||
# Success requires at least 80% of validations to pass
|
||||
relationship_success = relationship_passed >= (total_relationship_checks * 0.8)
|
||||
|
||||
|
||||
# If no traversal checks were possible, it means no traversal logs were found
|
||||
# This could indicate an issue since we expect at least some history building
|
||||
if total_traversal_checks == 0:
|
||||
self.logger.warning(" No history traversal logs found - this may indicate conversation history is always pre-embedded")
|
||||
self.logger.warning(
|
||||
" No history traversal logs found - this may indicate conversation history is always pre-embedded"
|
||||
)
|
||||
# Still consider it successful since the thread relationships are what matter most
|
||||
traversal_success = True
|
||||
else:
|
||||
traversal_success = traversal_passed >= (total_traversal_checks * 0.8)
|
||||
|
||||
|
||||
overall_success = relationship_success and traversal_success
|
||||
|
||||
self.logger.info(f" 📊 Conversation Chain Structure:")
|
||||
self.logger.info(f" Chain A: {continuation_id_a1[:8]} → {continuation_id_a2[:8]} → {continuation_id_a3[:8]}")
|
||||
|
||||
self.logger.info(" 📊 Conversation Chain Structure:")
|
||||
self.logger.info(
|
||||
f" Chain A: {continuation_id_a1[:8]} → {continuation_id_a2[:8]} → {continuation_id_a3[:8]}"
|
||||
)
|
||||
self.logger.info(f" Chain B: {continuation_id_b1[:8]} → {continuation_id_b2[:8]}")
|
||||
self.logger.info(f" Branch: {continuation_id_a1[:8]} → {continuation_id_a1_branch[:8]}")
|
||||
|
||||
@@ -394,13 +398,13 @@ class TestClass:
|
||||
def main():
|
||||
"""Run the conversation chain validation test"""
|
||||
import sys
|
||||
|
||||
|
||||
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
||||
test = ConversationChainValidationTest(verbose=verbose)
|
||||
|
||||
|
||||
success = test.run_test()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -30,7 +30,7 @@ class O3ModelSelectionTest(BaseSimulatorTest):
|
||||
# Read logs directly from the log file - more reliable than docker logs --since
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "200", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
@@ -49,7 +49,7 @@ class O3ModelSelectionTest(BaseSimulatorTest):
|
||||
self.setup_test_files()
|
||||
|
||||
# Get timestamp for log filtering
|
||||
start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
# Test 1: Explicit O3 model selection
|
||||
self.logger.info(" 1: Testing explicit O3 model selection")
|
||||
@@ -115,37 +115,26 @@ def multiply(x, y):
|
||||
|
||||
self.logger.info(" ✅ O3 with codereview tool completed")
|
||||
|
||||
# Validate model usage from server logs
|
||||
# Validate model usage from server logs
|
||||
self.logger.info(" 4: Validating model usage in logs")
|
||||
logs = self.get_recent_server_logs()
|
||||
|
||||
# Check for OpenAI API calls (this proves O3 models are being used)
|
||||
openai_api_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "Sending request to openai API" in line
|
||||
]
|
||||
openai_api_logs = [line for line in logs.split("\n") if "Sending request to openai API" in line]
|
||||
|
||||
# Check for OpenAI HTTP responses (confirms successful O3 calls)
|
||||
openai_http_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "HTTP Request: POST https://api.openai.com" in line
|
||||
line for line in logs.split("\n") if "HTTP Request: POST https://api.openai.com" in line
|
||||
]
|
||||
|
||||
# Check for received responses from OpenAI
|
||||
openai_response_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "Received response from openai API" in line
|
||||
]
|
||||
openai_response_logs = [line for line in logs.split("\n") if "Received response from openai API" in line]
|
||||
|
||||
# Check that we have both chat and codereview tool calls to OpenAI
|
||||
chat_openai_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "Sending request to openai API for chat" in line
|
||||
]
|
||||
chat_openai_logs = [line for line in logs.split("\n") if "Sending request to openai API for chat" in line]
|
||||
|
||||
codereview_openai_logs = [
|
||||
line for line in logs.split("\n")
|
||||
if "Sending request to openai API for codereview" in line
|
||||
line for line in logs.split("\n") if "Sending request to openai API for codereview" in line
|
||||
]
|
||||
|
||||
# Validation criteria - we expect 3 OpenAI calls (2 chat + 1 codereview)
|
||||
@@ -178,7 +167,7 @@ def multiply(x, y):
|
||||
("OpenAI HTTP requests successful", openai_http_success),
|
||||
("OpenAI responses received", openai_responses_received),
|
||||
("Chat tool used OpenAI", chat_calls_to_openai),
|
||||
("Codereview tool used OpenAI", codereview_calls_to_openai)
|
||||
("Codereview tool used OpenAI", codereview_calls_to_openai),
|
||||
]
|
||||
|
||||
passed_criteria = sum(1 for _, passed in success_criteria if passed)
|
||||
@@ -214,4 +203,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -10,9 +10,8 @@ This test validates that:
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import subprocess
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
@@ -33,7 +32,7 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
try:
|
||||
cmd = ["docker", "exec", self.container_name, "tail", "-n", "300", "/tmp/mcp_server.log"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
@@ -43,13 +42,13 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
self.logger.error(f"Failed to get server logs: {e}")
|
||||
return ""
|
||||
|
||||
def extract_conversation_usage_logs(self, logs: str) -> List[Dict[str, int]]:
|
||||
def extract_conversation_usage_logs(self, logs: str) -> list[dict[str, int]]:
|
||||
"""Extract actual conversation token usage from server logs"""
|
||||
usage_logs = []
|
||||
|
||||
|
||||
# Look for conversation debug logs that show actual usage
|
||||
lines = logs.split('\n')
|
||||
|
||||
lines = logs.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if "[CONVERSATION_DEBUG] Token budget calculation:" in line:
|
||||
# Found start of token budget log, extract the following lines
|
||||
@@ -57,47 +56,47 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
for j in range(1, 8): # Next 7 lines contain the usage details
|
||||
if i + j < len(lines):
|
||||
detail_line = lines[i + j]
|
||||
|
||||
|
||||
# Parse Total capacity: 1,048,576
|
||||
if "Total capacity:" in detail_line:
|
||||
match = re.search(r'Total capacity:\s*([\d,]+)', detail_line)
|
||||
match = re.search(r"Total capacity:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage['total_capacity'] = int(match.group(1).replace(',', ''))
|
||||
|
||||
usage["total_capacity"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Content allocation: 838,860
|
||||
elif "Content allocation:" in detail_line:
|
||||
match = re.search(r'Content allocation:\s*([\d,]+)', detail_line)
|
||||
match = re.search(r"Content allocation:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage['content_allocation'] = int(match.group(1).replace(',', ''))
|
||||
|
||||
# Parse Conversation tokens: 12,345
|
||||
usage["content_allocation"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Conversation tokens: 12,345
|
||||
elif "Conversation tokens:" in detail_line:
|
||||
match = re.search(r'Conversation tokens:\s*([\d,]+)', detail_line)
|
||||
match = re.search(r"Conversation tokens:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage['conversation_tokens'] = int(match.group(1).replace(',', ''))
|
||||
|
||||
usage["conversation_tokens"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
# Parse Remaining tokens: 825,515
|
||||
elif "Remaining tokens:" in detail_line:
|
||||
match = re.search(r'Remaining tokens:\s*([\d,]+)', detail_line)
|
||||
match = re.search(r"Remaining tokens:\s*([\d,]+)", detail_line)
|
||||
if match:
|
||||
usage['remaining_tokens'] = int(match.group(1).replace(',', ''))
|
||||
|
||||
usage["remaining_tokens"] = int(match.group(1).replace(",", ""))
|
||||
|
||||
if usage: # Only add if we found some usage data
|
||||
usage_logs.append(usage)
|
||||
|
||||
|
||||
return usage_logs
|
||||
|
||||
def extract_conversation_token_usage(self, logs: str) -> List[int]:
|
||||
def extract_conversation_token_usage(self, logs: str) -> list[int]:
|
||||
"""Extract conversation token usage from logs"""
|
||||
usage_values = []
|
||||
|
||||
|
||||
# Look for conversation token usage logs
|
||||
pattern = r'Conversation history token usage:\s*([\d,]+)'
|
||||
pattern = r"Conversation history token usage:\s*([\d,]+)"
|
||||
matches = re.findall(pattern, logs)
|
||||
|
||||
|
||||
for match in matches:
|
||||
usage_values.append(int(match.replace(',', '')))
|
||||
|
||||
usage_values.append(int(match.replace(",", "")))
|
||||
|
||||
return usage_values
|
||||
|
||||
def run_test(self) -> bool:
|
||||
@@ -111,11 +110,11 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
# Create additional test files for this test - make them substantial enough to see token differences
|
||||
file1_content = """def fibonacci(n):
|
||||
'''Calculate fibonacci number recursively
|
||||
|
||||
|
||||
This is a classic recursive algorithm that demonstrates
|
||||
the exponential time complexity of naive recursion.
|
||||
For large values of n, this becomes very slow.
|
||||
|
||||
|
||||
Time complexity: O(2^n)
|
||||
Space complexity: O(n) due to call stack
|
||||
'''
|
||||
@@ -125,10 +124,10 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
|
||||
def factorial(n):
|
||||
'''Calculate factorial using recursion
|
||||
|
||||
|
||||
More efficient than fibonacci as each value
|
||||
is calculated only once.
|
||||
|
||||
|
||||
Time complexity: O(n)
|
||||
Space complexity: O(n) due to call stack
|
||||
'''
|
||||
@@ -157,14 +156,14 @@ if __name__ == "__main__":
|
||||
for i in range(10):
|
||||
print(f" F({i}) = {fibonacci(i)}")
|
||||
"""
|
||||
|
||||
|
||||
file2_content = """class Calculator:
|
||||
'''Advanced calculator class with error handling and logging'''
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.history = []
|
||||
self.last_result = 0
|
||||
|
||||
|
||||
def add(self, a, b):
|
||||
'''Addition with history tracking'''
|
||||
result = a + b
|
||||
@@ -172,7 +171,7 @@ if __name__ == "__main__":
|
||||
self.history.append(operation)
|
||||
self.last_result = result
|
||||
return result
|
||||
|
||||
|
||||
def multiply(self, a, b):
|
||||
'''Multiplication with history tracking'''
|
||||
result = a * b
|
||||
@@ -180,20 +179,20 @@ if __name__ == "__main__":
|
||||
self.history.append(operation)
|
||||
self.last_result = result
|
||||
return result
|
||||
|
||||
|
||||
def divide(self, a, b):
|
||||
'''Division with error handling and history tracking'''
|
||||
if b == 0:
|
||||
error_msg = f"Division by zero error: {a} / {b}"
|
||||
self.history.append(error_msg)
|
||||
raise ValueError("Cannot divide by zero")
|
||||
|
||||
|
||||
result = a / b
|
||||
operation = f"{a} / {b} = {result}"
|
||||
self.history.append(operation)
|
||||
self.last_result = result
|
||||
return result
|
||||
|
||||
|
||||
def power(self, base, exponent):
|
||||
'''Exponentiation with history tracking'''
|
||||
result = base ** exponent
|
||||
@@ -201,11 +200,11 @@ if __name__ == "__main__":
|
||||
self.history.append(operation)
|
||||
self.last_result = result
|
||||
return result
|
||||
|
||||
|
||||
def get_history(self):
|
||||
'''Return calculation history'''
|
||||
return self.history.copy()
|
||||
|
||||
|
||||
def clear_history(self):
|
||||
'''Clear calculation history'''
|
||||
self.history.clear()
|
||||
@@ -215,32 +214,32 @@ if __name__ == "__main__":
|
||||
if __name__ == "__main__":
|
||||
calc = Calculator()
|
||||
print("=== Calculator Demo ===")
|
||||
|
||||
|
||||
# Perform various calculations
|
||||
print(f"Addition: {calc.add(10, 20)}")
|
||||
print(f"Multiplication: {calc.multiply(5, 8)}")
|
||||
print(f"Division: {calc.divide(100, 4)}")
|
||||
print(f"Power: {calc.power(2, 8)}")
|
||||
|
||||
|
||||
print("\\nCalculation History:")
|
||||
for operation in calc.get_history():
|
||||
print(f" {operation}")
|
||||
|
||||
|
||||
print(f"\\nLast result: {calc.last_result}")
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
file1_path = self.create_additional_test_file("math_functions.py", file1_content)
|
||||
file2_path = self.create_additional_test_file("calculator.py", file2_content)
|
||||
|
||||
|
||||
# Track continuation IDs to validate each step generates new ones
|
||||
continuation_ids = []
|
||||
|
||||
# Step 1: Initial chat with first file
|
||||
self.logger.info(" Step 1: Initial chat with file1 - checking token allocation")
|
||||
|
||||
step1_start_time = datetime.datetime.now()
|
||||
|
||||
|
||||
datetime.datetime.now()
|
||||
|
||||
response1, continuation_id1 = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
@@ -260,31 +259,33 @@ if __name__ == "__main__":
|
||||
|
||||
# Get logs and analyze file processing (Step 1 is new conversation, no conversation debug logs expected)
|
||||
logs_step1 = self.get_recent_server_logs()
|
||||
|
||||
|
||||
# For Step 1, check for file embedding logs instead of conversation usage
|
||||
file_embedding_logs_step1 = [
|
||||
line for line in logs_step1.split('\n')
|
||||
if 'successfully embedded' in line and 'files' in line and 'tokens' in line
|
||||
line
|
||||
for line in logs_step1.split("\n")
|
||||
if "successfully embedded" in line and "files" in line and "tokens" in line
|
||||
]
|
||||
|
||||
|
||||
if not file_embedding_logs_step1:
|
||||
self.logger.error(" ❌ Step 1: No file embedding logs found")
|
||||
return False
|
||||
|
||||
|
||||
# Extract file token count from embedding logs
|
||||
step1_file_tokens = 0
|
||||
for log in file_embedding_logs_step1:
|
||||
# Look for pattern like "successfully embedded 1 files (146 tokens)"
|
||||
import re
|
||||
match = re.search(r'\((\d+) tokens\)', log)
|
||||
|
||||
match = re.search(r"\((\d+) tokens\)", log)
|
||||
if match:
|
||||
step1_file_tokens = int(match.group(1))
|
||||
break
|
||||
|
||||
|
||||
self.logger.info(f" 📊 Step 1 File Processing - Embedded files: {step1_file_tokens:,} tokens")
|
||||
|
||||
|
||||
# Validate that file1 is actually mentioned in the embedding logs (check for actual filename)
|
||||
file1_mentioned = any('math_functions.py' in log for log in file_embedding_logs_step1)
|
||||
file1_mentioned = any("math_functions.py" in log for log in file_embedding_logs_step1)
|
||||
if not file1_mentioned:
|
||||
# Debug: show what files were actually found in the logs
|
||||
self.logger.debug(" 📋 Files found in embedding logs:")
|
||||
@@ -300,8 +301,10 @@ if __name__ == "__main__":
|
||||
# Continue test - the important thing is that files were processed
|
||||
|
||||
# Step 2: Different tool continuing same conversation - should build conversation history
|
||||
self.logger.info(" Step 2: Analyze tool continuing chat conversation - checking conversation history buildup")
|
||||
|
||||
self.logger.info(
|
||||
" Step 2: Analyze tool continuing chat conversation - checking conversation history buildup"
|
||||
)
|
||||
|
||||
response2, continuation_id2 = self.call_mcp_tool(
|
||||
"analyze",
|
||||
{
|
||||
@@ -314,12 +317,12 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
if not response2 or not continuation_id2:
|
||||
self.logger.error(" ❌ Step 2 failed - no response or continuation ID")
|
||||
self.logger.error(" ❌ Step 2 failed - no response or continuation ID")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 2 completed with continuation_id: {continuation_id2[:8]}...")
|
||||
continuation_ids.append(continuation_id2)
|
||||
|
||||
|
||||
# Validate that we got a different continuation ID
|
||||
if continuation_id2 == continuation_id1:
|
||||
self.logger.error(" ❌ Step 2: Got same continuation ID as Step 1 - continuation not working")
|
||||
@@ -328,33 +331,37 @@ if __name__ == "__main__":
|
||||
# Get logs and analyze token usage
|
||||
logs_step2 = self.get_recent_server_logs()
|
||||
usage_step2 = self.extract_conversation_usage_logs(logs_step2)
|
||||
|
||||
|
||||
if len(usage_step2) < 2:
|
||||
self.logger.warning(f" ⚠️ Step 2: Only found {len(usage_step2)} conversation usage logs, expected at least 2")
|
||||
# Debug: Look for any CONVERSATION_DEBUG logs
|
||||
conversation_debug_lines = [line for line in logs_step2.split('\n') if 'CONVERSATION_DEBUG' in line]
|
||||
self.logger.warning(
|
||||
f" ⚠️ Step 2: Only found {len(usage_step2)} conversation usage logs, expected at least 2"
|
||||
)
|
||||
# Debug: Look for any CONVERSATION_DEBUG logs
|
||||
conversation_debug_lines = [line for line in logs_step2.split("\n") if "CONVERSATION_DEBUG" in line]
|
||||
self.logger.debug(f" 📋 Found {len(conversation_debug_lines)} CONVERSATION_DEBUG lines in step 2")
|
||||
|
||||
|
||||
if conversation_debug_lines:
|
||||
self.logger.debug(" 📋 Recent CONVERSATION_DEBUG lines:")
|
||||
for line in conversation_debug_lines[-10:]: # Show last 10
|
||||
self.logger.debug(f" {line}")
|
||||
|
||||
|
||||
# If we have at least 1 usage log, continue with adjusted expectations
|
||||
if len(usage_step2) >= 1:
|
||||
self.logger.info(" 📋 Continuing with single usage log for analysis")
|
||||
else:
|
||||
self.logger.error(" ❌ No conversation usage logs found at all")
|
||||
return False
|
||||
|
||||
|
||||
latest_usage_step2 = usage_step2[-1] # Get most recent usage
|
||||
self.logger.info(f" 📊 Step 2 Token Usage - Total Capacity: {latest_usage_step2.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step2.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step2.get('remaining_tokens', 0):,}")
|
||||
self.logger.info(
|
||||
f" 📊 Step 2 Token Usage - Total Capacity: {latest_usage_step2.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step2.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step2.get('remaining_tokens', 0):,}"
|
||||
)
|
||||
|
||||
# Step 3: Continue conversation with additional file - should show increased token usage
|
||||
self.logger.info(" Step 3: Continue conversation with file1 + file2 - checking token growth")
|
||||
|
||||
|
||||
response3, continuation_id3 = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
@@ -376,26 +383,30 @@ if __name__ == "__main__":
|
||||
# Get logs and analyze final token usage
|
||||
logs_step3 = self.get_recent_server_logs()
|
||||
usage_step3 = self.extract_conversation_usage_logs(logs_step3)
|
||||
|
||||
|
||||
self.logger.info(f" 📋 Found {len(usage_step3)} total conversation usage logs")
|
||||
|
||||
|
||||
if len(usage_step3) < 3:
|
||||
self.logger.warning(f" ⚠️ Step 3: Only found {len(usage_step3)} conversation usage logs, expected at least 3")
|
||||
self.logger.warning(
|
||||
f" ⚠️ Step 3: Only found {len(usage_step3)} conversation usage logs, expected at least 3"
|
||||
)
|
||||
# Let's check if we have at least some logs to work with
|
||||
if len(usage_step3) == 0:
|
||||
self.logger.error(" ❌ No conversation usage logs found at all")
|
||||
# Debug: show some recent logs
|
||||
recent_lines = logs_step3.split('\n')[-50:]
|
||||
recent_lines = logs_step3.split("\n")[-50:]
|
||||
self.logger.debug(" 📋 Recent log lines:")
|
||||
for line in recent_lines:
|
||||
if line.strip() and "CONVERSATION_DEBUG" in line:
|
||||
self.logger.debug(f" {line}")
|
||||
return False
|
||||
|
||||
|
||||
latest_usage_step3 = usage_step3[-1] # Get most recent usage
|
||||
self.logger.info(f" 📊 Step 3 Token Usage - Total Capacity: {latest_usage_step3.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step3.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step3.get('remaining_tokens', 0):,}")
|
||||
self.logger.info(
|
||||
f" 📊 Step 3 Token Usage - Total Capacity: {latest_usage_step3.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step3.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step3.get('remaining_tokens', 0):,}"
|
||||
)
|
||||
|
||||
# Validation: Check token processing and conversation history
|
||||
self.logger.info(" 📋 Validating token processing and conversation history...")
|
||||
@@ -405,14 +416,14 @@ if __name__ == "__main__":
|
||||
step2_remaining = 0
|
||||
step3_conversation = 0
|
||||
step3_remaining = 0
|
||||
|
||||
|
||||
if len(usage_step2) > 0:
|
||||
step2_conversation = latest_usage_step2.get('conversation_tokens', 0)
|
||||
step2_remaining = latest_usage_step2.get('remaining_tokens', 0)
|
||||
|
||||
step2_conversation = latest_usage_step2.get("conversation_tokens", 0)
|
||||
step2_remaining = latest_usage_step2.get("remaining_tokens", 0)
|
||||
|
||||
if len(usage_step3) >= len(usage_step2) + 1: # Should have one more log than step2
|
||||
step3_conversation = latest_usage_step3.get('conversation_tokens', 0)
|
||||
step3_remaining = latest_usage_step3.get('remaining_tokens', 0)
|
||||
step3_conversation = latest_usage_step3.get("conversation_tokens", 0)
|
||||
step3_remaining = latest_usage_step3.get("remaining_tokens", 0)
|
||||
else:
|
||||
# Use step2 values as fallback
|
||||
step3_conversation = step2_conversation
|
||||
@@ -421,62 +432,78 @@ if __name__ == "__main__":
|
||||
|
||||
# Validation criteria
|
||||
criteria = []
|
||||
|
||||
|
||||
# 1. Step 1 should have processed files successfully
|
||||
step1_processed_files = step1_file_tokens > 0
|
||||
criteria.append(("Step 1 processed files successfully", step1_processed_files))
|
||||
|
||||
|
||||
# 2. Step 2 should have conversation history (if continuation worked)
|
||||
step2_has_conversation = step2_conversation > 0 if len(usage_step2) > 0 else True # Pass if no logs (might be different issue)
|
||||
step2_has_conversation = (
|
||||
step2_conversation > 0 if len(usage_step2) > 0 else True
|
||||
) # Pass if no logs (might be different issue)
|
||||
step2_has_remaining = step2_remaining > 0 if len(usage_step2) > 0 else True
|
||||
criteria.append(("Step 2 has conversation history", step2_has_conversation))
|
||||
criteria.append(("Step 2 has remaining tokens", step2_has_remaining))
|
||||
|
||||
|
||||
# 3. Step 3 should show conversation growth
|
||||
step3_has_conversation = step3_conversation >= step2_conversation if len(usage_step3) > len(usage_step2) else True
|
||||
step3_has_conversation = (
|
||||
step3_conversation >= step2_conversation if len(usage_step3) > len(usage_step2) else True
|
||||
)
|
||||
criteria.append(("Step 3 maintains conversation history", step3_has_conversation))
|
||||
|
||||
|
||||
# 4. Check that we got some conversation usage logs for continuation calls
|
||||
has_conversation_logs = len(usage_step3) > 0
|
||||
criteria.append(("Found conversation usage logs", has_conversation_logs))
|
||||
|
||||
|
||||
# 5. Validate unique continuation IDs per response
|
||||
unique_continuation_ids = len(set(continuation_ids)) == len(continuation_ids)
|
||||
criteria.append(("Each response generated unique continuation ID", unique_continuation_ids))
|
||||
|
||||
|
||||
# 6. Validate continuation IDs were different from each step
|
||||
step_ids_different = len(continuation_ids) == 3 and continuation_ids[0] != continuation_ids[1] and continuation_ids[1] != continuation_ids[2]
|
||||
step_ids_different = (
|
||||
len(continuation_ids) == 3
|
||||
and continuation_ids[0] != continuation_ids[1]
|
||||
and continuation_ids[1] != continuation_ids[2]
|
||||
)
|
||||
criteria.append(("All continuation IDs are different", step_ids_different))
|
||||
|
||||
# Log detailed analysis
|
||||
self.logger.info(f" 📊 Token Processing Analysis:")
|
||||
self.logger.info(" 📊 Token Processing Analysis:")
|
||||
self.logger.info(f" Step 1 - File tokens: {step1_file_tokens:,} (new conversation)")
|
||||
self.logger.info(f" Step 2 - Conversation: {step2_conversation:,}, Remaining: {step2_remaining:,}")
|
||||
self.logger.info(f" Step 3 - Conversation: {step3_conversation:,}, Remaining: {step3_remaining:,}")
|
||||
|
||||
|
||||
# Log continuation ID analysis
|
||||
self.logger.info(f" 📊 Continuation ID Analysis:")
|
||||
self.logger.info(" 📊 Continuation ID Analysis:")
|
||||
self.logger.info(f" Step 1 ID: {continuation_ids[0][:8]}... (generated)")
|
||||
self.logger.info(f" Step 2 ID: {continuation_ids[1][:8]}... (generated from Step 1)")
|
||||
self.logger.info(f" Step 3 ID: {continuation_ids[2][:8]}... (generated from Step 2)")
|
||||
|
||||
|
||||
# Check for file mentions in step 3 (should include both files)
|
||||
# Look for file processing in conversation memory logs and tool embedding logs
|
||||
file2_mentioned_step3 = any('calculator.py' in log for log in logs_step3.split('\n') if ('embedded' in log.lower() and ('conversation' in log.lower() or 'tool' in log.lower())))
|
||||
file1_still_mentioned_step3 = any('math_functions.py' in log for log in logs_step3.split('\n') if ('embedded' in log.lower() and ('conversation' in log.lower() or 'tool' in log.lower())))
|
||||
|
||||
self.logger.info(f" 📊 File Processing in Step 3:")
|
||||
file2_mentioned_step3 = any(
|
||||
"calculator.py" in log
|
||||
for log in logs_step3.split("\n")
|
||||
if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
|
||||
)
|
||||
file1_still_mentioned_step3 = any(
|
||||
"math_functions.py" in log
|
||||
for log in logs_step3.split("\n")
|
||||
if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
|
||||
)
|
||||
|
||||
self.logger.info(" 📊 File Processing in Step 3:")
|
||||
self.logger.info(f" File1 (math_functions.py) mentioned: {file1_still_mentioned_step3}")
|
||||
self.logger.info(f" File2 (calculator.py) mentioned: {file2_mentioned_step3}")
|
||||
|
||||
# Add file increase validation
|
||||
|
||||
# Add file increase validation
|
||||
step3_file_increase = file2_mentioned_step3 # New file should be visible
|
||||
criteria.append(("Step 3 shows new file being processed", step3_file_increase))
|
||||
|
||||
# Check validation criteria
|
||||
passed_criteria = sum(1 for _, passed in criteria if passed)
|
||||
total_criteria = len(criteria)
|
||||
|
||||
|
||||
self.logger.info(f" 📊 Validation criteria: {passed_criteria}/{total_criteria}")
|
||||
for criterion, passed in criteria:
|
||||
status = "✅" if passed else "❌"
|
||||
@@ -484,15 +511,11 @@ if __name__ == "__main__":
|
||||
|
||||
# Check for file embedding logs
|
||||
file_embedding_logs = [
|
||||
line for line in logs_step3.split('\n')
|
||||
if 'tool embedding' in line and 'files' in line
|
||||
]
|
||||
|
||||
conversation_logs = [
|
||||
line for line in logs_step3.split('\n')
|
||||
if 'conversation history' in line.lower()
|
||||
line for line in logs_step3.split("\n") if "tool embedding" in line and "files" in line
|
||||
]
|
||||
|
||||
conversation_logs = [line for line in logs_step3.split("\n") if "conversation history" in line.lower()]
|
||||
|
||||
self.logger.info(f" 📊 File embedding logs: {len(file_embedding_logs)}")
|
||||
self.logger.info(f" 📊 Conversation history logs: {len(conversation_logs)}")
|
||||
|
||||
@@ -516,13 +539,13 @@ if __name__ == "__main__":
|
||||
def main():
|
||||
"""Run the token allocation validation test"""
|
||||
import sys
|
||||
|
||||
|
||||
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
||||
test = TokenAllocationValidationTest(verbose=verbose)
|
||||
|
||||
|
||||
success = test.run_test()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user