Improved prompts to encourage better investigative flow
Improved abstraction Fixed failing tests after refactor
This commit is contained in:
@@ -10,14 +10,18 @@ This test validates that:
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
class TokenAllocationValidationTest(ConversationBaseTest):
|
||||
"""Test token allocation and conversation history functionality"""
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
|
||||
"""Call an MCP tool in-process"""
|
||||
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
|
||||
return response_text, continuation_id
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "token_allocation_validation"
|
||||
@@ -31,6 +35,9 @@ class TokenAllocationValidationTest(BaseSimulatorTest):
|
||||
try:
|
||||
self.logger.info(" Test: Token allocation and conversation history validation")
|
||||
|
||||
# Initialize for in-process tool calling
|
||||
self.setUp()
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
@@ -184,46 +191,12 @@ if __name__ == "__main__":
|
||||
self.logger.info(f" ✅ Step 1 completed with continuation_id: {continuation_id1[:8]}...")
|
||||
continuation_ids.append(continuation_id1)
|
||||
|
||||
# Get logs and analyze file processing (Step 1 is new conversation, no conversation debug logs expected)
|
||||
logs_step1 = self.get_recent_server_logs()
|
||||
|
||||
# For Step 1, check for file embedding logs instead of conversation usage
|
||||
file_embedding_logs_step1 = [
|
||||
line
|
||||
for line in logs_step1.split("\n")
|
||||
if "successfully embedded" in line and "files" in line and "tokens" in line
|
||||
]
|
||||
|
||||
if not file_embedding_logs_step1:
|
||||
self.logger.error(" ❌ Step 1: No file embedding logs found")
|
||||
# Validate that Step 1 succeeded and returned proper content
|
||||
if "fibonacci" not in response1.lower() or "factorial" not in response1.lower():
|
||||
self.logger.error(" ❌ Step 1: Response doesn't contain expected function analysis")
|
||||
return False
|
||||
|
||||
# Extract file token count from embedding logs
|
||||
step1_file_tokens = 0
|
||||
for log in file_embedding_logs_step1:
|
||||
# Look for pattern like "successfully embedded 1 files (146 tokens)"
|
||||
match = re.search(r"\((\d+) tokens\)", log)
|
||||
if match:
|
||||
step1_file_tokens = int(match.group(1))
|
||||
break
|
||||
|
||||
self.logger.info(f" Step 1 File Processing - Embedded files: {step1_file_tokens:,} tokens")
|
||||
|
||||
# Validate that file1 is actually mentioned in the embedding logs (check for actual filename)
|
||||
file1_mentioned = any("math_functions.py" in log for log in file_embedding_logs_step1)
|
||||
if not file1_mentioned:
|
||||
# Debug: show what files were actually found in the logs
|
||||
self.logger.debug(" 📋 Files found in embedding logs:")
|
||||
for log in file_embedding_logs_step1:
|
||||
self.logger.debug(f" {log}")
|
||||
# Also check if any files were embedded at all
|
||||
any_file_embedded = len(file_embedding_logs_step1) > 0
|
||||
if not any_file_embedded:
|
||||
self.logger.error(" ❌ Step 1: No file embedding logs found at all")
|
||||
return False
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Step 1: math_functions.py not specifically found, but files were embedded")
|
||||
# Continue test - the important thing is that files were processed
|
||||
self.logger.info(" ✅ Step 1: File was successfully analyzed")
|
||||
|
||||
# Step 2: Different tool continuing same conversation - should build conversation history
|
||||
self.logger.info(
|
||||
@@ -253,36 +226,13 @@ if __name__ == "__main__":
|
||||
self.logger.error(" ❌ Step 2: Got same continuation ID as Step 1 - continuation not working")
|
||||
return False
|
||||
|
||||
# Get logs and analyze token usage
|
||||
logs_step2 = self.get_recent_server_logs()
|
||||
usage_step2 = self.extract_conversation_usage_logs(logs_step2)
|
||||
# Validate that Step 2 is building on Step 1's conversation
|
||||
# Check if the response references the previous conversation
|
||||
if "performance" not in response2.lower() and "recursive" not in response2.lower():
|
||||
self.logger.error(" ❌ Step 2: Response doesn't contain expected performance analysis")
|
||||
return False
|
||||
|
||||
if len(usage_step2) < 2:
|
||||
self.logger.warning(
|
||||
f" ⚠️ Step 2: Only found {len(usage_step2)} conversation usage logs, expected at least 2"
|
||||
)
|
||||
# Debug: Look for any CONVERSATION_DEBUG logs
|
||||
conversation_debug_lines = [line for line in logs_step2.split("\n") if "CONVERSATION_DEBUG" in line]
|
||||
self.logger.debug(f" 📋 Found {len(conversation_debug_lines)} CONVERSATION_DEBUG lines in step 2")
|
||||
|
||||
if conversation_debug_lines:
|
||||
self.logger.debug(" 📋 Recent CONVERSATION_DEBUG lines:")
|
||||
for line in conversation_debug_lines[-10:]: # Show last 10
|
||||
self.logger.debug(f" {line}")
|
||||
|
||||
# If we have at least 1 usage log, continue with adjusted expectations
|
||||
if len(usage_step2) >= 1:
|
||||
self.logger.info(" 📋 Continuing with single usage log for analysis")
|
||||
else:
|
||||
self.logger.error(" ❌ No conversation usage logs found at all")
|
||||
return False
|
||||
|
||||
latest_usage_step2 = usage_step2[-1] # Get most recent usage
|
||||
self.logger.info(
|
||||
f" Step 2 Token Usage - Total Capacity: {latest_usage_step2.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step2.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step2.get('remaining_tokens', 0):,}"
|
||||
)
|
||||
self.logger.info(" ✅ Step 2: Successfully continued conversation with performance analysis")
|
||||
|
||||
# Step 3: Continue conversation with additional file - should show increased token usage
|
||||
self.logger.info(" Step 3: Continue conversation with file1 + file2 - checking token growth")
|
||||
@@ -305,86 +255,32 @@ if __name__ == "__main__":
|
||||
self.logger.info(f" ✅ Step 3 completed with continuation_id: {continuation_id3[:8]}...")
|
||||
continuation_ids.append(continuation_id3)
|
||||
|
||||
# Get logs and analyze final token usage
|
||||
logs_step3 = self.get_recent_server_logs()
|
||||
usage_step3 = self.extract_conversation_usage_logs(logs_step3)
|
||||
# Validate that Step 3 references both previous steps and compares the files
|
||||
if "calculator" not in response3.lower() or "math" not in response3.lower():
|
||||
self.logger.error(" ❌ Step 3: Response doesn't contain expected comparison between files")
|
||||
return False
|
||||
|
||||
self.logger.info(f" 📋 Found {len(usage_step3)} total conversation usage logs")
|
||||
self.logger.info(" ✅ Step 3: Successfully compared both files in continued conversation")
|
||||
|
||||
if len(usage_step3) < 3:
|
||||
self.logger.warning(
|
||||
f" ⚠️ Step 3: Only found {len(usage_step3)} conversation usage logs, expected at least 3"
|
||||
)
|
||||
# Let's check if we have at least some logs to work with
|
||||
if len(usage_step3) == 0:
|
||||
self.logger.error(" ❌ No conversation usage logs found at all")
|
||||
# Debug: show some recent logs
|
||||
recent_lines = logs_step3.split("\n")[-50:]
|
||||
self.logger.debug(" 📋 Recent log lines:")
|
||||
for line in recent_lines:
|
||||
if line.strip() and "CONVERSATION_DEBUG" in line:
|
||||
self.logger.debug(f" {line}")
|
||||
return False
|
||||
|
||||
latest_usage_step3 = usage_step3[-1] # Get most recent usage
|
||||
self.logger.info(
|
||||
f" Step 3 Token Usage - Total Capacity: {latest_usage_step3.get('total_capacity', 0):,}, "
|
||||
f"Conversation: {latest_usage_step3.get('conversation_tokens', 0):,}, "
|
||||
f"Remaining: {latest_usage_step3.get('remaining_tokens', 0):,}"
|
||||
)
|
||||
|
||||
# Validation: Check token processing and conversation history
|
||||
self.logger.info(" 📋 Validating token processing and conversation history...")
|
||||
|
||||
# Get conversation usage for steps with continuation_id
|
||||
step2_conversation = 0
|
||||
step2_remaining = 0
|
||||
step3_conversation = 0
|
||||
step3_remaining = 0
|
||||
|
||||
if len(usage_step2) > 0:
|
||||
step2_conversation = latest_usage_step2.get("conversation_tokens", 0)
|
||||
step2_remaining = latest_usage_step2.get("remaining_tokens", 0)
|
||||
|
||||
if len(usage_step3) >= len(usage_step2) + 1: # Should have one more log than step2
|
||||
step3_conversation = latest_usage_step3.get("conversation_tokens", 0)
|
||||
step3_remaining = latest_usage_step3.get("remaining_tokens", 0)
|
||||
else:
|
||||
# Use step2 values as fallback
|
||||
step3_conversation = step2_conversation
|
||||
step3_remaining = step2_remaining
|
||||
self.logger.warning(" ⚠️ Using Step 2 usage for Step 3 comparison due to missing logs")
|
||||
# Validation: Check that conversation continuation worked properly
|
||||
self.logger.info(" 📋 Validating conversation continuation...")
|
||||
|
||||
# Validation criteria
|
||||
criteria = []
|
||||
|
||||
# 1. Step 1 should have processed files successfully
|
||||
step1_processed_files = step1_file_tokens > 0
|
||||
criteria.append(("Step 1 processed files successfully", step1_processed_files))
|
||||
# 1. All steps returned valid responses
|
||||
all_responses_valid = bool(response1 and response2 and response3)
|
||||
criteria.append(("All steps returned valid responses", all_responses_valid))
|
||||
|
||||
# 2. Step 2 should have conversation history (if continuation worked)
|
||||
step2_has_conversation = (
|
||||
step2_conversation > 0 if len(usage_step2) > 0 else True
|
||||
) # Pass if no logs (might be different issue)
|
||||
step2_has_remaining = step2_remaining > 0 if len(usage_step2) > 0 else True
|
||||
criteria.append(("Step 2 has conversation history", step2_has_conversation))
|
||||
criteria.append(("Step 2 has remaining tokens", step2_has_remaining))
|
||||
# 2. All steps generated continuation IDs
|
||||
all_have_continuation_ids = bool(continuation_id1 and continuation_id2 and continuation_id3)
|
||||
criteria.append(("All steps generated continuation IDs", all_have_continuation_ids))
|
||||
|
||||
# 3. Step 3 should show conversation growth
|
||||
step3_has_conversation = (
|
||||
step3_conversation >= step2_conversation if len(usage_step3) > len(usage_step2) else True
|
||||
)
|
||||
criteria.append(("Step 3 maintains conversation history", step3_has_conversation))
|
||||
|
||||
# 4. Check that we got some conversation usage logs for continuation calls
|
||||
has_conversation_logs = len(usage_step3) > 0
|
||||
criteria.append(("Found conversation usage logs", has_conversation_logs))
|
||||
|
||||
# 5. Validate unique continuation IDs per response
|
||||
# 3. Each continuation ID is unique
|
||||
unique_continuation_ids = len(set(continuation_ids)) == len(continuation_ids)
|
||||
criteria.append(("Each response generated unique continuation ID", unique_continuation_ids))
|
||||
|
||||
# 6. Validate continuation IDs were different from each step
|
||||
# 4. Continuation IDs follow the expected pattern
|
||||
step_ids_different = (
|
||||
len(continuation_ids) == 3
|
||||
and continuation_ids[0] != continuation_ids[1]
|
||||
@@ -392,38 +288,20 @@ if __name__ == "__main__":
|
||||
)
|
||||
criteria.append(("All continuation IDs are different", step_ids_different))
|
||||
|
||||
# Log detailed analysis
|
||||
self.logger.info(" Token Processing Analysis:")
|
||||
self.logger.info(f" Step 1 - File tokens: {step1_file_tokens:,} (new conversation)")
|
||||
self.logger.info(f" Step 2 - Conversation: {step2_conversation:,}, Remaining: {step2_remaining:,}")
|
||||
self.logger.info(f" Step 3 - Conversation: {step3_conversation:,}, Remaining: {step3_remaining:,}")
|
||||
# 5. Check responses build on each other (content validation)
|
||||
step1_has_function_analysis = "fibonacci" in response1.lower() or "factorial" in response1.lower()
|
||||
step2_has_performance_analysis = "performance" in response2.lower() or "recursive" in response2.lower()
|
||||
step3_has_comparison = "calculator" in response3.lower() and "math" in response3.lower()
|
||||
|
||||
criteria.append(("Step 1 analyzed the math functions", step1_has_function_analysis))
|
||||
criteria.append(("Step 2 discussed performance implications", step2_has_performance_analysis))
|
||||
criteria.append(("Step 3 compared both files", step3_has_comparison))
|
||||
|
||||
# Log continuation ID analysis
|
||||
self.logger.info(" Continuation ID Analysis:")
|
||||
self.logger.info(f" Step 1 ID: {continuation_ids[0][:8]}... (generated)")
|
||||
self.logger.info(f" Step 2 ID: {continuation_ids[1][:8]}... (generated from Step 1)")
|
||||
self.logger.info(f" Step 3 ID: {continuation_ids[2][:8]}... (generated from Step 2)")
|
||||
|
||||
# Check for file mentions in step 3 (should include both files)
|
||||
# Look for file processing in conversation memory logs and tool embedding logs
|
||||
file2_mentioned_step3 = any(
|
||||
"calculator.py" in log
|
||||
for log in logs_step3.split("\n")
|
||||
if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
|
||||
)
|
||||
file1_still_mentioned_step3 = any(
|
||||
"math_functions.py" in log
|
||||
for log in logs_step3.split("\n")
|
||||
if ("embedded" in log.lower() and ("conversation" in log.lower() or "tool" in log.lower()))
|
||||
)
|
||||
|
||||
self.logger.info(" File Processing in Step 3:")
|
||||
self.logger.info(f" File1 (math_functions.py) mentioned: {file1_still_mentioned_step3}")
|
||||
self.logger.info(f" File2 (calculator.py) mentioned: {file2_mentioned_step3}")
|
||||
|
||||
# Add file increase validation
|
||||
step3_file_increase = file2_mentioned_step3 # New file should be visible
|
||||
criteria.append(("Step 3 shows new file being processed", step3_file_increase))
|
||||
self.logger.info(f" Step 1 ID: {continuation_ids[0][:8]}... (new conversation)")
|
||||
self.logger.info(f" Step 2 ID: {continuation_ids[1][:8]}... (continued from Step 1)")
|
||||
self.logger.info(f" Step 3 ID: {continuation_ids[2][:8]}... (continued from Step 2)")
|
||||
|
||||
# Check validation criteria
|
||||
passed_criteria = sum(1 for _, passed in criteria if passed)
|
||||
@@ -434,16 +312,6 @@ if __name__ == "__main__":
|
||||
status = "✅" if passed else "❌"
|
||||
self.logger.info(f" {status} {criterion}")
|
||||
|
||||
# Check for file embedding logs
|
||||
file_embedding_logs = [
|
||||
line for line in logs_step3.split("\n") if "tool embedding" in line and "files" in line
|
||||
]
|
||||
|
||||
conversation_logs = [line for line in logs_step3.split("\n") if "conversation history" in line.lower()]
|
||||
|
||||
self.logger.info(f" File embedding logs: {len(file_embedding_logs)}")
|
||||
self.logger.info(f" Conversation history logs: {len(conversation_logs)}")
|
||||
|
||||
# Success criteria: All validation criteria must pass
|
||||
success = passed_criteria == total_criteria
|
||||
|
||||
|
||||
Reference in New Issue
Block a user