Fixed o4-mini name, OpenAI removed o4-mini-high Add max_output_tokens property to ModelCapabilities Fixed tests after refactor
381 lines
17 KiB
Python
381 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Conversation Chain and Threading Validation Test
|
|
|
|
This test validates that:
|
|
1. Multiple tool invocations create proper parent->parent->parent chains
|
|
2. New conversations can be started independently
|
|
3. Original conversation chains can be resumed from any point
|
|
4. History traversal works correctly for all scenarios
|
|
5. Thread relationships are properly maintained in Redis
|
|
|
|
Test Flow:
|
|
Chain A: chat -> analyze -> debug (3 linked threads)
|
|
Chain B: chat -> analyze (2 linked threads, independent)
|
|
Chain A Branch: debug (continue from original chat, creating branch)
|
|
|
|
This validates the conversation threading system's ability to:
|
|
- Build linear chains
|
|
- Create independent conversation threads
|
|
- Branch from earlier points in existing chains
|
|
- Properly traverse parent relationships for history reconstruction
|
|
"""
|
|
|
|
|
|
from .conversation_base_test import ConversationBaseTest
|
|
|
|
|
|
class ConversationChainValidationTest(ConversationBaseTest):
|
|
"""Test conversation chain and threading functionality"""
|
|
|
|
@property
|
|
def test_name(self) -> str:
|
|
return "conversation_chain_validation"
|
|
|
|
@property
|
|
def test_description(self) -> str:
|
|
return "Conversation chain and threading validation"
|
|
|
|
def run_test(self) -> bool:
|
|
"""Test conversation chain and threading functionality"""
|
|
# Set up the test environment
|
|
self.setUp()
|
|
|
|
try:
|
|
self.logger.info("Test: Conversation chain and threading validation")
|
|
|
|
# Create test file for consistent context
|
|
test_file_content = """def example_function():
|
|
'''Simple test function for conversation continuity testing'''
|
|
return "Hello from conversation chain test"
|
|
|
|
def buggy_function(x, y):
|
|
'''Function with a bug - incorrect operator'''
|
|
return x - y # BUG: Should be x + y for addition
|
|
|
|
class TestClass:
|
|
def method(self):
|
|
return "Method in test class"
|
|
"""
|
|
test_file_path = self.create_additional_test_file("chain_test.py", test_file_content)
|
|
|
|
# Track all continuation IDs and their relationships
|
|
conversation_chains = {}
|
|
|
|
# === CHAIN A: Build linear conversation chain ===
|
|
self.logger.info(" Chain A: Building linear conversation chain")
|
|
|
|
# Step A1: Start with chat tool (creates thread_id_1)
|
|
self.logger.info(" Step A1: Chat tool - start new conversation")
|
|
|
|
response_a1, continuation_id_a1 = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "Analyze this test file and explain what it does.",
|
|
"files": [test_file_path],
|
|
"model": "flash",
|
|
"temperature": 0.7,
|
|
},
|
|
)
|
|
|
|
if not response_a1 or not continuation_id_a1:
|
|
self.logger.error(" ❌ Step A1 failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step A1 completed - thread_id: {continuation_id_a1[:8]}...")
|
|
conversation_chains["A1"] = continuation_id_a1
|
|
|
|
# Step A2: Continue with analyze tool (creates thread_id_2 with parent=thread_id_1)
|
|
self.logger.info(" Step A2: Analyze tool - continue Chain A")
|
|
|
|
response_a2, continuation_id_a2 = self.call_mcp_tool(
|
|
"analyze",
|
|
{
|
|
"step": "Now analyze the code quality and suggest improvements.",
|
|
"step_number": 1,
|
|
"total_steps": 2,
|
|
"next_step_required": False,
|
|
"findings": "Continuing analysis from previous chat conversation to analyze code quality.",
|
|
"relevant_files": [test_file_path],
|
|
"continuation_id": continuation_id_a1,
|
|
"model": "flash",
|
|
},
|
|
)
|
|
|
|
if not response_a2 or not continuation_id_a2:
|
|
self.logger.error(" ❌ Step A2 failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step A2 completed - thread_id: {continuation_id_a2[:8]}...")
|
|
conversation_chains["A2"] = continuation_id_a2
|
|
|
|
# Step A3: Continue with chat tool (creates thread_id_3 with parent=thread_id_2)
|
|
self.logger.info(" Step A3: Chat tool - continue Chain A")
|
|
|
|
response_a3, continuation_id_a3 = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "Thank you for the analysis. Can you summarize the key points?",
|
|
"continuation_id": continuation_id_a2,
|
|
"model": "flash",
|
|
"temperature": 0.7,
|
|
},
|
|
)
|
|
|
|
if not response_a3 or not continuation_id_a3:
|
|
self.logger.error(" ❌ Step A3 failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step A3 completed - thread_id: {continuation_id_a3[:8]}...")
|
|
conversation_chains["A3"] = continuation_id_a3
|
|
|
|
# === CHAIN B: Start independent conversation ===
|
|
self.logger.info(" Chain B: Starting independent conversation")
|
|
|
|
# Step B1: Start new chat conversation (creates thread_id_4, no parent)
|
|
self.logger.info(" Step B1: Chat tool - start NEW independent conversation")
|
|
|
|
response_b1, continuation_id_b1 = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "This is a completely new conversation. Please greet me.",
|
|
"model": "flash",
|
|
"temperature": 0.7,
|
|
},
|
|
)
|
|
|
|
if not response_b1 or not continuation_id_b1:
|
|
self.logger.error(" ❌ Step B1 failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step B1 completed - thread_id: {continuation_id_b1[:8]}...")
|
|
conversation_chains["B1"] = continuation_id_b1
|
|
|
|
# Step B2: Continue the new conversation (creates thread_id_5 with parent=thread_id_4)
|
|
self.logger.info(" Step B2: Analyze tool - continue Chain B")
|
|
|
|
response_b2, continuation_id_b2 = self.call_mcp_tool(
|
|
"analyze",
|
|
{
|
|
"step": "Analyze the previous greeting and suggest improvements.",
|
|
"step_number": 1,
|
|
"total_steps": 1,
|
|
"next_step_required": False,
|
|
"findings": "Analyzing the greeting from previous conversation and suggesting improvements.",
|
|
"relevant_files": [test_file_path],
|
|
"continuation_id": continuation_id_b1,
|
|
"model": "flash",
|
|
},
|
|
)
|
|
|
|
if not response_b2 or not continuation_id_b2:
|
|
self.logger.error(" ❌ Step B2 failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step B2 completed - thread_id: {continuation_id_b2[:8]}...")
|
|
conversation_chains["B2"] = continuation_id_b2
|
|
|
|
# === CHAIN A BRANCH: Go back to original conversation ===
|
|
self.logger.info(" Chain A Branch: Resume original conversation from A1")
|
|
|
|
# Step A1-Branch: Use original continuation_id_a1 to branch (creates thread_id_6 with parent=thread_id_1)
|
|
self.logger.info(" Step A1-Branch: Chat tool - branch from original Chain A")
|
|
|
|
response_a1_branch, continuation_id_a1_branch = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "Going back to our original discussion, I have another question about the code structure.",
|
|
"continuation_id": continuation_id_a1, # Go back to original!
|
|
"model": "flash",
|
|
"temperature": 0.7,
|
|
},
|
|
)
|
|
|
|
if not response_a1_branch or not continuation_id_a1_branch:
|
|
self.logger.error(" ❌ Step A1-Branch failed - no response or continuation ID")
|
|
return False
|
|
|
|
self.logger.info(f" ✅ Step A1-Branch completed - thread_id: {continuation_id_a1_branch[:8]}...")
|
|
conversation_chains["A1_Branch"] = continuation_id_a1_branch
|
|
|
|
# === ANALYSIS: Validate thread relationships and history traversal ===
|
|
self.logger.info(" Analyzing conversation chain structure...")
|
|
|
|
# Get logs and extract thread relationships
|
|
logs = self.get_recent_server_logs()
|
|
thread_creation_logs = self.extract_thread_creation_logs(logs)
|
|
history_traversal_logs = self.extract_history_traversal_logs(logs)
|
|
|
|
self.logger.info(f" Found {len(thread_creation_logs)} thread creation logs")
|
|
self.logger.info(f" Found {len(history_traversal_logs)} history traversal logs")
|
|
|
|
# Debug: Show what we found
|
|
if self.verbose:
|
|
self.logger.debug(" Thread creation logs found:")
|
|
for log in thread_creation_logs:
|
|
self.logger.debug(
|
|
f" {log['thread_id'][:8]}... parent: {log['parent_id'][:8] if log['parent_id'] else 'None'}..."
|
|
)
|
|
self.logger.debug(" History traversal logs found:")
|
|
for log in history_traversal_logs:
|
|
self.logger.debug(f" {log['thread_id'][:8]}... chain length: {log['chain_length']}")
|
|
|
|
# Build expected thread relationships
|
|
expected_relationships = []
|
|
|
|
# Note: A1 and B1 won't appear in thread creation logs because they're new conversations (no parent)
|
|
# Only continuation threads (A2, A3, B2, A1-Branch) will appear in creation logs
|
|
|
|
# Find logs for each continuation thread
|
|
a2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a2), None)
|
|
a3_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_a3), None)
|
|
b2_log = next((log for log in thread_creation_logs if log["thread_id"] == continuation_id_b2), None)
|
|
a1_branch_log = next(
|
|
(log for log in thread_creation_logs if log["thread_id"] == continuation_id_a1_branch), None
|
|
)
|
|
|
|
# A2 should have A1 as parent
|
|
if a2_log:
|
|
expected_relationships.append(("A2 has A1 as parent", a2_log["parent_id"] == continuation_id_a1))
|
|
|
|
# A3 should have A2 as parent
|
|
if a3_log:
|
|
expected_relationships.append(("A3 has A2 as parent", a3_log["parent_id"] == continuation_id_a2))
|
|
|
|
# B2 should have B1 as parent (independent chain)
|
|
if b2_log:
|
|
expected_relationships.append(("B2 has B1 as parent", b2_log["parent_id"] == continuation_id_b1))
|
|
|
|
# A1-Branch should have A1 as parent (branching)
|
|
if a1_branch_log:
|
|
expected_relationships.append(
|
|
("A1-Branch has A1 as parent", a1_branch_log["parent_id"] == continuation_id_a1)
|
|
)
|
|
|
|
# Validate history traversal
|
|
traversal_validations = []
|
|
|
|
# History traversal logs are only generated when conversation history is built from scratch
|
|
# (not when history is already embedded in the prompt by server.py)
|
|
# So we should expect at least 1 traversal log, but not necessarily for every continuation
|
|
|
|
if len(history_traversal_logs) > 0:
|
|
# Validate that any traversal logs we find have reasonable chain lengths
|
|
for log in history_traversal_logs:
|
|
thread_id = log["thread_id"]
|
|
chain_length = log["chain_length"]
|
|
|
|
# Chain length should be at least 2 for any continuation thread
|
|
# (original thread + continuation thread)
|
|
is_valid_length = chain_length >= 2
|
|
|
|
# Try to identify which thread this is for better validation
|
|
thread_description = f"Thread {thread_id[:8]}"
|
|
if thread_id == continuation_id_a1:
|
|
thread_description = "A1 (original thread)"
|
|
is_valid_length = chain_length == 1
|
|
elif thread_id == continuation_id_a2:
|
|
thread_description = "A2 (2-thread chain)"
|
|
is_valid_length = chain_length == 2
|
|
elif thread_id == continuation_id_a3:
|
|
thread_description = "A3 (3-thread chain)"
|
|
is_valid_length = chain_length == 3
|
|
elif thread_id == continuation_id_b1:
|
|
thread_description = "B1 (original thread)"
|
|
is_valid_length = chain_length == 1
|
|
elif thread_id == continuation_id_b2:
|
|
thread_description = "B2 (2-thread chain)"
|
|
is_valid_length = chain_length == 2
|
|
elif thread_id == continuation_id_a1_branch:
|
|
thread_description = "A1-Branch (2-thread chain)"
|
|
is_valid_length = chain_length == 2
|
|
|
|
traversal_validations.append((f"{thread_description} has valid chain length", is_valid_length))
|
|
|
|
# Also validate we found at least one traversal (shows the system is working)
|
|
traversal_validations.append(
|
|
("At least one history traversal occurred", len(history_traversal_logs) >= 1)
|
|
)
|
|
|
|
# === VALIDATION RESULTS ===
|
|
self.logger.info(" Thread Relationship Validation:")
|
|
relationship_passed = 0
|
|
for desc, passed in expected_relationships:
|
|
status = "✅" if passed else "❌"
|
|
self.logger.info(f" {status} {desc}")
|
|
if passed:
|
|
relationship_passed += 1
|
|
|
|
self.logger.info(" History Traversal Validation:")
|
|
traversal_passed = 0
|
|
for desc, passed in traversal_validations:
|
|
status = "✅" if passed else "❌"
|
|
self.logger.info(f" {status} {desc}")
|
|
if passed:
|
|
traversal_passed += 1
|
|
|
|
# === SUCCESS CRITERIA ===
|
|
total_relationship_checks = len(expected_relationships)
|
|
total_traversal_checks = len(traversal_validations)
|
|
|
|
self.logger.info(" Validation Summary:")
|
|
self.logger.info(f" Thread relationships: {relationship_passed}/{total_relationship_checks}")
|
|
self.logger.info(f" History traversal: {traversal_passed}/{total_traversal_checks}")
|
|
|
|
# Success requires at least 80% of validations to pass
|
|
relationship_success = relationship_passed >= (total_relationship_checks * 0.8)
|
|
|
|
# If no traversal checks were possible, it means no traversal logs were found
|
|
# This could indicate an issue since we expect at least some history building
|
|
if total_traversal_checks == 0:
|
|
self.logger.warning(
|
|
" No history traversal logs found - this may indicate conversation history is always pre-embedded"
|
|
)
|
|
# Still consider it successful since the thread relationships are what matter most
|
|
traversal_success = True
|
|
else:
|
|
# For traversal success, we need at least 50% to pass since chain lengths can vary
|
|
# The important thing is that traversal is happening and relationships are correct
|
|
traversal_success = traversal_passed >= (total_traversal_checks * 0.5)
|
|
|
|
overall_success = relationship_success and traversal_success
|
|
|
|
self.logger.info(" Conversation Chain Structure:")
|
|
self.logger.info(
|
|
f" Chain A: {continuation_id_a1[:8]} → {continuation_id_a2[:8]} → {continuation_id_a3[:8]}"
|
|
)
|
|
self.logger.info(f" Chain B: {continuation_id_b1[:8]} → {continuation_id_b2[:8]}")
|
|
self.logger.info(f" Branch: {continuation_id_a1[:8]} → {continuation_id_a1_branch[:8]}")
|
|
|
|
if overall_success:
|
|
self.logger.info(" ✅ Conversation chain validation test PASSED")
|
|
return True
|
|
else:
|
|
self.logger.error(" ❌ Conversation chain validation test FAILED")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Conversation chain validation test failed: {e}")
|
|
return False
|
|
|
|
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple:
|
|
"""Call an MCP tool in-process"""
|
|
# Use in-process implementation to maintain conversation memory
|
|
response_text, continuation_id = self.call_mcp_tool_direct(tool_name, params)
|
|
return response_text, continuation_id
|
|
|
|
|
|
def main():
|
|
"""Run the conversation chain validation test"""
|
|
import sys
|
|
|
|
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
|
test = ConversationChainValidationTest(verbose=verbose)
|
|
|
|
success = test.run_test()
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|