🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)
* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
4dae6e457e
commit
69a3121452
@@ -6,7 +6,9 @@ Each test is in its own file for better organization and maintainability.
|
||||
"""
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .test_analyze_validation import AnalyzeValidationTest
|
||||
from .test_basic_conversation import BasicConversationTest
|
||||
from .test_codereview_validation import CodeReviewValidationTest
|
||||
from .test_consensus_conversation import TestConsensusConversation
|
||||
from .test_consensus_stance import TestConsensusStance
|
||||
from .test_consensus_three_models import TestConsensusThreeModels
|
||||
@@ -27,10 +29,12 @@ from .test_openrouter_models import OpenRouterModelsTest
|
||||
from .test_per_tool_deduplication import PerToolDeduplicationTest
|
||||
from .test_planner_continuation_history import PlannerContinuationHistoryTest
|
||||
from .test_planner_validation import PlannerValidationTest
|
||||
from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest
|
||||
|
||||
# Redis validation test removed - no longer needed for standalone server
|
||||
from .test_refactor_validation import RefactorValidationTest
|
||||
from .test_testgen_validation import TestGenValidationTest
|
||||
from .test_thinkdeep_validation import ThinkDeepWorkflowValidationTest
|
||||
from .test_token_allocation_validation import TokenAllocationValidationTest
|
||||
from .test_vision_capability import VisionCapabilityTest
|
||||
from .test_xai_models import XAIModelsTest
|
||||
@@ -38,6 +42,7 @@ from .test_xai_models import XAIModelsTest
|
||||
# Test registry for dynamic loading
|
||||
TEST_REGISTRY = {
|
||||
"basic_conversation": BasicConversationTest,
|
||||
"codereview_validation": CodeReviewValidationTest,
|
||||
"content_validation": ContentValidationTest,
|
||||
"per_tool_deduplication": PerToolDeduplicationTest,
|
||||
"cross_tool_continuation": CrossToolContinuationTest,
|
||||
@@ -52,8 +57,10 @@ TEST_REGISTRY = {
|
||||
"openrouter_models": OpenRouterModelsTest,
|
||||
"planner_validation": PlannerValidationTest,
|
||||
"planner_continuation_history": PlannerContinuationHistoryTest,
|
||||
"precommit_validation": PrecommitWorkflowValidationTest,
|
||||
"token_allocation_validation": TokenAllocationValidationTest,
|
||||
"testgen_validation": TestGenValidationTest,
|
||||
"thinkdeep_validation": ThinkDeepWorkflowValidationTest,
|
||||
"refactor_validation": RefactorValidationTest,
|
||||
"debug_validation": DebugValidationTest,
|
||||
"debug_certain_confidence": DebugCertainConfidenceTest,
|
||||
@@ -63,19 +70,20 @@ TEST_REGISTRY = {
|
||||
"consensus_conversation": TestConsensusConversation,
|
||||
"consensus_stance": TestConsensusStance,
|
||||
"consensus_three_models": TestConsensusThreeModels,
|
||||
"analyze_validation": AnalyzeValidationTest,
|
||||
# "o3_pro_expensive": O3ProExpensiveTest, # COMMENTED OUT - too expensive to run by default
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"BaseSimulatorTest",
|
||||
"BasicConversationTest",
|
||||
"CodeReviewValidationTest",
|
||||
"ContentValidationTest",
|
||||
"PerToolDeduplicationTest",
|
||||
"CrossToolContinuationTest",
|
||||
"CrossToolComprehensiveTest",
|
||||
"LineNumberValidationTest",
|
||||
"LogsValidationTest",
|
||||
# "RedisValidationTest", # Removed - no longer needed for standalone server
|
||||
"TestModelThinkingConfig",
|
||||
"O3ModelSelectionTest",
|
||||
"O3ProExpensiveTest",
|
||||
@@ -84,8 +92,10 @@ __all__ = [
|
||||
"OpenRouterModelsTest",
|
||||
"PlannerValidationTest",
|
||||
"PlannerContinuationHistoryTest",
|
||||
"PrecommitWorkflowValidationTest",
|
||||
"TokenAllocationValidationTest",
|
||||
"TestGenValidationTest",
|
||||
"ThinkDeepWorkflowValidationTest",
|
||||
"RefactorValidationTest",
|
||||
"DebugValidationTest",
|
||||
"DebugCertainConfidenceTest",
|
||||
@@ -95,5 +105,6 @@ __all__ = [
|
||||
"TestConsensusConversation",
|
||||
"TestConsensusStance",
|
||||
"TestConsensusThreeModels",
|
||||
"AnalyzeValidationTest",
|
||||
"TEST_REGISTRY",
|
||||
]
|
||||
|
||||
@@ -228,6 +228,10 @@ class Calculator:
|
||||
|
||||
# Look for continuation_id in various places
|
||||
if isinstance(response_data, dict):
|
||||
# Check for direct continuation_id field (new workflow tools)
|
||||
if "continuation_id" in response_data:
|
||||
return response_data["continuation_id"]
|
||||
|
||||
# Check metadata
|
||||
metadata = response_data.get("metadata", {})
|
||||
if "thread_id" in metadata:
|
||||
|
||||
@@ -80,8 +80,10 @@ class ConversationBaseTest(BaseSimulatorTest):
|
||||
if project_root not in sys.path:
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
# Import tools from server
|
||||
from server import TOOLS
|
||||
# Import and configure providers first (this is what main() does)
|
||||
from server import TOOLS, configure_providers
|
||||
|
||||
configure_providers()
|
||||
|
||||
self._tools = TOOLS
|
||||
self.logger.debug(f"Imported {len(self._tools)} tools for in-process testing")
|
||||
|
||||
1079
simulator_tests/test_analyze_validation.py
Normal file
1079
simulator_tests/test_analyze_validation.py
Normal file
File diff suppressed because it is too large
Load Diff
1027
simulator_tests/test_codereview_validation.py
Normal file
1027
simulator_tests/test_codereview_validation.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -62,7 +62,7 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
self.logger.info(" 1: Testing chat -> thinkdeep -> codereview")
|
||||
|
||||
# Start with chat
|
||||
chat_response, chat_id = self.call_mcp_tool_direct(
|
||||
chat_response, chat_id = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Look at this Python code and tell me what you think about it",
|
||||
@@ -76,11 +76,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
# Continue with thinkdeep
|
||||
thinkdeep_response, _ = self.call_mcp_tool_direct(
|
||||
thinkdeep_response, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Think deeply about potential performance issues in this code",
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"step": "Think deeply about potential performance issues in this code. Please use low thinking mode.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Building on previous chat analysis to examine performance issues",
|
||||
"relevant_files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"continuation_id": chat_id,
|
||||
"model": "flash",
|
||||
},
|
||||
@@ -91,11 +95,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
# Continue with codereview
|
||||
codereview_response, _ = self.call_mcp_tool_direct(
|
||||
codereview_response, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"prompt": "Building on our previous analysis, provide a comprehensive code review",
|
||||
"step": "Building on our previous analysis, provide a comprehensive code review",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Continuing from previous chat and thinkdeep analysis for comprehensive review",
|
||||
"relevant_files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"continuation_id": chat_id,
|
||||
"model": "flash",
|
||||
},
|
||||
@@ -118,11 +126,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
self.logger.info(" 2: Testing analyze -> debug -> thinkdeep")
|
||||
|
||||
# Start with analyze
|
||||
analyze_response, analyze_id = self.call_mcp_tool_direct(
|
||||
analyze_response, analyze_id = self.call_mcp_tool(
|
||||
"analyze",
|
||||
{
|
||||
"files": [self.test_files["python"]],
|
||||
"prompt": "Analyze this code for quality and performance issues",
|
||||
"step": "Analyze this code for quality and performance issues",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Starting analysis of Python code for quality and performance issues",
|
||||
"relevant_files": [self.test_files["python"]],
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
@@ -132,11 +144,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
# Continue with debug
|
||||
debug_response, _ = self.call_mcp_tool_direct(
|
||||
debug_response, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"prompt": "Based on our analysis, help debug the performance issue in fibonacci",
|
||||
"step": "Based on our analysis, help debug the performance issue in fibonacci",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Building on previous analysis to debug specific performance issue",
|
||||
"relevant_files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"continuation_id": analyze_id,
|
||||
"model": "flash",
|
||||
},
|
||||
@@ -147,11 +163,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
# Continue with thinkdeep
|
||||
final_response, _ = self.call_mcp_tool_direct(
|
||||
final_response, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Think deeply about the architectural implications of the issues we've found",
|
||||
"files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"step": "Think deeply about the architectural implications of the issues we've found. Please use low thinking mode.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Building on analysis and debug findings to explore architectural implications",
|
||||
"relevant_files": [self.test_files["python"]], # Same file should be deduplicated
|
||||
"continuation_id": analyze_id,
|
||||
"model": "flash",
|
||||
},
|
||||
@@ -174,7 +194,7 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
self.logger.info(" 3: Testing multi-file cross-tool continuation")
|
||||
|
||||
# Start with both files
|
||||
multi_response, multi_id = self.call_mcp_tool_direct(
|
||||
multi_response, multi_id = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. Analyze both the Python code and configuration file",
|
||||
@@ -188,11 +208,15 @@ class CrossToolContinuationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
# Switch to codereview with same files (should use conversation history)
|
||||
multi_review, _ = self.call_mcp_tool_direct(
|
||||
multi_review, _ = self.call_mcp_tool(
|
||||
"codereview",
|
||||
{
|
||||
"files": [self.test_files["python"], self.test_files["config"]], # Same files
|
||||
"prompt": "Review both files in the context of our previous discussion",
|
||||
"step": "Review both files in the context of our previous discussion",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Continuing multi-file analysis with code review perspective",
|
||||
"relevant_files": [self.test_files["python"], self.test_files["config"]], # Same files
|
||||
"continuation_id": multi_id,
|
||||
"model": "flash",
|
||||
},
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug Tool Self-Investigation Validation Test
|
||||
DebugWorkflow Tool Validation Test
|
||||
|
||||
Tests the debug tool's systematic self-investigation capabilities including:
|
||||
- Step-by-step investigation with proper JSON responses
|
||||
- Progressive tracking of findings, files, and methods
|
||||
- Hypothesis formation and confidence tracking
|
||||
- Backtracking and revision capabilities
|
||||
- Final expert analysis after investigation completion
|
||||
Tests the debug tool's capabilities using the new workflow architecture.
|
||||
This validates that the new workflow-based implementation maintains
|
||||
all the functionality of the original debug tool.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -17,7 +14,7 @@ from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class DebugValidationTest(ConversationBaseTest):
|
||||
"""Test debug tool's self-investigation and expert analysis features"""
|
||||
"""Test debug tool with new workflow architecture"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
@@ -25,15 +22,15 @@ class DebugValidationTest(ConversationBaseTest):
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Debug tool self-investigation pattern validation"
|
||||
return "Debug tool validation with new workflow architecture"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test debug tool self-investigation capabilities"""
|
||||
"""Test debug tool capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: Debug tool self-investigation validation")
|
||||
self.logger.info("Test: DebugWorkflow tool validation (new architecture)")
|
||||
|
||||
# Create a Python file with a subtle but realistic bug
|
||||
self._create_buggy_code()
|
||||
@@ -50,11 +47,23 @@ class DebugValidationTest(ConversationBaseTest):
|
||||
if not self._test_complete_investigation_with_analysis():
|
||||
return False
|
||||
|
||||
# Test 4: Certain confidence behavior
|
||||
if not self._test_certain_confidence():
|
||||
return False
|
||||
|
||||
# Test 5: Context-aware file embedding
|
||||
if not self._test_context_aware_file_embedding():
|
||||
return False
|
||||
|
||||
# Test 6: Multi-step file context optimization
|
||||
if not self._test_multi_step_file_context():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All debug validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Debug validation test failed: {e}")
|
||||
self.logger.error(f"DebugWorkflow validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _create_buggy_code(self):
|
||||
@@ -164,8 +173,8 @@ RuntimeError: dictionary changed size during iteration
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Validate step 1 response structure
|
||||
if not self._validate_step_response(response1_data, 1, 4, True, "investigation_in_progress"):
|
||||
# Validate step 1 response structure - expect pause_for_investigation for next_step_required=True
|
||||
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_investigation"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
@@ -194,7 +203,7 @@ RuntimeError: dictionary changed size during iteration
|
||||
return False
|
||||
|
||||
response2_data = self._parse_debug_response(response2)
|
||||
if not self._validate_step_response(response2_data, 2, 4, True, "investigation_in_progress"):
|
||||
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_investigation"):
|
||||
return False
|
||||
|
||||
# Check investigation status tracking
|
||||
@@ -213,35 +222,6 @@ RuntimeError: dictionary changed size during iteration
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful with proper tracking")
|
||||
|
||||
# Step 3: Validate hypothesis
|
||||
self.logger.info(" 1.1.3: Step 3 - Hypothesis validation")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Confirming the bug pattern: the for loop iterates over self.active_sessions.items() while del self.active_sessions[session_id] modifies the dictionary inside the loop.",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Confirmed: Line 44-47 shows classic dictionary modification during iteration bug. The fix would be to collect expired session IDs first, then delete them after iteration completes.",
|
||||
"files_checked": [self.buggy_file],
|
||||
"relevant_files": [self.buggy_file],
|
||||
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
|
||||
"hypothesis": "Dictionary modification during iteration in cleanup_expired_sessions causes RuntimeError",
|
||||
"confidence": "high",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to continue investigation to step 3")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_debug_response(response3)
|
||||
if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Investigation session progressing successfully")
|
||||
|
||||
# Store continuation_id for next test
|
||||
self.investigation_continuation_id = continuation_id
|
||||
return True
|
||||
@@ -321,7 +301,7 @@ RuntimeError: dictionary changed size during iteration
|
||||
return False
|
||||
|
||||
response3_data = self._parse_debug_response(response3)
|
||||
if not self._validate_step_response(response3_data, 3, 4, True, "investigation_in_progress"):
|
||||
if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_investigation"):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Backtracking working correctly")
|
||||
@@ -386,7 +366,7 @@ RuntimeError: dictionary changed size during iteration
|
||||
if not response_final_data:
|
||||
return False
|
||||
|
||||
# Validate final response structure
|
||||
# Validate final response structure - expect calling_expert_analysis for next_step_required=False
|
||||
if response_final_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error(
|
||||
f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
|
||||
@@ -433,38 +413,67 @@ RuntimeError: dictionary changed size during iteration
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Complete investigation with expert analysis successful")
|
||||
|
||||
# Validate logs
|
||||
self.logger.info(" 📋 Validating execution logs...")
|
||||
|
||||
# Get server logs
|
||||
logs = self.get_recent_server_logs(500)
|
||||
|
||||
# Look for debug tool execution patterns
|
||||
debug_patterns = [
|
||||
"debug tool",
|
||||
"investigation",
|
||||
"Expert analysis",
|
||||
"calling_expert_analysis",
|
||||
]
|
||||
|
||||
patterns_found = 0
|
||||
for pattern in debug_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
if patterns_found >= 2:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(debug_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(debug_patterns)} log patterns")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Complete investigation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_certain_confidence(self) -> bool:
|
||||
"""Test certain confidence behavior - should skip expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.4: Testing certain confidence behavior")
|
||||
|
||||
# Test certain confidence - should skip expert analysis
|
||||
self.logger.info(" 1.4.1: Certain confidence investigation")
|
||||
response_certain, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "I have confirmed the exact root cause with 100% certainty: dictionary modification during iteration.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "The bug is on line 44-47: for loop iterates over dict.items() while del modifies the dict inside the loop. Fix is simple: collect expired IDs first, then delete after iteration.",
|
||||
"files_checked": [self.buggy_file],
|
||||
"relevant_files": [self.buggy_file],
|
||||
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
|
||||
"hypothesis": "Dictionary modification during iteration causes RuntimeError - fix is straightforward",
|
||||
"confidence": "certain", # This should skip expert analysis
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response_certain:
|
||||
self.logger.error("Failed to test certain confidence")
|
||||
return False
|
||||
|
||||
response_certain_data = self._parse_debug_response(response_certain)
|
||||
if not response_certain_data:
|
||||
return False
|
||||
|
||||
# Validate certain confidence response - should skip expert analysis
|
||||
if response_certain_data.get("status") != "certain_confidence_proceed_with_fix":
|
||||
self.logger.error(
|
||||
f"Expected status 'certain_confidence_proceed_with_fix', got '{response_certain_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if not response_certain_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
|
||||
return False
|
||||
|
||||
expert_analysis = response_certain_data.get("expert_analysis", {})
|
||||
if expert_analysis.get("status") != "skipped_due_to_certain_confidence":
|
||||
self.logger.error("Expert analysis should be skipped for certain confidence")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Certain confidence behavior working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Certain confidence test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for debug-specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
@@ -537,9 +546,6 @@ RuntimeError: dictionary changed size during iteration
|
||||
self.logger.error("Missing investigation_status in response")
|
||||
return False
|
||||
|
||||
# Output field removed in favor of contextual next_steps
|
||||
# No longer checking for "output" field as it was redundant
|
||||
|
||||
# Check next_steps guidance
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
@@ -550,3 +556,406 @@ RuntimeError: dictionary changed size during iteration
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating step response: {e}")
|
||||
return False
|
||||
|
||||
def _test_context_aware_file_embedding(self) -> bool:
|
||||
"""Test context-aware file embedding optimization"""
|
||||
try:
|
||||
self.logger.info(" 1.5: Testing context-aware file embedding")
|
||||
|
||||
# Create multiple test files for context testing
|
||||
file1_content = """#!/usr/bin/env python3
|
||||
def process_data(data):
|
||||
\"\"\"Process incoming data\"\"\"
|
||||
result = []
|
||||
for item in data:
|
||||
if item.get('valid'):
|
||||
result.append(item['value'])
|
||||
return result
|
||||
"""
|
||||
|
||||
file2_content = """#!/usr/bin/env python3
|
||||
def validate_input(data):
|
||||
\"\"\"Validate input data\"\"\"
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Data must be a list")
|
||||
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
raise ValueError("Items must be dictionaries")
|
||||
if 'value' not in item:
|
||||
raise ValueError("Items must have 'value' key")
|
||||
|
||||
return True
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
file1 = self.create_additional_test_file("data_processor.py", file1_content)
|
||||
file2 = self.create_additional_test_file("validator.py", file2_content)
|
||||
|
||||
# Test 1: New conversation, intermediate step - should only reference files
|
||||
self.logger.info(" 1.5.1: New conversation intermediate step (should reference only)")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Starting investigation of data processing pipeline",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True, # Intermediate step
|
||||
"findings": "Initial analysis of data processing components",
|
||||
"files_checked": [file1, file2],
|
||||
"relevant_files": [file1], # This should be referenced, not embedded
|
||||
"relevant_methods": ["process_data"],
|
||||
"hypothesis": "Investigating data flow",
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start context-aware file embedding test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_debug_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be reference_only for intermediate step
|
||||
file_context = response1_data.get("file_context", {})
|
||||
if file_context.get("type") != "reference_only":
|
||||
self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
|
||||
return False
|
||||
|
||||
if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
|
||||
self.logger.error("Expected context optimization message for reference_only")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Intermediate step correctly uses reference_only file context")
|
||||
|
||||
# Test 2: Intermediate step with continuation - should still only reference
|
||||
self.logger.info(" 1.5.2: Intermediate step with continuation (should reference only)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Continuing investigation with more detailed analysis",
|
||||
"step_number": 2,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True, # Still intermediate
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Found potential issues in validation logic",
|
||||
"files_checked": [file1, file2],
|
||||
"relevant_files": [file1, file2], # Both files referenced
|
||||
"relevant_methods": ["process_data", "validate_input"],
|
||||
"hypothesis": "Validation might be too strict",
|
||||
"confidence": "medium",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_debug_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Check file context - should still be reference_only
|
||||
file_context2 = response2_data.get("file_context", {})
|
||||
if file_context2.get("type") != "reference_only":
|
||||
self.logger.error(f"Expected reference_only file context for step 2, got: {file_context2.get('type')}")
|
||||
return False
|
||||
|
||||
# Should include reference note
|
||||
if not file_context2.get("note"):
|
||||
self.logger.error("Expected file reference note for intermediate step")
|
||||
return False
|
||||
|
||||
reference_note = file_context2.get("note", "")
|
||||
if "data_processor.py" not in reference_note or "validator.py" not in reference_note:
|
||||
self.logger.error("File reference note should mention both files")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Intermediate step with continuation correctly uses reference_only")
|
||||
|
||||
# Test 3: Final step - should embed files for expert analysis
|
||||
self.logger.info(" 1.5.3: Final step (should embed files)")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Investigation complete - identified the root cause",
|
||||
"step_number": 3,
|
||||
"total_steps": 3,
|
||||
"next_step_required": False, # Final step - should embed files
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Root cause: validator is rejecting valid data due to strict type checking",
|
||||
"files_checked": [file1, file2],
|
||||
"relevant_files": [file1, file2], # Should be fully embedded
|
||||
"relevant_methods": ["process_data", "validate_input"],
|
||||
"hypothesis": "Validation logic is too restrictive for valid edge cases",
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_debug_response(response3)
|
||||
if not response3_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be fully_embedded for final step
|
||||
file_context3 = response3_data.get("file_context", {})
|
||||
if file_context3.get("type") != "fully_embedded":
|
||||
self.logger.error(
|
||||
f"Expected fully_embedded file context for final step, got: {file_context3.get('type')}"
|
||||
)
|
||||
return False
|
||||
|
||||
if "Full file content embedded for expert analysis" not in file_context3.get("context_optimization", ""):
|
||||
self.logger.error("Expected expert analysis optimization message for fully_embedded")
|
||||
return False
|
||||
|
||||
# Should show files embedded count
|
||||
files_embedded = file_context3.get("files_embedded", 0)
|
||||
if files_embedded == 0:
|
||||
# This is OK - files might already be in conversation history
|
||||
self.logger.info(
|
||||
" ℹ️ Files embedded count is 0 - files already in conversation history (smart deduplication)"
|
||||
)
|
||||
else:
|
||||
self.logger.info(f" ✅ Files embedded count: {files_embedded}")
|
||||
|
||||
self.logger.info(" ✅ Final step correctly uses fully_embedded file context")
|
||||
|
||||
# Verify expert analysis was called for final step
|
||||
if response3_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
if "expert_analysis" not in response3_data:
|
||||
self.logger.error("Expert analysis should be present in final step")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Context-aware file embedding test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Context-aware file embedding test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_multi_step_file_context(self) -> bool:
|
||||
"""Test multi-step workflow with proper file context transitions"""
|
||||
try:
|
||||
self.logger.info(" 1.6: Testing multi-step file context optimization")
|
||||
|
||||
# Create a complex scenario with multiple files
|
||||
config_content = """#!/usr/bin/env python3
|
||||
import os
|
||||
|
||||
DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///app.db')
|
||||
DEBUG_MODE = os.getenv('DEBUG', 'False').lower() == 'true'
|
||||
MAX_CONNECTIONS = int(os.getenv('MAX_CONNECTIONS', '10'))
|
||||
|
||||
# Bug: This will cause issues when MAX_CONNECTIONS is not a valid integer
|
||||
CACHE_SIZE = MAX_CONNECTIONS * 2 # Problematic if MAX_CONNECTIONS is invalid
|
||||
"""
|
||||
|
||||
server_content = """#!/usr/bin/env python3
|
||||
from config import DATABASE_URL, DEBUG_MODE, CACHE_SIZE
|
||||
import sqlite3
|
||||
|
||||
class DatabaseServer:
|
||||
def __init__(self):
|
||||
self.connection_pool = []
|
||||
self.cache_size = CACHE_SIZE # This will fail if CACHE_SIZE is invalid
|
||||
|
||||
def connect(self):
|
||||
try:
|
||||
conn = sqlite3.connect(DATABASE_URL)
|
||||
self.connection_pool.append(conn)
|
||||
return conn
|
||||
except Exception as e:
|
||||
print(f"Connection failed: {e}")
|
||||
return None
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
config_file = self.create_additional_test_file("config.py", config_content)
|
||||
server_file = self.create_additional_test_file("database_server.py", server_content)
|
||||
|
||||
# Step 1: Start investigation (new conversation)
|
||||
self.logger.info(" 1.6.1: Step 1 - Start investigation")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Investigating application startup failures in production environment",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Application fails to start with configuration errors",
|
||||
"files_checked": [config_file],
|
||||
"relevant_files": [config_file],
|
||||
"relevant_methods": [],
|
||||
"hypothesis": "Configuration issue causing startup failure",
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start multi-step file context test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_debug_response(response1)
|
||||
|
||||
# Validate step 1 - should use reference_only
|
||||
file_context1 = response1_data.get("file_context", {})
|
||||
if file_context1.get("type") != "reference_only":
|
||||
self.logger.error("Step 1 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 1: reference_only file context")
|
||||
|
||||
# Step 2: Expand investigation
|
||||
self.logger.info(" 1.6.2: Step 2 - Expand investigation")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Found configuration issue - investigating database server initialization",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "MAX_CONNECTIONS environment variable contains invalid value, causing CACHE_SIZE calculation to fail",
|
||||
"files_checked": [config_file, server_file],
|
||||
"relevant_files": [config_file, server_file],
|
||||
"relevant_methods": ["DatabaseServer.__init__"],
|
||||
"hypothesis": "Invalid environment variable causing integer conversion error",
|
||||
"confidence": "medium",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_debug_response(response2)
|
||||
|
||||
# Validate step 2 - should still use reference_only
|
||||
file_context2 = response2_data.get("file_context", {})
|
||||
if file_context2.get("type") != "reference_only":
|
||||
self.logger.error("Step 2 should use reference_only file context")
|
||||
return False
|
||||
|
||||
# Should reference both files
|
||||
reference_note = file_context2.get("note", "")
|
||||
if "config.py" not in reference_note or "database_server.py" not in reference_note:
|
||||
self.logger.error("Step 2 should reference both files in note")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2: reference_only file context with multiple files")
|
||||
|
||||
# Step 3: Deep analysis
|
||||
self.logger.info(" 1.6.3: Step 3 - Deep analysis")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Analyzing the exact error propagation path and impact",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Error occurs in config.py line 8 when MAX_CONNECTIONS is not numeric, then propagates to DatabaseServer.__init__",
|
||||
"files_checked": [config_file, server_file],
|
||||
"relevant_files": [config_file, server_file],
|
||||
"relevant_methods": ["DatabaseServer.__init__"],
|
||||
"hypothesis": "Need proper error handling and validation for environment variables",
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to continue to step 3")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_debug_response(response3)
|
||||
|
||||
# Validate step 3 - should still use reference_only
|
||||
file_context3 = response3_data.get("file_context", {})
|
||||
if file_context3.get("type") != "reference_only":
|
||||
self.logger.error("Step 3 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 3: reference_only file context")
|
||||
|
||||
# Step 4: Final analysis with expert consultation
|
||||
self.logger.info(" 1.6.4: Step 4 - Final step with expert analysis")
|
||||
response4, _ = self.call_mcp_tool(
|
||||
"debug",
|
||||
{
|
||||
"step": "Investigation complete - root cause identified with solution",
|
||||
"step_number": 4,
|
||||
"total_steps": 4,
|
||||
"next_step_required": False, # Final step - should embed files
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Root cause: config.py assumes MAX_CONNECTIONS env var is always a valid integer. Fix: add try/except with default value and proper validation.",
|
||||
"files_checked": [config_file, server_file],
|
||||
"relevant_files": [config_file, server_file],
|
||||
"relevant_methods": ["DatabaseServer.__init__"],
|
||||
"hypothesis": "Environment variable validation needed with proper error handling",
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response4:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response4_data = self._parse_debug_response(response4)
|
||||
|
||||
# Validate step 4 - should use fully_embedded for expert analysis
|
||||
file_context4 = response4_data.get("file_context", {})
|
||||
if file_context4.get("type") != "fully_embedded":
|
||||
self.logger.error("Step 4 (final) should use fully_embedded file context")
|
||||
return False
|
||||
|
||||
if "expert analysis" not in file_context4.get("context_optimization", "").lower():
|
||||
self.logger.error("Final step should mention expert analysis in context optimization")
|
||||
return False
|
||||
|
||||
# Verify expert analysis was triggered
|
||||
if response4_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
# Check that expert analysis has file context
|
||||
expert_analysis = response4_data.get("expert_analysis", {})
|
||||
if not expert_analysis:
|
||||
self.logger.error("Expert analysis should be present in final step")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 4: fully_embedded file context with expert analysis")
|
||||
|
||||
# Validate the complete workflow progression
|
||||
progression_summary = {
|
||||
"step_1": "reference_only (new conversation, intermediate)",
|
||||
"step_2": "reference_only (continuation, intermediate)",
|
||||
"step_3": "reference_only (continuation, intermediate)",
|
||||
"step_4": "fully_embedded (continuation, final)",
|
||||
}
|
||||
|
||||
self.logger.info(" 📋 File context progression:")
|
||||
for step, context_type in progression_summary.items():
|
||||
self.logger.info(f" {step}: {context_type}")
|
||||
|
||||
self.logger.info(" ✅ Multi-step file context optimization test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Multi-step file context test failed: {e}")
|
||||
return False
|
||||
|
||||
@@ -60,14 +60,18 @@ def divide(x, y):
|
||||
# Step 1: precommit tool with dummy file (low thinking mode)
|
||||
self.logger.info(" Step 1: precommit tool with dummy file")
|
||||
precommit_params = {
|
||||
"step": "Initial analysis of dummy_code.py for commit readiness. Please give me a quick one line reply.",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Starting pre-commit validation of dummy_code.py",
|
||||
"path": os.getcwd(), # Use current working directory as the git repo path
|
||||
"files": [dummy_file_path],
|
||||
"prompt": "Please give me a quick one line reply. Review this code for commit readiness",
|
||||
"relevant_files": [dummy_file_path],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response1, continuation_id = self.call_mcp_tool_direct("precommit", precommit_params)
|
||||
response1, continuation_id = self.call_mcp_tool("precommit", precommit_params)
|
||||
if not response1:
|
||||
self.logger.error(" ❌ Step 1: precommit tool failed")
|
||||
return False
|
||||
@@ -86,13 +90,17 @@ def divide(x, y):
|
||||
# Step 2: codereview tool with same file (NO continuation - fresh conversation)
|
||||
self.logger.info(" Step 2: codereview tool with same file (fresh conversation)")
|
||||
codereview_params = {
|
||||
"files": [dummy_file_path],
|
||||
"prompt": "Please give me a quick one line reply. General code review for quality and best practices",
|
||||
"step": "Initial code review of dummy_code.py for quality and best practices. Please give me a quick one line reply.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False,
|
||||
"findings": "Starting code review of dummy_code.py",
|
||||
"relevant_files": [dummy_file_path],
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response2, _ = self.call_mcp_tool_direct("codereview", codereview_params)
|
||||
response2, _ = self.call_mcp_tool("codereview", codereview_params)
|
||||
if not response2:
|
||||
self.logger.error(" ❌ Step 2: codereview tool failed")
|
||||
return False
|
||||
@@ -115,14 +123,18 @@ def subtract(a, b):
|
||||
# Continue precommit with both files
|
||||
continue_params = {
|
||||
"continuation_id": continuation_id,
|
||||
"step": "Continue analysis with new_feature.py added. Please give me a quick one line reply about both files.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False,
|
||||
"findings": "Continuing pre-commit validation with both dummy_code.py and new_feature.py",
|
||||
"path": os.getcwd(), # Use current working directory as the git repo path
|
||||
"files": [dummy_file_path, new_file_path], # Old + new file
|
||||
"prompt": "Please give me a quick one line reply. Now also review the new feature file along with the previous one",
|
||||
"relevant_files": [dummy_file_path, new_file_path], # Old + new file
|
||||
"thinking_mode": "low",
|
||||
"model": "flash",
|
||||
}
|
||||
|
||||
response3, _ = self.call_mcp_tool_direct("precommit", continue_params)
|
||||
response3, _ = self.call_mcp_tool("precommit", continue_params)
|
||||
if not response3:
|
||||
self.logger.error(" ❌ Step 3: precommit continuation failed")
|
||||
return False
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Planner Tool Validation Test
|
||||
PlannerWorkflow Tool Validation Test
|
||||
|
||||
Tests the planner tool's sequential planning capabilities including:
|
||||
- Step-by-step planning with proper JSON responses
|
||||
- Continuation logic across planning sessions
|
||||
- Branching and revision capabilities
|
||||
- Previous plan context loading
|
||||
- Plan completion and summary storage
|
||||
Tests the planner tool's capabilities using the new workflow architecture.
|
||||
This validates that the new workflow-based implementation maintains all the
|
||||
functionality of the original planner tool while using the workflow pattern
|
||||
like the debug tool.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -17,7 +15,7 @@ from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class PlannerValidationTest(ConversationBaseTest):
|
||||
"""Test planner tool's sequential planning and continuation features"""
|
||||
"""Test planner tool with new workflow architecture"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
@@ -25,49 +23,62 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Planner tool sequential planning and continuation validation"
|
||||
return "PlannerWorkflow tool validation with new workflow architecture"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test planner tool sequential planning capabilities"""
|
||||
"""Test planner tool capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: Planner tool validation")
|
||||
self.logger.info("Test: PlannerWorkflow tool validation (new architecture)")
|
||||
|
||||
# Test 1: Single planning session with multiple steps
|
||||
# Test 1: Single planning session with workflow architecture
|
||||
if not self._test_single_planning_session():
|
||||
return False
|
||||
|
||||
# Test 2: Plan completion and continuation to new planning session
|
||||
if not self._test_plan_continuation():
|
||||
# Test 2: Planning with continuation using workflow
|
||||
if not self._test_planning_with_continuation():
|
||||
return False
|
||||
|
||||
# Test 3: Branching and revision capabilities
|
||||
# Test 3: Complex plan with deep thinking pauses
|
||||
if not self._test_complex_plan_deep_thinking():
|
||||
return False
|
||||
|
||||
# Test 4: Self-contained completion (no expert analysis)
|
||||
if not self._test_self_contained_completion():
|
||||
return False
|
||||
|
||||
# Test 5: Branching and revision with workflow
|
||||
if not self._test_branching_and_revision():
|
||||
return False
|
||||
|
||||
# Test 6: Workflow file context behavior
|
||||
if not self._test_workflow_file_context():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All planner validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Planner validation test failed: {e}")
|
||||
self.logger.error(f"PlannerWorkflow validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_single_planning_session(self) -> bool:
|
||||
"""Test a complete planning session with multiple steps"""
|
||||
"""Test a complete planning session with workflow architecture"""
|
||||
try:
|
||||
self.logger.info(" 1.1: Testing single planning session")
|
||||
self.logger.info(" 1.1: Testing single planning session with workflow")
|
||||
|
||||
# Step 1: Start planning
|
||||
self.logger.info(" 1.1.1: Step 1 - Initial planning step")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "I need to plan a microservices migration for our monolithic e-commerce platform. Let me start by understanding the current architecture and identifying the key business domains.",
|
||||
"step": "I need to plan a comprehensive API redesign for our legacy system. Let me start by analyzing the current state and identifying key requirements for the new API architecture.",
|
||||
"step_number": 1,
|
||||
"total_steps": 5,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
@@ -80,22 +91,44 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Validate step 1 response structure
|
||||
if not self._validate_step_response(response1_data, 1, 5, True, "planning_success"):
|
||||
# Validate step 1 response structure - expect pause_for_planner for next_step_required=True
|
||||
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_planner"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
# Debug: Log the actual response structure to see what we're getting
|
||||
self.logger.debug(f"Response structure: {list(response1_data.keys())}")
|
||||
|
||||
# Check workflow-specific response structure (more flexible)
|
||||
status_key = None
|
||||
for key in response1_data.keys():
|
||||
if key.endswith("_status"):
|
||||
status_key = key
|
||||
break
|
||||
|
||||
if not status_key:
|
||||
self.logger.error(f"Missing workflow status field in response: {list(response1_data.keys())}")
|
||||
return False
|
||||
|
||||
self.logger.debug(f"Found status field: {status_key}")
|
||||
|
||||
# Check required_actions for workflow guidance
|
||||
if not response1_data.get("required_actions"):
|
||||
self.logger.error("Missing required_actions in workflow response")
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful with workflow, continuation_id: {continuation_id}")
|
||||
|
||||
# Step 2: Continue planning
|
||||
self.logger.info(" 1.1.2: Step 2 - Domain identification")
|
||||
self.logger.info(" 1.1.2: Step 2 - API domain analysis")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Based on my analysis, I can identify the main business domains: User Management, Product Catalog, Order Processing, Payment, and Inventory. Let me plan how to extract these into separate services.",
|
||||
"step": "After analyzing the current API, I can identify three main domains: User Management, Content Management, and Analytics. Let me design the new API structure with RESTful endpoints and proper versioning.",
|
||||
"step_number": 2,
|
||||
"total_steps": 5,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
@@ -104,21 +137,39 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not self._validate_step_response(response2_data, 2, 5, True, "planning_success"):
|
||||
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_planner"):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful")
|
||||
# Check step history tracking in workflow (more flexible)
|
||||
status_key = None
|
||||
for key in response2_data.keys():
|
||||
if key.endswith("_status"):
|
||||
status_key = key
|
||||
break
|
||||
|
||||
# Step 3: Final step
|
||||
if status_key:
|
||||
workflow_status = response2_data.get(status_key, {})
|
||||
step_history_length = workflow_status.get("step_history_length", 0)
|
||||
if step_history_length < 2:
|
||||
self.logger.error(f"Step history not properly tracked in workflow: {step_history_length}")
|
||||
return False
|
||||
self.logger.debug(f"Step history length: {step_history_length}")
|
||||
else:
|
||||
self.logger.warning("No workflow status found, skipping step history check")
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful with workflow tracking")
|
||||
|
||||
# Step 3: Final step - should trigger completion
|
||||
self.logger.info(" 1.1.3: Step 3 - Final planning step")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Now I'll create a phased migration strategy: Phase 1 - Extract User Management, Phase 2 - Product Catalog and Inventory, Phase 3 - Order Processing and Payment services. This completes the initial migration plan.",
|
||||
"step": "API redesign plan complete: Phase 1 - User Management API, Phase 2 - Content Management API, Phase 3 - Analytics API. Each phase includes proper authentication, rate limiting, and comprehensive documentation.",
|
||||
"step_number": 3,
|
||||
"total_steps": 3, # Adjusted total
|
||||
"next_step_required": False, # Final step
|
||||
"next_step_required": False, # Final step - should complete without expert analysis
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
@@ -127,125 +178,329 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
return False
|
||||
|
||||
response3_data = self._parse_planner_response(response3)
|
||||
if not self._validate_final_step_response(response3_data, 3, 3):
|
||||
if not response3_data:
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Planning session completed successfully")
|
||||
# Validate final response structure - should be self-contained completion
|
||||
if response3_data.get("status") != "planner_complete":
|
||||
self.logger.error(f"Expected status 'planner_complete', got '{response3_data.get('status')}'")
|
||||
return False
|
||||
|
||||
if not response3_data.get("planning_complete"):
|
||||
self.logger.error("Expected planning_complete=true for final step")
|
||||
return False
|
||||
|
||||
# Should NOT have expert_analysis (self-contained)
|
||||
if "expert_analysis" in response3_data:
|
||||
self.logger.error("PlannerWorkflow should be self-contained without expert analysis")
|
||||
return False
|
||||
|
||||
# Check plan_summary exists
|
||||
if not response3_data.get("plan_summary"):
|
||||
self.logger.error("Missing plan_summary in final step")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Planning session completed successfully with workflow architecture")
|
||||
|
||||
# Store continuation_id for next test
|
||||
self.migration_continuation_id = continuation_id
|
||||
self.api_continuation_id = continuation_id
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Single planning session test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_plan_continuation(self) -> bool:
|
||||
"""Test continuing from a previous completed plan"""
|
||||
def _test_planning_with_continuation(self) -> bool:
|
||||
"""Test planning continuation with workflow architecture"""
|
||||
try:
|
||||
self.logger.info(" 1.2: Testing plan continuation with previous context")
|
||||
self.logger.info(" 1.2: Testing planning continuation with workflow")
|
||||
|
||||
# Start a new planning session using the continuation_id from previous completed plan
|
||||
self.logger.info(" 1.2.1: New planning session with previous plan context")
|
||||
response1, new_continuation_id = self.call_mcp_tool(
|
||||
# Use continuation from previous test if available
|
||||
continuation_id = getattr(self, "api_continuation_id", None)
|
||||
if not continuation_id:
|
||||
# Start fresh if no continuation available
|
||||
self.logger.info(" 1.2.0: Starting fresh planning session")
|
||||
response0, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Planning API security strategy",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
if not response0 or not continuation_id:
|
||||
self.logger.error("Failed to start fresh planning session")
|
||||
return False
|
||||
|
||||
# Test continuation step
|
||||
self.logger.info(" 1.2.1: Continue planning session")
|
||||
response1, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Now that I have the microservices migration plan, let me plan the database strategy. I need to decide how to handle data consistency across the new services.",
|
||||
"step_number": 1, # New planning session starts at step 1
|
||||
"total_steps": 4,
|
||||
"step": "Building on the API redesign, let me now plan the security implementation with OAuth 2.0, API keys, and rate limiting strategies.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"continuation_id": self.migration_continuation_id, # Use previous plan's continuation_id
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not new_continuation_id:
|
||||
self.logger.error("Failed to start new planning session with context")
|
||||
if not response1:
|
||||
self.logger.error("Failed to continue planning")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_planner_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Should have previous plan context
|
||||
if "previous_plan_context" not in response1_data:
|
||||
self.logger.error("Expected previous_plan_context in new planning session")
|
||||
# Validate continuation behavior
|
||||
if not self._validate_step_response(response1_data, 2, 2, True, "pause_for_planner"):
|
||||
return False
|
||||
|
||||
# Check for key terms from the previous plan
|
||||
context = response1_data["previous_plan_context"].lower()
|
||||
if "migration" not in context and "plan" not in context:
|
||||
self.logger.error("Previous plan context doesn't contain expected content")
|
||||
# Check that continuation_id is preserved
|
||||
if response1_data.get("continuation_id") != continuation_id:
|
||||
self.logger.error("Continuation ID not preserved in workflow")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ New planning session loaded previous plan context")
|
||||
self.logger.info(" ✅ Planning continuation working with workflow")
|
||||
return True
|
||||
|
||||
# Continue the new planning session (step 2+ should NOT load context)
|
||||
self.logger.info(" 1.2.2: Continue new planning session (no context loading)")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Planning continuation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_complex_plan_deep_thinking(self) -> bool:
|
||||
"""Test complex plan with deep thinking pauses"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing complex plan with deep thinking pauses")
|
||||
|
||||
# Start complex plan (≥5 steps) - should trigger deep thinking
|
||||
self.logger.info(" 1.3.1: Step 1 of complex plan (should trigger deep thinking)")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "I need to plan a complete digital transformation for our enterprise organization, including cloud migration, process automation, and cultural change management.",
|
||||
"step_number": 1,
|
||||
"total_steps": 8, # Complex plan ≥5 steps
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start complex planning")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_planner_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Should trigger deep thinking pause for complex plan
|
||||
if response1_data.get("status") != "pause_for_deep_thinking":
|
||||
self.logger.error("Expected deep thinking pause for complex plan step 1")
|
||||
return False
|
||||
|
||||
if not response1_data.get("thinking_required"):
|
||||
self.logger.error("Expected thinking_required=true for complex plan")
|
||||
return False
|
||||
|
||||
# Check required thinking actions
|
||||
required_thinking = response1_data.get("required_thinking", [])
|
||||
if len(required_thinking) < 4:
|
||||
self.logger.error("Expected comprehensive thinking requirements for complex plan")
|
||||
return False
|
||||
|
||||
# Check for deep thinking guidance in next_steps
|
||||
next_steps = response1_data.get("next_steps", "")
|
||||
if "MANDATORY" not in next_steps or "deep thinking" not in next_steps.lower():
|
||||
self.logger.error("Expected mandatory deep thinking guidance")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Complex plan step 1 correctly triggered deep thinking pause")
|
||||
|
||||
# Step 2 of complex plan - should also trigger deep thinking
|
||||
self.logger.info(" 1.3.2: Step 2 of complex plan (should trigger deep thinking)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "I'll implement a database-per-service pattern with eventual consistency using event sourcing for cross-service communication.",
|
||||
"step": "After deep analysis, I can see this transformation requires three parallel tracks: Technical Infrastructure, Business Process, and Human Capital. Let me design the coordination strategy.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"total_steps": 8,
|
||||
"next_step_required": True,
|
||||
"continuation_id": new_continuation_id, # Same continuation, step 2
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue new planning session")
|
||||
self.logger.error("Failed to continue complex planning")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Step 2+ should NOT have previous_plan_context (only step 1 with continuation_id gets context)
|
||||
if "previous_plan_context" in response2_data:
|
||||
self.logger.error("Step 2 should NOT have previous_plan_context")
|
||||
# Step 2 should also trigger deep thinking for complex plans
|
||||
if response2_data.get("status") != "pause_for_deep_thinking":
|
||||
self.logger.error("Expected deep thinking pause for complex plan step 2")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 correctly has no previous context (as expected)")
|
||||
self.logger.info(" ✅ Complex plan step 2 correctly triggered deep thinking pause")
|
||||
|
||||
# Step 4 of complex plan - should use normal flow (after step 3)
|
||||
self.logger.info(" 1.3.3: Step 4 of complex plan (should use normal flow)")
|
||||
response4, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Now moving to tactical planning: Phase 1 execution details with specific timelines and resource allocation for the technical infrastructure track.",
|
||||
"step_number": 4,
|
||||
"total_steps": 8,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response4:
|
||||
self.logger.error("Failed to continue to step 4")
|
||||
return False
|
||||
|
||||
response4_data = self._parse_planner_response(response4)
|
||||
if not response4_data:
|
||||
return False
|
||||
|
||||
# Step 4 should use normal flow (no more deep thinking pauses)
|
||||
if response4_data.get("status") != "pause_for_planner":
|
||||
self.logger.error("Expected normal planning flow for step 4")
|
||||
return False
|
||||
|
||||
if response4_data.get("thinking_required"):
|
||||
self.logger.error("Step 4 should not require special thinking pause")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Complex plan transitions to normal flow after step 3")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Plan continuation test failed: {e}")
|
||||
self.logger.error(f"Complex plan deep thinking test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_branching_and_revision(self) -> bool:
|
||||
"""Test branching and revision capabilities"""
|
||||
def _test_self_contained_completion(self) -> bool:
|
||||
"""Test self-contained completion without expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing branching and revision capabilities")
|
||||
self.logger.info(" 1.4: Testing self-contained completion")
|
||||
|
||||
# Start a new planning session for testing branching
|
||||
self.logger.info(" 1.3.1: Start planning session for branching test")
|
||||
# Simple planning session that should complete without expert analysis
|
||||
self.logger.info(" 1.4.1: Simple planning session")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Let me plan the deployment strategy for the microservices. I'll consider different deployment options.",
|
||||
"step": "Planning a simple website redesign with new color scheme and improved navigation.",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start branching test planning session")
|
||||
self.logger.error("Failed to start simple planning")
|
||||
return False
|
||||
|
||||
# Test branching
|
||||
self.logger.info(" 1.3.2: Create a branch from step 1")
|
||||
# Final step - should complete without expert analysis
|
||||
self.logger.info(" 1.4.2: Final step - self-contained completion")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Branch A: I'll explore Kubernetes deployment with service mesh (Istio) for advanced traffic management and observability.",
|
||||
"step": "Website redesign plan complete: Phase 1 - Update color palette and typography, Phase 2 - Redesign navigation structure and user flows.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to complete simple planning")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Validate self-contained completion
|
||||
if response2_data.get("status") != "planner_complete":
|
||||
self.logger.error("Expected self-contained completion status")
|
||||
return False
|
||||
|
||||
# Should NOT call expert analysis
|
||||
if "expert_analysis" in response2_data:
|
||||
self.logger.error("PlannerWorkflow should not call expert analysis")
|
||||
return False
|
||||
|
||||
# Should have planning_complete flag
|
||||
if not response2_data.get("planning_complete"):
|
||||
self.logger.error("Expected planning_complete=true")
|
||||
return False
|
||||
|
||||
# Should have plan_summary
|
||||
if not response2_data.get("plan_summary"):
|
||||
self.logger.error("Expected plan_summary in completion")
|
||||
return False
|
||||
|
||||
# Check completion instructions
|
||||
output = response2_data.get("output", {})
|
||||
if not output.get("instructions"):
|
||||
self.logger.error("Missing output instructions for plan presentation")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Self-contained completion working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Self-contained completion test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_branching_and_revision(self) -> bool:
|
||||
"""Test branching and revision with workflow architecture"""
|
||||
try:
|
||||
self.logger.info(" 1.5: Testing branching and revision with workflow")
|
||||
|
||||
# Start planning session for branching test
|
||||
self.logger.info(" 1.5.1: Start planning for branching test")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Planning mobile app development strategy with different technology options to evaluate.",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start branching test")
|
||||
return False
|
||||
|
||||
# Create branch
|
||||
self.logger.info(" 1.5.2: Create branch for React Native approach")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Branch A: React Native approach - cross-platform development with shared codebase, faster development cycle, and consistent UI across platforms.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"is_branch_point": True,
|
||||
"branch_from_step": 1,
|
||||
"branch_id": "kubernetes-istio",
|
||||
"branch_id": "react-native",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
@@ -257,34 +512,35 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Validate branching metadata
|
||||
# Validate branching in workflow
|
||||
metadata = response2_data.get("metadata", {})
|
||||
if not metadata.get("is_branch_point"):
|
||||
self.logger.error("Branch point not properly recorded in metadata")
|
||||
self.logger.error("Branch point not recorded in workflow")
|
||||
return False
|
||||
|
||||
if metadata.get("branch_id") != "kubernetes-istio":
|
||||
if metadata.get("branch_id") != "react-native":
|
||||
self.logger.error("Branch ID not properly recorded")
|
||||
return False
|
||||
|
||||
if "kubernetes-istio" not in metadata.get("branches", []):
|
||||
self.logger.error("Branch not recorded in branches list")
|
||||
if "react-native" not in metadata.get("branches", []):
|
||||
self.logger.error("Branch not added to branches list")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Branching working correctly")
|
||||
self.logger.info(" ✅ Branching working with workflow architecture")
|
||||
|
||||
# Test revision
|
||||
self.logger.info(" 1.3.3: Revise step 2")
|
||||
self.logger.info(" 1.5.3: Test revision capability")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
|
||||
"step": "Revision of step 2: After consideration, let me revise the React Native approach to include performance optimizations and native module integration for critical features.",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"is_step_revision": True,
|
||||
"revises_step_number": 2,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
@@ -296,23 +552,87 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
if not response3_data:
|
||||
return False
|
||||
|
||||
# Validate revision metadata
|
||||
# Validate revision in workflow
|
||||
metadata = response3_data.get("metadata", {})
|
||||
if not metadata.get("is_step_revision"):
|
||||
self.logger.error("Step revision not properly recorded in metadata")
|
||||
self.logger.error("Step revision not recorded in workflow")
|
||||
return False
|
||||
|
||||
if metadata.get("revises_step_number") != 2:
|
||||
self.logger.error("Revised step number not properly recorded")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Revision working correctly")
|
||||
self.logger.info(" ✅ Revision working with workflow architecture")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Branching and revision test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_workflow_file_context(self) -> bool:
|
||||
"""Test workflow file context behavior (should be minimal for planner)"""
|
||||
try:
|
||||
self.logger.info(" 1.6: Testing workflow file context behavior")
|
||||
|
||||
# Planner typically doesn't use files, but test the workflow handles this correctly
|
||||
self.logger.info(" 1.6.1: Planning step with no files (normal case)")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Planning data architecture for analytics platform.",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start workflow file context test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_planner_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Planner workflow should not have file_context since it doesn't use files
|
||||
if "file_context" in response1_data:
|
||||
self.logger.info(" ℹ️ Workflow file context present but should be minimal for planner")
|
||||
|
||||
# Final step
|
||||
self.logger.info(" 1.6.2: Final step (should complete without file embedding)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Data architecture plan complete with data lakes, processing pipelines, and analytics layers.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False,
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to complete workflow file context test")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Final step should complete self-contained
|
||||
if response2_data.get("status") != "planner_complete":
|
||||
self.logger.error("Expected self-contained completion for planner workflow")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Workflow file context behavior appropriate for planner")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Workflow file context test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for planner-specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
@@ -329,7 +649,7 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
def _extract_planner_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from planner response"""
|
||||
try:
|
||||
# Parse the response - it's now direct JSON, not wrapped
|
||||
# Parse the response
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
|
||||
@@ -340,7 +660,7 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
def _parse_planner_response(self, response_text: str) -> dict:
|
||||
"""Parse planner tool JSON response"""
|
||||
try:
|
||||
# Parse the response - it's now direct JSON, not wrapped
|
||||
# Parse the response - it should be direct JSON
|
||||
return json.loads(response_text)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
@@ -356,7 +676,7 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate a planning step response structure"""
|
||||
"""Validate a planner step response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
@@ -380,16 +700,11 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
)
|
||||
return False
|
||||
|
||||
# Check that step_content exists
|
||||
# Check step_content exists
|
||||
if not response_data.get("step_content"):
|
||||
self.logger.error("Missing step_content in response")
|
||||
return False
|
||||
|
||||
# Check metadata exists
|
||||
if "metadata" not in response_data:
|
||||
self.logger.error("Missing metadata in response")
|
||||
return False
|
||||
|
||||
# Check next_steps guidance
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
@@ -400,40 +715,3 @@ class PlannerValidationTest(ConversationBaseTest):
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating step response: {e}")
|
||||
return False
|
||||
|
||||
def _validate_final_step_response(self, response_data: dict, expected_step: int, expected_total: int) -> bool:
|
||||
"""Validate a final planning step response"""
|
||||
try:
|
||||
# Basic step validation
|
||||
if not self._validate_step_response(
|
||||
response_data, expected_step, expected_total, False, "planning_success"
|
||||
):
|
||||
return False
|
||||
|
||||
# Check planning_complete flag
|
||||
if not response_data.get("planning_complete"):
|
||||
self.logger.error("Expected planning_complete=true for final step")
|
||||
return False
|
||||
|
||||
# Check plan_summary exists
|
||||
if not response_data.get("plan_summary"):
|
||||
self.logger.error("Missing plan_summary in final step")
|
||||
return False
|
||||
|
||||
# Check plan_summary contains expected content
|
||||
plan_summary = response_data.get("plan_summary", "")
|
||||
if "COMPLETE PLAN:" not in plan_summary:
|
||||
self.logger.error("plan_summary doesn't contain 'COMPLETE PLAN:' marker")
|
||||
return False
|
||||
|
||||
# Check next_steps mentions completion
|
||||
next_steps = response_data.get("next_steps", "")
|
||||
if "complete" not in next_steps.lower():
|
||||
self.logger.error("next_steps doesn't indicate planning completion")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating final step response: {e}")
|
||||
return False
|
||||
|
||||
439
simulator_tests/test_planner_validation_old.py
Normal file
439
simulator_tests/test_planner_validation_old.py
Normal file
@@ -0,0 +1,439 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Planner Tool Validation Test
|
||||
|
||||
Tests the planner tool's sequential planning capabilities including:
|
||||
- Step-by-step planning with proper JSON responses
|
||||
- Continuation logic across planning sessions
|
||||
- Branching and revision capabilities
|
||||
- Previous plan context loading
|
||||
- Plan completion and summary storage
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class PlannerValidationTest(ConversationBaseTest):
|
||||
"""Test planner tool's sequential planning and continuation features"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "planner_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Planner tool sequential planning and continuation validation"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test planner tool sequential planning capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: Planner tool validation")
|
||||
|
||||
# Test 1: Single planning session with multiple steps
|
||||
if not self._test_single_planning_session():
|
||||
return False
|
||||
|
||||
# Test 2: Plan completion and continuation to new planning session
|
||||
if not self._test_plan_continuation():
|
||||
return False
|
||||
|
||||
# Test 3: Branching and revision capabilities
|
||||
if not self._test_branching_and_revision():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All planner validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Planner validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_single_planning_session(self) -> bool:
|
||||
"""Test a complete planning session with multiple steps"""
|
||||
try:
|
||||
self.logger.info(" 1.1: Testing single planning session")
|
||||
|
||||
# Step 1: Start planning
|
||||
self.logger.info(" 1.1.1: Step 1 - Initial planning step")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "I need to plan a microservices migration for our monolithic e-commerce platform. Let me start by understanding the current architecture and identifying the key business domains.",
|
||||
"step_number": 1,
|
||||
"total_steps": 5,
|
||||
"next_step_required": True,
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to get initial planning response")
|
||||
return False
|
||||
|
||||
# Parse and validate JSON response
|
||||
response1_data = self._parse_planner_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Validate step 1 response structure
|
||||
if not self._validate_step_response(response1_data, 1, 5, True, "planning_success"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
|
||||
# Step 2: Continue planning
|
||||
self.logger.info(" 1.1.2: Step 2 - Domain identification")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Based on my analysis, I can identify the main business domains: User Management, Product Catalog, Order Processing, Payment, and Inventory. Let me plan how to extract these into separate services.",
|
||||
"step_number": 2,
|
||||
"total_steps": 5,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue planning to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not self._validate_step_response(response2_data, 2, 5, True, "planning_success"):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful")
|
||||
|
||||
# Step 3: Final step
|
||||
self.logger.info(" 1.1.3: Step 3 - Final planning step")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Now I'll create a phased migration strategy: Phase 1 - Extract User Management, Phase 2 - Product Catalog and Inventory, Phase 3 - Order Processing and Payment services. This completes the initial migration plan.",
|
||||
"step_number": 3,
|
||||
"total_steps": 3, # Adjusted total
|
||||
"next_step_required": False, # Final step
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to complete planning session")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_planner_response(response3)
|
||||
if not self._validate_final_step_response(response3_data, 3, 3):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Planning session completed successfully")
|
||||
|
||||
# Store continuation_id for next test
|
||||
self.migration_continuation_id = continuation_id
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Single planning session test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_plan_continuation(self) -> bool:
|
||||
"""Test continuing from a previous completed plan"""
|
||||
try:
|
||||
self.logger.info(" 1.2: Testing plan continuation with previous context")
|
||||
|
||||
# Start a new planning session using the continuation_id from previous completed plan
|
||||
self.logger.info(" 1.2.1: New planning session with previous plan context")
|
||||
response1, new_continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Now that I have the microservices migration plan, let me plan the database strategy. I need to decide how to handle data consistency across the new services.",
|
||||
"step_number": 1, # New planning session starts at step 1
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": self.migration_continuation_id, # Use previous plan's continuation_id
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not new_continuation_id:
|
||||
self.logger.error("Failed to start new planning session with context")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_planner_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Should have previous plan context
|
||||
if "previous_plan_context" not in response1_data:
|
||||
self.logger.error("Expected previous_plan_context in new planning session")
|
||||
return False
|
||||
|
||||
# Check for key terms from the previous plan
|
||||
context = response1_data["previous_plan_context"].lower()
|
||||
if "migration" not in context and "plan" not in context:
|
||||
self.logger.error("Previous plan context doesn't contain expected content")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ New planning session loaded previous plan context")
|
||||
|
||||
# Continue the new planning session (step 2+ should NOT load context)
|
||||
self.logger.info(" 1.2.2: Continue new planning session (no context loading)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "I'll implement a database-per-service pattern with eventual consistency using event sourcing for cross-service communication.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": new_continuation_id, # Same continuation, step 2
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue new planning session")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Step 2+ should NOT have previous_plan_context (only step 1 with continuation_id gets context)
|
||||
if "previous_plan_context" in response2_data:
|
||||
self.logger.error("Step 2 should NOT have previous_plan_context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 correctly has no previous context (as expected)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Plan continuation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_branching_and_revision(self) -> bool:
|
||||
"""Test branching and revision capabilities"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing branching and revision capabilities")
|
||||
|
||||
# Start a new planning session for testing branching
|
||||
self.logger.info(" 1.3.1: Start planning session for branching test")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Let me plan the deployment strategy for the microservices. I'll consider different deployment options.",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start branching test planning session")
|
||||
return False
|
||||
|
||||
# Test branching
|
||||
self.logger.info(" 1.3.2: Create a branch from step 1")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Branch A: I'll explore Kubernetes deployment with service mesh (Istio) for advanced traffic management and observability.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"is_branch_point": True,
|
||||
"branch_from_step": 1,
|
||||
"branch_id": "kubernetes-istio",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to create branch")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_planner_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Validate branching metadata
|
||||
metadata = response2_data.get("metadata", {})
|
||||
if not metadata.get("is_branch_point"):
|
||||
self.logger.error("Branch point not properly recorded in metadata")
|
||||
return False
|
||||
|
||||
if metadata.get("branch_id") != "kubernetes-istio":
|
||||
self.logger.error("Branch ID not properly recorded")
|
||||
return False
|
||||
|
||||
if "kubernetes-istio" not in metadata.get("branches", []):
|
||||
self.logger.error("Branch not recorded in branches list")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Branching working correctly")
|
||||
|
||||
# Test revision
|
||||
self.logger.info(" 1.3.3: Revise step 2")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"planner",
|
||||
{
|
||||
"step": "Revision: Actually, let me revise the Kubernetes approach. I'll use a simpler deployment initially, then migrate to Kubernetes later.",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"is_step_revision": True,
|
||||
"revises_step_number": 2,
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to create revision")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_planner_response(response3)
|
||||
if not response3_data:
|
||||
return False
|
||||
|
||||
# Validate revision metadata
|
||||
metadata = response3_data.get("metadata", {})
|
||||
if not metadata.get("is_step_revision"):
|
||||
self.logger.error("Step revision not properly recorded in metadata")
|
||||
return False
|
||||
|
||||
if metadata.get("revises_step_number") != 2:
|
||||
self.logger.error("Revised step number not properly recorded")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Revision working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Branching and revision test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for planner-specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
|
||||
|
||||
if not response_text:
|
||||
return None, None
|
||||
|
||||
# Extract continuation_id from planner response specifically
|
||||
continuation_id = self._extract_planner_continuation_id(response_text)
|
||||
|
||||
return response_text, continuation_id
|
||||
|
||||
def _extract_planner_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from planner response"""
|
||||
try:
|
||||
# Parse the response - it's now direct JSON, not wrapped
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.debug(f"Failed to parse response for planner continuation_id: {e}")
|
||||
return None
|
||||
|
||||
def _parse_planner_response(self, response_text: str) -> dict:
|
||||
"""Parse planner tool JSON response"""
|
||||
try:
|
||||
# Parse the response - it's now direct JSON, not wrapped
|
||||
return json.loads(response_text)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse planner response as JSON: {e}")
|
||||
self.logger.error(f"Response text: {response_text[:500]}...")
|
||||
return {}
|
||||
|
||||
def _validate_step_response(
|
||||
self,
|
||||
response_data: dict,
|
||||
expected_step: int,
|
||||
expected_total: int,
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate a planning step response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
|
||||
return False
|
||||
|
||||
# Check step number
|
||||
if response_data.get("step_number") != expected_step:
|
||||
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
|
||||
return False
|
||||
|
||||
# Check total steps
|
||||
if response_data.get("total_steps") != expected_total:
|
||||
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
|
||||
return False
|
||||
|
||||
# Check next_step_required
|
||||
if response_data.get("next_step_required") != expected_next_required:
|
||||
self.logger.error(
|
||||
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check that step_content exists
|
||||
if not response_data.get("step_content"):
|
||||
self.logger.error("Missing step_content in response")
|
||||
return False
|
||||
|
||||
# Check metadata exists
|
||||
if "metadata" not in response_data:
|
||||
self.logger.error("Missing metadata in response")
|
||||
return False
|
||||
|
||||
# Check next_steps guidance
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating step response: {e}")
|
||||
return False
|
||||
|
||||
def _validate_final_step_response(self, response_data: dict, expected_step: int, expected_total: int) -> bool:
|
||||
"""Validate a final planning step response"""
|
||||
try:
|
||||
# Basic step validation
|
||||
if not self._validate_step_response(
|
||||
response_data, expected_step, expected_total, False, "planning_success"
|
||||
):
|
||||
return False
|
||||
|
||||
# Check planning_complete flag
|
||||
if not response_data.get("planning_complete"):
|
||||
self.logger.error("Expected planning_complete=true for final step")
|
||||
return False
|
||||
|
||||
# Check plan_summary exists
|
||||
if not response_data.get("plan_summary"):
|
||||
self.logger.error("Missing plan_summary in final step")
|
||||
return False
|
||||
|
||||
# Check plan_summary contains expected content
|
||||
plan_summary = response_data.get("plan_summary", "")
|
||||
if "COMPLETE PLAN:" not in plan_summary:
|
||||
self.logger.error("plan_summary doesn't contain 'COMPLETE PLAN:' marker")
|
||||
return False
|
||||
|
||||
# Check next_steps mentions completion
|
||||
next_steps = response_data.get("next_steps", "")
|
||||
if "complete" not in next_steps.lower():
|
||||
self.logger.error("next_steps doesn't indicate planning completion")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating final step response: {e}")
|
||||
return False
|
||||
1081
simulator_tests/test_precommitworkflow_validation.py
Normal file
1081
simulator_tests/test_precommitworkflow_validation.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2,18 +2,19 @@
|
||||
"""
|
||||
TestGen Tool Validation Test
|
||||
|
||||
Tests the testgen tool by:
|
||||
- Creating a test code file with a specific function
|
||||
- Using testgen to generate tests with a specific function name
|
||||
- Validating that the output contains the expected test function
|
||||
- Confirming the format matches test generation patterns
|
||||
Tests the testgen tool's capabilities using the workflow architecture.
|
||||
This validates that the workflow-based implementation guides Claude through
|
||||
systematic test generation analysis before creating comprehensive test suites.
|
||||
"""
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class TestGenValidationTest(BaseSimulatorTest):
|
||||
"""Test testgen tool validation with specific function name"""
|
||||
class TestGenValidationTest(ConversationBaseTest):
|
||||
"""Test testgen tool with workflow architecture"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
@@ -21,111 +22,812 @@ class TestGenValidationTest(BaseSimulatorTest):
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "TestGen tool validation with specific test function"
|
||||
return "TestGen tool validation with step-by-step test planning"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test testgen tool with specific function name validation"""
|
||||
"""Test testgen tool capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: TestGen tool validation")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
# Create sample code files to test
|
||||
self._create_test_code_files()
|
||||
|
||||
# Create a specific code file for test generation
|
||||
test_code_content = '''"""
|
||||
Sample authentication module for testing testgen
|
||||
"""
|
||||
|
||||
class UserAuthenticator:
|
||||
"""Handles user authentication logic"""
|
||||
|
||||
def __init__(self):
|
||||
self.failed_attempts = {}
|
||||
self.max_attempts = 3
|
||||
|
||||
def validate_password(self, username, password):
|
||||
"""Validate user password with security checks"""
|
||||
if not username or not password:
|
||||
return False
|
||||
|
||||
if username in self.failed_attempts:
|
||||
if self.failed_attempts[username] >= self.max_attempts:
|
||||
return False # Account locked
|
||||
|
||||
# Simple validation for demo
|
||||
if len(password) < 8:
|
||||
self._record_failed_attempt(username)
|
||||
return False
|
||||
|
||||
if password == "password123": # Demo valid password
|
||||
self._reset_failed_attempts(username)
|
||||
return True
|
||||
|
||||
self._record_failed_attempt(username)
|
||||
return False
|
||||
|
||||
def _record_failed_attempt(self, username):
|
||||
"""Record a failed login attempt"""
|
||||
self.failed_attempts[username] = self.failed_attempts.get(username, 0) + 1
|
||||
|
||||
def _reset_failed_attempts(self, username):
|
||||
"""Reset failed attempts after successful login"""
|
||||
if username in self.failed_attempts:
|
||||
del self.failed_attempts[username]
|
||||
'''
|
||||
|
||||
# Create the auth code file
|
||||
auth_file = self.create_additional_test_file("user_auth.py", test_code_content)
|
||||
|
||||
# Test testgen tool with specific requirements
|
||||
self.logger.info(" 1.1: Generate tests with specific function name")
|
||||
response, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"files": [auth_file],
|
||||
"prompt": "Generate comprehensive tests for the UserAuthenticator.validate_password method. Include tests for edge cases, security scenarios, and account locking. Use the specific test function name 'test_password_validation_edge_cases' for one of the test methods.",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
self.logger.error("Failed to get testgen response")
|
||||
# Test 1: Single investigation session with multiple steps
|
||||
if not self._test_single_test_generation_session():
|
||||
return False
|
||||
|
||||
self.logger.info(" 1.2: Validate response contains expected test function")
|
||||
|
||||
# Check that the response contains the specific test function name
|
||||
if "test_password_validation_edge_cases" not in response:
|
||||
self.logger.error("Response does not contain the requested test function name")
|
||||
self.logger.debug(f"Response content: {response[:500]}...")
|
||||
# Test 2: Test generation with pattern following
|
||||
if not self._test_generation_with_pattern_following():
|
||||
return False
|
||||
|
||||
# Check for common test patterns
|
||||
test_patterns = [
|
||||
"def test_", # Test function definition
|
||||
"assert", # Assertion statements
|
||||
"UserAuthenticator", # Class being tested
|
||||
"validate_password", # Method being tested
|
||||
]
|
||||
|
||||
missing_patterns = []
|
||||
for pattern in test_patterns:
|
||||
if pattern not in response:
|
||||
missing_patterns.append(pattern)
|
||||
|
||||
if missing_patterns:
|
||||
self.logger.error(f"Response missing expected test patterns: {missing_patterns}")
|
||||
self.logger.debug(f"Response content: {response[:500]}...")
|
||||
# Test 3: Complete test generation with expert analysis
|
||||
if not self._test_complete_generation_with_analysis():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ TestGen tool validation successful")
|
||||
self.logger.info(" ✅ Generated tests contain expected function name")
|
||||
self.logger.info(" ✅ Generated tests follow proper test patterns")
|
||||
# Test 4: Certain confidence behavior
|
||||
if not self._test_certain_confidence():
|
||||
return False
|
||||
|
||||
# Test 5: Context-aware file embedding
|
||||
if not self._test_context_aware_file_embedding():
|
||||
return False
|
||||
|
||||
# Test 6: Multi-step test planning
|
||||
if not self._test_multi_step_test_planning():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All testgen validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"TestGen validation test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
|
||||
def _create_test_code_files(self):
|
||||
"""Create sample code files for test generation"""
|
||||
# Create a calculator module with various functions
|
||||
calculator_code = """#!/usr/bin/env python3
|
||||
\"\"\"
|
||||
Simple calculator module for demonstration
|
||||
\"\"\"
|
||||
|
||||
def add(a, b):
|
||||
\"\"\"Add two numbers\"\"\"
|
||||
return a + b
|
||||
|
||||
def subtract(a, b):
|
||||
\"\"\"Subtract b from a\"\"\"
|
||||
return a - b
|
||||
|
||||
def multiply(a, b):
|
||||
\"\"\"Multiply two numbers\"\"\"
|
||||
return a * b
|
||||
|
||||
def divide(a, b):
|
||||
\"\"\"Divide a by b\"\"\"
|
||||
if b == 0:
|
||||
raise ValueError("Cannot divide by zero")
|
||||
return a / b
|
||||
|
||||
def calculate_percentage(value, percentage):
|
||||
\"\"\"Calculate percentage of a value\"\"\"
|
||||
if percentage < 0:
|
||||
raise ValueError("Percentage cannot be negative")
|
||||
if percentage > 100:
|
||||
raise ValueError("Percentage cannot exceed 100")
|
||||
return (value * percentage) / 100
|
||||
|
||||
def power(base, exponent):
|
||||
\"\"\"Calculate base raised to exponent\"\"\"
|
||||
if base == 0 and exponent < 0:
|
||||
raise ValueError("Cannot raise 0 to negative power")
|
||||
return base ** exponent
|
||||
"""
|
||||
|
||||
# Create test file
|
||||
self.calculator_file = self.create_additional_test_file("calculator.py", calculator_code)
|
||||
self.logger.info(f" ✅ Created calculator module: {self.calculator_file}")
|
||||
|
||||
# Create a simple existing test file to use as pattern
|
||||
existing_test = """#!/usr/bin/env python3
|
||||
import pytest
|
||||
from calculator import add, subtract
|
||||
|
||||
class TestCalculatorBasic:
|
||||
\"\"\"Test basic calculator operations\"\"\"
|
||||
|
||||
def test_add_positive_numbers(self):
|
||||
\"\"\"Test adding two positive numbers\"\"\"
|
||||
assert add(2, 3) == 5
|
||||
assert add(10, 20) == 30
|
||||
|
||||
def test_add_negative_numbers(self):
|
||||
\"\"\"Test adding negative numbers\"\"\"
|
||||
assert add(-5, -3) == -8
|
||||
assert add(-10, 5) == -5
|
||||
|
||||
def test_subtract_positive(self):
|
||||
\"\"\"Test subtracting positive numbers\"\"\"
|
||||
assert subtract(10, 3) == 7
|
||||
assert subtract(5, 5) == 0
|
||||
"""
|
||||
|
||||
self.existing_test_file = self.create_additional_test_file("test_calculator_basic.py", existing_test)
|
||||
self.logger.info(f" ✅ Created existing test file: {self.existing_test_file}")
|
||||
|
||||
def _test_single_test_generation_session(self) -> bool:
|
||||
"""Test a complete test generation session with multiple steps"""
|
||||
try:
|
||||
self.logger.info(" 1.1: Testing single test generation session")
|
||||
|
||||
# Step 1: Start investigation
|
||||
self.logger.info(" 1.1.1: Step 1 - Initial test planning")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "I need to generate comprehensive tests for the calculator module. Let me start by analyzing the code structure and understanding the functionality.",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Calculator module contains 6 functions: add, subtract, multiply, divide, calculate_percentage, and power. Each has specific error conditions that need testing.",
|
||||
"files_checked": [self.calculator_file],
|
||||
"relevant_files": [self.calculator_file],
|
||||
"relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to get initial test planning response")
|
||||
return False
|
||||
|
||||
# Parse and validate JSON response
|
||||
response1_data = self._parse_testgen_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Validate step 1 response structure
|
||||
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_test_analysis"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
|
||||
# Step 2: Analyze test requirements
|
||||
self.logger.info(" 1.1.2: Step 2 - Test requirements analysis")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Now analyzing the test requirements for each function, identifying edge cases and boundary conditions.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Identified key test scenarios: (1) divide - zero division error, (2) calculate_percentage - negative/over 100 validation, (3) power - zero to negative power error. Need tests for normal cases and edge cases.",
|
||||
"files_checked": [self.calculator_file],
|
||||
"relevant_files": [self.calculator_file],
|
||||
"relevant_context": ["divide", "calculate_percentage", "power"],
|
||||
"confidence": "medium",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue test planning to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_testgen_response(response2)
|
||||
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_test_analysis"):
|
||||
return False
|
||||
|
||||
# Check test generation status tracking
|
||||
test_status = response2_data.get("test_generation_status", {})
|
||||
if test_status.get("test_scenarios_identified", 0) < 3:
|
||||
self.logger.error("Test scenarios not properly tracked")
|
||||
return False
|
||||
|
||||
if test_status.get("analysis_confidence") != "medium":
|
||||
self.logger.error("Confidence level not properly tracked")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful with proper tracking")
|
||||
|
||||
# Store continuation_id for next test
|
||||
self.test_continuation_id = continuation_id
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Single test generation session test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_generation_with_pattern_following(self) -> bool:
|
||||
"""Test test generation following existing patterns"""
|
||||
try:
|
||||
self.logger.info(" 1.2: Testing test generation with pattern following")
|
||||
|
||||
# Start a new investigation with existing test patterns
|
||||
self.logger.info(" 1.2.1: Start test generation with pattern reference")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Generating tests for remaining calculator functions following existing test patterns",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "Found existing test pattern using pytest with class-based organization and descriptive test names",
|
||||
"files_checked": [self.calculator_file, self.existing_test_file],
|
||||
"relevant_files": [self.calculator_file, self.existing_test_file],
|
||||
"relevant_context": ["TestCalculatorBasic", "multiply", "divide", "calculate_percentage", "power"],
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start pattern following test")
|
||||
return False
|
||||
|
||||
# Step 2: Analyze patterns
|
||||
self.logger.info(" 1.2.2: Step 2 - Pattern analysis")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Analyzing the existing test patterns to maintain consistency",
|
||||
"step_number": 2,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True,
|
||||
"findings": "Existing tests use: class-based organization (TestCalculatorBasic), descriptive method names (test_operation_scenario), multiple assertions per test, pytest framework",
|
||||
"files_checked": [self.existing_test_file],
|
||||
"relevant_files": [self.calculator_file, self.existing_test_file],
|
||||
"confidence": "high",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Pattern analysis successful")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Pattern following test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_complete_generation_with_analysis(self) -> bool:
|
||||
"""Test complete test generation ending with expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing complete test generation with expert analysis")
|
||||
|
||||
# Use the continuation from first test or start fresh
|
||||
continuation_id = getattr(self, "test_continuation_id", None)
|
||||
if not continuation_id:
|
||||
# Start fresh if no continuation available
|
||||
self.logger.info(" 1.3.0: Starting fresh test generation")
|
||||
response0, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Analyzing calculator module for comprehensive test generation",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Identified 6 functions needing tests with various edge cases",
|
||||
"files_checked": [self.calculator_file],
|
||||
"relevant_files": [self.calculator_file],
|
||||
"relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
|
||||
},
|
||||
)
|
||||
if not response0 or not continuation_id:
|
||||
self.logger.error("Failed to start fresh test generation")
|
||||
return False
|
||||
|
||||
# Final step - trigger expert analysis
|
||||
self.logger.info(" 1.3.1: Final step - complete test planning")
|
||||
response_final, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Test planning complete. Identified all test scenarios including edge cases, error conditions, and boundary values for comprehensive coverage.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step - triggers expert analysis
|
||||
"findings": "Complete test plan: normal operations, edge cases (zero, negative), error conditions (divide by zero, invalid percentage, zero to negative power), boundary values",
|
||||
"files_checked": [self.calculator_file],
|
||||
"relevant_files": [self.calculator_file],
|
||||
"relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
|
||||
"confidence": "high",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash", # Use flash for expert analysis
|
||||
},
|
||||
)
|
||||
|
||||
if not response_final:
|
||||
self.logger.error("Failed to complete test generation")
|
||||
return False
|
||||
|
||||
response_final_data = self._parse_testgen_response(response_final)
|
||||
if not response_final_data:
|
||||
return False
|
||||
|
||||
# Validate final response structure
|
||||
if response_final_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error(
|
||||
f"Expected status 'calling_expert_analysis', got '{response_final_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if not response_final_data.get("test_generation_complete"):
|
||||
self.logger.error("Expected test_generation_complete=true for final step")
|
||||
return False
|
||||
|
||||
# Check for expert analysis
|
||||
if "expert_analysis" not in response_final_data:
|
||||
self.logger.error("Missing expert_analysis in final response")
|
||||
return False
|
||||
|
||||
expert_analysis = response_final_data.get("expert_analysis", {})
|
||||
|
||||
# Check for expected analysis content
|
||||
analysis_text = json.dumps(expert_analysis).lower()
|
||||
|
||||
# Look for test generation indicators
|
||||
test_indicators = ["test", "edge", "boundary", "error", "coverage", "pytest"]
|
||||
found_indicators = sum(1 for indicator in test_indicators if indicator in analysis_text)
|
||||
|
||||
if found_indicators >= 4:
|
||||
self.logger.info(" ✅ Expert analysis provided comprehensive test suggestions")
|
||||
else:
|
||||
self.logger.warning(
|
||||
f" ⚠️ Expert analysis may not have fully addressed test generation (found {found_indicators}/6 indicators)"
|
||||
)
|
||||
|
||||
# Check complete test generation summary
|
||||
if "complete_test_generation" not in response_final_data:
|
||||
self.logger.error("Missing complete_test_generation in final response")
|
||||
return False
|
||||
|
||||
complete_generation = response_final_data["complete_test_generation"]
|
||||
if not complete_generation.get("relevant_context"):
|
||||
self.logger.error("Missing relevant context in complete test generation")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Complete test generation with expert analysis successful")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Complete test generation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_certain_confidence(self) -> bool:
|
||||
"""Test certain confidence behavior - should skip expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.4: Testing certain confidence behavior")
|
||||
|
||||
# Test certain confidence - should skip expert analysis
|
||||
self.logger.info(" 1.4.1: Certain confidence test generation")
|
||||
response_certain, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "I have fully analyzed the code and identified all test scenarios with 100% certainty. Test plan is complete.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Complete test coverage plan: all functions covered with normal cases, edge cases, and error conditions. Ready for implementation.",
|
||||
"files_checked": [self.calculator_file],
|
||||
"relevant_files": [self.calculator_file],
|
||||
"relevant_context": ["add", "subtract", "multiply", "divide", "calculate_percentage", "power"],
|
||||
"confidence": "certain", # This should skip expert analysis
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response_certain:
|
||||
self.logger.error("Failed to test certain confidence")
|
||||
return False
|
||||
|
||||
response_certain_data = self._parse_testgen_response(response_certain)
|
||||
if not response_certain_data:
|
||||
return False
|
||||
|
||||
# Validate certain confidence response - should skip expert analysis
|
||||
if response_certain_data.get("status") != "test_generation_complete_ready_for_implementation":
|
||||
self.logger.error(
|
||||
f"Expected status 'test_generation_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if not response_certain_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
|
||||
return False
|
||||
|
||||
expert_analysis = response_certain_data.get("expert_analysis", {})
|
||||
if expert_analysis.get("status") != "skipped_due_to_certain_test_confidence":
|
||||
self.logger.error("Expert analysis should be skipped for certain confidence")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Certain confidence behavior working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Certain confidence test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for testgen-specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
|
||||
|
||||
if not response_text:
|
||||
return None, None
|
||||
|
||||
# Extract continuation_id from testgen response specifically
|
||||
continuation_id = self._extract_testgen_continuation_id(response_text)
|
||||
|
||||
return response_text, continuation_id
|
||||
|
||||
def _extract_testgen_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from testgen response"""
|
||||
try:
|
||||
# Parse the response
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.debug(f"Failed to parse response for testgen continuation_id: {e}")
|
||||
return None
|
||||
|
||||
def _parse_testgen_response(self, response_text: str) -> dict:
|
||||
"""Parse testgen tool JSON response"""
|
||||
try:
|
||||
# Parse the response - it should be direct JSON
|
||||
return json.loads(response_text)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse testgen response as JSON: {e}")
|
||||
self.logger.error(f"Response text: {response_text[:500]}...")
|
||||
return {}
|
||||
|
||||
def _validate_step_response(
|
||||
self,
|
||||
response_data: dict,
|
||||
expected_step: int,
|
||||
expected_total: int,
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate a test generation step response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
|
||||
return False
|
||||
|
||||
# Check step number
|
||||
if response_data.get("step_number") != expected_step:
|
||||
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
|
||||
return False
|
||||
|
||||
# Check total steps
|
||||
if response_data.get("total_steps") != expected_total:
|
||||
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
|
||||
return False
|
||||
|
||||
# Check next_step_required
|
||||
if response_data.get("next_step_required") != expected_next_required:
|
||||
self.logger.error(
|
||||
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check test_generation_status exists
|
||||
if "test_generation_status" not in response_data:
|
||||
self.logger.error("Missing test_generation_status in response")
|
||||
return False
|
||||
|
||||
# Check next_steps guidance
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating step response: {e}")
|
||||
return False
|
||||
|
||||
def _test_context_aware_file_embedding(self) -> bool:
|
||||
"""Test context-aware file embedding optimization"""
|
||||
try:
|
||||
self.logger.info(" 1.5: Testing context-aware file embedding")
|
||||
|
||||
# Create additional test files
|
||||
utils_code = """#!/usr/bin/env python3
|
||||
def validate_number(n):
|
||||
\"\"\"Validate if input is a number\"\"\"
|
||||
return isinstance(n, (int, float))
|
||||
|
||||
def format_result(result):
|
||||
\"\"\"Format calculation result\"\"\"
|
||||
if isinstance(result, float):
|
||||
return round(result, 2)
|
||||
return result
|
||||
"""
|
||||
|
||||
math_helpers_code = """#!/usr/bin/env python3
|
||||
import math
|
||||
|
||||
def factorial(n):
|
||||
\"\"\"Calculate factorial of n\"\"\"
|
||||
if n < 0:
|
||||
raise ValueError("Factorial not defined for negative numbers")
|
||||
return math.factorial(n)
|
||||
|
||||
def is_prime(n):
|
||||
\"\"\"Check if number is prime\"\"\"
|
||||
if n < 2:
|
||||
return False
|
||||
for i in range(2, int(n**0.5) + 1):
|
||||
if n % i == 0:
|
||||
return False
|
||||
return True
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
utils_file = self.create_additional_test_file("utils.py", utils_code)
|
||||
math_file = self.create_additional_test_file("math_helpers.py", math_helpers_code)
|
||||
|
||||
# Test 1: New conversation, intermediate step - should only reference files
|
||||
self.logger.info(" 1.5.1: New conversation intermediate step (should reference only)")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Starting test generation for utility modules",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True, # Intermediate step
|
||||
"findings": "Initial analysis of utility functions",
|
||||
"files_checked": [utils_file, math_file],
|
||||
"relevant_files": [utils_file], # This should be referenced, not embedded
|
||||
"relevant_context": ["validate_number", "format_result"],
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start context-aware file embedding test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_testgen_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be reference_only for intermediate step
|
||||
file_context = response1_data.get("file_context", {})
|
||||
if file_context.get("type") != "reference_only":
|
||||
self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Intermediate step correctly uses reference_only file context")
|
||||
|
||||
# Test 2: Final step - should embed files for expert analysis
|
||||
self.logger.info(" 1.5.2: Final step (should embed files)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Test planning complete - all test scenarios identified",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step - should embed files
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Complete test plan for all utility functions with edge cases",
|
||||
"files_checked": [utils_file, math_file],
|
||||
"relevant_files": [utils_file, math_file], # Should be fully embedded
|
||||
"relevant_context": ["validate_number", "format_result", "factorial", "is_prime"],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_testgen_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be fully_embedded for final step
|
||||
file_context2 = response2_data.get("file_context", {})
|
||||
if file_context2.get("type") != "fully_embedded":
|
||||
self.logger.error(
|
||||
f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Verify expert analysis was called for final step
|
||||
if response2_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Context-aware file embedding test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Context-aware file embedding test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_multi_step_test_planning(self) -> bool:
|
||||
"""Test multi-step test planning with complex code"""
|
||||
try:
|
||||
self.logger.info(" 1.6: Testing multi-step test planning")
|
||||
|
||||
# Create a complex class to test
|
||||
complex_code = """#!/usr/bin/env python3
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class DataProcessor:
|
||||
\"\"\"Complex data processor with async operations\"\"\"
|
||||
|
||||
def __init__(self, batch_size: int = 100):
|
||||
self.batch_size = batch_size
|
||||
self.processed_count = 0
|
||||
self.error_count = 0
|
||||
self.cache: Dict[str, any] = {}
|
||||
|
||||
async def process_batch(self, items: List[dict]) -> List[dict]:
|
||||
\"\"\"Process a batch of items asynchronously\"\"\"
|
||||
if not items:
|
||||
return []
|
||||
|
||||
if len(items) > self.batch_size:
|
||||
raise ValueError(f"Batch size {len(items)} exceeds limit {self.batch_size}")
|
||||
|
||||
results = []
|
||||
for item in items:
|
||||
try:
|
||||
result = await self._process_single_item(item)
|
||||
results.append(result)
|
||||
self.processed_count += 1
|
||||
except Exception as e:
|
||||
self.error_count += 1
|
||||
results.append({"error": str(e), "item": item})
|
||||
|
||||
return results
|
||||
|
||||
async def _process_single_item(self, item: dict) -> dict:
|
||||
\"\"\"Process a single item with caching\"\"\"
|
||||
item_id = item.get('id')
|
||||
if not item_id:
|
||||
raise ValueError("Item must have an ID")
|
||||
|
||||
# Check cache
|
||||
if item_id in self.cache:
|
||||
return self.cache[item_id]
|
||||
|
||||
# Simulate async processing
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
processed = {
|
||||
'id': item_id,
|
||||
'processed': True,
|
||||
'value': item.get('value', 0) * 2
|
||||
}
|
||||
|
||||
# Cache result
|
||||
self.cache[item_id] = processed
|
||||
return processed
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
\"\"\"Get processing statistics\"\"\"
|
||||
return {
|
||||
'processed': self.processed_count,
|
||||
'errors': self.error_count,
|
||||
'cache_size': len(self.cache),
|
||||
'success_rate': self.processed_count / (self.processed_count + self.error_count) if (self.processed_count + self.error_count) > 0 else 0
|
||||
}
|
||||
"""
|
||||
|
||||
# Create test file
|
||||
processor_file = self.create_additional_test_file("data_processor.py", complex_code)
|
||||
|
||||
# Step 1: Start investigation
|
||||
self.logger.info(" 1.6.1: Step 1 - Start complex test planning")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Analyzing complex DataProcessor class for comprehensive test generation",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "DataProcessor is an async class with caching, error handling, and statistics. Need async test patterns.",
|
||||
"files_checked": [processor_file],
|
||||
"relevant_files": [processor_file],
|
||||
"relevant_context": ["DataProcessor", "process_batch", "_process_single_item", "get_stats"],
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start multi-step test planning")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_testgen_response(response1)
|
||||
|
||||
# Validate step 1
|
||||
file_context1 = response1_data.get("file_context", {})
|
||||
if file_context1.get("type") != "reference_only":
|
||||
self.logger.error("Step 1 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 1: Started complex test planning")
|
||||
|
||||
# Step 2: Analyze async patterns
|
||||
self.logger.info(" 1.6.2: Step 2 - Async pattern analysis")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Analyzing async patterns and edge cases for testing",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Key test areas: async batch processing, cache behavior, error handling, batch size limits, empty items, statistics calculation",
|
||||
"files_checked": [processor_file],
|
||||
"relevant_files": [processor_file],
|
||||
"relevant_context": ["process_batch", "_process_single_item"],
|
||||
"confidence": "medium",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2: Async patterns analyzed")
|
||||
|
||||
# Step 3: Edge case identification
|
||||
self.logger.info(" 1.6.3: Step 3 - Edge case identification")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Identifying all edge cases and boundary conditions",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Edge cases: empty batch, oversized batch, items without ID, cache hits/misses, concurrent processing, error accumulation",
|
||||
"files_checked": [processor_file],
|
||||
"relevant_files": [processor_file],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to continue to step 3")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 3: Edge cases identified")
|
||||
|
||||
# Step 4: Final test plan with expert analysis
|
||||
self.logger.info(" 1.6.4: Step 4 - Complete test plan")
|
||||
response4, _ = self.call_mcp_tool(
|
||||
"testgen",
|
||||
{
|
||||
"step": "Test planning complete with comprehensive coverage strategy",
|
||||
"step_number": 4,
|
||||
"total_steps": 4,
|
||||
"next_step_required": False, # Final step
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Complete async test suite plan: unit tests for each method, integration tests for batch processing, edge case coverage, performance tests",
|
||||
"files_checked": [processor_file],
|
||||
"relevant_files": [processor_file],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response4:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response4_data = self._parse_testgen_response(response4)
|
||||
|
||||
# Validate final step
|
||||
if response4_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
file_context4 = response4_data.get("file_context", {})
|
||||
if file_context4.get("type") != "fully_embedded":
|
||||
self.logger.error("Final step should use fully_embedded file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Multi-step test planning completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Multi-step test planning test failed: {e}")
|
||||
return False
|
||||
|
||||
950
simulator_tests/test_thinkdeep_validation.py
Normal file
950
simulator_tests/test_thinkdeep_validation.py
Normal file
@@ -0,0 +1,950 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ThinkDeep Tool Validation Test
|
||||
|
||||
Tests the thinkdeep tool's capabilities using the new workflow architecture.
|
||||
This validates that the workflow-based deep thinking implementation provides
|
||||
step-by-step thinking with expert analysis integration.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from .conversation_base_test import ConversationBaseTest
|
||||
|
||||
|
||||
class ThinkDeepWorkflowValidationTest(ConversationBaseTest):
|
||||
"""Test thinkdeep tool with new workflow architecture"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "thinkdeep_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "ThinkDeep workflow tool validation with new workflow architecture"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test thinkdeep tool capabilities"""
|
||||
# Set up the test environment
|
||||
self.setUp()
|
||||
|
||||
try:
|
||||
self.logger.info("Test: ThinkDeepWorkflow tool validation (new architecture)")
|
||||
|
||||
# Create test files for thinking context
|
||||
self._create_thinking_context()
|
||||
|
||||
# Test 1: Single thinking session with multiple steps
|
||||
if not self._test_single_thinking_session():
|
||||
return False
|
||||
|
||||
# Test 2: Thinking with backtracking
|
||||
if not self._test_thinking_with_backtracking():
|
||||
return False
|
||||
|
||||
# Test 3: Complete thinking with expert analysis
|
||||
if not self._test_complete_thinking_with_analysis():
|
||||
return False
|
||||
|
||||
# Test 4: Certain confidence behavior
|
||||
if not self._test_certain_confidence():
|
||||
return False
|
||||
|
||||
# Test 5: Context-aware file embedding
|
||||
if not self._test_context_aware_file_embedding():
|
||||
return False
|
||||
|
||||
# Test 6: Multi-step file context optimization
|
||||
if not self._test_multi_step_file_context():
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ All thinkdeep validation tests passed")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"ThinkDeep validation test failed: {e}")
|
||||
return False
|
||||
|
||||
def _create_thinking_context(self):
|
||||
"""Create test files for deep thinking context"""
|
||||
# Create architecture document
|
||||
architecture_doc = """# Microservices Architecture Design
|
||||
|
||||
## Current System
|
||||
- Monolithic application with 500k LOC
|
||||
- Single PostgreSQL database
|
||||
- Peak load: 10k requests/minute
|
||||
- Team size: 25 developers
|
||||
- Deployment: Manual, 2-week cycles
|
||||
|
||||
## Proposed Migration to Microservices
|
||||
|
||||
### Benefits
|
||||
- Independent deployments
|
||||
- Technology diversity
|
||||
- Team autonomy
|
||||
- Scalability improvements
|
||||
|
||||
### Challenges
|
||||
- Data consistency
|
||||
- Network latency
|
||||
- Operational complexity
|
||||
- Transaction management
|
||||
|
||||
### Key Considerations
|
||||
- Service boundaries
|
||||
- Data migration strategy
|
||||
- Communication patterns
|
||||
- Monitoring and observability
|
||||
"""
|
||||
|
||||
# Create requirements document
|
||||
requirements_doc = """# Migration Requirements
|
||||
|
||||
## Business Goals
|
||||
- Reduce deployment cycle from 2 weeks to daily
|
||||
- Support 50k requests/minute by Q4
|
||||
- Enable A/B testing capabilities
|
||||
- Improve system resilience
|
||||
|
||||
## Technical Constraints
|
||||
- Zero downtime migration
|
||||
- Maintain data consistency
|
||||
- Budget: $200k for infrastructure
|
||||
- Timeline: 6 months
|
||||
- Existing team skills: Java, Spring Boot
|
||||
|
||||
## Success Metrics
|
||||
- Deployment frequency: 10x improvement
|
||||
- System availability: 99.9%
|
||||
- Response time: <200ms p95
|
||||
- Developer productivity: 30% improvement
|
||||
"""
|
||||
|
||||
# Create performance analysis
|
||||
performance_analysis = """# Current Performance Analysis
|
||||
|
||||
## Database Bottlenecks
|
||||
- Connection pool exhaustion during peak hours
|
||||
- Complex joins affecting query performance
|
||||
- Lock contention on user_sessions table
|
||||
- Read replica lag causing data inconsistency
|
||||
|
||||
## Application Issues
|
||||
- Memory leaks in background processing
|
||||
- Thread pool starvation
|
||||
- Cache invalidation storms
|
||||
- Session clustering problems
|
||||
|
||||
## Infrastructure Limits
|
||||
- Single server deployment
|
||||
- Manual scaling processes
|
||||
- Limited monitoring capabilities
|
||||
- No circuit breaker patterns
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
self.architecture_file = self.create_additional_test_file("architecture_design.md", architecture_doc)
|
||||
self.requirements_file = self.create_additional_test_file("migration_requirements.md", requirements_doc)
|
||||
self.performance_file = self.create_additional_test_file("performance_analysis.md", performance_analysis)
|
||||
|
||||
self.logger.info(" ✅ Created thinking context files:")
|
||||
self.logger.info(f" - {self.architecture_file}")
|
||||
self.logger.info(f" - {self.requirements_file}")
|
||||
self.logger.info(f" - {self.performance_file}")
|
||||
|
||||
def _test_single_thinking_session(self) -> bool:
|
||||
"""Test a complete thinking session with multiple steps"""
|
||||
try:
|
||||
self.logger.info(" 1.1: Testing single thinking session")
|
||||
|
||||
# Step 1: Start thinking analysis
|
||||
self.logger.info(" 1.1.1: Step 1 - Initial thinking analysis")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "I need to think deeply about the microservices migration strategy. Let me analyze the trade-offs, risks, and implementation approach systematically.",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Initial analysis shows significant architectural complexity but potential for major scalability and development velocity improvements. Need to carefully consider migration strategy and service boundaries.",
|
||||
"files_checked": [self.architecture_file, self.requirements_file],
|
||||
"relevant_files": [self.architecture_file, self.requirements_file],
|
||||
"relevant_context": ["microservices_migration", "service_boundaries", "data_consistency"],
|
||||
"confidence": "low",
|
||||
"problem_context": "Enterprise application migration from monolith to microservices",
|
||||
"focus_areas": ["architecture", "scalability", "risk_assessment"],
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to get initial thinking response")
|
||||
return False
|
||||
|
||||
# Parse and validate JSON response
|
||||
response1_data = self._parse_thinkdeep_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Validate step 1 response structure - expect pause_for_thinkdeep for next_step_required=True
|
||||
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_thinkdeep"):
|
||||
return False
|
||||
|
||||
self.logger.info(f" ✅ Step 1 successful, continuation_id: {continuation_id}")
|
||||
|
||||
# Step 2: Deep analysis
|
||||
self.logger.info(" 1.1.2: Step 2 - Deep analysis of alternatives")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Analyzing different migration approaches: strangler fig pattern vs big bang vs gradual extraction. Each has different risk profiles and timelines.",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Strangler fig pattern emerges as best approach: lower risk, incremental value delivery, team learning curve management. Key insight: start with read-only services to minimize data consistency issues.",
|
||||
"files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
|
||||
"relevant_files": [self.architecture_file, self.performance_file],
|
||||
"relevant_context": ["strangler_fig_pattern", "service_extraction", "risk_mitigation"],
|
||||
"issues_found": [
|
||||
{"severity": "high", "description": "Data consistency challenges during migration"},
|
||||
{"severity": "medium", "description": "Team skill gap in distributed systems"},
|
||||
],
|
||||
"confidence": "medium",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue thinking to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_thinkdeep_response(response2)
|
||||
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_thinkdeep"):
|
||||
return False
|
||||
|
||||
# Check thinking status tracking
|
||||
thinking_status = response2_data.get("thinking_status", {})
|
||||
if thinking_status.get("files_checked", 0) < 3:
|
||||
self.logger.error("Files checked count not properly tracked")
|
||||
return False
|
||||
|
||||
if thinking_status.get("thinking_confidence") != "medium":
|
||||
self.logger.error("Confidence level not properly tracked")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2 successful with proper tracking")
|
||||
|
||||
# Store continuation_id for next test
|
||||
self.thinking_continuation_id = continuation_id
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Single thinking session test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_thinking_with_backtracking(self) -> bool:
|
||||
"""Test thinking with backtracking to revise analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.2: Testing thinking with backtracking")
|
||||
|
||||
# Start a new thinking session for testing backtracking
|
||||
self.logger.info(" 1.2.1: Start thinking for backtracking test")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Thinking about optimal database architecture for the new microservices",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Initial thought: each service should have its own database for independence",
|
||||
"files_checked": [self.architecture_file],
|
||||
"relevant_files": [self.architecture_file],
|
||||
"relevant_context": ["database_per_service", "data_independence"],
|
||||
"confidence": "low",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start backtracking test thinking")
|
||||
return False
|
||||
|
||||
# Step 2: Initial direction
|
||||
self.logger.info(" 1.2.2: Step 2 - Initial analysis direction")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Exploring database-per-service pattern implementation",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Database-per-service creates significant complexity for transactions and reporting",
|
||||
"files_checked": [self.architecture_file, self.performance_file],
|
||||
"relevant_files": [self.performance_file],
|
||||
"relevant_context": ["database_per_service", "transaction_management"],
|
||||
"issues_found": [
|
||||
{"severity": "high", "description": "Cross-service transactions become complex"},
|
||||
{"severity": "medium", "description": "Reporting queries span multiple databases"},
|
||||
],
|
||||
"confidence": "low",
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
# Step 3: Backtrack and revise approach
|
||||
self.logger.info(" 1.2.3: Step 3 - Backtrack and revise thinking")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Backtracking - maybe shared database with service-specific schemas is better initially. Then gradually extract databases as services mature.",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Hybrid approach: shared database with bounded contexts, then gradual extraction. This reduces initial complexity while preserving migration path to full service independence.",
|
||||
"files_checked": [self.architecture_file, self.requirements_file],
|
||||
"relevant_files": [self.architecture_file, self.requirements_file],
|
||||
"relevant_context": ["shared_database", "bounded_contexts", "gradual_extraction"],
|
||||
"confidence": "medium",
|
||||
"backtrack_from_step": 2, # Backtrack from step 2
|
||||
"continuation_id": continuation_id,
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to backtrack")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_thinkdeep_response(response3)
|
||||
if not self._validate_step_response(response3_data, 3, 4, True, "pause_for_thinkdeep"):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Backtracking working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Backtracking test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_complete_thinking_with_analysis(self) -> bool:
|
||||
"""Test complete thinking ending with expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.3: Testing complete thinking with expert analysis")
|
||||
|
||||
# Use the continuation from first test
|
||||
continuation_id = getattr(self, "thinking_continuation_id", None)
|
||||
if not continuation_id:
|
||||
# Start fresh if no continuation available
|
||||
self.logger.info(" 1.3.0: Starting fresh thinking session")
|
||||
response0, continuation_id = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Thinking about the complete microservices migration strategy",
|
||||
"step_number": 1,
|
||||
"total_steps": 2,
|
||||
"next_step_required": True,
|
||||
"findings": "Comprehensive analysis of migration approaches and risks",
|
||||
"files_checked": [self.architecture_file, self.requirements_file],
|
||||
"relevant_files": [self.architecture_file, self.requirements_file],
|
||||
"relevant_context": ["migration_strategy", "risk_assessment"],
|
||||
},
|
||||
)
|
||||
if not response0 or not continuation_id:
|
||||
self.logger.error("Failed to start fresh thinking session")
|
||||
return False
|
||||
|
||||
# Final step - trigger expert analysis
|
||||
self.logger.info(" 1.3.1: Final step - complete thinking analysis")
|
||||
response_final, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Thinking analysis complete. I've thoroughly considered the migration strategy, risks, and implementation approach.",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step - triggers expert analysis
|
||||
"findings": "Comprehensive migration strategy: strangler fig pattern with shared database initially, gradual service extraction based on business value and technical feasibility. Key success factors: team training, monitoring infrastructure, and incremental rollout.",
|
||||
"files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
|
||||
"relevant_files": [self.architecture_file, self.requirements_file, self.performance_file],
|
||||
"relevant_context": ["strangler_fig", "migration_strategy", "risk_mitigation", "team_readiness"],
|
||||
"issues_found": [
|
||||
{"severity": "medium", "description": "Team needs distributed systems training"},
|
||||
{"severity": "low", "description": "Monitoring tools need upgrade"},
|
||||
],
|
||||
"confidence": "high",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "flash", # Use flash for expert analysis
|
||||
},
|
||||
)
|
||||
|
||||
if not response_final:
|
||||
self.logger.error("Failed to complete thinking")
|
||||
return False
|
||||
|
||||
response_final_data = self._parse_thinkdeep_response(response_final)
|
||||
if not response_final_data:
|
||||
return False
|
||||
|
||||
# Validate final response structure - accept both expert analysis and special statuses
|
||||
valid_final_statuses = ["calling_expert_analysis", "files_required_to_continue"]
|
||||
if response_final_data.get("status") not in valid_final_statuses:
|
||||
self.logger.error(
|
||||
f"Expected status in {valid_final_statuses}, got '{response_final_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if not response_final_data.get("thinking_complete"):
|
||||
self.logger.error("Expected thinking_complete=true for final step")
|
||||
return False
|
||||
|
||||
# Check for expert analysis or special status content
|
||||
if response_final_data.get("status") == "calling_expert_analysis":
|
||||
if "expert_analysis" not in response_final_data:
|
||||
self.logger.error("Missing expert_analysis in final response")
|
||||
return False
|
||||
expert_analysis = response_final_data.get("expert_analysis", {})
|
||||
else:
|
||||
# For special statuses like files_required_to_continue, analysis may be in content
|
||||
expert_analysis = response_final_data.get("content", "{}")
|
||||
if isinstance(expert_analysis, str):
|
||||
try:
|
||||
expert_analysis = json.loads(expert_analysis)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
expert_analysis = {"analysis": expert_analysis}
|
||||
|
||||
# Check for expected analysis content (checking common patterns)
|
||||
analysis_text = json.dumps(expert_analysis).lower()
|
||||
|
||||
# Look for thinking analysis validation
|
||||
thinking_indicators = ["migration", "strategy", "microservices", "risk", "approach", "implementation"]
|
||||
found_indicators = sum(1 for indicator in thinking_indicators if indicator in analysis_text)
|
||||
|
||||
if found_indicators >= 3:
|
||||
self.logger.info(" ✅ Expert analysis validated the thinking correctly")
|
||||
else:
|
||||
self.logger.warning(
|
||||
f" ⚠️ Expert analysis may not have fully validated the thinking (found {found_indicators}/6 indicators)"
|
||||
)
|
||||
|
||||
# Check complete thinking summary
|
||||
if "complete_thinking" not in response_final_data:
|
||||
self.logger.error("Missing complete_thinking in final response")
|
||||
return False
|
||||
|
||||
complete_thinking = response_final_data["complete_thinking"]
|
||||
if not complete_thinking.get("relevant_context"):
|
||||
self.logger.error("Missing relevant context in complete thinking")
|
||||
return False
|
||||
|
||||
if "migration_strategy" not in complete_thinking["relevant_context"]:
|
||||
self.logger.error("Expected context not found in thinking summary")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Complete thinking with expert analysis successful")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Complete thinking test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_certain_confidence(self) -> bool:
|
||||
"""Test certain confidence behavior - should skip expert analysis"""
|
||||
try:
|
||||
self.logger.info(" 1.4: Testing certain confidence behavior")
|
||||
|
||||
# Test certain confidence - should skip expert analysis
|
||||
self.logger.info(" 1.4.1: Certain confidence thinking")
|
||||
response_certain, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "I have thoroughly analyzed all aspects of the migration strategy with complete certainty.",
|
||||
"step_number": 1,
|
||||
"total_steps": 1,
|
||||
"next_step_required": False, # Final step
|
||||
"findings": "Definitive conclusion: strangler fig pattern with phased database extraction is the optimal approach. Risk mitigation through team training and robust monitoring. Timeline: 6 months with monthly service extractions.",
|
||||
"files_checked": [self.architecture_file, self.requirements_file, self.performance_file],
|
||||
"relevant_files": [self.architecture_file, self.requirements_file],
|
||||
"relevant_context": ["migration_complete_strategy", "implementation_plan"],
|
||||
"confidence": "certain", # This should skip expert analysis
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response_certain:
|
||||
self.logger.error("Failed to test certain confidence")
|
||||
return False
|
||||
|
||||
response_certain_data = self._parse_thinkdeep_response(response_certain)
|
||||
if not response_certain_data:
|
||||
return False
|
||||
|
||||
# Validate certain confidence response - should skip expert analysis
|
||||
if response_certain_data.get("status") != "deep_thinking_complete_ready_for_implementation":
|
||||
self.logger.error(
|
||||
f"Expected status 'deep_thinking_complete_ready_for_implementation', got '{response_certain_data.get('status')}'"
|
||||
)
|
||||
return False
|
||||
|
||||
if not response_certain_data.get("skip_expert_analysis"):
|
||||
self.logger.error("Expected skip_expert_analysis=true for certain confidence")
|
||||
return False
|
||||
|
||||
expert_analysis = response_certain_data.get("expert_analysis", {})
|
||||
if expert_analysis.get("status") != "skipped_due_to_certain_thinking_confidence":
|
||||
self.logger.error("Expert analysis should be skipped for certain confidence")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Certain confidence behavior working correctly")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Certain confidence test failed: {e}")
|
||||
return False
|
||||
|
||||
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Call an MCP tool in-process - override for thinkdeep-specific response handling"""
|
||||
# Use in-process implementation to maintain conversation memory
|
||||
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
|
||||
|
||||
if not response_text:
|
||||
return None, None
|
||||
|
||||
# Extract continuation_id from thinkdeep response specifically
|
||||
continuation_id = self._extract_thinkdeep_continuation_id(response_text)
|
||||
|
||||
return response_text, continuation_id
|
||||
|
||||
def _extract_thinkdeep_continuation_id(self, response_text: str) -> Optional[str]:
|
||||
"""Extract continuation_id from thinkdeep response"""
|
||||
try:
|
||||
# Parse the response
|
||||
response_data = json.loads(response_text)
|
||||
return response_data.get("continuation_id")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.debug(f"Failed to parse response for thinkdeep continuation_id: {e}")
|
||||
return None
|
||||
|
||||
def _parse_thinkdeep_response(self, response_text: str) -> dict:
|
||||
"""Parse thinkdeep tool JSON response"""
|
||||
try:
|
||||
# Parse the response - it should be direct JSON
|
||||
return json.loads(response_text)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse thinkdeep response as JSON: {e}")
|
||||
self.logger.error(f"Response text: {response_text[:500]}...")
|
||||
return {}
|
||||
|
||||
def _validate_step_response(
|
||||
self,
|
||||
response_data: dict,
|
||||
expected_step: int,
|
||||
expected_total: int,
|
||||
expected_next_required: bool,
|
||||
expected_status: str,
|
||||
) -> bool:
|
||||
"""Validate a thinkdeep thinking step response structure"""
|
||||
try:
|
||||
# Check status
|
||||
if response_data.get("status") != expected_status:
|
||||
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
|
||||
return False
|
||||
|
||||
# Check step number
|
||||
if response_data.get("step_number") != expected_step:
|
||||
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
|
||||
return False
|
||||
|
||||
# Check total steps
|
||||
if response_data.get("total_steps") != expected_total:
|
||||
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
|
||||
return False
|
||||
|
||||
# Check next_step_required
|
||||
if response_data.get("next_step_required") != expected_next_required:
|
||||
self.logger.error(
|
||||
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check thinking_status exists
|
||||
if "thinking_status" not in response_data:
|
||||
self.logger.error("Missing thinking_status in response")
|
||||
return False
|
||||
|
||||
# Check next_steps guidance
|
||||
if not response_data.get("next_steps"):
|
||||
self.logger.error("Missing next_steps guidance in response")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating step response: {e}")
|
||||
return False
|
||||
|
||||
def _test_context_aware_file_embedding(self) -> bool:
|
||||
"""Test context-aware file embedding optimization"""
|
||||
try:
|
||||
self.logger.info(" 1.5: Testing context-aware file embedding")
|
||||
|
||||
# Create additional test files for context testing
|
||||
strategy_doc = """# Implementation Strategy
|
||||
|
||||
## Phase 1: Foundation (Month 1-2)
|
||||
- Set up monitoring and logging infrastructure
|
||||
- Establish CI/CD pipelines for microservices
|
||||
- Team training on distributed systems concepts
|
||||
|
||||
## Phase 2: Initial Services (Month 3-4)
|
||||
- Extract read-only services (user profiles, product catalog)
|
||||
- Implement API gateway
|
||||
- Set up service discovery
|
||||
|
||||
## Phase 3: Core Services (Month 5-6)
|
||||
- Extract transaction services
|
||||
- Implement saga patterns for distributed transactions
|
||||
- Performance optimization and monitoring
|
||||
"""
|
||||
|
||||
tech_stack_doc = """# Technology Stack Decisions
|
||||
|
||||
## Service Framework
|
||||
- Spring Boot 2.7 (team familiarity)
|
||||
- Docker containers
|
||||
- Kubernetes orchestration
|
||||
|
||||
## Communication
|
||||
- REST APIs for synchronous communication
|
||||
- Apache Kafka for asynchronous messaging
|
||||
- gRPC for high-performance internal communication
|
||||
|
||||
## Data Layer
|
||||
- PostgreSQL (existing expertise)
|
||||
- Redis for caching
|
||||
- Elasticsearch for search and analytics
|
||||
|
||||
## Monitoring
|
||||
- Prometheus + Grafana
|
||||
- Distributed tracing with Jaeger
|
||||
- Centralized logging with ELK stack
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
strategy_file = self.create_additional_test_file("implementation_strategy.md", strategy_doc)
|
||||
tech_stack_file = self.create_additional_test_file("tech_stack.md", tech_stack_doc)
|
||||
|
||||
# Test 1: New conversation, intermediate step - should only reference files
|
||||
self.logger.info(" 1.5.1: New conversation intermediate step (should reference only)")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Starting deep thinking about implementation timeline and technology choices",
|
||||
"step_number": 1,
|
||||
"total_steps": 3,
|
||||
"next_step_required": True, # Intermediate step
|
||||
"findings": "Initial analysis of implementation strategy and technology stack decisions",
|
||||
"files_checked": [strategy_file, tech_stack_file],
|
||||
"relevant_files": [strategy_file], # This should be referenced, not embedded
|
||||
"relevant_context": ["implementation_timeline", "technology_selection"],
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start context-aware file embedding test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_thinkdeep_response(response1)
|
||||
if not response1_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be reference_only for intermediate step
|
||||
file_context = response1_data.get("file_context", {})
|
||||
if file_context.get("type") != "reference_only":
|
||||
self.logger.error(f"Expected reference_only file context, got: {file_context.get('type')}")
|
||||
return False
|
||||
|
||||
if "Files referenced but not embedded" not in file_context.get("context_optimization", ""):
|
||||
self.logger.error("Expected context optimization message for reference_only")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Intermediate step correctly uses reference_only file context")
|
||||
|
||||
# Test 2: Final step - should embed files for expert analysis
|
||||
self.logger.info(" 1.5.2: Final step (should embed files)")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Thinking analysis complete - comprehensive evaluation of implementation approach",
|
||||
"step_number": 2,
|
||||
"total_steps": 2,
|
||||
"next_step_required": False, # Final step - should embed files
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Complete analysis: phased implementation with proven technology stack minimizes risk while maximizing team effectiveness. Timeline is realistic with proper training and infrastructure setup.",
|
||||
"files_checked": [strategy_file, tech_stack_file],
|
||||
"relevant_files": [strategy_file, tech_stack_file], # Should be fully embedded
|
||||
"relevant_context": ["implementation_plan", "technology_decisions", "risk_management"],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_thinkdeep_response(response2)
|
||||
if not response2_data:
|
||||
return False
|
||||
|
||||
# Check file context - should be fully_embedded for final step
|
||||
file_context2 = response2_data.get("file_context", {})
|
||||
if file_context2.get("type") != "fully_embedded":
|
||||
self.logger.error(
|
||||
f"Expected fully_embedded file context for final step, got: {file_context2.get('type')}"
|
||||
)
|
||||
return False
|
||||
|
||||
if "Full file content embedded for expert analysis" not in file_context2.get("context_optimization", ""):
|
||||
self.logger.error("Expected expert analysis optimization message for fully_embedded")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Final step correctly uses fully_embedded file context")
|
||||
|
||||
# Verify expert analysis was called for final step
|
||||
if response2_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
if "expert_analysis" not in response2_data:
|
||||
self.logger.error("Expert analysis should be present in final step")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Context-aware file embedding test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Context-aware file embedding test failed: {e}")
|
||||
return False
|
||||
|
||||
def _test_multi_step_file_context(self) -> bool:
|
||||
"""Test multi-step workflow with proper file context transitions"""
|
||||
try:
|
||||
self.logger.info(" 1.6: Testing multi-step file context optimization")
|
||||
|
||||
# Create a complex scenario with multiple thinking documents
|
||||
risk_analysis = """# Risk Analysis
|
||||
|
||||
## Technical Risks
|
||||
- Service mesh complexity
|
||||
- Data consistency challenges
|
||||
- Performance degradation during migration
|
||||
- Operational overhead increase
|
||||
|
||||
## Business Risks
|
||||
- Extended development timelines
|
||||
- Potential system instability
|
||||
- Team productivity impact
|
||||
- Customer experience disruption
|
||||
|
||||
## Mitigation Strategies
|
||||
- Gradual rollout with feature flags
|
||||
- Comprehensive monitoring and alerting
|
||||
- Rollback procedures for each phase
|
||||
- Customer communication plan
|
||||
"""
|
||||
|
||||
success_metrics = """# Success Metrics and KPIs
|
||||
|
||||
## Development Velocity
|
||||
- Deployment frequency: Target 10x improvement
|
||||
- Lead time for changes: <2 hours
|
||||
- Mean time to recovery: <30 minutes
|
||||
- Change failure rate: <5%
|
||||
|
||||
## System Performance
|
||||
- Response time: <200ms p95
|
||||
- System availability: 99.9%
|
||||
- Throughput: 50k requests/minute
|
||||
- Resource utilization: 70% optimal
|
||||
|
||||
## Business Impact
|
||||
- Developer satisfaction: >8/10
|
||||
- Time to market: 50% reduction
|
||||
- Operational costs: 20% reduction
|
||||
- System reliability: 99.9% uptime
|
||||
"""
|
||||
|
||||
# Create test files
|
||||
risk_file = self.create_additional_test_file("risk_analysis.md", risk_analysis)
|
||||
metrics_file = self.create_additional_test_file("success_metrics.md", success_metrics)
|
||||
|
||||
# Step 1: Start thinking analysis (new conversation)
|
||||
self.logger.info(" 1.6.1: Step 1 - Start thinking analysis")
|
||||
response1, continuation_id = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Beginning comprehensive analysis of migration risks and success criteria",
|
||||
"step_number": 1,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"findings": "Initial assessment of risk factors and success metrics for microservices migration",
|
||||
"files_checked": [risk_file],
|
||||
"relevant_files": [risk_file],
|
||||
"relevant_context": ["risk_assessment", "migration_planning"],
|
||||
"confidence": "low",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response1 or not continuation_id:
|
||||
self.logger.error("Failed to start multi-step file context test")
|
||||
return False
|
||||
|
||||
response1_data = self._parse_thinkdeep_response(response1)
|
||||
|
||||
# Validate step 1 - should use reference_only
|
||||
file_context1 = response1_data.get("file_context", {})
|
||||
if file_context1.get("type") != "reference_only":
|
||||
self.logger.error("Step 1 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 1: reference_only file context")
|
||||
|
||||
# Step 2: Expand thinking analysis
|
||||
self.logger.info(" 1.6.2: Step 2 - Expand thinking analysis")
|
||||
response2, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Deepening analysis by correlating risks with success metrics",
|
||||
"step_number": 2,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Key insight: technical risks directly impact business metrics. Need balanced approach prioritizing high-impact, low-risk improvements first.",
|
||||
"files_checked": [risk_file, metrics_file],
|
||||
"relevant_files": [risk_file, metrics_file],
|
||||
"relevant_context": ["risk_metric_correlation", "priority_matrix"],
|
||||
"confidence": "medium",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response2:
|
||||
self.logger.error("Failed to continue to step 2")
|
||||
return False
|
||||
|
||||
response2_data = self._parse_thinkdeep_response(response2)
|
||||
|
||||
# Validate step 2 - should still use reference_only
|
||||
file_context2 = response2_data.get("file_context", {})
|
||||
if file_context2.get("type") != "reference_only":
|
||||
self.logger.error("Step 2 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 2: reference_only file context with multiple files")
|
||||
|
||||
# Step 3: Deep analysis
|
||||
self.logger.info(" 1.6.3: Step 3 - Deep strategic analysis")
|
||||
response3, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Synthesizing risk mitigation strategies with measurable success criteria",
|
||||
"step_number": 3,
|
||||
"total_steps": 4,
|
||||
"next_step_required": True,
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Strategic framework emerging: phase-gate approach with clear go/no-go criteria at each milestone. Emphasis on early wins to build confidence and momentum.",
|
||||
"files_checked": [risk_file, metrics_file, self.requirements_file],
|
||||
"relevant_files": [risk_file, metrics_file, self.requirements_file],
|
||||
"relevant_context": ["phase_gate_approach", "milestone_criteria", "early_wins"],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response3:
|
||||
self.logger.error("Failed to continue to step 3")
|
||||
return False
|
||||
|
||||
response3_data = self._parse_thinkdeep_response(response3)
|
||||
|
||||
# Validate step 3 - should still use reference_only
|
||||
file_context3 = response3_data.get("file_context", {})
|
||||
if file_context3.get("type") != "reference_only":
|
||||
self.logger.error("Step 3 should use reference_only file context")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 3: reference_only file context")
|
||||
|
||||
# Step 4: Final analysis with expert consultation
|
||||
self.logger.info(" 1.6.4: Step 4 - Final step with expert analysis")
|
||||
response4, _ = self.call_mcp_tool(
|
||||
"thinkdeep",
|
||||
{
|
||||
"step": "Thinking analysis complete - comprehensive strategic framework developed",
|
||||
"step_number": 4,
|
||||
"total_steps": 4,
|
||||
"next_step_required": False, # Final step - should embed files
|
||||
"continuation_id": continuation_id,
|
||||
"findings": "Complete strategic framework: risk-balanced migration with measurable success criteria, phase-gate governance, and clear rollback procedures. Framework aligns technical execution with business objectives.",
|
||||
"files_checked": [risk_file, metrics_file, self.requirements_file, self.architecture_file],
|
||||
"relevant_files": [risk_file, metrics_file, self.requirements_file, self.architecture_file],
|
||||
"relevant_context": ["strategic_framework", "governance_model", "success_measurement"],
|
||||
"confidence": "high",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if not response4:
|
||||
self.logger.error("Failed to complete to final step")
|
||||
return False
|
||||
|
||||
response4_data = self._parse_thinkdeep_response(response4)
|
||||
|
||||
# Validate step 4 - should use fully_embedded for expert analysis
|
||||
file_context4 = response4_data.get("file_context", {})
|
||||
if file_context4.get("type") != "fully_embedded":
|
||||
self.logger.error("Step 4 (final) should use fully_embedded file context")
|
||||
return False
|
||||
|
||||
if "expert analysis" not in file_context4.get("context_optimization", "").lower():
|
||||
self.logger.error("Final step should mention expert analysis in context optimization")
|
||||
return False
|
||||
|
||||
# Verify expert analysis was triggered
|
||||
if response4_data.get("status") != "calling_expert_analysis":
|
||||
self.logger.error("Final step should trigger expert analysis")
|
||||
return False
|
||||
|
||||
# Check that expert analysis has file context
|
||||
expert_analysis = response4_data.get("expert_analysis", {})
|
||||
if not expert_analysis:
|
||||
self.logger.error("Expert analysis should be present in final step")
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Step 4: fully_embedded file context with expert analysis")
|
||||
|
||||
# Validate the complete workflow progression
|
||||
progression_summary = {
|
||||
"step_1": "reference_only (new conversation, intermediate)",
|
||||
"step_2": "reference_only (continuation, intermediate)",
|
||||
"step_3": "reference_only (continuation, intermediate)",
|
||||
"step_4": "fully_embedded (continuation, final)",
|
||||
}
|
||||
|
||||
self.logger.info(" 📋 File context progression:")
|
||||
for step, context_type in progression_summary.items():
|
||||
self.logger.info(f" {step}: {context_type}")
|
||||
|
||||
self.logger.info(" ✅ Multi-step file context optimization test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Multi-step file context test failed: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user