Improved consensus to treat a step properly as both a request + response, and initial step includes Claude's assessment.
Improved prompt to not request for code when it's a general business decision
This commit is contained in:
@@ -11,8 +11,8 @@ from .test_basic_conversation import BasicConversationTest
|
|||||||
from .test_chat_simple_validation import ChatSimpleValidationTest
|
from .test_chat_simple_validation import ChatSimpleValidationTest
|
||||||
from .test_codereview_validation import CodeReviewValidationTest
|
from .test_codereview_validation import CodeReviewValidationTest
|
||||||
from .test_consensus_conversation import TestConsensusConversation
|
from .test_consensus_conversation import TestConsensusConversation
|
||||||
from .test_consensus_stance import TestConsensusStance
|
|
||||||
from .test_consensus_three_models import TestConsensusThreeModels
|
from .test_consensus_three_models import TestConsensusThreeModels
|
||||||
|
from .test_consensus_workflow_accurate import TestConsensusWorkflowAccurate
|
||||||
from .test_content_validation import ContentValidationTest
|
from .test_content_validation import ContentValidationTest
|
||||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||||
@@ -71,7 +71,7 @@ TEST_REGISTRY = {
|
|||||||
"vision_capability": VisionCapabilityTest,
|
"vision_capability": VisionCapabilityTest,
|
||||||
"xai_models": XAIModelsTest,
|
"xai_models": XAIModelsTest,
|
||||||
"consensus_conversation": TestConsensusConversation,
|
"consensus_conversation": TestConsensusConversation,
|
||||||
"consensus_stance": TestConsensusStance,
|
"consensus_workflow_accurate": TestConsensusWorkflowAccurate,
|
||||||
"consensus_three_models": TestConsensusThreeModels,
|
"consensus_three_models": TestConsensusThreeModels,
|
||||||
"analyze_validation": AnalyzeValidationTest,
|
"analyze_validation": AnalyzeValidationTest,
|
||||||
"prompt_size_limit_bug": PromptSizeLimitBugTest,
|
"prompt_size_limit_bug": PromptSizeLimitBugTest,
|
||||||
@@ -108,7 +108,7 @@ __all__ = [
|
|||||||
"VisionCapabilityTest",
|
"VisionCapabilityTest",
|
||||||
"XAIModelsTest",
|
"XAIModelsTest",
|
||||||
"TestConsensusConversation",
|
"TestConsensusConversation",
|
||||||
"TestConsensusStance",
|
"TestConsensusWorkflowAccurate",
|
||||||
"TestConsensusThreeModels",
|
"TestConsensusThreeModels",
|
||||||
"AnalyzeValidationTest",
|
"AnalyzeValidationTest",
|
||||||
"PromptSizeLimitBugTest",
|
"PromptSizeLimitBugTest",
|
||||||
|
|||||||
@@ -1,156 +0,0 @@
|
|||||||
"""
|
|
||||||
Test consensus tool with explicit stance arguments
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
from .base_test import BaseSimulatorTest
|
|
||||||
|
|
||||||
|
|
||||||
class TestConsensusStance(BaseSimulatorTest):
|
|
||||||
"""Test consensus tool functionality with stance steering"""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def test_name(self) -> str:
|
|
||||||
return "consensus_stance"
|
|
||||||
|
|
||||||
@property
|
|
||||||
def test_description(self) -> str:
|
|
||||||
return "Test consensus tool with stance steering (for/against/neutral)"
|
|
||||||
|
|
||||||
def run_test(self) -> bool:
|
|
||||||
"""Run consensus stance test"""
|
|
||||||
try:
|
|
||||||
self.logger.info("Testing consensus tool with ModelConfig objects and custom stance prompts")
|
|
||||||
|
|
||||||
# Send request with full two-model consensus
|
|
||||||
response, continuation_id = self.call_mcp_tool(
|
|
||||||
"consensus",
|
|
||||||
{
|
|
||||||
"prompt": "Add pizza button: good idea?",
|
|
||||||
"models": [
|
|
||||||
{
|
|
||||||
"model": "flash",
|
|
||||||
"stance": "for",
|
|
||||||
"stance_prompt": "Focus on user engagement benefits.",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model": "flash",
|
|
||||||
"stance": "against",
|
|
||||||
"stance_prompt": "Focus on technical complexity issues.",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"model": "flash",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate response
|
|
||||||
if not response:
|
|
||||||
self.logger.error("Failed to get response from consensus tool")
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.logger.info(f"Consensus response preview: {response[:500]}...")
|
|
||||||
|
|
||||||
# Parse the JSON response
|
|
||||||
try:
|
|
||||||
consensus_data = json.loads(response)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
self.logger.error(f"Failed to parse consensus response as JSON: {response}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Validate consensus structure
|
|
||||||
if "status" not in consensus_data:
|
|
||||||
self.logger.error("Missing 'status' field in consensus response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if consensus_data["status"] != "consensus_success":
|
|
||||||
self.logger.error(f"Consensus failed with status: {consensus_data['status']}")
|
|
||||||
|
|
||||||
# Log additional error details for debugging
|
|
||||||
if "error" in consensus_data:
|
|
||||||
self.logger.error(f"Error message: {consensus_data['error']}")
|
|
||||||
if "models_errored" in consensus_data:
|
|
||||||
self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
|
|
||||||
if "models_skipped" in consensus_data:
|
|
||||||
self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
|
|
||||||
if "next_steps" in consensus_data:
|
|
||||||
self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check that both models were used with their stances
|
|
||||||
if "models_used" not in consensus_data:
|
|
||||||
self.logger.error("Missing 'models_used' field in consensus response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
models_used = consensus_data["models_used"]
|
|
||||||
if len(models_used) != 2:
|
|
||||||
self.logger.error(f"Expected 2 models, got {len(models_used)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "flash:for" not in models_used:
|
|
||||||
self.logger.error("Missing 'flash:for' in models_used")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "flash:against" not in models_used:
|
|
||||||
self.logger.error("Missing 'flash:against' in models_used")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Validate responses structure
|
|
||||||
if "responses" not in consensus_data:
|
|
||||||
self.logger.error("Missing 'responses' field in consensus response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
responses = consensus_data["responses"]
|
|
||||||
if len(responses) != 2:
|
|
||||||
self.logger.error(f"Expected 2 responses, got {len(responses)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check each response has the correct stance
|
|
||||||
for_response = None
|
|
||||||
against_response = None
|
|
||||||
|
|
||||||
for resp in responses:
|
|
||||||
if "stance" not in resp:
|
|
||||||
self.logger.error("Missing 'stance' field in response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if resp["stance"] == "for":
|
|
||||||
for_response = resp
|
|
||||||
elif resp["stance"] == "against":
|
|
||||||
against_response = resp
|
|
||||||
|
|
||||||
# Verify we got both stances
|
|
||||||
if not for_response:
|
|
||||||
self.logger.error("Missing 'for' stance response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if not against_response:
|
|
||||||
self.logger.error("Missing 'against' stance response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check that successful responses have verdicts
|
|
||||||
if for_response.get("status") == "success":
|
|
||||||
if "verdict" not in for_response:
|
|
||||||
self.logger.error("Missing 'verdict' in for_response")
|
|
||||||
return False
|
|
||||||
self.logger.info(f"FOR stance verdict preview: {for_response['verdict'][:200]}...")
|
|
||||||
|
|
||||||
if against_response.get("status") == "success":
|
|
||||||
if "verdict" not in against_response:
|
|
||||||
self.logger.error("Missing 'verdict' in against_response")
|
|
||||||
return False
|
|
||||||
self.logger.info(f"AGAINST stance verdict preview: {against_response['verdict'][:200]}...")
|
|
||||||
|
|
||||||
# Verify synthesis guidance is present
|
|
||||||
if "next_steps" not in consensus_data:
|
|
||||||
self.logger.error("Missing 'next_steps' field in consensus response")
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.logger.info("✓ Consensus tool successfully processed two-model consensus with stance steering")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Test failed with exception: {str(e)}")
|
|
||||||
return False
|
|
||||||
226
simulator_tests/test_consensus_workflow_accurate.py
Normal file
226
simulator_tests/test_consensus_workflow_accurate.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
"""
|
||||||
|
Accurate Consensus Workflow Test
|
||||||
|
|
||||||
|
This test validates the complete consensus workflow step-by-step to ensure:
|
||||||
|
1. Step 1: Claude provides its own analysis
|
||||||
|
2. Step 2: Tool consults first model and returns response to Claude
|
||||||
|
3. Step 3: Tool consults second model and returns response to Claude
|
||||||
|
4. Step 4: Claude synthesizes all perspectives
|
||||||
|
|
||||||
|
This replaces the old faulty test that used non-workflow parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .conversation_base_test import ConversationBaseTest
|
||||||
|
|
||||||
|
|
||||||
|
class TestConsensusWorkflowAccurate(ConversationBaseTest):
|
||||||
|
"""Test complete consensus workflow with accurate step-by-step behavior"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_name(self) -> str:
|
||||||
|
return "consensus_workflow_accurate"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test_description(self) -> str:
|
||||||
|
return "Test NEW efficient consensus workflow: 2 models = 2 steps (Claude+model1, model2+synthesis)"
|
||||||
|
|
||||||
|
def run_test(self) -> bool:
|
||||||
|
"""Run complete consensus workflow test"""
|
||||||
|
# Set up the test environment
|
||||||
|
self.setUp()
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.logger.info("Testing complete consensus workflow step-by-step")
|
||||||
|
self.logger.info("Expected NEW flow: Step1(Claude+Model1) -> Step2(Model2+Synthesis)")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# STEP 1: Claude analysis + first model consultation
|
||||||
|
# ============================================================================
|
||||||
|
self.logger.info("=== STEP 1: Claude analysis + flash:for consultation ===")
|
||||||
|
|
||||||
|
step1_response, continuation_id = self.call_mcp_tool_direct(
|
||||||
|
"consensus",
|
||||||
|
{
|
||||||
|
"step": "Should we add a new AI-powered search feature to our application? Please analyze the technical feasibility, user value, and implementation complexity.",
|
||||||
|
"step_number": 1,
|
||||||
|
"total_steps": 2, # 2 models (each step includes consultation + analysis)
|
||||||
|
"next_step_required": True,
|
||||||
|
"findings": "Initial assessment of AI search feature proposal considering user needs, technical constraints, and business value.",
|
||||||
|
"models": [
|
||||||
|
{
|
||||||
|
"model": "flash",
|
||||||
|
"stance": "for",
|
||||||
|
"stance_prompt": "Focus on innovation benefits and competitive advantages.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "flash",
|
||||||
|
"stance": "against",
|
||||||
|
"stance_prompt": "Focus on implementation complexity and resource requirements.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"model": "flash", # Claude's execution model
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not step1_response:
|
||||||
|
self.logger.error("Step 1 failed - no response")
|
||||||
|
return False
|
||||||
|
|
||||||
|
step1_data = json.loads(step1_response)
|
||||||
|
self.logger.info(f"Step 1 status: {step1_data.get('status')}")
|
||||||
|
|
||||||
|
# Validate step 1 response (should include Claude's analysis + first model consultation)
|
||||||
|
if step1_data.get("status") != "analysis_and_first_model_consulted":
|
||||||
|
self.logger.error(
|
||||||
|
f"Expected status 'analysis_and_first_model_consulted', got: {step1_data.get('status')}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if step1_data.get("step_number") != 1:
|
||||||
|
self.logger.error(f"Expected step_number 1, got: {step1_data.get('step_number')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not step1_data.get("next_step_required"):
|
||||||
|
self.logger.error("Expected next_step_required=True for step 1")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify Claude's analysis is included
|
||||||
|
if "claude_analysis" not in step1_data:
|
||||||
|
self.logger.error("Expected claude_analysis in step 1 response")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify first model response is included
|
||||||
|
if "model_response" not in step1_data:
|
||||||
|
self.logger.error("Expected model_response in step 1 response")
|
||||||
|
return False
|
||||||
|
|
||||||
|
model1_response = step1_data["model_response"]
|
||||||
|
if model1_response.get("model") != "flash" or model1_response.get("stance") != "for":
|
||||||
|
self.logger.error(
|
||||||
|
f"Expected flash:for model response in step 1, got: {model1_response.get('model')}:{model1_response.get('stance')}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info("✓ Step 1 completed - Claude analysis + first model (flash:for) consulted")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# STEP 2: Final step - second model consultation + synthesis
|
||||||
|
# ============================================================================
|
||||||
|
self.logger.info("=== STEP 2: Final step - second model (flash:against) + synthesis ===")
|
||||||
|
|
||||||
|
step2_response, _ = self.call_mcp_tool_direct(
|
||||||
|
"consensus",
|
||||||
|
{
|
||||||
|
"step": "I need to review the second model's perspective and provide final synthesis.",
|
||||||
|
"step_number": 2,
|
||||||
|
"total_steps": 2,
|
||||||
|
"next_step_required": False, # Final step
|
||||||
|
"findings": "Analyzed first model's 'for' perspective. Now ready for second model's 'against' stance and final synthesis.",
|
||||||
|
"continuation_id": continuation_id,
|
||||||
|
"model": "flash",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not step2_response:
|
||||||
|
self.logger.error("Step 2 failed - no response")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(f"Step 2 raw response: {step2_response[:500]}...")
|
||||||
|
step2_data = json.loads(step2_response)
|
||||||
|
self.logger.info(f"Step 2 status: {step2_data.get('status')}")
|
||||||
|
|
||||||
|
# Validate step 2 - should show consensus completion
|
||||||
|
if step2_data.get("status") != "consensus_workflow_complete":
|
||||||
|
self.logger.error(f"Expected status 'consensus_workflow_complete', got: {step2_data.get('status')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if step2_data.get("model_consulted") != "flash":
|
||||||
|
self.logger.error(f"Expected model_consulted 'flash', got: {step2_data.get('model_consulted')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if step2_data.get("model_stance") != "against":
|
||||||
|
self.logger.error(f"Expected model_stance 'against', got: {step2_data.get('model_stance')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify model response is included
|
||||||
|
if "model_response" not in step2_data:
|
||||||
|
self.logger.error("Expected model_response in step 2")
|
||||||
|
return False
|
||||||
|
|
||||||
|
model2_response = step2_data["model_response"]
|
||||||
|
if model2_response.get("model") != "flash":
|
||||||
|
self.logger.error(f"Expected model_response.model 'flash', got: {model2_response.get('model')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify consensus completion data
|
||||||
|
if not step2_data.get("consensus_complete"):
|
||||||
|
self.logger.error("Expected consensus_complete=True in final step")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "complete_consensus" not in step2_data:
|
||||||
|
self.logger.error("Expected complete_consensus data in final step")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info("✓ Step 2 completed - Second model (flash:against) consulted and consensus complete")
|
||||||
|
self.logger.info(f"Model 2 verdict preview: {model2_response.get('verdict', 'No verdict')[:100]}...")
|
||||||
|
|
||||||
|
# Validate final consensus completion data
|
||||||
|
complete_consensus = step2_data["complete_consensus"]
|
||||||
|
if complete_consensus.get("total_responses") != 2:
|
||||||
|
self.logger.error(f"Expected 2 model responses, got: {complete_consensus.get('total_responses')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
models_consulted = complete_consensus.get("models_consulted", [])
|
||||||
|
expected_models = ["flash:for", "flash:against"]
|
||||||
|
if models_consulted != expected_models:
|
||||||
|
self.logger.error(f"Expected models {expected_models}, got: {models_consulted}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# VALIDATION: Check accumulated responses are available
|
||||||
|
# ============================================================================
|
||||||
|
self.logger.info("=== VALIDATION: Checking accumulated responses ===")
|
||||||
|
|
||||||
|
if "accumulated_responses" not in step2_data:
|
||||||
|
self.logger.error("Expected accumulated_responses in final step")
|
||||||
|
return False
|
||||||
|
|
||||||
|
accumulated = step2_data["accumulated_responses"]
|
||||||
|
if len(accumulated) != 2:
|
||||||
|
self.logger.error(f"Expected 2 accumulated responses, got: {len(accumulated)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify first response (flash:for)
|
||||||
|
response1 = accumulated[0]
|
||||||
|
if response1.get("model") != "flash" or response1.get("stance") != "for":
|
||||||
|
self.logger.error(f"First response incorrect: {response1}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Verify second response (flash:against)
|
||||||
|
response2 = accumulated[1]
|
||||||
|
if response2.get("model") != "flash" or response2.get("stance") != "against":
|
||||||
|
self.logger.error(f"Second response incorrect: {response2}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info("✓ All accumulated responses validated")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# SUCCESS
|
||||||
|
# ============================================================================
|
||||||
|
self.logger.info("🎉 CONSENSUS WORKFLOW TEST PASSED")
|
||||||
|
self.logger.info("✓ Step 1: Claude analysis + first model (flash:for) consulted")
|
||||||
|
self.logger.info("✓ Step 2: Second model (flash:against) consulted + synthesis completed")
|
||||||
|
self.logger.info("✓ All model responses accumulated correctly")
|
||||||
|
self.logger.info("✓ New efficient workflow: 2 models = 2 steps (not 4)")
|
||||||
|
self.logger.info("✓ Workflow progression validated at each step")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Consensus workflow test failed with exception: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
self.logger.error(f"Traceback: {traceback.format_exc()}")
|
||||||
|
return False
|
||||||
@@ -23,15 +23,22 @@ PERSPECTIVE FRAMEWORK
|
|||||||
{stance_prompt}
|
{stance_prompt}
|
||||||
|
|
||||||
IF MORE INFORMATION IS NEEDED
|
IF MORE INFORMATION IS NEEDED
|
||||||
If you need additional context (e.g., related files, system architecture, requirements, code snippets) to provide thorough
|
IMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,
|
||||||
analysis or response, you MUST ONLY respond with this exact JSON (and nothing else). Do NOT ask for the same file you've
|
or technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based
|
||||||
been provided unless for some reason its content is missing or incomplete:
|
on the information given rather than requesting technical files.
|
||||||
|
|
||||||
|
If you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to
|
||||||
|
provide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).
|
||||||
|
Do NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:
|
||||||
{
|
{
|
||||||
"status": "files_required_to_continue",
|
"status": "files_required_to_continue",
|
||||||
"mandatory_instructions": "<your critical instructions for Claude>",
|
"mandatory_instructions": "<your critical instructions for Claude>",
|
||||||
"files_needed": ["[file name here]", "[or some folder/]"]
|
"files_needed": ["[file name here]", "[or some folder/]"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
For business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the
|
||||||
|
context provided, even if specific technical details are not available.
|
||||||
|
|
||||||
EVALUATION FRAMEWORK
|
EVALUATION FRAMEWORK
|
||||||
Assess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you
|
Assess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you
|
||||||
acknowledge fundamental truths about feasibility, safety, or value:
|
acknowledge fundamental truths about feasibility, safety, or value:
|
||||||
|
|||||||
16
test_simulation_files/config.json
Normal file
16
test_simulation_files/config.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"database": {
|
||||||
|
"host": "localhost",
|
||||||
|
"port": 5432,
|
||||||
|
"name": "testdb",
|
||||||
|
"ssl": true
|
||||||
|
},
|
||||||
|
"cache": {
|
||||||
|
"redis_url": "redis://localhost:6379",
|
||||||
|
"ttl": 3600
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"level": "INFO",
|
||||||
|
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
}
|
||||||
|
}
|
||||||
32
test_simulation_files/test_module.py
Normal file
32
test_simulation_files/test_module.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""
|
||||||
|
Sample Python module for testing MCP conversation continuity
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fibonacci(n):
|
||||||
|
"""Calculate fibonacci number recursively"""
|
||||||
|
if n <= 1:
|
||||||
|
return n
|
||||||
|
return fibonacci(n-1) + fibonacci(n-2)
|
||||||
|
|
||||||
|
def factorial(n):
|
||||||
|
"""Calculate factorial iteratively"""
|
||||||
|
result = 1
|
||||||
|
for i in range(1, n + 1):
|
||||||
|
result *= i
|
||||||
|
return result
|
||||||
|
|
||||||
|
class Calculator:
|
||||||
|
"""Simple calculator class"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.history = []
|
||||||
|
|
||||||
|
def add(self, a, b):
|
||||||
|
result = a + b
|
||||||
|
self.history.append(f"{a} + {b} = {result}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def multiply(self, a, b):
|
||||||
|
result = a * b
|
||||||
|
self.history.append(f"{a} * {b} = {result}")
|
||||||
|
return result
|
||||||
@@ -2,8 +2,7 @@
|
|||||||
Tests for the Consensus tool using WorkflowTool architecture.
|
Tests for the Consensus tool using WorkflowTool architecture.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
from unittest.mock import Mock
|
||||||
from unittest.mock import Mock, patch
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -219,180 +218,112 @@ class TestConsensusTool:
|
|||||||
assert tool.should_call_expert_analysis({}) is False
|
assert tool.should_call_expert_analysis({}) is False
|
||||||
assert tool.requires_expert_analysis() is False
|
assert tool.requires_expert_analysis() is False
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_execute_workflow_step1_basic(self):
|
||||||
async def test_execute_workflow_step1(self):
|
"""Test basic workflow validation for step 1."""
|
||||||
"""Test workflow execution for step 1."""
|
|
||||||
tool = ConsensusTool()
|
tool = ConsensusTool()
|
||||||
|
|
||||||
|
# Test that step 1 sets up the workflow correctly
|
||||||
arguments = {
|
arguments = {
|
||||||
"step": "Initial analysis of proposal",
|
"step": "Initial analysis of proposal",
|
||||||
"step_number": 1,
|
"step_number": 1,
|
||||||
"total_steps": 4,
|
"total_steps": 2,
|
||||||
"next_step_required": True,
|
"next_step_required": True,
|
||||||
"findings": "Found pros and cons",
|
"findings": "Found pros and cons",
|
||||||
"confidence": "medium",
|
|
||||||
"models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
|
"models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
|
||||||
"relevant_files": ["/proposal.md"],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
with patch.object(tool, "is_effective_auto_mode", return_value=False):
|
# Verify models_to_consult is set correctly from step 1
|
||||||
with patch.object(tool, "get_model_provider", return_value=Mock()):
|
request = tool.get_workflow_request_model()(**arguments)
|
||||||
result = await tool.execute_workflow(arguments)
|
assert len(request.models) == 2
|
||||||
|
assert request.models[0]["model"] == "flash"
|
||||||
|
assert request.models[1]["model"] == "o3-mini"
|
||||||
|
|
||||||
assert len(result) == 1
|
def test_execute_workflow_total_steps_calculation(self):
|
||||||
response_text = result[0].text
|
"""Test that total_steps is calculated correctly from models."""
|
||||||
response_data = json.loads(response_text)
|
|
||||||
|
|
||||||
# Verify step 1 response structure
|
|
||||||
assert response_data["status"] == "consulting_models"
|
|
||||||
assert response_data["step_number"] == 1
|
|
||||||
assert "continuation_id" in response_data
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_execute_workflow_model_consultation(self):
|
|
||||||
"""Test workflow execution for model consultation steps."""
|
|
||||||
tool = ConsensusTool()
|
tool = ConsensusTool()
|
||||||
tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
|
|
||||||
tool.initial_prompt = "Test prompt"
|
|
||||||
|
|
||||||
|
# Test with 2 models
|
||||||
arguments = {
|
arguments = {
|
||||||
"step": "Processing model response",
|
"step": "Initial analysis",
|
||||||
"step_number": 2,
|
"step_number": 1,
|
||||||
"total_steps": 4,
|
"total_steps": 4, # This should be corrected to 2
|
||||||
"next_step_required": True,
|
"next_step_required": True,
|
||||||
"findings": "Model provided perspective",
|
"findings": "Analysis complete",
|
||||||
"confidence": "medium",
|
"models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
|
||||||
"continuation_id": "test-id",
|
|
||||||
"current_model_index": 0,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Mock the _consult_model method instead to return a proper dict
|
request = tool.get_workflow_request_model()(**arguments)
|
||||||
mock_model_response = {
|
# The tool should set total_steps = len(models) = 2
|
||||||
"model": "flash",
|
assert len(request.models) == 2
|
||||||
"stance": "neutral",
|
|
||||||
"status": "success",
|
def test_consult_model_basic_structure(self):
|
||||||
"verdict": "Model analysis response",
|
"""Test basic model consultation structure."""
|
||||||
"metadata": {"provider": "gemini"},
|
tool = ConsensusTool()
|
||||||
|
|
||||||
|
# Test that _get_stance_enhanced_prompt works
|
||||||
|
for_prompt = tool._get_stance_enhanced_prompt("for")
|
||||||
|
against_prompt = tool._get_stance_enhanced_prompt("against")
|
||||||
|
neutral_prompt = tool._get_stance_enhanced_prompt("neutral")
|
||||||
|
|
||||||
|
assert "SUPPORTIVE PERSPECTIVE" in for_prompt
|
||||||
|
assert "CRITICAL PERSPECTIVE" in against_prompt
|
||||||
|
assert "BALANCED PERSPECTIVE" in neutral_prompt
|
||||||
|
|
||||||
|
def test_model_configuration_validation(self):
|
||||||
|
"""Test model configuration validation."""
|
||||||
|
tool = ConsensusTool()
|
||||||
|
|
||||||
|
# Test single model config
|
||||||
|
models = [{"model": "flash", "stance": "neutral"}]
|
||||||
|
arguments = {
|
||||||
|
"step": "Test",
|
||||||
|
"step_number": 1,
|
||||||
|
"total_steps": 1,
|
||||||
|
"next_step_required": False,
|
||||||
|
"findings": "Test findings",
|
||||||
|
"models": models,
|
||||||
}
|
}
|
||||||
|
|
||||||
with patch.object(tool, "_consult_model", return_value=mock_model_response):
|
request = tool.get_workflow_request_model()(**arguments)
|
||||||
result = await tool.execute_workflow(arguments)
|
assert len(request.models) == 1
|
||||||
|
assert request.models[0]["model"] == "flash"
|
||||||
assert len(result) == 1
|
assert request.models[0]["stance"] == "neutral"
|
||||||
response_text = result[0].text
|
|
||||||
response_data = json.loads(response_text)
|
|
||||||
|
|
||||||
# Verify model consultation response
|
|
||||||
assert response_data["status"] == "model_consulted"
|
|
||||||
assert response_data["model_consulted"] == "flash"
|
|
||||||
assert response_data["model_stance"] == "neutral"
|
|
||||||
assert "model_response" in response_data
|
|
||||||
assert response_data["model_response"]["status"] == "success"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_consult_model_error_handling(self):
|
|
||||||
"""Test error handling in model consultation."""
|
|
||||||
tool = ConsensusTool()
|
|
||||||
tool.initial_prompt = "Test prompt"
|
|
||||||
|
|
||||||
# Mock provider to raise an error
|
|
||||||
mock_provider = Mock()
|
|
||||||
mock_provider.generate_content.side_effect = Exception("Model error")
|
|
||||||
|
|
||||||
with patch.object(tool, "get_model_provider", return_value=mock_provider):
|
|
||||||
result = await tool._consult_model(
|
|
||||||
{"model": "test-model", "stance": "neutral"}, Mock(relevant_files=[], continuation_id=None, images=None)
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result["status"] == "error"
|
|
||||||
assert result["error"] == "Model error"
|
|
||||||
assert result["model"] == "test-model"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_consult_model_with_images(self):
|
|
||||||
"""Test model consultation with images."""
|
|
||||||
tool = ConsensusTool()
|
|
||||||
tool.initial_prompt = "Test prompt"
|
|
||||||
|
|
||||||
# Mock provider
|
|
||||||
mock_provider = Mock()
|
|
||||||
mock_response = Mock(content="Model response with image analysis")
|
|
||||||
mock_provider.generate_content.return_value = mock_response
|
|
||||||
mock_provider.get_provider_type.return_value = Mock(value="gemini")
|
|
||||||
|
|
||||||
test_images = ["/path/to/image1.png", "/path/to/image2.jpg"]
|
|
||||||
|
|
||||||
with patch.object(tool, "get_model_provider", return_value=mock_provider):
|
|
||||||
result = await tool._consult_model(
|
|
||||||
{"model": "test-model", "stance": "neutral"},
|
|
||||||
Mock(relevant_files=[], continuation_id=None, images=test_images),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify that images were passed to generate_content
|
|
||||||
mock_provider.generate_content.assert_called_once()
|
|
||||||
call_args = mock_provider.generate_content.call_args
|
|
||||||
assert call_args.kwargs.get("images") == test_images
|
|
||||||
|
|
||||||
assert result["status"] == "success"
|
|
||||||
assert result["model"] == "test-model"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_handle_work_completion(self):
|
|
||||||
"""Test work completion handling for consensus workflow."""
|
|
||||||
tool = ConsensusTool()
|
|
||||||
tool.initial_prompt = "Test prompt"
|
|
||||||
tool.accumulated_responses = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
|
|
||||||
|
|
||||||
request = Mock(confidence="high")
|
|
||||||
response_data = {}
|
|
||||||
|
|
||||||
result = await tool.handle_work_completion(response_data, request, {})
|
|
||||||
|
|
||||||
assert result["consensus_complete"] is True
|
|
||||||
assert result["status"] == "consensus_workflow_complete"
|
|
||||||
assert "complete_consensus" in result
|
|
||||||
assert result["complete_consensus"]["models_consulted"] == ["flash:neutral", "o3-mini:for"]
|
|
||||||
assert result["complete_consensus"]["total_responses"] == 2
|
|
||||||
|
|
||||||
def test_handle_work_continuation(self):
|
def test_handle_work_continuation(self):
|
||||||
"""Test work continuation handling between steps."""
|
"""Test work continuation handling - legacy method for compatibility."""
|
||||||
tool = ConsensusTool()
|
tool = ConsensusTool()
|
||||||
tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
|
tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
|
||||||
|
|
||||||
|
# Note: In the new workflow, model consultation happens DURING steps in execute_workflow
|
||||||
|
# This method is kept for compatibility but not actively used in the step-by-step flow
|
||||||
|
|
||||||
# Test after step 1
|
# Test after step 1
|
||||||
request = Mock(step_number=1, current_model_index=0)
|
request = Mock(step_number=1, current_model_index=0)
|
||||||
response_data = {}
|
response_data = {}
|
||||||
|
|
||||||
result = tool.handle_work_continuation(response_data, request)
|
result = tool.handle_work_continuation(response_data, request)
|
||||||
assert result["status"] == "consulting_models"
|
# The method still exists but returns legacy status for compatibility
|
||||||
assert result["next_model"] == {"model": "flash", "stance": "neutral"}
|
assert "status" in result
|
||||||
|
|
||||||
# Test between model consultations
|
# Test between model consultations
|
||||||
request = Mock(step_number=2, current_model_index=1)
|
request = Mock(step_number=2, current_model_index=1)
|
||||||
response_data = {}
|
response_data = {}
|
||||||
|
|
||||||
result = tool.handle_work_continuation(response_data, request)
|
result = tool.handle_work_continuation(response_data, request)
|
||||||
assert result["status"] == "consulting_next_model"
|
assert "status" in result
|
||||||
assert result["next_model"] == {"model": "o3-mini", "stance": "for"}
|
|
||||||
assert result["models_remaining"] == 1
|
|
||||||
|
|
||||||
def test_customize_workflow_response(self):
|
def test_customize_workflow_response(self):
|
||||||
"""Test response customization for consensus workflow."""
|
"""Test response customization for consensus workflow."""
|
||||||
tool = ConsensusTool()
|
tool = ConsensusTool()
|
||||||
tool.accumulated_responses = [{"model": "test", "response": "data"}]
|
tool.accumulated_responses = [{"model": "test", "response": "data"}]
|
||||||
|
|
||||||
# Test different step numbers
|
# Test different step numbers (new workflow: 2 models = 2 steps)
|
||||||
request = Mock(step_number=1, total_steps=4)
|
request = Mock(step_number=1, total_steps=2)
|
||||||
response_data = {}
|
response_data = {}
|
||||||
result = tool.customize_workflow_response(response_data, request)
|
result = tool.customize_workflow_response(response_data, request)
|
||||||
assert result["consensus_workflow_status"] == "initial_analysis_complete"
|
assert result["consensus_workflow_status"] == "initial_analysis_complete"
|
||||||
|
|
||||||
request = Mock(step_number=2, total_steps=4)
|
request = Mock(step_number=2, total_steps=2)
|
||||||
response_data = {}
|
|
||||||
result = tool.customize_workflow_response(response_data, request)
|
|
||||||
assert result["consensus_workflow_status"] == "consulting_models"
|
|
||||||
|
|
||||||
request = Mock(step_number=4, total_steps=4)
|
|
||||||
response_data = {}
|
response_data = {}
|
||||||
result = tool.customize_workflow_response(response_data, request)
|
result = tool.customize_workflow_response(response_data, request)
|
||||||
assert result["consensus_workflow_status"] == "ready_for_synthesis"
|
assert result["consensus_workflow_status"] == "ready_for_synthesis"
|
||||||
|
|||||||
@@ -48,8 +48,9 @@ CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS = {
|
|||||||
"steps 2+ are for processing individual model responses."
|
"steps 2+ are for processing individual model responses."
|
||||||
),
|
),
|
||||||
"total_steps": (
|
"total_steps": (
|
||||||
"Total number of steps needed. This equals 1 (your analysis) + number of models to consult + "
|
"Total number of steps needed. This equals the number of models to consult. "
|
||||||
"1 (final synthesis)."
|
"Step 1 includes your analysis + first model consultation on return of the call. Final step includes "
|
||||||
|
"last model consultation + synthesis."
|
||||||
),
|
),
|
||||||
"next_step_required": ("Set to true if more models need to be consulted. False when ready for final synthesis."),
|
"next_step_required": ("Set to true if more models need to be consulted. False when ready for final synthesis."),
|
||||||
"findings": (
|
"findings": (
|
||||||
@@ -182,7 +183,7 @@ class ConsensusTool(WorkflowTool):
|
|||||||
"IMPORTANT: This workflow enforces sequential model consultation:\\n"
|
"IMPORTANT: This workflow enforces sequential model consultation:\\n"
|
||||||
"- Step 1 is always your independent analysis\\n"
|
"- Step 1 is always your independent analysis\\n"
|
||||||
"- Each subsequent step processes one model response\\n"
|
"- Each subsequent step processes one model response\\n"
|
||||||
"- Total steps = 1 (your analysis) + number of models + 1 (synthesis)\\n"
|
"- Total steps = number of models (each step includes consultation + response)\\n"
|
||||||
"- Models can have stances (for/against/neutral) for structured debate\\n"
|
"- Models can have stances (for/against/neutral) for structured debate\\n"
|
||||||
"- Same model can be used multiple times with different stances\\n"
|
"- Same model can be used multiple times with different stances\\n"
|
||||||
"- Each model + stance combination must be unique\\n\\n"
|
"- Each model + stance combination must be unique\\n\\n"
|
||||||
@@ -435,15 +436,16 @@ of the evidence, even when it strongly points in one direction.""",
|
|||||||
self.initial_prompt = request.step
|
self.initial_prompt = request.step
|
||||||
self.models_to_consult = request.models or []
|
self.models_to_consult = request.models or []
|
||||||
self.accumulated_responses = []
|
self.accumulated_responses = []
|
||||||
# Set total steps: 1 (Claude) + len(models) + 1 (synthesis)
|
# Set total steps: len(models) (each step includes consultation + response)
|
||||||
request.total_steps = 1 + len(self.models_to_consult) + 1
|
request.total_steps = len(self.models_to_consult)
|
||||||
|
|
||||||
|
# For all steps (1 through total_steps), consult the corresponding model
|
||||||
|
if request.step_number <= request.total_steps:
|
||||||
|
# Calculate which model to consult for this step
|
||||||
|
model_idx = request.step_number - 1 # 0-based index
|
||||||
|
|
||||||
# If this is a model consultation step (2 through total_steps-1)
|
|
||||||
elif request.step_number > 1 and request.step_number < request.total_steps:
|
|
||||||
# Get the current model to consult
|
|
||||||
model_idx = request.current_model_index or 0
|
|
||||||
if model_idx < len(self.models_to_consult):
|
if model_idx < len(self.models_to_consult):
|
||||||
# Consult the model
|
# Consult the model for this step
|
||||||
model_response = await self._consult_model(self.models_to_consult[model_idx], request)
|
model_response = await self._consult_model(self.models_to_consult[model_idx], request)
|
||||||
|
|
||||||
# Add to accumulated responses
|
# Add to accumulated responses
|
||||||
@@ -458,23 +460,48 @@ of the evidence, even when it strongly points in one direction.""",
|
|||||||
"model_stance": model_response.get("stance", "neutral"),
|
"model_stance": model_response.get("stance", "neutral"),
|
||||||
"model_response": model_response,
|
"model_response": model_response,
|
||||||
"current_model_index": model_idx + 1,
|
"current_model_index": model_idx + 1,
|
||||||
"next_step_required": request.step_number < request.total_steps - 1,
|
"next_step_required": request.step_number < request.total_steps,
|
||||||
}
|
}
|
||||||
|
|
||||||
if request.step_number < request.total_steps - 1:
|
# Add Claude's analysis to step 1
|
||||||
|
if request.step_number == 1:
|
||||||
|
response_data["claude_analysis"] = {
|
||||||
|
"initial_analysis": request.step,
|
||||||
|
"findings": request.findings,
|
||||||
|
}
|
||||||
|
response_data["status"] = "analysis_and_first_model_consulted"
|
||||||
|
|
||||||
|
# Check if this is the final step
|
||||||
|
if request.step_number == request.total_steps:
|
||||||
|
response_data["status"] = "consensus_workflow_complete"
|
||||||
|
response_data["consensus_complete"] = True
|
||||||
|
response_data["complete_consensus"] = {
|
||||||
|
"initial_prompt": self.initial_prompt,
|
||||||
|
"models_consulted": [
|
||||||
|
f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.accumulated_responses
|
||||||
|
],
|
||||||
|
"total_responses": len(self.accumulated_responses),
|
||||||
|
"consensus_confidence": "high",
|
||||||
|
}
|
||||||
|
response_data["next_steps"] = (
|
||||||
|
"CONSENSUS GATHERING IS COMPLETE. Synthesize all perspectives and present:\n"
|
||||||
|
"1. Key points of AGREEMENT across models\n"
|
||||||
|
"2. Key points of DISAGREEMENT and why they differ\n"
|
||||||
|
"3. Your final consolidated recommendation\n"
|
||||||
|
"4. Specific, actionable next steps for implementation\n"
|
||||||
|
"5. Critical risks or concerns that must be addressed"
|
||||||
|
)
|
||||||
|
else:
|
||||||
response_data["next_steps"] = (
|
response_data["next_steps"] = (
|
||||||
f"Model {model_response['model']} has provided its {model_response.get('stance', 'neutral')} "
|
f"Model {model_response['model']} has provided its {model_response.get('stance', 'neutral')} "
|
||||||
f"perspective. Please analyze this response and call {self.get_name()} again with:\n"
|
f"perspective. Please analyze this response and call {self.get_name()} again with:\n"
|
||||||
f"- step_number: {request.step_number + 1}\n"
|
f"- step_number: {request.step_number + 1}\n"
|
||||||
f"- findings: Summarize key points from this model's response\n"
|
f"- findings: Summarize key points from this model's response"
|
||||||
f"- current_model_index: {model_idx + 1}\n"
|
|
||||||
f"- model_responses: (append this response to the list)"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response_data["next_steps"] = (
|
|
||||||
"All models have been consulted. For the final step, synthesize all perspectives."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add accumulated responses for tracking
|
||||||
|
response_data["accumulated_responses"] = self.accumulated_responses
|
||||||
|
|
||||||
return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
|
return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
|
||||||
|
|
||||||
# Otherwise, use standard workflow execution
|
# Otherwise, use standard workflow execution
|
||||||
@@ -520,6 +547,7 @@ of the evidence, even when it strongly points in one direction.""",
|
|||||||
"verdict": response.content,
|
"verdict": response.content,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"provider": provider.get_provider_type().value,
|
"provider": provider.get_provider_type().value,
|
||||||
|
"model_name": model_name,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -627,8 +655,90 @@ of the evidence, even when it strongly points in one direction.""",
|
|||||||
else:
|
else:
|
||||||
response_data["consensus_workflow_status"] = "ready_for_synthesis"
|
response_data["consensus_workflow_status"] = "ready_for_synthesis"
|
||||||
|
|
||||||
|
# Customize metadata for consensus workflow
|
||||||
|
self._customize_consensus_metadata(response_data, request)
|
||||||
|
|
||||||
return response_data
|
return response_data
|
||||||
|
|
||||||
|
def _customize_consensus_metadata(self, response_data: dict, request) -> None:
|
||||||
|
"""
|
||||||
|
Customize metadata for consensus workflow to accurately reflect multi-model nature.
|
||||||
|
|
||||||
|
The default workflow metadata shows the model running Claude's analysis steps,
|
||||||
|
but consensus is a multi-model tool that consults different models. We need
|
||||||
|
to provide accurate metadata that reflects this.
|
||||||
|
"""
|
||||||
|
if "metadata" not in response_data:
|
||||||
|
response_data["metadata"] = {}
|
||||||
|
|
||||||
|
metadata = response_data["metadata"]
|
||||||
|
|
||||||
|
# Always preserve tool_name
|
||||||
|
metadata["tool_name"] = self.get_name()
|
||||||
|
|
||||||
|
if request.step_number == request.total_steps:
|
||||||
|
# Final step - show comprehensive consensus metadata
|
||||||
|
models_consulted = []
|
||||||
|
if self.models_to_consult:
|
||||||
|
models_consulted = [f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.models_to_consult]
|
||||||
|
|
||||||
|
metadata.update(
|
||||||
|
{
|
||||||
|
"workflow_type": "multi_model_consensus",
|
||||||
|
"models_consulted": models_consulted,
|
||||||
|
"consensus_complete": True,
|
||||||
|
"total_models": len(self.models_to_consult) if self.models_to_consult else 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove the misleading single model metadata
|
||||||
|
metadata.pop("model_used", None)
|
||||||
|
metadata.pop("provider_used", None)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Intermediate steps - show consensus workflow in progress
|
||||||
|
models_to_consult = []
|
||||||
|
if self.models_to_consult:
|
||||||
|
models_to_consult = [f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.models_to_consult]
|
||||||
|
|
||||||
|
metadata.update(
|
||||||
|
{
|
||||||
|
"workflow_type": "multi_model_consensus",
|
||||||
|
"models_to_consult": models_to_consult,
|
||||||
|
"consultation_step": request.step_number,
|
||||||
|
"total_consultation_steps": request.total_steps,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove the misleading single model metadata that shows Claude's execution model
|
||||||
|
# instead of the models being consulted
|
||||||
|
metadata.pop("model_used", None)
|
||||||
|
metadata.pop("provider_used", None)
|
||||||
|
|
||||||
|
def _add_workflow_metadata(self, response_data: dict, arguments: dict[str, Any]) -> None:
|
||||||
|
"""
|
||||||
|
Override workflow metadata addition for consensus tool.
|
||||||
|
|
||||||
|
The consensus tool doesn't use single model metadata because it's a multi-model
|
||||||
|
workflow. Instead, we provide consensus-specific metadata that accurately
|
||||||
|
reflects the models being consulted.
|
||||||
|
"""
|
||||||
|
# Initialize metadata if not present
|
||||||
|
if "metadata" not in response_data:
|
||||||
|
response_data["metadata"] = {}
|
||||||
|
|
||||||
|
# Add basic tool metadata
|
||||||
|
response_data["metadata"]["tool_name"] = self.get_name()
|
||||||
|
|
||||||
|
# The consensus-specific metadata is already added by _customize_consensus_metadata
|
||||||
|
# which is called from customize_workflow_response. We don't add the standard
|
||||||
|
# single-model metadata (model_used, provider_used) because it's misleading
|
||||||
|
# for a multi-model consensus workflow.
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"[CONSENSUS_METADATA] {self.get_name()}: Using consensus-specific metadata instead of single-model metadata"
|
||||||
|
)
|
||||||
|
|
||||||
def store_initial_issue(self, step_description: str):
|
def store_initial_issue(self, step_description: str):
|
||||||
"""Store initial prompt for model consultations."""
|
"""Store initial prompt for model consultations."""
|
||||||
self.initial_prompt = step_description
|
self.initial_prompt = step_description
|
||||||
|
|||||||
Reference in New Issue
Block a user