Add Consensus Tool for Multi-Model Perspective Gathering (#67)
* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
9b98df650b
commit
95556ba9ea
@@ -7,6 +7,9 @@ Each test is in its own file for better organization and maintainability.
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
from .test_basic_conversation import BasicConversationTest
|
||||
from .test_consensus_conversation import TestConsensusConversation
|
||||
from .test_consensus_stance import TestConsensusStance
|
||||
from .test_consensus_three_models import TestConsensusThreeModels
|
||||
from .test_content_validation import ContentValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||
@@ -48,6 +51,9 @@ TEST_REGISTRY = {
|
||||
"conversation_chain_validation": ConversationChainValidationTest,
|
||||
"vision_capability": VisionCapabilityTest,
|
||||
"xai_models": XAIModelsTest,
|
||||
"consensus_conversation": TestConsensusConversation,
|
||||
"consensus_stance": TestConsensusStance,
|
||||
"consensus_three_models": TestConsensusThreeModels,
|
||||
# "o3_pro_expensive": O3ProExpensiveTest, # COMMENTED OUT - too expensive to run by default
|
||||
}
|
||||
|
||||
@@ -73,5 +79,8 @@ __all__ = [
|
||||
"ConversationChainValidationTest",
|
||||
"VisionCapabilityTest",
|
||||
"XAIModelsTest",
|
||||
"TestConsensusConversation",
|
||||
"TestConsensusStance",
|
||||
"TestConsensusThreeModels",
|
||||
"TEST_REGISTRY",
|
||||
]
|
||||
|
||||
@@ -136,18 +136,23 @@ class Calculator:
|
||||
|
||||
self.logger.debug(f"Calling MCP tool {tool_name} with proper initialization")
|
||||
|
||||
# Execute the command
|
||||
# Execute the command with proper handling for async responses
|
||||
# For consensus tool and other long-running tools, we need to ensure
|
||||
# the subprocess doesn't close prematurely
|
||||
result = subprocess.run(
|
||||
docker_cmd,
|
||||
input=input_data,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=3600, # 1 hour timeout
|
||||
check=False, # Don't raise on non-zero exit code
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error(f"Docker exec failed: {result.stderr}")
|
||||
return None, None
|
||||
self.logger.error(f"Docker exec failed with return code {result.returncode}")
|
||||
self.logger.error(f"Stderr: {result.stderr}")
|
||||
# Still try to parse stdout as the response might have been written before the error
|
||||
self.logger.debug(f"Attempting to parse stdout despite error: {result.stdout[:500]}")
|
||||
|
||||
# Parse the response - look for the tool call response
|
||||
response_data = self._parse_mcp_response(result.stdout, expected_id=2)
|
||||
@@ -191,7 +196,10 @@ class Calculator:
|
||||
|
||||
# If we get here, log all responses for debugging
|
||||
self.logger.warning(f"No valid tool call response found for ID {expected_id}")
|
||||
self.logger.debug(f"Full stdout: {stdout}")
|
||||
self.logger.warning(f"Full stdout: {stdout}")
|
||||
self.logger.warning(f"Total stdout lines: {len(lines)}")
|
||||
for i, line in enumerate(lines[:10]): # Log first 10 lines
|
||||
self.logger.warning(f"Line {i}: {line[:100]}...")
|
||||
return None
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
|
||||
222
simulator_tests/test_consensus_conversation.py
Normal file
222
simulator_tests/test_consensus_conversation.py
Normal file
@@ -0,0 +1,222 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Consensus Conversation Continuation Test
|
||||
|
||||
Tests that the consensus tool properly handles conversation continuation
|
||||
and builds conversation context correctly when using continuation_id.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class TestConsensusConversation(BaseSimulatorTest):
|
||||
"""Test consensus tool conversation continuation functionality"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "consensus_conversation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Test consensus tool conversation building and continuation"
|
||||
|
||||
def get_docker_logs(self):
|
||||
"""Get Docker container logs"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["docker", "logs", "--tail", "100", self.container_name], capture_output=True, text=True, timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.split("\n")
|
||||
else:
|
||||
self.logger.warning(f"Failed to get Docker logs: {result.stderr}")
|
||||
return []
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Exception getting Docker logs: {e}")
|
||||
return []
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test consensus conversation continuation"""
|
||||
try:
|
||||
self.logger.info("Testing consensus tool conversation continuation")
|
||||
|
||||
# Setup test files for context
|
||||
self.setup_test_files()
|
||||
|
||||
# Phase 1: Start conversation with chat tool (which properly creates continuation_id)
|
||||
self.logger.info("Phase 1: Starting conversation with chat tool")
|
||||
initial_response, continuation_id = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Please use low thinking mode. I'm working on a web application and need advice on authentication. Can you look at this code?",
|
||||
"files": [self.test_files["python"]],
|
||||
"model": "local-llama",
|
||||
},
|
||||
)
|
||||
|
||||
# Validate initial response
|
||||
if not initial_response:
|
||||
self.logger.error("Failed to get initial chat response")
|
||||
return False
|
||||
|
||||
if not continuation_id:
|
||||
self.logger.error("Failed to get continuation_id from initial chat")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Initial chat response preview: {initial_response[:200]}...")
|
||||
self.logger.info(f"Got continuation_id: {continuation_id}")
|
||||
|
||||
# Phase 2: Use consensus with continuation_id to test conversation building
|
||||
self.logger.info("Phase 2: Using consensus with continuation_id to test conversation building")
|
||||
consensus_response, _ = self.call_mcp_tool(
|
||||
"consensus",
|
||||
{
|
||||
"prompt": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?",
|
||||
"models": [
|
||||
{
|
||||
"model": "local-llama",
|
||||
"stance": "for",
|
||||
"stance_prompt": "Focus on OAuth2 benefits: security, scalability, and industry standards.",
|
||||
},
|
||||
{
|
||||
"model": "local-llama",
|
||||
"stance": "against",
|
||||
"stance_prompt": "Focus on OAuth2 complexity: implementation challenges and simpler alternatives.",
|
||||
},
|
||||
],
|
||||
"continuation_id": continuation_id,
|
||||
"model": "local-llama",
|
||||
},
|
||||
)
|
||||
|
||||
# Validate consensus response
|
||||
if not consensus_response:
|
||||
self.logger.error("Failed to get consensus response with continuation_id")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Consensus response preview: {consensus_response[:300]}...")
|
||||
|
||||
# Log the full response for debugging if it's not JSON
|
||||
if not consensus_response.startswith("{"):
|
||||
self.logger.error(f"Consensus response is not JSON. Full response: {consensus_response}")
|
||||
return False
|
||||
|
||||
# Parse consensus response
|
||||
try:
|
||||
consensus_data = json.loads(consensus_response)
|
||||
except json.JSONDecodeError:
|
||||
self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}")
|
||||
return False
|
||||
|
||||
if consensus_data.get("status") != "consensus_success":
|
||||
self.logger.error(f"Consensus failed with status: {consensus_data.get('status')}")
|
||||
if "error" in consensus_data:
|
||||
self.logger.error(f"Error: {consensus_data['error']}")
|
||||
return False
|
||||
|
||||
# Phase 3: Check server logs for conversation building
|
||||
self.logger.info("Phase 3: Checking server logs for conversation building")
|
||||
|
||||
# Check for conversation-related log entries
|
||||
logs = self.get_docker_logs()
|
||||
if not logs:
|
||||
self.logger.warning("Could not retrieve Docker logs for verification")
|
||||
else:
|
||||
# Look for conversation building indicators
|
||||
conversation_logs = [
|
||||
line
|
||||
for line in logs
|
||||
if any(
|
||||
keyword in line
|
||||
for keyword in [
|
||||
"CONVERSATION HISTORY",
|
||||
"continuation_id",
|
||||
"build_conversation_history",
|
||||
"ThreadContext",
|
||||
f"thread:{continuation_id}",
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
if conversation_logs:
|
||||
self.logger.info(f"Found {len(conversation_logs)} conversation-related log entries")
|
||||
# Show a few examples (truncated)
|
||||
for i, log in enumerate(conversation_logs[:3]):
|
||||
self.logger.info(f" Conversation log {i+1}: {log[:100]}...")
|
||||
else:
|
||||
self.logger.warning(
|
||||
"No conversation-related logs found (may indicate conversation not properly built)"
|
||||
)
|
||||
|
||||
# Check for any ERROR entries related to consensus
|
||||
error_logs = [
|
||||
line
|
||||
for line in logs
|
||||
if "ERROR" in line
|
||||
and any(keyword in line for keyword in ["consensus", "conversation", continuation_id])
|
||||
]
|
||||
|
||||
if error_logs:
|
||||
self.logger.error(f"Found {len(error_logs)} error logs related to consensus conversation:")
|
||||
for error in error_logs:
|
||||
self.logger.error(f" ERROR: {error}")
|
||||
return False
|
||||
|
||||
# Phase 4: Verify response structure
|
||||
self.logger.info("Phase 4: Verifying consensus response structure")
|
||||
|
||||
# Check that consensus has proper models_used
|
||||
models_used = consensus_data.get("models_used", [])
|
||||
if not models_used:
|
||||
self.logger.error("Consensus response missing models_used")
|
||||
return False
|
||||
|
||||
# Check that we have responses
|
||||
responses = consensus_data.get("responses", [])
|
||||
if not responses:
|
||||
self.logger.error("Consensus response missing responses")
|
||||
return False
|
||||
|
||||
# Verify at least one successful response
|
||||
successful_responses = [r for r in responses if r.get("status") == "success"]
|
||||
if not successful_responses:
|
||||
self.logger.error("No successful responses in consensus")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Consensus used models: {models_used}")
|
||||
self.logger.info(f"Consensus had {len(successful_responses)} successful responses")
|
||||
|
||||
# Phase 5: Cross-tool continuation test
|
||||
self.logger.info("Phase 5: Testing cross-tool continuation from consensus")
|
||||
|
||||
# Try to continue the conversation with a different tool
|
||||
chat_response, _ = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Based on our consensus discussion about authentication, can you summarize the key points?",
|
||||
"continuation_id": continuation_id,
|
||||
"model": "local-llama",
|
||||
},
|
||||
)
|
||||
|
||||
if not chat_response:
|
||||
self.logger.warning("Cross-tool continuation from consensus failed")
|
||||
# Don't fail the test for this - it's a bonus check
|
||||
else:
|
||||
self.logger.info("✓ Cross-tool continuation from consensus working")
|
||||
self.logger.info(f"Chat continuation preview: {chat_response[:200]}...")
|
||||
|
||||
self.logger.info("✓ Consensus conversation continuation test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Consensus conversation test failed with exception: {str(e)}")
|
||||
import traceback
|
||||
|
||||
self.logger.error(f"Traceback: {traceback.format_exc()}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
156
simulator_tests/test_consensus_stance.py
Normal file
156
simulator_tests/test_consensus_stance.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
Test consensus tool with explicit stance arguments
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class TestConsensusStance(BaseSimulatorTest):
|
||||
"""Test consensus tool functionality with stance steering"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "consensus_stance"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Test consensus tool with stance steering (for/against/neutral)"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Run consensus stance test"""
|
||||
try:
|
||||
self.logger.info("Testing consensus tool with ModelConfig objects and custom stance prompts")
|
||||
|
||||
# Send request with full two-model consensus
|
||||
response, continuation_id = self.call_mcp_tool(
|
||||
"consensus",
|
||||
{
|
||||
"prompt": "Add pizza button: good idea?",
|
||||
"models": [
|
||||
{
|
||||
"model": "flash",
|
||||
"stance": "for",
|
||||
"stance_prompt": "Focus on user engagement benefits.",
|
||||
},
|
||||
{
|
||||
"model": "flash",
|
||||
"stance": "against",
|
||||
"stance_prompt": "Focus on technical complexity issues.",
|
||||
},
|
||||
],
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
# Validate response
|
||||
if not response:
|
||||
self.logger.error("Failed to get response from consensus tool")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Consensus response preview: {response[:500]}...")
|
||||
|
||||
# Parse the JSON response
|
||||
try:
|
||||
consensus_data = json.loads(response)
|
||||
except json.JSONDecodeError:
|
||||
self.logger.error(f"Failed to parse consensus response as JSON: {response}")
|
||||
return False
|
||||
|
||||
# Validate consensus structure
|
||||
if "status" not in consensus_data:
|
||||
self.logger.error("Missing 'status' field in consensus response")
|
||||
return False
|
||||
|
||||
if consensus_data["status"] != "consensus_success":
|
||||
self.logger.error(f"Consensus failed with status: {consensus_data['status']}")
|
||||
|
||||
# Log additional error details for debugging
|
||||
if "error" in consensus_data:
|
||||
self.logger.error(f"Error message: {consensus_data['error']}")
|
||||
if "models_errored" in consensus_data:
|
||||
self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
|
||||
if "models_skipped" in consensus_data:
|
||||
self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
|
||||
if "next_steps" in consensus_data:
|
||||
self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
|
||||
|
||||
return False
|
||||
|
||||
# Check that both models were used with their stances
|
||||
if "models_used" not in consensus_data:
|
||||
self.logger.error("Missing 'models_used' field in consensus response")
|
||||
return False
|
||||
|
||||
models_used = consensus_data["models_used"]
|
||||
if len(models_used) != 2:
|
||||
self.logger.error(f"Expected 2 models, got {len(models_used)}")
|
||||
return False
|
||||
|
||||
if "flash:for" not in models_used:
|
||||
self.logger.error("Missing 'flash:for' in models_used")
|
||||
return False
|
||||
|
||||
if "flash:against" not in models_used:
|
||||
self.logger.error("Missing 'flash:against' in models_used")
|
||||
return False
|
||||
|
||||
# Validate responses structure
|
||||
if "responses" not in consensus_data:
|
||||
self.logger.error("Missing 'responses' field in consensus response")
|
||||
return False
|
||||
|
||||
responses = consensus_data["responses"]
|
||||
if len(responses) != 2:
|
||||
self.logger.error(f"Expected 2 responses, got {len(responses)}")
|
||||
return False
|
||||
|
||||
# Check each response has the correct stance
|
||||
for_response = None
|
||||
against_response = None
|
||||
|
||||
for resp in responses:
|
||||
if "stance" not in resp:
|
||||
self.logger.error("Missing 'stance' field in response")
|
||||
return False
|
||||
|
||||
if resp["stance"] == "for":
|
||||
for_response = resp
|
||||
elif resp["stance"] == "against":
|
||||
against_response = resp
|
||||
|
||||
# Verify we got both stances
|
||||
if not for_response:
|
||||
self.logger.error("Missing 'for' stance response")
|
||||
return False
|
||||
|
||||
if not against_response:
|
||||
self.logger.error("Missing 'against' stance response")
|
||||
return False
|
||||
|
||||
# Check that successful responses have verdicts
|
||||
if for_response.get("status") == "success":
|
||||
if "verdict" not in for_response:
|
||||
self.logger.error("Missing 'verdict' in for_response")
|
||||
return False
|
||||
self.logger.info(f"FOR stance verdict preview: {for_response['verdict'][:200]}...")
|
||||
|
||||
if against_response.get("status") == "success":
|
||||
if "verdict" not in against_response:
|
||||
self.logger.error("Missing 'verdict' in against_response")
|
||||
return False
|
||||
self.logger.info(f"AGAINST stance verdict preview: {against_response['verdict'][:200]}...")
|
||||
|
||||
# Verify synthesis guidance is present
|
||||
if "next_steps" not in consensus_data:
|
||||
self.logger.error("Missing 'next_steps' field in consensus response")
|
||||
return False
|
||||
|
||||
self.logger.info("✓ Consensus tool successfully processed two-model consensus with stance steering")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Test failed with exception: {str(e)}")
|
||||
return False
|
||||
153
simulator_tests/test_consensus_three_models.py
Normal file
153
simulator_tests/test_consensus_three_models.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
Test consensus tool with three models demonstrating sequential processing
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class TestConsensusThreeModels(BaseSimulatorTest):
|
||||
"""Test consensus tool functionality with three models (testing sequential processing)"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "consensus_three_models"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Test consensus tool with three models using flash:against, flash:for, local-llama:neutral"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Run three-model consensus test"""
|
||||
try:
|
||||
self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")
|
||||
|
||||
# Send request with three ModelConfig objects
|
||||
response, continuation_id = self.call_mcp_tool(
|
||||
"consensus",
|
||||
{
|
||||
"prompt": "Is a sync manager class a good idea for my CoolTodos app?",
|
||||
"models": [
|
||||
{
|
||||
"model": "flash",
|
||||
"stance": "against",
|
||||
"stance_prompt": "You are a software architecture critic. Focus on the potential downsides of adding a sync manager class: complexity overhead, maintenance burden, potential for over-engineering, and whether simpler alternatives exist. Consider if this adds unnecessary abstraction layers.",
|
||||
},
|
||||
{
|
||||
"model": "flash",
|
||||
"stance": "for",
|
||||
"stance_prompt": "You are a software architecture advocate. Focus on the benefits of a sync manager class: separation of concerns, testability, maintainability, and how it can improve the overall architecture. Consider scalability and code organization advantages.",
|
||||
},
|
||||
{
|
||||
"model": "local-llama",
|
||||
"stance": "neutral",
|
||||
"stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
|
||||
},
|
||||
],
|
||||
"model": "flash", # Default model for Claude's synthesis
|
||||
"focus_areas": ["architecture", "maintainability", "complexity", "scalability"],
|
||||
},
|
||||
)
|
||||
|
||||
# Validate response
|
||||
if not response:
|
||||
self.logger.error("Failed to get response from three-model consensus tool")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Three-model consensus response preview: {response[:500]}...")
|
||||
|
||||
# Parse the JSON response
|
||||
try:
|
||||
consensus_data = json.loads(response)
|
||||
except json.JSONDecodeError:
|
||||
self.logger.error(f"Failed to parse three-model consensus response as JSON: {response}")
|
||||
return False
|
||||
|
||||
# Validate consensus structure
|
||||
if "status" not in consensus_data:
|
||||
self.logger.error("Missing 'status' field in three-model consensus response")
|
||||
return False
|
||||
|
||||
if consensus_data["status"] != "consensus_success":
|
||||
self.logger.error(f"Three-model consensus failed with status: {consensus_data['status']}")
|
||||
|
||||
# Log additional error details for debugging
|
||||
if "error" in consensus_data:
|
||||
self.logger.error(f"Error message: {consensus_data['error']}")
|
||||
if "models_errored" in consensus_data:
|
||||
self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
|
||||
if "models_skipped" in consensus_data:
|
||||
self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
|
||||
if "next_steps" in consensus_data:
|
||||
self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
|
||||
|
||||
return False
|
||||
|
||||
# Check that models were used correctly
|
||||
if "models_used" not in consensus_data:
|
||||
self.logger.error("Missing 'models_used' field in three-model consensus response")
|
||||
return False
|
||||
|
||||
models_used = consensus_data["models_used"]
|
||||
self.logger.info(f"Models used in three-model test: {models_used}")
|
||||
|
||||
# Validate we got the expected models (allowing for some to fail)
|
||||
expected_models = ["flash:against", "flash:for", "local-llama"]
|
||||
successful_models = [m for m in expected_models if m in models_used]
|
||||
|
||||
if len(successful_models) == 0:
|
||||
self.logger.error("No models succeeded in three-model consensus test")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Successful models in three-model test: {successful_models}")
|
||||
|
||||
# Validate responses structure
|
||||
if "responses" not in consensus_data:
|
||||
self.logger.error("Missing 'responses' field in three-model consensus response")
|
||||
return False
|
||||
|
||||
responses = consensus_data["responses"]
|
||||
if len(responses) == 0:
|
||||
self.logger.error("No responses received in three-model consensus test")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Received {len(responses)} responses in three-model test")
|
||||
|
||||
# Count successful responses by stance
|
||||
stance_counts = {"for": 0, "against": 0, "neutral": 0}
|
||||
for resp in responses:
|
||||
if resp.get("status") == "success":
|
||||
stance = resp.get("stance", "neutral")
|
||||
stance_counts[stance] = stance_counts.get(stance, 0) + 1
|
||||
|
||||
self.logger.info(f"Stance distribution: {stance_counts}")
|
||||
|
||||
# Verify we have at least one successful response
|
||||
total_successful = sum(stance_counts.values())
|
||||
if total_successful == 0:
|
||||
self.logger.error("No successful responses in three-model consensus test")
|
||||
return False
|
||||
|
||||
# Check for sequential processing indication (>2 models should use sequential)
|
||||
if len(consensus_data["models_used"]) > 2:
|
||||
self.logger.info("✓ Sequential processing was correctly used for >2 models")
|
||||
else:
|
||||
self.logger.info("✓ Concurrent processing was used (≤2 models)")
|
||||
|
||||
# Verify synthesis guidance is present
|
||||
if "next_steps" not in consensus_data:
|
||||
self.logger.error("Missing 'next_steps' field in three-model consensus response")
|
||||
return False
|
||||
|
||||
self.logger.info("✓ Three-model consensus tool test completed successfully")
|
||||
self.logger.info(f"✓ Total successful responses: {total_successful}")
|
||||
self.logger.info(
|
||||
f"✓ Stance diversity achieved: {len([s for s in stance_counts.values() if s > 0])} different stances"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Three-model consensus test failed with exception: {str(e)}")
|
||||
return False
|
||||
Reference in New Issue
Block a user