Files
my-pal-mcp-server/simulator_tests/test_consensus_three_models.py
Beehive Innovations 95556ba9ea Add Consensus Tool for Multi-Model Perspective Gathering (#67)
* WIP
Refactor resolving mode_names, should be done once at MCP call boundary
Pass around model context instead
Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses.

* Deduplication of model resolution, model_context should be available before reaching deeper parts of the code
Improved abstraction when building conversations
Throw programmer errors early

* Guardrails
Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus
Model name now supports an optional ":option" for future use

* Simplified async flow

* Improved model for request to support natural language
Simplified async flow

* Improved model for request to support natural language
Simplified async flow

* Fix consensus tool async/sync patterns to match codebase standards

CRITICAL FIXES:
- Converted _get_consensus_responses from async to sync (matches other tools)
- Converted store_conversation_turn from async to sync (add_turn is synchronous)
- Removed unnecessary asyncio imports and sleep calls
- Fixed ClosedResourceError in MCP protocol during long consensus operations

PATTERN ALIGNMENT:
- Consensus tool now follows same sync patterns as all other tools
- Only execute() and prepare_prompt() are async (base class requirement)
- All internal operations are synchronous like analyze, chat, debug, etc.

TESTING:
- MCP simulation test now passes: consensus_stance 
- Two-model consensus works correctly in ~35 seconds
- Unknown stance handling defaults to neutral with warnings
- All 9 unit tests pass (100% success rate)

The consensus tool async patterns were anomalous in the codebase.
This fix aligns it with the established synchronous patterns used
by all other tools while maintaining full functionality.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* Fixed call order and added new test

* Cleanup dead comments
Docs for the new tool
Improved tests

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-06-17 10:53:17 +04:00

154 lines
7.2 KiB
Python

"""
Test consensus tool with three models demonstrating sequential processing
"""
import json
from .base_test import BaseSimulatorTest
class TestConsensusThreeModels(BaseSimulatorTest):
"""Test consensus tool functionality with three models (testing sequential processing)"""
@property
def test_name(self) -> str:
return "consensus_three_models"
@property
def test_description(self) -> str:
return "Test consensus tool with three models using flash:against, flash:for, local-llama:neutral"
def run_test(self) -> bool:
"""Run three-model consensus test"""
try:
self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")
# Send request with three ModelConfig objects
response, continuation_id = self.call_mcp_tool(
"consensus",
{
"prompt": "Is a sync manager class a good idea for my CoolTodos app?",
"models": [
{
"model": "flash",
"stance": "against",
"stance_prompt": "You are a software architecture critic. Focus on the potential downsides of adding a sync manager class: complexity overhead, maintenance burden, potential for over-engineering, and whether simpler alternatives exist. Consider if this adds unnecessary abstraction layers.",
},
{
"model": "flash",
"stance": "for",
"stance_prompt": "You are a software architecture advocate. Focus on the benefits of a sync manager class: separation of concerns, testability, maintainability, and how it can improve the overall architecture. Consider scalability and code organization advantages.",
},
{
"model": "local-llama",
"stance": "neutral",
"stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
},
],
"model": "flash", # Default model for Claude's synthesis
"focus_areas": ["architecture", "maintainability", "complexity", "scalability"],
},
)
# Validate response
if not response:
self.logger.error("Failed to get response from three-model consensus tool")
return False
self.logger.info(f"Three-model consensus response preview: {response[:500]}...")
# Parse the JSON response
try:
consensus_data = json.loads(response)
except json.JSONDecodeError:
self.logger.error(f"Failed to parse three-model consensus response as JSON: {response}")
return False
# Validate consensus structure
if "status" not in consensus_data:
self.logger.error("Missing 'status' field in three-model consensus response")
return False
if consensus_data["status"] != "consensus_success":
self.logger.error(f"Three-model consensus failed with status: {consensus_data['status']}")
# Log additional error details for debugging
if "error" in consensus_data:
self.logger.error(f"Error message: {consensus_data['error']}")
if "models_errored" in consensus_data:
self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
if "models_skipped" in consensus_data:
self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
if "next_steps" in consensus_data:
self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
return False
# Check that models were used correctly
if "models_used" not in consensus_data:
self.logger.error("Missing 'models_used' field in three-model consensus response")
return False
models_used = consensus_data["models_used"]
self.logger.info(f"Models used in three-model test: {models_used}")
# Validate we got the expected models (allowing for some to fail)
expected_models = ["flash:against", "flash:for", "local-llama"]
successful_models = [m for m in expected_models if m in models_used]
if len(successful_models) == 0:
self.logger.error("No models succeeded in three-model consensus test")
return False
self.logger.info(f"Successful models in three-model test: {successful_models}")
# Validate responses structure
if "responses" not in consensus_data:
self.logger.error("Missing 'responses' field in three-model consensus response")
return False
responses = consensus_data["responses"]
if len(responses) == 0:
self.logger.error("No responses received in three-model consensus test")
return False
self.logger.info(f"Received {len(responses)} responses in three-model test")
# Count successful responses by stance
stance_counts = {"for": 0, "against": 0, "neutral": 0}
for resp in responses:
if resp.get("status") == "success":
stance = resp.get("stance", "neutral")
stance_counts[stance] = stance_counts.get(stance, 0) + 1
self.logger.info(f"Stance distribution: {stance_counts}")
# Verify we have at least one successful response
total_successful = sum(stance_counts.values())
if total_successful == 0:
self.logger.error("No successful responses in three-model consensus test")
return False
# Check for sequential processing indication (>2 models should use sequential)
if len(consensus_data["models_used"]) > 2:
self.logger.info("✓ Sequential processing was correctly used for >2 models")
else:
self.logger.info("✓ Concurrent processing was used (≤2 models)")
# Verify synthesis guidance is present
if "next_steps" not in consensus_data:
self.logger.error("Missing 'next_steps' field in three-model consensus response")
return False
self.logger.info("✓ Three-model consensus tool test completed successfully")
self.logger.info(f"✓ Total successful responses: {total_successful}")
self.logger.info(
f"✓ Stance diversity achieved: {len([s for s in stance_counts.values() if s > 0])} different stances"
)
return True
except Exception as e:
self.logger.error(f"Three-model consensus test failed with exception: {str(e)}")
return False