Add Consensus Tool for Multi-Model Perspective Gathering (#67)

* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-17 10:53:17 +04:00
parent 9b98df650b
commit 95556ba9ea
31 changed files with 2643 additions and 324 deletions
--- a/test_enhanced_consensus.py
+++ b/test_enhanced_consensus.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Test script for the enhanced consensus tool with ModelConfig objects
+"""
+
+import asyncio
+import json
+import sys
+
+from tools.consensus import ConsensusTool
+
+
+async def test_enhanced_consensus():
+    """Test the enhanced consensus tool with custom stance prompts"""
+
+    print("🧪 Testing Enhanced Consensus Tool")
+    print("=" * 50)
+
+    # Test all stance synonyms work
+    print("📝 Testing stance synonym normalization...")
+    tool = ConsensusTool()
+
+    test_synonyms = [
+        ("support", "for"),
+        ("favor", "for"),
+        ("oppose", "against"),
+        ("critical", "against"),
+        ("neutral", "neutral"),
+        ("for", "for"),
+        ("against", "against"),
+        # Test unknown stances default to neutral
+        ("maybe", "neutral"),
+        ("supportive", "neutral"),
+        ("random", "neutral"),
+    ]
+
+    for input_stance, expected in test_synonyms:
+        normalized = tool._normalize_stance(input_stance)
+        status = "✅" if normalized == expected else "❌"
+        print(f"{status} '{input_stance}' → '{normalized}' (expected: '{expected}')")
+
+    print()
+
+    # Create consensus tool instance
+    tool = ConsensusTool()
+
+    # Test arguments with new ModelConfig format
+    test_arguments = {
+        "prompt": "Should we add a pizza ordering button to our enterprise software?",
+        "models": [
+            {
+                "model": "flash",
+                "stance": "support",  # Test synonym
+                "stance_prompt": "You are a user experience advocate. Focus on how this feature could improve user engagement and satisfaction. Consider the human elements - how might this bring joy to users' workday? Think about unexpected benefits and creative use cases.",
+            },
+            {
+                "model": "flash",
+                "stance": "oppose",  # Test synonym
+                "stance_prompt": "You are a software architecture specialist. Focus on technical concerns: code maintainability, security implications, scope creep, and system complexity. Consider long-term costs and potential maintenance burden.",
+            },
+        ],
+        "focus_areas": ["user experience", "technical complexity", "business value"],
+        "temperature": 0.3,
+    }
+
+    try:
+        print("📝 Test Arguments:")
+        print(json.dumps(test_arguments, indent=2))
+        print()
+
+        print("🚀 Executing consensus tool...")
+
+        # Execute the tool
+        result = await tool.execute(test_arguments)
+
+        print("✅ Consensus tool execution completed!")
+        print()
+
+        # Parse and display results
+        if result and len(result) > 0:
+            response_text = result[0].text
+            try:
+                response_data = json.loads(response_text)
+                print("📊 Consensus Results:")
+                print(f"Status: {response_data.get('status', 'unknown')}")
+
+                if response_data.get("status") == "consensus_success":
+                    models_used = response_data.get("models_used", [])
+                    print(f"Models used: {', '.join(models_used)}")
+
+                    responses = response_data.get("responses", [])
+                    print(f"\n🎭 Individual Model Responses ({len(responses)} total):")
+
+                    for i, resp in enumerate(responses, 1):
+                        model = resp.get("model", "unknown")
+                        stance = resp.get("stance", "neutral")
+                        status = resp.get("status", "unknown")
+
+                        print(f"\n{i}. {model.upper()} ({stance} stance) - {status}")
+
+                        if status == "success":
+                            verdict = resp.get("verdict", "No verdict")
+                            custom_prompt = resp.get("metadata", {}).get("custom_stance_prompt", False)
+                            print(f"   Custom prompt used: {'Yes' if custom_prompt else 'No'}")
+                            print(f"   Verdict preview: {verdict[:200]}...")
+
+                            # Show stance normalization worked
+                            if stance in ["support", "oppose"]:
+                                expected = "for" if stance == "support" else "against"
+                                print(f"   ✅ Stance '{stance}' normalized correctly")
+                        else:
+                            error = resp.get("error", "Unknown error")
+                            print(f"   Error: {error}")
+
+                else:
+                    print(f"❌ Consensus failed: {response_data.get('error', 'Unknown error')}")
+
+            except json.JSONDecodeError:
+                print("📄 Raw response (not JSON):")
+                print(response_text[:500] + "..." if len(response_text) > 500 else response_text)
+        else:
+            print("❌ No response received from consensus tool")
+
+    except Exception as e:
+        print(f"❌ Test failed with exception: {str(e)}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    print("\n🎉 Enhanced consensus tool test completed!")
+    return True
+
+
+if __name__ == "__main__":
+    # Run the test
+    success = asyncio.run(test_enhanced_consensus())
+    sys.exit(0 if success else 1)