Add Consensus Tool for Multi-Model Perspective Gathering (#67)

* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-17 10:53:17 +04:00
parent 9b98df650b
commit 95556ba9ea
31 changed files with 2643 additions and 324 deletions
--- a/simulator_tests/test_consensus_conversation.py
+++ b/simulator_tests/test_consensus_conversation.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Consensus Conversation Continuation Test
+
+Tests that the consensus tool properly handles conversation continuation
+and builds conversation context correctly when using continuation_id.
+"""
+
+import json
+import subprocess
+
+from .base_test import BaseSimulatorTest
+
+
+class TestConsensusConversation(BaseSimulatorTest):
+    """Test consensus tool conversation continuation functionality"""
+
+    @property
+    def test_name(self) -> str:
+        return "consensus_conversation"
+
+    @property
+    def test_description(self) -> str:
+        return "Test consensus tool conversation building and continuation"
+
+    def get_docker_logs(self):
+        """Get Docker container logs"""
+        try:
+            result = subprocess.run(
+                ["docker", "logs", "--tail", "100", self.container_name], capture_output=True, text=True, timeout=30
+            )
+            if result.returncode == 0:
+                return result.stdout.split("\n")
+            else:
+                self.logger.warning(f"Failed to get Docker logs: {result.stderr}")
+                return []
+        except Exception as e:
+            self.logger.warning(f"Exception getting Docker logs: {e}")
+            return []
+
+    def run_test(self) -> bool:
+        """Test consensus conversation continuation"""
+        try:
+            self.logger.info("Testing consensus tool conversation continuation")
+
+            # Setup test files for context
+            self.setup_test_files()
+
+            # Phase 1: Start conversation with chat tool (which properly creates continuation_id)
+            self.logger.info("Phase 1: Starting conversation with chat tool")
+            initial_response, continuation_id = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Please use low thinking mode. I'm working on a web application and need advice on authentication. Can you look at this code?",
+                    "files": [self.test_files["python"]],
+                    "model": "local-llama",
+                },
+            )
+
+            # Validate initial response
+            if not initial_response:
+                self.logger.error("Failed to get initial chat response")
+                return False
+
+            if not continuation_id:
+                self.logger.error("Failed to get continuation_id from initial chat")
+                return False
+
+            self.logger.info(f"Initial chat response preview: {initial_response[:200]}...")
+            self.logger.info(f"Got continuation_id: {continuation_id}")
+
+            # Phase 2: Use consensus with continuation_id to test conversation building
+            self.logger.info("Phase 2: Using consensus with continuation_id to test conversation building")
+            consensus_response, _ = self.call_mcp_tool(
+                "consensus",
+                {
+                    "prompt": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?",
+                    "models": [
+                        {
+                            "model": "local-llama",
+                            "stance": "for",
+                            "stance_prompt": "Focus on OAuth2 benefits: security, scalability, and industry standards.",
+                        },
+                        {
+                            "model": "local-llama",
+                            "stance": "against",
+                            "stance_prompt": "Focus on OAuth2 complexity: implementation challenges and simpler alternatives.",
+                        },
+                    ],
+                    "continuation_id": continuation_id,
+                    "model": "local-llama",
+                },
+            )
+
+            # Validate consensus response
+            if not consensus_response:
+                self.logger.error("Failed to get consensus response with continuation_id")
+                return False
+
+            self.logger.info(f"Consensus response preview: {consensus_response[:300]}...")
+
+            # Log the full response for debugging if it's not JSON
+            if not consensus_response.startswith("{"):
+                self.logger.error(f"Consensus response is not JSON. Full response: {consensus_response}")
+                return False
+
+            # Parse consensus response
+            try:
+                consensus_data = json.loads(consensus_response)
+            except json.JSONDecodeError:
+                self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}")
+                return False
+
+            if consensus_data.get("status") != "consensus_success":
+                self.logger.error(f"Consensus failed with status: {consensus_data.get('status')}")
+                if "error" in consensus_data:
+                    self.logger.error(f"Error: {consensus_data['error']}")
+                return False
+
+            # Phase 3: Check server logs for conversation building
+            self.logger.info("Phase 3: Checking server logs for conversation building")
+
+            # Check for conversation-related log entries
+            logs = self.get_docker_logs()
+            if not logs:
+                self.logger.warning("Could not retrieve Docker logs for verification")
+            else:
+                # Look for conversation building indicators
+                conversation_logs = [
+                    line
+                    for line in logs
+                    if any(
+                        keyword in line
+                        for keyword in [
+                            "CONVERSATION HISTORY",
+                            "continuation_id",
+                            "build_conversation_history",
+                            "ThreadContext",
+                            f"thread:{continuation_id}",
+                        ]
+                    )
+                ]
+
+                if conversation_logs:
+                    self.logger.info(f"Found {len(conversation_logs)} conversation-related log entries")
+                    # Show a few examples (truncated)
+                    for i, log in enumerate(conversation_logs[:3]):
+                        self.logger.info(f"  Conversation log {i+1}: {log[:100]}...")
+                else:
+                    self.logger.warning(
+                        "No conversation-related logs found (may indicate conversation not properly built)"
+                    )
+
+                # Check for any ERROR entries related to consensus
+                error_logs = [
+                    line
+                    for line in logs
+                    if "ERROR" in line
+                    and any(keyword in line for keyword in ["consensus", "conversation", continuation_id])
+                ]
+
+                if error_logs:
+                    self.logger.error(f"Found {len(error_logs)} error logs related to consensus conversation:")
+                    for error in error_logs:
+                        self.logger.error(f"  ERROR: {error}")
+                    return False
+
+            # Phase 4: Verify response structure
+            self.logger.info("Phase 4: Verifying consensus response structure")
+
+            # Check that consensus has proper models_used
+            models_used = consensus_data.get("models_used", [])
+            if not models_used:
+                self.logger.error("Consensus response missing models_used")
+                return False
+
+            # Check that we have responses
+            responses = consensus_data.get("responses", [])
+            if not responses:
+                self.logger.error("Consensus response missing responses")
+                return False
+
+            # Verify at least one successful response
+            successful_responses = [r for r in responses if r.get("status") == "success"]
+            if not successful_responses:
+                self.logger.error("No successful responses in consensus")
+                return False
+
+            self.logger.info(f"Consensus used models: {models_used}")
+            self.logger.info(f"Consensus had {len(successful_responses)} successful responses")
+
+            # Phase 5: Cross-tool continuation test
+            self.logger.info("Phase 5: Testing cross-tool continuation from consensus")
+
+            # Try to continue the conversation with a different tool
+            chat_response, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Based on our consensus discussion about authentication, can you summarize the key points?",
+                    "continuation_id": continuation_id,
+                    "model": "local-llama",
+                },
+            )
+
+            if not chat_response:
+                self.logger.warning("Cross-tool continuation from consensus failed")
+                # Don't fail the test for this - it's a bonus check
+            else:
+                self.logger.info("✓ Cross-tool continuation from consensus working")
+                self.logger.info(f"Chat continuation preview: {chat_response[:200]}...")
+
+            self.logger.info("✓ Consensus conversation continuation test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Consensus conversation test failed with exception: {str(e)}")
+            import traceback
+
+            self.logger.error(f"Traceback: {traceback.format_exc()}")
+            return False
+        finally:
+            self.cleanup_test_files()