my-pal-mcp-server/simulator_tests/test_consensus_three_models.py

"""
Test consensus tool with three models demonstrating sequential processing
"""

import json

from .base_test import BaseSimulatorTest


class TestConsensusThreeModels(BaseSimulatorTest):
    """Test consensus tool functionality with three models (testing sequential processing)"""

    @property
    def test_name(self) -> str:
        return "consensus_three_models"

    @property
    def test_description(self) -> str:
        return "Test consensus tool with three models using flash:against, flash:for, local-llama:neutral"

    def run_test(self) -> bool:
        """Run three-model consensus test"""
        try:
            self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")

            # Send request with three ModelConfig objects
            response, continuation_id = self.call_mcp_tool(
                "consensus",
                {
                    "prompt": "Is a sync manager class a good idea for my CoolTodos app?",
                    "models": [
                        {
                            "model": "flash",
                            "stance": "against",
                            "stance_prompt": "You are a software architecture critic. Focus on the potential downsides of adding a sync manager class: complexity overhead, maintenance burden, potential for over-engineering, and whether simpler alternatives exist. Consider if this adds unnecessary abstraction layers.",
                        },
                        {
                            "model": "flash",
                            "stance": "for",
                            "stance_prompt": "You are a software architecture advocate. Focus on the benefits of a sync manager class: separation of concerns, testability, maintainability, and how it can improve the overall architecture. Consider scalability and code organization advantages.",
                        },
                        {
                            "model": "local-llama",
                            "stance": "neutral",
                            "stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
                        },
                    ],
                    "model": "flash",  # Default model for Claude's synthesis
                    "focus_areas": ["architecture", "maintainability", "complexity", "scalability"],
                },
            )

            # Validate response
            if not response:
                self.logger.error("Failed to get response from three-model consensus tool")
                return False

            self.logger.info(f"Three-model consensus response preview: {response[:500]}...")

            # Parse the JSON response
            try:
                consensus_data = json.loads(response)
            except json.JSONDecodeError:
                self.logger.error(f"Failed to parse three-model consensus response as JSON: {response}")
                return False

            # Validate consensus structure
            if "status" not in consensus_data:
                self.logger.error("Missing 'status' field in three-model consensus response")
                return False

            if consensus_data["status"] != "consensus_success":
                self.logger.error(f"Three-model consensus failed with status: {consensus_data['status']}")

                # Log additional error details for debugging
                if "error" in consensus_data:
                    self.logger.error(f"Error message: {consensus_data['error']}")
                if "models_errored" in consensus_data:
                    self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
                if "models_skipped" in consensus_data:
                    self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
                if "next_steps" in consensus_data:
                    self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")

                return False

            # Check that models were used correctly
            if "models_used" not in consensus_data:
                self.logger.error("Missing 'models_used' field in three-model consensus response")
                return False

            models_used = consensus_data["models_used"]
            self.logger.info(f"Models used in three-model test: {models_used}")

            # Validate we got the expected models (allowing for some to fail)
            expected_models = ["flash:against", "flash:for", "local-llama"]
            successful_models = [m for m in expected_models if m in models_used]

            if len(successful_models) == 0:
                self.logger.error("No models succeeded in three-model consensus test")
                return False

            self.logger.info(f"Successful models in three-model test: {successful_models}")

            # Validate responses structure
            if "responses" not in consensus_data:
                self.logger.error("Missing 'responses' field in three-model consensus response")
                return False

            responses = consensus_data["responses"]
            if len(responses) == 0:
                self.logger.error("No responses received in three-model consensus test")
                return False

            self.logger.info(f"Received {len(responses)} responses in three-model test")

            # Count successful responses by stance
            stance_counts = {"for": 0, "against": 0, "neutral": 0}
            for resp in responses:
                if resp.get("status") == "success":
                    stance = resp.get("stance", "neutral")
                    stance_counts[stance] = stance_counts.get(stance, 0) + 1

            self.logger.info(f"Stance distribution: {stance_counts}")

            # Verify we have at least one successful response
            total_successful = sum(stance_counts.values())
            if total_successful == 0:
                self.logger.error("No successful responses in three-model consensus test")
                return False

            # Check for sequential processing indication (>2 models should use sequential)
            if len(consensus_data["models_used"]) > 2:
                self.logger.info("✓ Sequential processing was correctly used for >2 models")
            else:
                self.logger.info("✓ Concurrent processing was used (≤2 models)")

            # Verify synthesis guidance is present
            if "next_steps" not in consensus_data:
                self.logger.error("Missing 'next_steps' field in three-model consensus response")
                return False

            self.logger.info("✓ Three-model consensus tool test completed successfully")
            self.logger.info(f"✓ Total successful responses: {total_successful}")
            self.logger.info(
                f"✓ Stance diversity achieved: {len([s for s in stance_counts.values() if s > 0])} different stances"
            )

            return True

        except Exception as e:
            self.logger.error(f"Three-model consensus test failed with exception: {str(e)}")
            return False