Add Consensus Tool for Multi-Model Perspective Gathering (#67)

* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-17 10:53:17 +04:00
parent 9b98df650b
commit 95556ba9ea
31 changed files with 2643 additions and 324 deletions
--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -0,0 +1,846 @@
+"""
+Consensus tool for multi-model perspective gathering and validation
+"""
+
+import json
+import logging
+from typing import TYPE_CHECKING, Any, Optional
+
+from mcp.types import TextContent
+from pydantic import BaseModel, Field, field_validator
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory
+
+from config import DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION
+from systemprompts import CONSENSUS_PROMPT
+
+from .base import BaseTool, ToolRequest
+
+logger = logging.getLogger(__name__)
+
+
+class ModelConfig(BaseModel):
+    """Enhanced model configuration for consensus tool"""
+
+    model: str = Field(..., description="Model name to use (e.g., 'o3', 'flash', 'pro')")
+    stance: Optional[str] = Field(
+        default="neutral",
+        description=(
+            "Stance for this model. Supportive: 'for', 'support', 'favor'. "
+            "Critical: 'against', 'oppose', 'critical'. Neutral: 'neutral'. "
+            "Defaults to 'neutral'."
+        ),
+    )
+    stance_prompt: Optional[str] = Field(
+        default=None,
+        description=(
+            "Custom stance-specific instructions for this model. "
+            "If provided, this will be used instead of the default stance prompt. "
+            "Should be clear, specific instructions about how this model should approach the analysis."
+        ),
+    )
+
+
+class ConsensusRequest(ToolRequest):
+    """Request model for consensus tool"""
+
+    prompt: str = Field(
+        ...,
+        description=(
+            "Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
+            "Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
+        ),
+    )
+    models: list[ModelConfig] = Field(
+        ...,
+        description=(
+            "List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
+            "Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
+            "{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
+            "Maximum 2 instances per model+stance combination."
+        ),
+    )
+    files: Optional[list[str]] = Field(
+        default_factory=list,
+        description="Optional files or directories for additional context (must be absolute paths)",
+    )
+    images: Optional[list[str]] = Field(
+        default_factory=list,
+        description=(
+            "Optional images showing expected UI changes, design requirements, "
+            "or visual references for the consensus analysis"
+        ),
+    )
+    focus_areas: Optional[list[str]] = Field(
+        default_factory=list,
+        description="Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
+    )
+
+    @field_validator("models")
+    @classmethod
+    def validate_models_not_empty(cls, v):
+        if not v:
+            raise ValueError("At least one model must be specified")
+        return v
+
+
+class ConsensusTool(BaseTool):
+    """Multi-model consensus tool for gathering diverse perspectives on technical proposals"""
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def parse_structured_prompt_models(model_spec: str) -> list[dict[str, str]]:
+        """
+        Parse consensus model specification from structured prompt format.
+
+        This method parses structured prompt specifications used in Claude Code shortcuts
+        like "/zen:consensus:flash:for,o3:against,pro:neutral" to extract model configurations
+        with their assigned stances.
+
+        Supported formats:
+        - "model:stance" - Explicit stance assignment (e.g., "flash:for", "o3:against")
+        - "model" - Defaults to neutral stance (e.g., "pro" becomes "pro:neutral")
+
+        Supported stances:
+        - Supportive: "for", "support", "favor"
+        - Critical: "against", "oppose", "critical"
+        - Neutral: "neutral" (default)
+
+        Args:
+            model_spec (str): Comma-separated model specification string.
+                Examples: "flash:for,o3:against,pro:neutral" or "flash:for,o3:against,pro"
+
+        Returns:
+            list[dict[str, str]]: List of model configuration dictionaries with keys:
+                - "model": The model name (e.g., "flash", "o3", "pro")
+                - "stance": The normalized stance (e.g., "for", "against", "neutral")
+
+        Examples:
+            >>> ConsensusTool.parse_structured_prompt_models("flash:for,o3:against,pro")
+            [{"model": "flash", "stance": "for"}, {"model": "o3", "stance": "against"}, {"model": "pro", "stance": "neutral"}]
+
+            >>> ConsensusTool.parse_structured_prompt_models("flash,o3,pro")
+            [{"model": "flash", "stance": "neutral"}, {"model": "o3", "stance": "neutral"}, {"model": "pro", "stance": "neutral"}]
+        """
+        models = []
+
+        # Split by comma to get individual model specs
+        model_parts = model_spec.split(",")
+
+        for part in model_parts:
+            part = part.strip()
+            if ":" in part:
+                # Model with stance: "flash:for" or "o3:against"
+                model_name, stance = part.split(":", 1)
+                models.append({"model": model_name.strip(), "stance": stance.strip()})
+            else:
+                # Model without stance (defaults to neutral): "pro"
+                models.append({"model": part.strip(), "stance": "neutral"})
+
+        return models
+
+    def get_name(self) -> str:
+        return "consensus"
+
+    def get_description(self) -> str:
+        return (
+            "MULTI-MODEL CONSENSUS - Gather diverse perspectives from multiple AI models on technical proposals, "
+            "plans, and ideas. Perfect for validation, feasibility assessment, and getting comprehensive "
+            "viewpoints on complex decisions. Supports advanced stance steering with custom instructions for each model. "
+            "You can specify different stances (for/against/neutral) and provide custom stance prompts to guide each "
+            "model's analysis. Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on implementation "
+            "benefits and user value'}, {'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify potential "
+            "risks and technical challenges'}]. Use neutral stances by default unless structured debate would add value."
+        )
+
+    def get_input_schema(self) -> dict[str, Any]:
+        schema = {
+            "type": "object",
+            "properties": {
+                "prompt": {
+                    "type": "string",
+                    "description": (
+                        "Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
+                        "Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
+                    ),
+                },
+                "models": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "model": {
+                                "type": "string",
+                                "description": "Model name to use (e.g., 'o3', 'flash', 'pro')",
+                            },
+                            "stance": {
+                                "type": "string",
+                                "enum": ["for", "support", "favor", "against", "oppose", "critical", "neutral"],
+                                "description": "Stance for this model: supportive ('for', 'support', 'favor'), critical ('against', 'oppose', 'critical'), or 'neutral'",
+                                "default": "neutral",
+                            },
+                            "stance_prompt": {
+                                "type": "string",
+                                "description": "Custom stance-specific instructions for this model. If provided, this will be used instead of the default stance prompt.",
+                            },
+                        },
+                        "required": ["model"],
+                    },
+                    "description": (
+                        "List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
+                        "Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
+                        "{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
+                        "Maximum 2 instances per model+stance combination."
+                    ),
+                },
+                "files": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional files or directories for additional context (must be absolute paths)",
+                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": (
+                        "Optional images showing expected UI changes, design requirements, "
+                        "or visual references for the consensus analysis"
+                    ),
+                },
+                "focus_areas": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
+                },
+                "temperature": {
+                    "type": "number",
+                    "description": "Temperature (0-1, default 0.2 for consistency)",
+                    "minimum": 0,
+                    "maximum": 1,
+                    "default": self.get_default_temperature(),
+                },
+                "thinking_mode": {
+                    "type": "string",
+                    "enum": ["minimal", "low", "medium", "high", "max"],
+                    "description": (
+                        "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), "
+                        "high (67%), max (100% of model max)"
+                    ),
+                },
+                "use_websearch": {
+                    "type": "boolean",
+                    "description": (
+                        "Enable web search for documentation, best practices, and current information. "
+                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
+                        "exploring industry best practices, working with specific frameworks/technologies, "
+                        "researching solutions to complex problems, or when current documentation and "
+                        "community insights would enhance the analysis."
+                    ),
+                    "default": True,
+                },
+                "continuation_id": {
+                    "type": "string",
+                    "description": (
+                        "Thread continuation ID for multi-turn conversations. Can be used to continue "
+                        "conversations across different tools. Only provide this if continuing a previous "
+                        "conversation thread."
+                    ),
+                },
+            },
+            "required": ["prompt", "models"],
+        }
+
+        return schema
+
+    def get_system_prompt(self) -> str:
+        return CONSENSUS_PROMPT
+
+    def get_default_temperature(self) -> float:
+        return 0.2  # Lower temperature for more consistent consensus responses
+
+    def get_model_category(self) -> "ToolModelCategory":
+        """Consensus uses extended reasoning models for deep analysis"""
+        from tools.models import ToolModelCategory
+
+        return ToolModelCategory.EXTENDED_REASONING
+
+    def get_request_model(self):
+        return ConsensusRequest
+
+    def format_conversation_turn(self, turn) -> list[str]:
+        """
+        Format consensus turns with individual model responses for better readability.
+
+        This custom formatting shows the individual model responses that were
+        synthesized into the consensus, making it easier to understand the
+        reasoning behind the final recommendation.
+        """
+        parts = []
+
+        # Add files context if present
+        if turn.files:
+            parts.append(f"Files used in this turn: {', '.join(turn.files)}")
+            parts.append("")
+
+        # Check if this is a consensus turn with individual responses
+        if turn.model_metadata and turn.model_metadata.get("individual_responses"):
+            individual_responses = turn.model_metadata["individual_responses"]
+
+            # Add consensus header
+            models_consulted = []
+            for resp in individual_responses:
+                model = resp["model"]
+                stance = resp.get("stance", "neutral")
+                if stance != "neutral":
+                    models_consulted.append(f"{model}:{stance}")
+                else:
+                    models_consulted.append(model)
+
+            parts.append(f"Models consulted: {', '.join(models_consulted)}")
+            parts.append("")
+            parts.append("=== INDIVIDUAL MODEL RESPONSES ===")
+            parts.append("")
+
+            # Add each successful model response
+            for i, response in enumerate(individual_responses):
+                model_name = response["model"]
+                stance = response.get("stance", "neutral")
+                verdict = response["verdict"]
+
+                stance_label = f"({stance.title()} Stance)" if stance != "neutral" else "(Neutral Analysis)"
+                parts.append(f"**{model_name.upper()} {stance_label}**:")
+                parts.append(verdict)
+
+                if i < len(individual_responses) - 1:
+                    parts.append("")
+                    parts.append("---")
+                parts.append("")
+
+            parts.append("=== END INDIVIDUAL RESPONSES ===")
+            parts.append("")
+            parts.append("Claude's Synthesis:")
+
+        # Add the actual content
+        parts.append(turn.content)
+
+        return parts
+
+    def _normalize_stance(self, stance: Optional[str]) -> str:
+        """Normalize stance to canonical form."""
+        if not stance:
+            return "neutral"
+
+        stance = stance.lower()
+
+        # Define stance synonyms
+        supportive_stances = {"for", "support", "favor"}
+        critical_stances = {"against", "oppose", "critical"}
+
+        # Map synonyms to canonical stance
+        if stance in supportive_stances:
+            return "for"
+        elif stance in critical_stances:
+            return "against"
+        elif stance == "neutral":
+            return "neutral"
+        else:
+            # Unknown stances default to neutral for robustness
+            logger.warning(
+                f"Unknown stance '{stance}' provided, defaulting to 'neutral'. Valid stances: {', '.join(sorted(supportive_stances | critical_stances))}, or 'neutral'"
+            )
+            return "neutral"
+
+    def _validate_model_combinations(self, model_configs: list[ModelConfig]) -> tuple[list[ModelConfig], list[str]]:
+        """Validate model configurations and enforce limits.
+
+        Returns:
+            tuple: (valid_configs, skipped_entries)
+            - Each model+stance combination can appear max 2 times
+            - Same model+stance limited to 2 instances
+        """
+        valid_configs = []
+        skipped_entries = []
+        combination_counts = {}  # Track (model, stance) -> count
+
+        for config in model_configs:
+            try:
+                # Normalize stance
+                normalized_stance = self._normalize_stance(config.stance)
+
+                # Create normalized config
+                normalized_config = ModelConfig(
+                    model=config.model, stance=normalized_stance, stance_prompt=config.stance_prompt
+                )
+
+                combination_key = (config.model, normalized_stance)
+                current_count = combination_counts.get(combination_key, 0)
+
+                if current_count >= DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION:
+                    # Already have max instances of this model+stance combination
+                    skipped_entries.append(
+                        f"{config.model}:{normalized_stance} (max {DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION} instances)"
+                    )
+                    continue
+
+                combination_counts[combination_key] = current_count + 1
+                valid_configs.append(normalized_config)
+
+            except ValueError as e:
+                # Invalid stance or model
+                skipped_entries.append(f"{config.model} ({str(e)})")
+                continue
+
+        return valid_configs, skipped_entries
+
+    def _get_stance_enhanced_prompt(self, stance: str, custom_stance_prompt: Optional[str] = None) -> str:
+        """Get the system prompt with stance injection based on the stance."""
+        base_prompt = self.get_system_prompt()
+
+        # If custom stance prompt is provided, use it instead of default
+        if custom_stance_prompt:
+            # Validate stance placeholder exists exactly once
+            if base_prompt.count("{stance_prompt}") != 1:
+                raise ValueError(
+                    "System prompt must contain exactly one '{stance_prompt}' placeholder, "
+                    f"found {base_prompt.count('{stance_prompt}')}"
+                )
+            return base_prompt.replace("{stance_prompt}", custom_stance_prompt)
+
+        stance_prompts = {
+            "for": """SUPPORTIVE PERSPECTIVE WITH INTEGRITY
+
+You are tasked with advocating FOR this proposal, but with CRITICAL GUARDRAILS:
+
+MANDATORY ETHICAL CONSTRAINTS:
+- This is NOT a debate for entertainment. You MUST act in good faith and in the best interest of the questioner
+- You MUST think deeply about whether supporting this idea is safe, sound, and passes essential requirements
+- You MUST be direct and unequivocal in saying "this is a bad idea" when it truly is
+- There must be at least ONE COMPELLING reason to be optimistic, otherwise DO NOT support it
+
+WHEN TO REFUSE SUPPORT (MUST OVERRIDE STANCE):
+- If the idea is fundamentally harmful to users, project, or stakeholders
+- If implementation would violate security, privacy, or ethical standards
+- If the proposal is technically infeasible within realistic constraints
+- If costs/risks dramatically outweigh any potential benefits
+
+YOUR SUPPORTIVE ANALYSIS SHOULD:
+- Identify genuine strengths and opportunities
+- Propose solutions to overcome legitimate challenges
+- Highlight synergies with existing systems
+- Suggest optimizations that enhance value
+- Present realistic implementation pathways
+
+Remember: Being "for" means finding the BEST possible version of the idea IF it has merit, not blindly supporting bad ideas.""",
+            "against": """CRITICAL PERSPECTIVE WITH RESPONSIBILITY
+
+You are tasked with critiquing this proposal, but with ESSENTIAL BOUNDARIES:
+
+MANDATORY FAIRNESS CONSTRAINTS:
+- You MUST NOT oppose genuinely excellent, common-sense ideas just to be contrarian
+- You MUST acknowledge when a proposal is fundamentally sound and well-conceived
+- You CANNOT give harmful advice or recommend against beneficial changes
+- If the idea is outstanding, say so clearly while offering constructive refinements
+
+WHEN TO MODERATE CRITICISM (MUST OVERRIDE STANCE):
+- If the proposal addresses critical user needs effectively
+- If it follows established best practices with good reason
+- If benefits clearly and substantially outweigh risks
+- If it's the obvious right solution to the problem
+
+YOUR CRITICAL ANALYSIS SHOULD:
+- Identify legitimate risks and failure modes
+- Point out overlooked complexities
+- Suggest more efficient alternatives
+- Highlight potential negative consequences
+- Question assumptions that may be flawed
+
+Remember: Being "against" means rigorous scrutiny to ensure quality, not undermining good ideas that deserve support.""",
+            "neutral": """BALANCED PERSPECTIVE
+
+Provide objective analysis considering both positive and negative aspects. However, if there is overwhelming evidence
+that the proposal clearly leans toward being exceptionally good or particularly problematic, you MUST accurately
+reflect this reality. Being "balanced" means being truthful about the weight of evidence, not artificially creating
+50/50 splits when the reality is 90/10.
+
+Your analysis should:
+- Present all significant pros and cons discovered
+- Weight them according to actual impact and likelihood
+- If evidence strongly favors one conclusion, clearly state this
+- Provide proportional coverage based on the strength of arguments
+- Help the questioner see the true balance of considerations
+
+Remember: Artificial balance that misrepresents reality is not helpful. True balance means accurate representation
+of the evidence, even when it strongly points in one direction.""",
+        }
+
+        stance_prompt = stance_prompts.get(stance, stance_prompts["neutral"])
+
+        # Validate stance placeholder exists exactly once
+        if base_prompt.count("{stance_prompt}") != 1:
+            raise ValueError(
+                "System prompt must contain exactly one '{stance_prompt}' placeholder, "
+                f"found {base_prompt.count('{stance_prompt}')}"
+            )
+
+        # Inject stance into the system prompt
+        return base_prompt.replace("{stance_prompt}", stance_prompt)
+
+    def _get_single_response(
+        self, provider, model_config: ModelConfig, prompt: str, request: ConsensusRequest
+    ) -> dict[str, Any]:
+        """Get response from a single model - synchronous method."""
+        logger.debug(f"Getting response from {model_config.model} with stance '{model_config.stance}'")
+
+        try:
+            # Provider.generate_content is synchronous, not async
+            response = provider.generate_content(
+                prompt=prompt,
+                model_name=model_config.model,
+                system_prompt=self._get_stance_enhanced_prompt(model_config.stance, model_config.stance_prompt),
+                temperature=getattr(request, "temperature", None) or self.get_default_temperature(),
+                thinking_mode=getattr(request, "thinking_mode", "medium"),
+                images=getattr(request, "images", None) or [],
+            )
+            return {
+                "model": model_config.model,
+                "stance": model_config.stance,
+                "status": "success",
+                "verdict": response.content,  # Contains structured Markdown
+                "metadata": {
+                    "provider": getattr(provider.get_provider_type(), "value", provider.get_provider_type()),
+                    "usage": response.usage if hasattr(response, "usage") else None,
+                    "custom_stance_prompt": bool(model_config.stance_prompt),
+                },
+            }
+        except Exception as e:
+            logger.error(f"Error getting response from {model_config.model}:{model_config.stance}: {str(e)}")
+            return {"model": model_config.model, "stance": model_config.stance, "status": "error", "error": str(e)}
+
+    def _get_consensus_responses(
+        self, provider_configs: list[tuple], prompt: str, request: ConsensusRequest
+    ) -> list[dict[str, Any]]:
+        """Execute all model requests sequentially - purely synchronous like other tools."""
+
+        logger.debug(f"Processing {len(provider_configs)} models sequentially")
+        responses = []
+
+        for i, (provider, model_config) in enumerate(provider_configs):
+            try:
+                logger.debug(
+                    f"Processing {model_config.model}:{model_config.stance} sequentially ({i+1}/{len(provider_configs)})"
+                )
+
+                # Direct synchronous call - matches pattern of other tools
+                response = self._get_single_response(provider, model_config, prompt, request)
+                responses.append(response)
+
+            except Exception as e:
+                logger.error(f"Failed to get response from {model_config.model}:{model_config.stance}: {str(e)}")
+                responses.append(
+                    {
+                        "model": model_config.model,
+                        "stance": model_config.stance,
+                        "status": "error",
+                        "error": f"Unhandled exception: {str(e)}",
+                    }
+                )
+
+        logger.debug(f"Sequential processing completed for {len(responses)} models")
+        return responses
+
+    def _format_consensus_output(self, responses: list[dict[str, Any]], skipped_entries: list[str]) -> str:
+        """Format the consensus responses into structured output for Claude."""
+
+        logger.debug(f"Formatting consensus output for {len(responses)} responses")
+
+        # Separate successful and failed responses
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        failed_responses = [r for r in responses if r["status"] == "error"]
+
+        logger.debug(f"Successful responses: {len(successful_responses)}, Failed: {len(failed_responses)}")
+
+        # Prepare the structured output (minimize size for MCP stability)
+        models_used = [
+            f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in successful_responses
+        ]
+        models_errored = [
+            f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in failed_responses
+        ]
+
+        # Prepare clean responses without truncation
+        clean_responses = []
+        for r in responses:
+            if r["status"] == "success":
+                clean_responses.append(
+                    {
+                        "model": r["model"],
+                        "stance": r["stance"],
+                        "status": r["status"],
+                        "verdict": r.get("verdict", ""),
+                        "metadata": r.get("metadata", {}),
+                    }
+                )
+            else:
+                clean_responses.append(
+                    {
+                        "model": r["model"],
+                        "stance": r["stance"],
+                        "status": r["status"],
+                        "error": r.get("error", "Unknown error"),
+                    }
+                )
+
+        output_data = {
+            "status": "consensus_success" if successful_responses else "consensus_failed",
+            "models_used": models_used,
+            "models_skipped": skipped_entries,
+            "models_errored": models_errored,
+            "responses": clean_responses,
+            "next_steps": self._get_synthesis_guidance(successful_responses, failed_responses),
+        }
+
+        return json.dumps(output_data, indent=2)
+
+    def _get_synthesis_guidance(
+        self, successful_responses: list[dict[str, Any]], failed_responses: list[dict[str, Any]]
+    ) -> str:
+        """Generate guidance for Claude on how to synthesize the consensus results."""
+
+        if not successful_responses:
+            return (
+                "No models provided successful responses. Please retry with different models or "
+                "check the error messages for guidance on resolving the issues."
+            )
+
+        if len(successful_responses) == 1:
+            return (
+                "Only one model provided a successful response. Synthesize based on the available "
+                "perspective and indicate areas where additional expert input would be valuable "
+                "due to the limited consensus data."
+            )
+
+        # Multiple successful responses - provide comprehensive synthesis guidance
+        stance_counts = {"for": 0, "against": 0, "neutral": 0}
+        for resp in successful_responses:
+            stance = resp.get("stance", "neutral")
+            stance_counts[stance] = stance_counts.get(stance, 0) + 1
+
+        guidance = (
+            "Claude, synthesize these perspectives by first identifying the key points of "
+            "**agreement** and **disagreement** between the models. Then provide your final, "
+            "consolidated recommendation, explaining how you weighed the different opinions and "
+            "why your proposed solution is the most balanced approach. Explicitly address the "
+            "most critical risks raised by each model and provide actionable next steps for implementation."
+        )
+
+        if failed_responses:
+            guidance += (
+                f" Note: {len(failed_responses)} model(s) failed to respond - consider this "
+                "partial consensus and indicate where additional expert input would strengthen the analysis."
+            )
+
+        return guidance
+
+    async def prepare_prompt(self, request: ConsensusRequest) -> str:
+        """Prepare the consensus prompt with context files and focus areas."""
+        # Check for prompt.txt in files
+        prompt_content, updated_files = self.handle_prompt_file(request.files)
+
+        # Use prompt.txt content if available, otherwise use the prompt field
+        user_content = prompt_content if prompt_content else request.prompt
+
+        # Check user input size at MCP transport boundary (before adding internal content)
+        size_check = self.check_prompt_size(user_content)
+        if size_check:
+            # Need to return error, but prepare_prompt returns str
+            # Use exception to handle this cleanly
+            from tools.models import ToolOutput
+
+            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+
+        # Update request files list
+        if updated_files is not None:
+            request.files = updated_files
+
+        # Add focus areas if specified
+        if request.focus_areas:
+            focus_areas_text = "\n\nSpecific focus areas for this analysis:\n" + "\n".join(
+                f"- {area}" for area in request.focus_areas
+            )
+            user_content += focus_areas_text
+
+        # Add context files if provided (using centralized file handling with filtering)
+        if request.files:
+            file_content, processed_files = self._prepare_file_content_for_prompt(
+                request.files, request.continuation_id, "Context files"
+            )
+            self._actually_processed_files = processed_files
+            if file_content:
+                user_content = f"{user_content}\n\n=== CONTEXT FILES ===\n{file_content}\n=== END CONTEXT ===="
+
+        # Check token limits
+        self._validate_token_limit(user_content, "Content")
+
+        return user_content
+
+    async def execute(self, arguments: dict[str, Any]) -> list[TextContent]:
+        """Execute consensus gathering from multiple models."""
+
+        # Store arguments for base class methods
+        self._current_arguments = arguments
+
+        # Validate and create request
+        request = ConsensusRequest(**arguments)
+
+        # Validate model configurations and enforce limits
+        valid_configs, skipped_entries = self._validate_model_combinations(request.models)
+
+        if not valid_configs:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "No valid model configurations after validation",
+                "models_skipped": skipped_entries,
+                "next_steps": "Please provide valid model configurations with proper model names and stance values.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        # Set up a dummy model context for consensus since we handle multiple models
+        # This is needed for base class methods like prepare_prompt to work
+        if not hasattr(self, "_model_context") or not self._model_context:
+            from utils.model_context import ModelContext
+
+            # Use the first model as the representative for token calculations
+            first_model = valid_configs[0].model if valid_configs else "flash"
+            self._model_context = ModelContext(first_model)
+
+        # Handle conversation continuation if specified
+        if request.continuation_id:
+            from utils.conversation_memory import build_conversation_history, get_thread
+
+            thread_context = get_thread(request.continuation_id)
+            if thread_context:
+                # Build conversation history using the same pattern as other tools
+                conversation_context, _ = build_conversation_history(thread_context, self._model_context)
+                if conversation_context:
+                    # Add conversation context to the beginning of the prompt
+                    enhanced_prompt = f"{conversation_context}\n\n{request.prompt}"
+                    request.prompt = enhanced_prompt
+
+        # Prepare the consensus prompt
+        consensus_prompt = await self.prepare_prompt(request)
+
+        # Get providers for valid model configurations with caching to avoid duplicate lookups
+        provider_configs = []
+        provider_cache = {}  # Cache to avoid duplicate provider lookups
+
+        for model_config in valid_configs:
+            try:
+                # Check cache first
+                if model_config.model in provider_cache:
+                    provider = provider_cache[model_config.model]
+                else:
+                    # Look up provider and cache it
+                    provider = self.get_model_provider(model_config.model)
+                    provider_cache[model_config.model] = provider
+
+                provider_configs.append((provider, model_config))
+            except Exception as e:
+                # Track failed models
+                model_display = (
+                    f"{model_config.model}:{model_config.stance}"
+                    if model_config.stance != "neutral"
+                    else model_config.model
+                )
+                skipped_entries.append(f"{model_display} (provider not available: {str(e)})")
+
+        if not provider_configs:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "No model providers available",
+                "models_skipped": skipped_entries,
+                "next_steps": "Please check that the specified models have configured API keys and are available.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        # Send to all models sequentially (purely synchronous like other tools)
+        logger.debug(f"Sending consensus request to {len(provider_configs)} models")
+        responses = self._get_consensus_responses(provider_configs, consensus_prompt, request)
+        logger.debug(f"Received {len(responses)} responses from consensus models")
+
+        # Enforce minimum success requirement - must have at least 1 successful response
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        if not successful_responses:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "All model calls failed - no successful responses received",
+                "models_skipped": skipped_entries,
+                "models_errored": [
+                    f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"]
+                    for r in responses
+                    if r["status"] == "error"
+                ],
+                "next_steps": "Please retry with different models or check the error messages for guidance on resolving the issues.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        logger.debug("About to format consensus output for MCP response")
+
+        # Structure the output and store in conversation memory
+        consensus_output = self._format_consensus_output(responses, skipped_entries)
+
+        # Log response size for debugging
+        output_size = len(consensus_output)
+        logger.debug(f"Consensus output size: {output_size:,} characters")
+
+        # Store in conversation memory if continuation_id is provided
+        if request.continuation_id:
+            self.store_conversation_turn(
+                request.continuation_id,
+                consensus_output,
+                request.files,
+                request.images,
+                responses,  # Store individual responses in metadata
+                skipped_entries,
+            )
+
+        return [TextContent(type="text", text=consensus_output)]
+
+    def store_conversation_turn(
+        self,
+        continuation_id: str,
+        output: str,
+        files: list[str],
+        images: list[str],
+        responses: list[dict[str, Any]],
+        skipped_entries: list[str],
+    ):
+        """Store consensus turn in conversation memory with special metadata."""
+        from utils.conversation_memory import add_turn
+
+        # Filter successful and failed responses
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        failed_responses = [r for r in responses if r["status"] == "error"]
+
+        # Prepare metadata for conversation storage
+        metadata = {
+            "tool_type": "consensus",
+            "models_used": [r["model"] for r in successful_responses],
+            "models_skipped": skipped_entries,
+            "models_errored": [r["model"] for r in failed_responses],
+            "individual_responses": successful_responses,  # Only store successful responses
+        }
+
+        # Store the turn with special consensus metadata - add_turn is synchronous
+        add_turn(
+            thread_id=continuation_id,
+            role="assistant",
+            content=output,
+            files=files or [],
+            images=images or [],
+            tool_name="consensus",
+            model_provider="consensus",  # Special provider name
+            model_name="consensus",  # Special model name
+            model_metadata=metadata,
+        )