Add Consensus Tool for Multi-Model Perspective Gathering (#67)

* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-17 10:53:17 +04:00
parent 9b98df650b
commit 95556ba9ea
31 changed files with 2643 additions and 324 deletions
--- a/tools/init.py
+++ b/tools/init.py
@@ -5,6 +5,7 @@ Tool implementations for Zen MCP Server
 from .analyze import AnalyzeTool
 from .chat import ChatTool
 from .codereview import CodeReviewTool
+from .consensus import ConsensusTool
 from .debug import DebugIssueTool
 from .listmodels import ListModelsTool
 from .precommit import Precommit
@@ -19,6 +20,7 @@ __all__ = [
    "DebugIssueTool",
    "AnalyzeTool",
    "ChatTool",
+    "ConsensusTool",
    "ListModelsTool",
    "Precommit",
    "RefactorTool",
--- a/tools/analyze.py
+++ b/tools/analyze.py
@@ -141,13 +141,7 @@ class AnalyzeTool(BaseTool):
        if updated_files is not None:
            request.files = updated_files

-        # MCP boundary check - STRICT REJECTION
-        if request.files:
-            file_size_check = self.check_total_file_size(request.files)
-            if file_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
+        # File size validation happens at MCP boundary in server.py

        # Use centralized file processing logic
        continuation_id = getattr(request, "continuation_id", None)
--- a/tools/base.py
+++ b/tools/base.py
@@ -31,6 +31,7 @@ from providers.base import ProviderType
 from utils import check_token_limit
 from utils.conversation_memory import (
    MAX_CONVERSATION_TURNS,
+    ConversationTurn,
    add_turn,
    create_thread,
    get_conversation_file_list,
@@ -643,6 +644,41 @@ class BaseTool(ABC):
            )
            return requested_files

+    def format_conversation_turn(self, turn: ConversationTurn) -> list[str]:
+        """
+        Format a conversation turn for display in conversation history.
+
+        Tools can override this to provide custom formatting for their responses
+        while maintaining the standard structure for cross-tool compatibility.
+
+        This method is called by build_conversation_history when reconstructing
+        conversation context, allowing each tool to control how its responses
+        appear in subsequent conversation turns.
+
+        Args:
+            turn: The conversation turn to format (from utils.conversation_memory)
+
+        Returns:
+            list[str]: Lines of formatted content for this turn
+
+        Example:
+            Default implementation returns:
+            ["Files used in this turn: file1.py, file2.py", "", "Response content..."]
+
+            Tools can override to add custom sections, formatting, or metadata display.
+        """
+        parts = []
+
+        # Add files context if present
+        if turn.files:
+            parts.append(f"Files used in this turn: {', '.join(turn.files)}")
+            parts.append("")  # Empty line for readability
+
+        # Add the actual content
+        parts.append(turn.content)
+
+        return parts
+
    def _prepare_file_content_for_prompt(
        self,
        request_files: list[str],
@@ -716,109 +752,35 @@ class BaseTool(ABC):
        elif max_tokens is not None:
            effective_max_tokens = max_tokens - reserve_tokens
        else:
-            # Get model-specific limits
-            # First check if model_context was passed from server.py
-            model_context = None
-            if arguments:
-                model_context = arguments.get("_model_context") or getattr(self, "_current_arguments", {}).get(
-                    "_model_context"
+            # The execute() method is responsible for setting self._model_context.
+            # A missing context is a programming error, not a fallback case.
+            if not hasattr(self, "_model_context") or not self._model_context:
+                logger.error(
+                    f"[FILES] {self.name}: _prepare_file_content_for_prompt called without a valid model context. "
+                    "This indicates an incorrect call sequence in the tool's implementation."
                )
+                # Fail fast to reveal integration issues. A silent fallback with arbitrary
+                # limits can hide bugs and lead to unexpected token usage or silent failures.
+                raise RuntimeError("ModelContext not initialized before file preparation.")

-            if model_context:
-                # Use the passed model context
-                try:
-                    token_allocation = model_context.calculate_token_allocation()
-                    effective_max_tokens = token_allocation.file_tokens - reserve_tokens
-                    logger.debug(
-                        f"[FILES] {self.name}: Using passed model context for {model_context.model_name}: "
-                        f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
-                    )
-                except Exception as e:
-                    logger.warning(f"[FILES] {self.name}: Error using passed model context: {e}")
-                    # Fall through to manual calculation
-                    model_context = None
-
-            if not model_context:
-                # Manual calculation as fallback
-                from config import DEFAULT_MODEL
-
-                model_name = getattr(self, "_current_model_name", None) or DEFAULT_MODEL
-
-                # Handle auto mode gracefully
-                if model_name.lower() == "auto":
-                    from providers.registry import ModelProviderRegistry
-
-                    # Use tool-specific fallback model for capacity estimation
-                    # This properly handles different providers (OpenAI=200K, Gemini=1M)
-                    tool_category = self.get_model_category()
-                    fallback_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
-                    logger.debug(
-                        f"[FILES] {self.name}: Auto mode detected, using {fallback_model} "
-                        f"for {tool_category.value} tool capacity estimation"
-                    )
-
-                    try:
-                        provider = self.get_model_provider(fallback_model)
-                        capabilities = provider.get_capabilities(fallback_model)
-
-                        # Calculate content allocation based on model capacity
-                        if capabilities.context_window < 300_000:
-                            # Smaller context models: 60% content, 40% response
-                            model_content_tokens = int(capabilities.context_window * 0.6)
-                        else:
-                            # Larger context models: 80% content, 20% response
-                            model_content_tokens = int(capabilities.context_window * 0.8)
-
-                        effective_max_tokens = model_content_tokens - reserve_tokens
-                        logger.debug(
-                            f"[FILES] {self.name}: Using {fallback_model} capacity for auto mode: "
-                            f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
-                        )
-                    except (ValueError, AttributeError) as e:
-                        # Handle specific errors: provider not found, model not supported, missing attributes
-                        logger.warning(
-                            f"[FILES] {self.name}: Could not get capabilities for fallback model {fallback_model}: {type(e).__name__}: {e}"
-                        )
-                        # Fall back to conservative default for safety
-                        effective_max_tokens = 100_000 - reserve_tokens
-                    except Exception as e:
-                        # Catch any other unexpected errors
-                        logger.error(
-                            f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
-                        )
-                        effective_max_tokens = 100_000 - reserve_tokens
-                else:
-                    # Normal mode - use the specified model
-                    try:
-                        provider = self.get_model_provider(model_name)
-                        capabilities = provider.get_capabilities(model_name)
-
-                        # Calculate content allocation based on model capacity
-                        if capabilities.context_window < 300_000:
-                            # Smaller context models: 60% content, 40% response
-                            model_content_tokens = int(capabilities.context_window * 0.6)
-                        else:
-                            # Larger context models: 80% content, 20% response
-                            model_content_tokens = int(capabilities.context_window * 0.8)
-
-                        effective_max_tokens = model_content_tokens - reserve_tokens
-                        logger.debug(
-                            f"[FILES] {self.name}: Using model-specific limit for {model_name}: "
-                            f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
-                        )
-                    except (ValueError, AttributeError) as e:
-                        # Handle specific errors: provider not found, model not supported, missing attributes
-                        logger.warning(
-                            f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}"
-                        )
-                        # Fall back to conservative default for safety
-                        effective_max_tokens = 100_000 - reserve_tokens
-                    except Exception as e:
-                        # Catch any other unexpected errors
-                        logger.error(
-                            f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
-                        )
-                        effective_max_tokens = 100_000 - reserve_tokens
+            # This is now the single source of truth for token allocation.
+            model_context = self._model_context
+            try:
+                token_allocation = model_context.calculate_token_allocation()
+                # Standardize on `file_tokens` for consistency and correctness.
+                # This fixes the bug where the old code incorrectly used content_tokens
+                effective_max_tokens = token_allocation.file_tokens - reserve_tokens
+                logger.debug(
+                    f"[FILES] {self.name}: Using model context for {model_context.model_name}: "
+                    f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
+                )
+            except Exception as e:
+                logger.error(
+                    f"[FILES] {self.name}: Failed to calculate token allocation from model context: {e}", exc_info=True
+                )
+                # If the context exists but calculation fails, we still need to prevent a crash.
+                # A loud error is logged, and we fall back to a safe default.
+                effective_max_tokens = 100_000 - reserve_tokens

        # Ensure we have a reasonable minimum budget
        effective_max_tokens = max(1000, effective_max_tokens)
@@ -1087,8 +1049,14 @@ When recommending searches, be specific about what information you need and why

        # Get model capabilities to check image support and size limits
        try:
-            provider = self.get_model_provider(model_name)
-            capabilities = provider.get_capabilities(model_name)
+            # Use the already-resolved provider from model context if available
+            if hasattr(self, "_model_context") and self._model_context:
+                provider = self._model_context.provider
+                capabilities = self._model_context.capabilities
+            else:
+                # Fallback for edge cases (e.g., direct test calls)
+                provider = self.get_model_provider(model_name)
+                capabilities = provider.get_capabilities(model_name)
        except Exception as e:
            logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
            # Fall back to checking custom models configuration
@@ -1214,7 +1182,7 @@ When recommending searches, be specific about what information you need and why

        return estimate_file_tokens(file_path)

-    def check_total_file_size(self, files: list[str]) -> Optional[dict[str, Any]]:
+    def check_total_file_size(self, files: list[str], model_name: str) -> Optional[dict[str, Any]]:
        """
        Check if total file sizes would exceed token threshold before embedding.

@@ -1224,6 +1192,7 @@ When recommending searches, be specific about what information you need and why

        Args:
            files: List of file paths to check
+            model_name: The resolved model name to use for token limits

        Returns:
            Dict with `code_too_large` response if too large, None if acceptable
@@ -1231,13 +1200,6 @@ When recommending searches, be specific about what information you need and why
        if not files:
            return None

-        # Get current model name for context-aware thresholds
-        model_name = getattr(self, "_current_model_name", None)
-        if not model_name:
-            from config import DEFAULT_MODEL
-
-            model_name = DEFAULT_MODEL
-
        # Use centralized file size checking with model context
        from utils.file_utils import check_total_file_size as check_file_size_utility

@@ -1353,6 +1315,65 @@ When recommending searches, be specific about what information you need and why
            # Extract and validate images from request
            images = getattr(request, "images", None) or []

+            # MODEL RESOLUTION NOW HAPPENS AT MCP BOUNDARY
+            # Extract pre-resolved model context from server.py
+            model_context = self._current_arguments.get("_model_context")
+            resolved_model_name = self._current_arguments.get("_resolved_model_name")
+
+            if model_context and resolved_model_name:
+                # Model was already resolved at MCP boundary
+                model_name = resolved_model_name
+                logger.debug(f"Using pre-resolved model '{model_name}' from MCP boundary")
+            else:
+                # Fallback for direct execute calls
+                model_name = getattr(request, "model", None)
+                if not model_name:
+                    from config import DEFAULT_MODEL
+
+                    model_name = DEFAULT_MODEL
+                logger.debug(f"Using fallback model resolution for '{model_name}' (test mode)")
+
+                # For tests: Check if we should require model selection (auto mode)
+                if self._should_require_model_selection(model_name):
+                    # Get suggested model based on tool category
+                    from providers.registry import ModelProviderRegistry
+
+                    tool_category = self.get_model_category()
+                    suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
+
+                    # Build error message based on why selection is required
+                    if model_name.lower() == "auto":
+                        error_message = (
+                            f"Model parameter is required in auto mode. "
+                            f"Suggested model for {self.name}: '{suggested_model}' "
+                            f"(category: {tool_category.value})"
+                        )
+                    else:
+                        # Model was specified but not available
+                        available_models = self._get_available_models()
+
+                        error_message = (
+                            f"Model '{model_name}' is not available with current API keys. "
+                            f"Available models: {', '.join(available_models)}. "
+                            f"Suggested model for {self.name}: '{suggested_model}' "
+                            f"(category: {tool_category.value})"
+                        )
+                    error_output = ToolOutput(
+                        status="error",
+                        content=error_message,
+                        content_type="text",
+                    )
+                    return [TextContent(type="text", text=error_output.model_dump_json())]
+
+                # Create model context for tests
+                from utils.model_context import ModelContext
+
+                model_context = ModelContext(model_name)
+
+            # Store resolved model name for use by helper methods
+            self._current_model_name = model_name
+            self._model_context = model_context
+
            # Check if we have continuation_id - if so, conversation history is already embedded
            continuation_id = getattr(request, "continuation_id", None)

@@ -1389,57 +1410,11 @@ When recommending searches, be specific about what information you need and why
                prompt = f"{prompt}\n\n{follow_up_instructions}"
                logger.debug(f"Added follow-up instructions for new {self.name} conversation")

-            # Extract model configuration from request or use defaults
-            model_name = getattr(request, "model", None)
-            if not model_name:
-                from config import DEFAULT_MODEL
-
-                model_name = DEFAULT_MODEL
-
-            # Check if we need Claude to select a model
-            # This happens when:
-            # 1. The model is explicitly "auto"
-            # 2. The requested model is not available
-            if self._should_require_model_selection(model_name):
-                # Get suggested model based on tool category
-                from providers.registry import ModelProviderRegistry
-
-                tool_category = self.get_model_category()
-                suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
-
-                # Build error message based on why selection is required
-                if model_name.lower() == "auto":
-                    error_message = (
-                        f"Model parameter is required in auto mode. "
-                        f"Suggested model for {self.name}: '{suggested_model}' "
-                        f"(category: {tool_category.value})"
-                    )
-                else:
-                    # Model was specified but not available
-                    # Get list of available models
-                    available_models = self._get_available_models()
-
-                    error_message = (
-                        f"Model '{model_name}' is not available with current API keys. "
-                        f"Available models: {', '.join(available_models)}. "
-                        f"Suggested model for {self.name}: '{suggested_model}' "
-                        f"(category: {tool_category.value})"
-                    )
-
-                error_output = ToolOutput(
-                    status="error",
-                    content=error_message,
-                    content_type="text",
-                )
-                return [TextContent(type="text", text=error_output.model_dump_json())]
-
-            # Store model name for use by helper methods like _prepare_file_content_for_prompt
-            # Only set this after auto mode validation to prevent "auto" being used as a model name
-            self._current_model_name = model_name
+            # Model name already resolved and stored in self._current_model_name earlier

            # Validate images at MCP boundary if any were provided
            if images:
-                image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
+                image_validation_error = self._validate_image_limits(images, self._current_model_name, continuation_id)
                if image_validation_error:
                    return [TextContent(type="text", text=json.dumps(image_validation_error))]

@@ -1451,10 +1426,10 @@ When recommending searches, be specific about what information you need and why
                thinking_mode = self.get_default_thinking_mode()

            # Get the appropriate model provider
-            provider = self.get_model_provider(model_name)
+            provider = self.get_model_provider(self._current_model_name)

            # Validate and correct temperature for this model
-            temperature, temp_warnings = self._validate_and_correct_temperature(model_name, temperature)
+            temperature, temp_warnings = self._validate_and_correct_temperature(self._current_model_name, temperature)

            # Log any temperature corrections
            for warning in temp_warnings:
@@ -1465,16 +1440,21 @@ When recommending searches, be specific about what information you need and why

            # Generate AI response using the provider
            logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.name}")
-            logger.info(f"Using model: {model_name} via {provider.get_provider_type().value} provider")
-            logger.debug(f"Prompt length: {len(prompt)} characters")
+            logger.info(f"Using model: {self._current_model_name} via {provider.get_provider_type().value} provider")
+
+            # Import token estimation utility
+            from utils.token_utils import estimate_tokens
+
+            estimated_tokens = estimate_tokens(prompt)
+            logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")

            # Generate content with provider abstraction
            model_response = provider.generate_content(
                prompt=prompt,
-                model_name=model_name,
+                model_name=self._current_model_name,
                system_prompt=system_prompt,
                temperature=temperature,
-                thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
+                thinking_mode=thinking_mode if provider.supports_thinking_mode(self._current_model_name) else None,
                images=images if images else None,  # Pass images via kwargs
            )

@@ -1486,7 +1466,11 @@ When recommending searches, be specific about what information you need and why

                # Parse response to check for clarification requests or format output
                # Pass model info for conversation tracking
-                model_info = {"provider": provider, "model_name": model_name, "model_response": model_response}
+                model_info = {
+                    "provider": provider,
+                    "model_name": self._current_model_name,
+                    "model_response": model_response,
+                }
                tool_output = self._parse_response(raw_text, request, model_info)
                logger.info(f"✅ {self.name} tool completed successfully")

@@ -1894,8 +1878,14 @@ When recommending searches, be specific about what information you need and why
            Tuple of (corrected_temperature, warning_messages)
        """
        try:
-            provider = self.get_model_provider(model_name)
-            capabilities = provider.get_capabilities(model_name)
+            # Use the already-resolved provider and capabilities from model context
+            if hasattr(self, "_model_context") and self._model_context:
+                capabilities = self._model_context.capabilities
+            else:
+                # Fallback for edge cases (e.g., direct test calls)
+                provider = self.get_model_provider(model_name)
+                capabilities = provider.get_capabilities(model_name)
+
            constraint = capabilities.temperature_constraint

            warnings = []
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -227,13 +227,7 @@ class CodeReviewTool(BaseTool):
        if updated_files is not None:
            request.files = updated_files

-        # MCP boundary check - STRICT REJECTION
-        if request.files:
-            file_size_check = self.check_total_file_size(request.files)
-            if file_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
+        # File size validation happens at MCP boundary in server.py

        # Check user input size at MCP transport boundary (before adding internal content)
        user_content = request.prompt
--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -0,0 +1,846 @@
+"""
+Consensus tool for multi-model perspective gathering and validation
+"""
+
+import json
+import logging
+from typing import TYPE_CHECKING, Any, Optional
+
+from mcp.types import TextContent
+from pydantic import BaseModel, Field, field_validator
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory
+
+from config import DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION
+from systemprompts import CONSENSUS_PROMPT
+
+from .base import BaseTool, ToolRequest
+
+logger = logging.getLogger(__name__)
+
+
+class ModelConfig(BaseModel):
+    """Enhanced model configuration for consensus tool"""
+
+    model: str = Field(..., description="Model name to use (e.g., 'o3', 'flash', 'pro')")
+    stance: Optional[str] = Field(
+        default="neutral",
+        description=(
+            "Stance for this model. Supportive: 'for', 'support', 'favor'. "
+            "Critical: 'against', 'oppose', 'critical'. Neutral: 'neutral'. "
+            "Defaults to 'neutral'."
+        ),
+    )
+    stance_prompt: Optional[str] = Field(
+        default=None,
+        description=(
+            "Custom stance-specific instructions for this model. "
+            "If provided, this will be used instead of the default stance prompt. "
+            "Should be clear, specific instructions about how this model should approach the analysis."
+        ),
+    )
+
+
+class ConsensusRequest(ToolRequest):
+    """Request model for consensus tool"""
+
+    prompt: str = Field(
+        ...,
+        description=(
+            "Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
+            "Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
+        ),
+    )
+    models: list[ModelConfig] = Field(
+        ...,
+        description=(
+            "List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
+            "Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
+            "{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
+            "Maximum 2 instances per model+stance combination."
+        ),
+    )
+    files: Optional[list[str]] = Field(
+        default_factory=list,
+        description="Optional files or directories for additional context (must be absolute paths)",
+    )
+    images: Optional[list[str]] = Field(
+        default_factory=list,
+        description=(
+            "Optional images showing expected UI changes, design requirements, "
+            "or visual references for the consensus analysis"
+        ),
+    )
+    focus_areas: Optional[list[str]] = Field(
+        default_factory=list,
+        description="Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
+    )
+
+    @field_validator("models")
+    @classmethod
+    def validate_models_not_empty(cls, v):
+        if not v:
+            raise ValueError("At least one model must be specified")
+        return v
+
+
+class ConsensusTool(BaseTool):
+    """Multi-model consensus tool for gathering diverse perspectives on technical proposals"""
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def parse_structured_prompt_models(model_spec: str) -> list[dict[str, str]]:
+        """
+        Parse consensus model specification from structured prompt format.
+
+        This method parses structured prompt specifications used in Claude Code shortcuts
+        like "/zen:consensus:flash:for,o3:against,pro:neutral" to extract model configurations
+        with their assigned stances.
+
+        Supported formats:
+        - "model:stance" - Explicit stance assignment (e.g., "flash:for", "o3:against")
+        - "model" - Defaults to neutral stance (e.g., "pro" becomes "pro:neutral")
+
+        Supported stances:
+        - Supportive: "for", "support", "favor"
+        - Critical: "against", "oppose", "critical"
+        - Neutral: "neutral" (default)
+
+        Args:
+            model_spec (str): Comma-separated model specification string.
+                Examples: "flash:for,o3:against,pro:neutral" or "flash:for,o3:against,pro"
+
+        Returns:
+            list[dict[str, str]]: List of model configuration dictionaries with keys:
+                - "model": The model name (e.g., "flash", "o3", "pro")
+                - "stance": The normalized stance (e.g., "for", "against", "neutral")
+
+        Examples:
+            >>> ConsensusTool.parse_structured_prompt_models("flash:for,o3:against,pro")
+            [{"model": "flash", "stance": "for"}, {"model": "o3", "stance": "against"}, {"model": "pro", "stance": "neutral"}]
+
+            >>> ConsensusTool.parse_structured_prompt_models("flash,o3,pro")
+            [{"model": "flash", "stance": "neutral"}, {"model": "o3", "stance": "neutral"}, {"model": "pro", "stance": "neutral"}]
+        """
+        models = []
+
+        # Split by comma to get individual model specs
+        model_parts = model_spec.split(",")
+
+        for part in model_parts:
+            part = part.strip()
+            if ":" in part:
+                # Model with stance: "flash:for" or "o3:against"
+                model_name, stance = part.split(":", 1)
+                models.append({"model": model_name.strip(), "stance": stance.strip()})
+            else:
+                # Model without stance (defaults to neutral): "pro"
+                models.append({"model": part.strip(), "stance": "neutral"})
+
+        return models
+
+    def get_name(self) -> str:
+        return "consensus"
+
+    def get_description(self) -> str:
+        return (
+            "MULTI-MODEL CONSENSUS - Gather diverse perspectives from multiple AI models on technical proposals, "
+            "plans, and ideas. Perfect for validation, feasibility assessment, and getting comprehensive "
+            "viewpoints on complex decisions. Supports advanced stance steering with custom instructions for each model. "
+            "You can specify different stances (for/against/neutral) and provide custom stance prompts to guide each "
+            "model's analysis. Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on implementation "
+            "benefits and user value'}, {'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify potential "
+            "risks and technical challenges'}]. Use neutral stances by default unless structured debate would add value."
+        )
+
+    def get_input_schema(self) -> dict[str, Any]:
+        schema = {
+            "type": "object",
+            "properties": {
+                "prompt": {
+                    "type": "string",
+                    "description": (
+                        "Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
+                        "Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
+                    ),
+                },
+                "models": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "model": {
+                                "type": "string",
+                                "description": "Model name to use (e.g., 'o3', 'flash', 'pro')",
+                            },
+                            "stance": {
+                                "type": "string",
+                                "enum": ["for", "support", "favor", "against", "oppose", "critical", "neutral"],
+                                "description": "Stance for this model: supportive ('for', 'support', 'favor'), critical ('against', 'oppose', 'critical'), or 'neutral'",
+                                "default": "neutral",
+                            },
+                            "stance_prompt": {
+                                "type": "string",
+                                "description": "Custom stance-specific instructions for this model. If provided, this will be used instead of the default stance prompt.",
+                            },
+                        },
+                        "required": ["model"],
+                    },
+                    "description": (
+                        "List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
+                        "Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
+                        "{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
+                        "Maximum 2 instances per model+stance combination."
+                    ),
+                },
+                "files": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Optional files or directories for additional context (must be absolute paths)",
+                },
+                "images": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": (
+                        "Optional images showing expected UI changes, design requirements, "
+                        "or visual references for the consensus analysis"
+                    ),
+                },
+                "focus_areas": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
+                },
+                "temperature": {
+                    "type": "number",
+                    "description": "Temperature (0-1, default 0.2 for consistency)",
+                    "minimum": 0,
+                    "maximum": 1,
+                    "default": self.get_default_temperature(),
+                },
+                "thinking_mode": {
+                    "type": "string",
+                    "enum": ["minimal", "low", "medium", "high", "max"],
+                    "description": (
+                        "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), "
+                        "high (67%), max (100% of model max)"
+                    ),
+                },
+                "use_websearch": {
+                    "type": "boolean",
+                    "description": (
+                        "Enable web search for documentation, best practices, and current information. "
+                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
+                        "exploring industry best practices, working with specific frameworks/technologies, "
+                        "researching solutions to complex problems, or when current documentation and "
+                        "community insights would enhance the analysis."
+                    ),
+                    "default": True,
+                },
+                "continuation_id": {
+                    "type": "string",
+                    "description": (
+                        "Thread continuation ID for multi-turn conversations. Can be used to continue "
+                        "conversations across different tools. Only provide this if continuing a previous "
+                        "conversation thread."
+                    ),
+                },
+            },
+            "required": ["prompt", "models"],
+        }
+
+        return schema
+
+    def get_system_prompt(self) -> str:
+        return CONSENSUS_PROMPT
+
+    def get_default_temperature(self) -> float:
+        return 0.2  # Lower temperature for more consistent consensus responses
+
+    def get_model_category(self) -> "ToolModelCategory":
+        """Consensus uses extended reasoning models for deep analysis"""
+        from tools.models import ToolModelCategory
+
+        return ToolModelCategory.EXTENDED_REASONING
+
+    def get_request_model(self):
+        return ConsensusRequest
+
+    def format_conversation_turn(self, turn) -> list[str]:
+        """
+        Format consensus turns with individual model responses for better readability.
+
+        This custom formatting shows the individual model responses that were
+        synthesized into the consensus, making it easier to understand the
+        reasoning behind the final recommendation.
+        """
+        parts = []
+
+        # Add files context if present
+        if turn.files:
+            parts.append(f"Files used in this turn: {', '.join(turn.files)}")
+            parts.append("")
+
+        # Check if this is a consensus turn with individual responses
+        if turn.model_metadata and turn.model_metadata.get("individual_responses"):
+            individual_responses = turn.model_metadata["individual_responses"]
+
+            # Add consensus header
+            models_consulted = []
+            for resp in individual_responses:
+                model = resp["model"]
+                stance = resp.get("stance", "neutral")
+                if stance != "neutral":
+                    models_consulted.append(f"{model}:{stance}")
+                else:
+                    models_consulted.append(model)
+
+            parts.append(f"Models consulted: {', '.join(models_consulted)}")
+            parts.append("")
+            parts.append("=== INDIVIDUAL MODEL RESPONSES ===")
+            parts.append("")
+
+            # Add each successful model response
+            for i, response in enumerate(individual_responses):
+                model_name = response["model"]
+                stance = response.get("stance", "neutral")
+                verdict = response["verdict"]
+
+                stance_label = f"({stance.title()} Stance)" if stance != "neutral" else "(Neutral Analysis)"
+                parts.append(f"**{model_name.upper()} {stance_label}**:")
+                parts.append(verdict)
+
+                if i < len(individual_responses) - 1:
+                    parts.append("")
+                    parts.append("---")
+                parts.append("")
+
+            parts.append("=== END INDIVIDUAL RESPONSES ===")
+            parts.append("")
+            parts.append("Claude's Synthesis:")
+
+        # Add the actual content
+        parts.append(turn.content)
+
+        return parts
+
+    def _normalize_stance(self, stance: Optional[str]) -> str:
+        """Normalize stance to canonical form."""
+        if not stance:
+            return "neutral"
+
+        stance = stance.lower()
+
+        # Define stance synonyms
+        supportive_stances = {"for", "support", "favor"}
+        critical_stances = {"against", "oppose", "critical"}
+
+        # Map synonyms to canonical stance
+        if stance in supportive_stances:
+            return "for"
+        elif stance in critical_stances:
+            return "against"
+        elif stance == "neutral":
+            return "neutral"
+        else:
+            # Unknown stances default to neutral for robustness
+            logger.warning(
+                f"Unknown stance '{stance}' provided, defaulting to 'neutral'. Valid stances: {', '.join(sorted(supportive_stances | critical_stances))}, or 'neutral'"
+            )
+            return "neutral"
+
+    def _validate_model_combinations(self, model_configs: list[ModelConfig]) -> tuple[list[ModelConfig], list[str]]:
+        """Validate model configurations and enforce limits.
+
+        Returns:
+            tuple: (valid_configs, skipped_entries)
+            - Each model+stance combination can appear max 2 times
+            - Same model+stance limited to 2 instances
+        """
+        valid_configs = []
+        skipped_entries = []
+        combination_counts = {}  # Track (model, stance) -> count
+
+        for config in model_configs:
+            try:
+                # Normalize stance
+                normalized_stance = self._normalize_stance(config.stance)
+
+                # Create normalized config
+                normalized_config = ModelConfig(
+                    model=config.model, stance=normalized_stance, stance_prompt=config.stance_prompt
+                )
+
+                combination_key = (config.model, normalized_stance)
+                current_count = combination_counts.get(combination_key, 0)
+
+                if current_count >= DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION:
+                    # Already have max instances of this model+stance combination
+                    skipped_entries.append(
+                        f"{config.model}:{normalized_stance} (max {DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION} instances)"
+                    )
+                    continue
+
+                combination_counts[combination_key] = current_count + 1
+                valid_configs.append(normalized_config)
+
+            except ValueError as e:
+                # Invalid stance or model
+                skipped_entries.append(f"{config.model} ({str(e)})")
+                continue
+
+        return valid_configs, skipped_entries
+
+    def _get_stance_enhanced_prompt(self, stance: str, custom_stance_prompt: Optional[str] = None) -> str:
+        """Get the system prompt with stance injection based on the stance."""
+        base_prompt = self.get_system_prompt()
+
+        # If custom stance prompt is provided, use it instead of default
+        if custom_stance_prompt:
+            # Validate stance placeholder exists exactly once
+            if base_prompt.count("{stance_prompt}") != 1:
+                raise ValueError(
+                    "System prompt must contain exactly one '{stance_prompt}' placeholder, "
+                    f"found {base_prompt.count('{stance_prompt}')}"
+                )
+            return base_prompt.replace("{stance_prompt}", custom_stance_prompt)
+
+        stance_prompts = {
+            "for": """SUPPORTIVE PERSPECTIVE WITH INTEGRITY
+
+You are tasked with advocating FOR this proposal, but with CRITICAL GUARDRAILS:
+
+MANDATORY ETHICAL CONSTRAINTS:
+- This is NOT a debate for entertainment. You MUST act in good faith and in the best interest of the questioner
+- You MUST think deeply about whether supporting this idea is safe, sound, and passes essential requirements
+- You MUST be direct and unequivocal in saying "this is a bad idea" when it truly is
+- There must be at least ONE COMPELLING reason to be optimistic, otherwise DO NOT support it
+
+WHEN TO REFUSE SUPPORT (MUST OVERRIDE STANCE):
+- If the idea is fundamentally harmful to users, project, or stakeholders
+- If implementation would violate security, privacy, or ethical standards
+- If the proposal is technically infeasible within realistic constraints
+- If costs/risks dramatically outweigh any potential benefits
+
+YOUR SUPPORTIVE ANALYSIS SHOULD:
+- Identify genuine strengths and opportunities
+- Propose solutions to overcome legitimate challenges
+- Highlight synergies with existing systems
+- Suggest optimizations that enhance value
+- Present realistic implementation pathways
+
+Remember: Being "for" means finding the BEST possible version of the idea IF it has merit, not blindly supporting bad ideas.""",
+            "against": """CRITICAL PERSPECTIVE WITH RESPONSIBILITY
+
+You are tasked with critiquing this proposal, but with ESSENTIAL BOUNDARIES:
+
+MANDATORY FAIRNESS CONSTRAINTS:
+- You MUST NOT oppose genuinely excellent, common-sense ideas just to be contrarian
+- You MUST acknowledge when a proposal is fundamentally sound and well-conceived
+- You CANNOT give harmful advice or recommend against beneficial changes
+- If the idea is outstanding, say so clearly while offering constructive refinements
+
+WHEN TO MODERATE CRITICISM (MUST OVERRIDE STANCE):
+- If the proposal addresses critical user needs effectively
+- If it follows established best practices with good reason
+- If benefits clearly and substantially outweigh risks
+- If it's the obvious right solution to the problem
+
+YOUR CRITICAL ANALYSIS SHOULD:
+- Identify legitimate risks and failure modes
+- Point out overlooked complexities
+- Suggest more efficient alternatives
+- Highlight potential negative consequences
+- Question assumptions that may be flawed
+
+Remember: Being "against" means rigorous scrutiny to ensure quality, not undermining good ideas that deserve support.""",
+            "neutral": """BALANCED PERSPECTIVE
+
+Provide objective analysis considering both positive and negative aspects. However, if there is overwhelming evidence
+that the proposal clearly leans toward being exceptionally good or particularly problematic, you MUST accurately
+reflect this reality. Being "balanced" means being truthful about the weight of evidence, not artificially creating
+50/50 splits when the reality is 90/10.
+
+Your analysis should:
+- Present all significant pros and cons discovered
+- Weight them according to actual impact and likelihood
+- If evidence strongly favors one conclusion, clearly state this
+- Provide proportional coverage based on the strength of arguments
+- Help the questioner see the true balance of considerations
+
+Remember: Artificial balance that misrepresents reality is not helpful. True balance means accurate representation
+of the evidence, even when it strongly points in one direction.""",
+        }
+
+        stance_prompt = stance_prompts.get(stance, stance_prompts["neutral"])
+
+        # Validate stance placeholder exists exactly once
+        if base_prompt.count("{stance_prompt}") != 1:
+            raise ValueError(
+                "System prompt must contain exactly one '{stance_prompt}' placeholder, "
+                f"found {base_prompt.count('{stance_prompt}')}"
+            )
+
+        # Inject stance into the system prompt
+        return base_prompt.replace("{stance_prompt}", stance_prompt)
+
+    def _get_single_response(
+        self, provider, model_config: ModelConfig, prompt: str, request: ConsensusRequest
+    ) -> dict[str, Any]:
+        """Get response from a single model - synchronous method."""
+        logger.debug(f"Getting response from {model_config.model} with stance '{model_config.stance}'")
+
+        try:
+            # Provider.generate_content is synchronous, not async
+            response = provider.generate_content(
+                prompt=prompt,
+                model_name=model_config.model,
+                system_prompt=self._get_stance_enhanced_prompt(model_config.stance, model_config.stance_prompt),
+                temperature=getattr(request, "temperature", None) or self.get_default_temperature(),
+                thinking_mode=getattr(request, "thinking_mode", "medium"),
+                images=getattr(request, "images", None) or [],
+            )
+            return {
+                "model": model_config.model,
+                "stance": model_config.stance,
+                "status": "success",
+                "verdict": response.content,  # Contains structured Markdown
+                "metadata": {
+                    "provider": getattr(provider.get_provider_type(), "value", provider.get_provider_type()),
+                    "usage": response.usage if hasattr(response, "usage") else None,
+                    "custom_stance_prompt": bool(model_config.stance_prompt),
+                },
+            }
+        except Exception as e:
+            logger.error(f"Error getting response from {model_config.model}:{model_config.stance}: {str(e)}")
+            return {"model": model_config.model, "stance": model_config.stance, "status": "error", "error": str(e)}
+
+    def _get_consensus_responses(
+        self, provider_configs: list[tuple], prompt: str, request: ConsensusRequest
+    ) -> list[dict[str, Any]]:
+        """Execute all model requests sequentially - purely synchronous like other tools."""
+
+        logger.debug(f"Processing {len(provider_configs)} models sequentially")
+        responses = []
+
+        for i, (provider, model_config) in enumerate(provider_configs):
+            try:
+                logger.debug(
+                    f"Processing {model_config.model}:{model_config.stance} sequentially ({i+1}/{len(provider_configs)})"
+                )
+
+                # Direct synchronous call - matches pattern of other tools
+                response = self._get_single_response(provider, model_config, prompt, request)
+                responses.append(response)
+
+            except Exception as e:
+                logger.error(f"Failed to get response from {model_config.model}:{model_config.stance}: {str(e)}")
+                responses.append(
+                    {
+                        "model": model_config.model,
+                        "stance": model_config.stance,
+                        "status": "error",
+                        "error": f"Unhandled exception: {str(e)}",
+                    }
+                )
+
+        logger.debug(f"Sequential processing completed for {len(responses)} models")
+        return responses
+
+    def _format_consensus_output(self, responses: list[dict[str, Any]], skipped_entries: list[str]) -> str:
+        """Format the consensus responses into structured output for Claude."""
+
+        logger.debug(f"Formatting consensus output for {len(responses)} responses")
+
+        # Separate successful and failed responses
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        failed_responses = [r for r in responses if r["status"] == "error"]
+
+        logger.debug(f"Successful responses: {len(successful_responses)}, Failed: {len(failed_responses)}")
+
+        # Prepare the structured output (minimize size for MCP stability)
+        models_used = [
+            f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in successful_responses
+        ]
+        models_errored = [
+            f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in failed_responses
+        ]
+
+        # Prepare clean responses without truncation
+        clean_responses = []
+        for r in responses:
+            if r["status"] == "success":
+                clean_responses.append(
+                    {
+                        "model": r["model"],
+                        "stance": r["stance"],
+                        "status": r["status"],
+                        "verdict": r.get("verdict", ""),
+                        "metadata": r.get("metadata", {}),
+                    }
+                )
+            else:
+                clean_responses.append(
+                    {
+                        "model": r["model"],
+                        "stance": r["stance"],
+                        "status": r["status"],
+                        "error": r.get("error", "Unknown error"),
+                    }
+                )
+
+        output_data = {
+            "status": "consensus_success" if successful_responses else "consensus_failed",
+            "models_used": models_used,
+            "models_skipped": skipped_entries,
+            "models_errored": models_errored,
+            "responses": clean_responses,
+            "next_steps": self._get_synthesis_guidance(successful_responses, failed_responses),
+        }
+
+        return json.dumps(output_data, indent=2)
+
+    def _get_synthesis_guidance(
+        self, successful_responses: list[dict[str, Any]], failed_responses: list[dict[str, Any]]
+    ) -> str:
+        """Generate guidance for Claude on how to synthesize the consensus results."""
+
+        if not successful_responses:
+            return (
+                "No models provided successful responses. Please retry with different models or "
+                "check the error messages for guidance on resolving the issues."
+            )
+
+        if len(successful_responses) == 1:
+            return (
+                "Only one model provided a successful response. Synthesize based on the available "
+                "perspective and indicate areas where additional expert input would be valuable "
+                "due to the limited consensus data."
+            )
+
+        # Multiple successful responses - provide comprehensive synthesis guidance
+        stance_counts = {"for": 0, "against": 0, "neutral": 0}
+        for resp in successful_responses:
+            stance = resp.get("stance", "neutral")
+            stance_counts[stance] = stance_counts.get(stance, 0) + 1
+
+        guidance = (
+            "Claude, synthesize these perspectives by first identifying the key points of "
+            "**agreement** and **disagreement** between the models. Then provide your final, "
+            "consolidated recommendation, explaining how you weighed the different opinions and "
+            "why your proposed solution is the most balanced approach. Explicitly address the "
+            "most critical risks raised by each model and provide actionable next steps for implementation."
+        )
+
+        if failed_responses:
+            guidance += (
+                f" Note: {len(failed_responses)} model(s) failed to respond - consider this "
+                "partial consensus and indicate where additional expert input would strengthen the analysis."
+            )
+
+        return guidance
+
+    async def prepare_prompt(self, request: ConsensusRequest) -> str:
+        """Prepare the consensus prompt with context files and focus areas."""
+        # Check for prompt.txt in files
+        prompt_content, updated_files = self.handle_prompt_file(request.files)
+
+        # Use prompt.txt content if available, otherwise use the prompt field
+        user_content = prompt_content if prompt_content else request.prompt
+
+        # Check user input size at MCP transport boundary (before adding internal content)
+        size_check = self.check_prompt_size(user_content)
+        if size_check:
+            # Need to return error, but prepare_prompt returns str
+            # Use exception to handle this cleanly
+            from tools.models import ToolOutput
+
+            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+
+        # Update request files list
+        if updated_files is not None:
+            request.files = updated_files
+
+        # Add focus areas if specified
+        if request.focus_areas:
+            focus_areas_text = "\n\nSpecific focus areas for this analysis:\n" + "\n".join(
+                f"- {area}" for area in request.focus_areas
+            )
+            user_content += focus_areas_text
+
+        # Add context files if provided (using centralized file handling with filtering)
+        if request.files:
+            file_content, processed_files = self._prepare_file_content_for_prompt(
+                request.files, request.continuation_id, "Context files"
+            )
+            self._actually_processed_files = processed_files
+            if file_content:
+                user_content = f"{user_content}\n\n=== CONTEXT FILES ===\n{file_content}\n=== END CONTEXT ===="
+
+        # Check token limits
+        self._validate_token_limit(user_content, "Content")
+
+        return user_content
+
+    async def execute(self, arguments: dict[str, Any]) -> list[TextContent]:
+        """Execute consensus gathering from multiple models."""
+
+        # Store arguments for base class methods
+        self._current_arguments = arguments
+
+        # Validate and create request
+        request = ConsensusRequest(**arguments)
+
+        # Validate model configurations and enforce limits
+        valid_configs, skipped_entries = self._validate_model_combinations(request.models)
+
+        if not valid_configs:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "No valid model configurations after validation",
+                "models_skipped": skipped_entries,
+                "next_steps": "Please provide valid model configurations with proper model names and stance values.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        # Set up a dummy model context for consensus since we handle multiple models
+        # This is needed for base class methods like prepare_prompt to work
+        if not hasattr(self, "_model_context") or not self._model_context:
+            from utils.model_context import ModelContext
+
+            # Use the first model as the representative for token calculations
+            first_model = valid_configs[0].model if valid_configs else "flash"
+            self._model_context = ModelContext(first_model)
+
+        # Handle conversation continuation if specified
+        if request.continuation_id:
+            from utils.conversation_memory import build_conversation_history, get_thread
+
+            thread_context = get_thread(request.continuation_id)
+            if thread_context:
+                # Build conversation history using the same pattern as other tools
+                conversation_context, _ = build_conversation_history(thread_context, self._model_context)
+                if conversation_context:
+                    # Add conversation context to the beginning of the prompt
+                    enhanced_prompt = f"{conversation_context}\n\n{request.prompt}"
+                    request.prompt = enhanced_prompt
+
+        # Prepare the consensus prompt
+        consensus_prompt = await self.prepare_prompt(request)
+
+        # Get providers for valid model configurations with caching to avoid duplicate lookups
+        provider_configs = []
+        provider_cache = {}  # Cache to avoid duplicate provider lookups
+
+        for model_config in valid_configs:
+            try:
+                # Check cache first
+                if model_config.model in provider_cache:
+                    provider = provider_cache[model_config.model]
+                else:
+                    # Look up provider and cache it
+                    provider = self.get_model_provider(model_config.model)
+                    provider_cache[model_config.model] = provider
+
+                provider_configs.append((provider, model_config))
+            except Exception as e:
+                # Track failed models
+                model_display = (
+                    f"{model_config.model}:{model_config.stance}"
+                    if model_config.stance != "neutral"
+                    else model_config.model
+                )
+                skipped_entries.append(f"{model_display} (provider not available: {str(e)})")
+
+        if not provider_configs:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "No model providers available",
+                "models_skipped": skipped_entries,
+                "next_steps": "Please check that the specified models have configured API keys and are available.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        # Send to all models sequentially (purely synchronous like other tools)
+        logger.debug(f"Sending consensus request to {len(provider_configs)} models")
+        responses = self._get_consensus_responses(provider_configs, consensus_prompt, request)
+        logger.debug(f"Received {len(responses)} responses from consensus models")
+
+        # Enforce minimum success requirement - must have at least 1 successful response
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        if not successful_responses:
+            error_output = {
+                "status": "consensus_failed",
+                "error": "All model calls failed - no successful responses received",
+                "models_skipped": skipped_entries,
+                "models_errored": [
+                    f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"]
+                    for r in responses
+                    if r["status"] == "error"
+                ],
+                "next_steps": "Please retry with different models or check the error messages for guidance on resolving the issues.",
+            }
+            return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
+
+        logger.debug("About to format consensus output for MCP response")
+
+        # Structure the output and store in conversation memory
+        consensus_output = self._format_consensus_output(responses, skipped_entries)
+
+        # Log response size for debugging
+        output_size = len(consensus_output)
+        logger.debug(f"Consensus output size: {output_size:,} characters")
+
+        # Store in conversation memory if continuation_id is provided
+        if request.continuation_id:
+            self.store_conversation_turn(
+                request.continuation_id,
+                consensus_output,
+                request.files,
+                request.images,
+                responses,  # Store individual responses in metadata
+                skipped_entries,
+            )
+
+        return [TextContent(type="text", text=consensus_output)]
+
+    def store_conversation_turn(
+        self,
+        continuation_id: str,
+        output: str,
+        files: list[str],
+        images: list[str],
+        responses: list[dict[str, Any]],
+        skipped_entries: list[str],
+    ):
+        """Store consensus turn in conversation memory with special metadata."""
+        from utils.conversation_memory import add_turn
+
+        # Filter successful and failed responses
+        successful_responses = [r for r in responses if r["status"] == "success"]
+        failed_responses = [r for r in responses if r["status"] == "error"]
+
+        # Prepare metadata for conversation storage
+        metadata = {
+            "tool_type": "consensus",
+            "models_used": [r["model"] for r in successful_responses],
+            "models_skipped": skipped_entries,
+            "models_errored": [r["model"] for r in failed_responses],
+            "individual_responses": successful_responses,  # Only store successful responses
+        }
+
+        # Store the turn with special consensus metadata - add_turn is synchronous
+        add_turn(
+            thread_id=continuation_id,
+            role="assistant",
+            content=output,
+            files=files or [],
+            images=images or [],
+            tool_name="consensus",
+            model_provider="consensus",  # Special provider name
+            model_name="consensus",  # Special model name
+            model_metadata=metadata,
+        )
--- a/tools/debug.py
+++ b/tools/debug.py
@@ -159,13 +159,7 @@ class DebugIssueTool(BaseTool):
        if updated_files is not None:
            request.files = updated_files

-        # MCP boundary check - STRICT REJECTION
-        if request.files:
-            file_size_check = self.check_total_file_size(request.files)
-            if file_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
+        # File size validation happens at MCP boundary in server.py

        # Build context sections
        context_parts = [f"=== ISSUE DESCRIPTION ===\n{request.prompt}\n=== END DESCRIPTION ==="]
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -236,13 +236,7 @@ class Precommit(BaseTool):
        translated_path = translate_path_for_environment(request.path)
        translated_files = translate_file_paths(request.files)

-        # MCP boundary check - STRICT REJECTION (check original files before translation)
-        if request.files:
-            file_size_check = self.check_total_file_size(request.files)
-            if file_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
+        # File size validation happens at MCP boundary in server.py

        # Check if the path translation resulted in an error path
        if translated_path.startswith("/inaccessible/"):
--- a/tools/refactor.py
+++ b/tools/refactor.py
@@ -409,23 +409,25 @@ class RefactorTool(BaseTool):
        continuation_id = getattr(request, "continuation_id", None)

        # Get model context for token budget calculation
-        model_name = getattr(self, "_current_model_name", None)
        available_tokens = None

-        if model_name:
+        if hasattr(self, "_model_context") and self._model_context:
            try:
-                provider = self.get_model_provider(model_name)
-                capabilities = provider.get_capabilities(model_name)
+                capabilities = self._model_context.capabilities
                # Use 75% of context for content (code + style examples), 25% for response
                available_tokens = int(capabilities.context_window * 0.75)
                logger.debug(
-                    f"[REFACTOR] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {model_name}"
+                    f"[REFACTOR] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {self._model_context.model_name}"
                )
            except Exception as e:
                # Fallback to conservative estimate
-                logger.warning(f"[REFACTOR] Could not get model capabilities for {model_name}: {e}")
+                logger.warning(f"[REFACTOR] Could not get model capabilities: {e}")
                available_tokens = 120000  # Conservative fallback
                logger.debug(f"[REFACTOR] Using fallback token budget: {available_tokens:,} tokens")
+        else:
+            # No model context available (shouldn't happen in normal flow)
+            available_tokens = 120000  # Conservative fallback
+            logger.debug(f"[REFACTOR] No model context, using fallback token budget: {available_tokens:,} tokens")

        # Process style guide examples first to determine token allocation
        style_examples_content = ""
--- a/tools/testgen.py
+++ b/tools/testgen.py
@@ -290,23 +290,25 @@ class TestGenerationTool(BaseTool):
        continuation_id = getattr(request, "continuation_id", None)

        # Get model context for token budget calculation
-        model_name = getattr(self, "_current_model_name", None)
        available_tokens = None

-        if model_name:
+        if hasattr(self, "_model_context") and self._model_context:
            try:
-                provider = self.get_model_provider(model_name)
-                capabilities = provider.get_capabilities(model_name)
+                capabilities = self._model_context.capabilities
                # Use 75% of context for content (code + test examples), 25% for response
                available_tokens = int(capabilities.context_window * 0.75)
                logger.debug(
-                    f"[TESTGEN] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {model_name}"
+                    f"[TESTGEN] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {self._model_context.model_name}"
                )
            except Exception as e:
                # Fallback to conservative estimate
-                logger.warning(f"[TESTGEN] Could not get model capabilities for {model_name}: {e}")
+                logger.warning(f"[TESTGEN] Could not get model capabilities: {e}")
                available_tokens = 120000  # Conservative fallback
                logger.debug(f"[TESTGEN] Using fallback token budget: {available_tokens:,} tokens")
+        else:
+            # No model context available (shouldn't happen in normal flow)
+            available_tokens = 120000  # Conservative fallback
+            logger.debug(f"[TESTGEN] No model context, using fallback token budget: {available_tokens:,} tokens")

        # Process test examples first to determine token allocation
        test_examples_content = ""
--- a/tools/thinkdeep.py
+++ b/tools/thinkdeep.py
@@ -158,13 +158,7 @@ class ThinkDeepTool(BaseTool):
        if updated_files is not None:
            request.files = updated_files

-        # MCP boundary check - STRICT REJECTION
-        if request.files:
-            file_size_check = self.check_total_file_size(request.files)
-            if file_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
+        # File size validation happens at MCP boundary in server.py

        # Build context parts
        context_parts = [f"=== CLAUDE'S CURRENT ANALYSIS ===\n{current_analysis}\n=== END ANALYSIS ==="]