Add Consensus Tool for Multi-Model Perspective Gathering (#67)
* WIP Refactor resolving mode_names, should be done once at MCP call boundary Pass around model context instead Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses. * Deduplication of model resolution, model_context should be available before reaching deeper parts of the code Improved abstraction when building conversations Throw programmer errors early * Guardrails Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus Model name now supports an optional ":option" for future use * Simplified async flow * Improved model for request to support natural language Simplified async flow * Improved model for request to support natural language Simplified async flow * Fix consensus tool async/sync patterns to match codebase standards CRITICAL FIXES: - Converted _get_consensus_responses from async to sync (matches other tools) - Converted store_conversation_turn from async to sync (add_turn is synchronous) - Removed unnecessary asyncio imports and sleep calls - Fixed ClosedResourceError in MCP protocol during long consensus operations PATTERN ALIGNMENT: - Consensus tool now follows same sync patterns as all other tools - Only execute() and prepare_prompt() are async (base class requirement) - All internal operations are synchronous like analyze, chat, debug, etc. TESTING: - MCP simulation test now passes: consensus_stance ✅ - Two-model consensus works correctly in ~35 seconds - Unknown stance handling defaults to neutral with warnings - All 9 unit tests pass (100% success rate) The consensus tool async patterns were anomalous in the codebase. This fix aligns it with the established synchronous patterns used by all other tools while maintaining full functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Fixed call order and added new test * Cleanup dead comments Docs for the new tool Improved tests --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
9b98df650b
commit
95556ba9ea
@@ -5,6 +5,7 @@ Tool implementations for Zen MCP Server
|
||||
from .analyze import AnalyzeTool
|
||||
from .chat import ChatTool
|
||||
from .codereview import CodeReviewTool
|
||||
from .consensus import ConsensusTool
|
||||
from .debug import DebugIssueTool
|
||||
from .listmodels import ListModelsTool
|
||||
from .precommit import Precommit
|
||||
@@ -19,6 +20,7 @@ __all__ = [
|
||||
"DebugIssueTool",
|
||||
"AnalyzeTool",
|
||||
"ChatTool",
|
||||
"ConsensusTool",
|
||||
"ListModelsTool",
|
||||
"Precommit",
|
||||
"RefactorTool",
|
||||
|
||||
@@ -141,13 +141,7 @@ class AnalyzeTool(BaseTool):
|
||||
if updated_files is not None:
|
||||
request.files = updated_files
|
||||
|
||||
# MCP boundary check - STRICT REJECTION
|
||||
if request.files:
|
||||
file_size_check = self.check_total_file_size(request.files)
|
||||
if file_size_check:
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
|
||||
# File size validation happens at MCP boundary in server.py
|
||||
|
||||
# Use centralized file processing logic
|
||||
continuation_id = getattr(request, "continuation_id", None)
|
||||
|
||||
326
tools/base.py
326
tools/base.py
@@ -31,6 +31,7 @@ from providers.base import ProviderType
|
||||
from utils import check_token_limit
|
||||
from utils.conversation_memory import (
|
||||
MAX_CONVERSATION_TURNS,
|
||||
ConversationTurn,
|
||||
add_turn,
|
||||
create_thread,
|
||||
get_conversation_file_list,
|
||||
@@ -643,6 +644,41 @@ class BaseTool(ABC):
|
||||
)
|
||||
return requested_files
|
||||
|
||||
def format_conversation_turn(self, turn: ConversationTurn) -> list[str]:
|
||||
"""
|
||||
Format a conversation turn for display in conversation history.
|
||||
|
||||
Tools can override this to provide custom formatting for their responses
|
||||
while maintaining the standard structure for cross-tool compatibility.
|
||||
|
||||
This method is called by build_conversation_history when reconstructing
|
||||
conversation context, allowing each tool to control how its responses
|
||||
appear in subsequent conversation turns.
|
||||
|
||||
Args:
|
||||
turn: The conversation turn to format (from utils.conversation_memory)
|
||||
|
||||
Returns:
|
||||
list[str]: Lines of formatted content for this turn
|
||||
|
||||
Example:
|
||||
Default implementation returns:
|
||||
["Files used in this turn: file1.py, file2.py", "", "Response content..."]
|
||||
|
||||
Tools can override to add custom sections, formatting, or metadata display.
|
||||
"""
|
||||
parts = []
|
||||
|
||||
# Add files context if present
|
||||
if turn.files:
|
||||
parts.append(f"Files used in this turn: {', '.join(turn.files)}")
|
||||
parts.append("") # Empty line for readability
|
||||
|
||||
# Add the actual content
|
||||
parts.append(turn.content)
|
||||
|
||||
return parts
|
||||
|
||||
def _prepare_file_content_for_prompt(
|
||||
self,
|
||||
request_files: list[str],
|
||||
@@ -716,109 +752,35 @@ class BaseTool(ABC):
|
||||
elif max_tokens is not None:
|
||||
effective_max_tokens = max_tokens - reserve_tokens
|
||||
else:
|
||||
# Get model-specific limits
|
||||
# First check if model_context was passed from server.py
|
||||
model_context = None
|
||||
if arguments:
|
||||
model_context = arguments.get("_model_context") or getattr(self, "_current_arguments", {}).get(
|
||||
"_model_context"
|
||||
# The execute() method is responsible for setting self._model_context.
|
||||
# A missing context is a programming error, not a fallback case.
|
||||
if not hasattr(self, "_model_context") or not self._model_context:
|
||||
logger.error(
|
||||
f"[FILES] {self.name}: _prepare_file_content_for_prompt called without a valid model context. "
|
||||
"This indicates an incorrect call sequence in the tool's implementation."
|
||||
)
|
||||
# Fail fast to reveal integration issues. A silent fallback with arbitrary
|
||||
# limits can hide bugs and lead to unexpected token usage or silent failures.
|
||||
raise RuntimeError("ModelContext not initialized before file preparation.")
|
||||
|
||||
if model_context:
|
||||
# Use the passed model context
|
||||
try:
|
||||
token_allocation = model_context.calculate_token_allocation()
|
||||
effective_max_tokens = token_allocation.file_tokens - reserve_tokens
|
||||
logger.debug(
|
||||
f"[FILES] {self.name}: Using passed model context for {model_context.model_name}: "
|
||||
f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"[FILES] {self.name}: Error using passed model context: {e}")
|
||||
# Fall through to manual calculation
|
||||
model_context = None
|
||||
|
||||
if not model_context:
|
||||
# Manual calculation as fallback
|
||||
from config import DEFAULT_MODEL
|
||||
|
||||
model_name = getattr(self, "_current_model_name", None) or DEFAULT_MODEL
|
||||
|
||||
# Handle auto mode gracefully
|
||||
if model_name.lower() == "auto":
|
||||
from providers.registry import ModelProviderRegistry
|
||||
|
||||
# Use tool-specific fallback model for capacity estimation
|
||||
# This properly handles different providers (OpenAI=200K, Gemini=1M)
|
||||
tool_category = self.get_model_category()
|
||||
fallback_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
|
||||
logger.debug(
|
||||
f"[FILES] {self.name}: Auto mode detected, using {fallback_model} "
|
||||
f"for {tool_category.value} tool capacity estimation"
|
||||
)
|
||||
|
||||
try:
|
||||
provider = self.get_model_provider(fallback_model)
|
||||
capabilities = provider.get_capabilities(fallback_model)
|
||||
|
||||
# Calculate content allocation based on model capacity
|
||||
if capabilities.context_window < 300_000:
|
||||
# Smaller context models: 60% content, 40% response
|
||||
model_content_tokens = int(capabilities.context_window * 0.6)
|
||||
else:
|
||||
# Larger context models: 80% content, 20% response
|
||||
model_content_tokens = int(capabilities.context_window * 0.8)
|
||||
|
||||
effective_max_tokens = model_content_tokens - reserve_tokens
|
||||
logger.debug(
|
||||
f"[FILES] {self.name}: Using {fallback_model} capacity for auto mode: "
|
||||
f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
|
||||
)
|
||||
except (ValueError, AttributeError) as e:
|
||||
# Handle specific errors: provider not found, model not supported, missing attributes
|
||||
logger.warning(
|
||||
f"[FILES] {self.name}: Could not get capabilities for fallback model {fallback_model}: {type(e).__name__}: {e}"
|
||||
)
|
||||
# Fall back to conservative default for safety
|
||||
effective_max_tokens = 100_000 - reserve_tokens
|
||||
except Exception as e:
|
||||
# Catch any other unexpected errors
|
||||
logger.error(
|
||||
f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
|
||||
)
|
||||
effective_max_tokens = 100_000 - reserve_tokens
|
||||
else:
|
||||
# Normal mode - use the specified model
|
||||
try:
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
|
||||
# Calculate content allocation based on model capacity
|
||||
if capabilities.context_window < 300_000:
|
||||
# Smaller context models: 60% content, 40% response
|
||||
model_content_tokens = int(capabilities.context_window * 0.6)
|
||||
else:
|
||||
# Larger context models: 80% content, 20% response
|
||||
model_content_tokens = int(capabilities.context_window * 0.8)
|
||||
|
||||
effective_max_tokens = model_content_tokens - reserve_tokens
|
||||
logger.debug(
|
||||
f"[FILES] {self.name}: Using model-specific limit for {model_name}: "
|
||||
f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
|
||||
)
|
||||
except (ValueError, AttributeError) as e:
|
||||
# Handle specific errors: provider not found, model not supported, missing attributes
|
||||
logger.warning(
|
||||
f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}"
|
||||
)
|
||||
# Fall back to conservative default for safety
|
||||
effective_max_tokens = 100_000 - reserve_tokens
|
||||
except Exception as e:
|
||||
# Catch any other unexpected errors
|
||||
logger.error(
|
||||
f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
|
||||
)
|
||||
effective_max_tokens = 100_000 - reserve_tokens
|
||||
# This is now the single source of truth for token allocation.
|
||||
model_context = self._model_context
|
||||
try:
|
||||
token_allocation = model_context.calculate_token_allocation()
|
||||
# Standardize on `file_tokens` for consistency and correctness.
|
||||
# This fixes the bug where the old code incorrectly used content_tokens
|
||||
effective_max_tokens = token_allocation.file_tokens - reserve_tokens
|
||||
logger.debug(
|
||||
f"[FILES] {self.name}: Using model context for {model_context.model_name}: "
|
||||
f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"[FILES] {self.name}: Failed to calculate token allocation from model context: {e}", exc_info=True
|
||||
)
|
||||
# If the context exists but calculation fails, we still need to prevent a crash.
|
||||
# A loud error is logged, and we fall back to a safe default.
|
||||
effective_max_tokens = 100_000 - reserve_tokens
|
||||
|
||||
# Ensure we have a reasonable minimum budget
|
||||
effective_max_tokens = max(1000, effective_max_tokens)
|
||||
@@ -1087,8 +1049,14 @@ When recommending searches, be specific about what information you need and why
|
||||
|
||||
# Get model capabilities to check image support and size limits
|
||||
try:
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
# Use the already-resolved provider from model context if available
|
||||
if hasattr(self, "_model_context") and self._model_context:
|
||||
provider = self._model_context.provider
|
||||
capabilities = self._model_context.capabilities
|
||||
else:
|
||||
# Fallback for edge cases (e.g., direct test calls)
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
|
||||
# Fall back to checking custom models configuration
|
||||
@@ -1214,7 +1182,7 @@ When recommending searches, be specific about what information you need and why
|
||||
|
||||
return estimate_file_tokens(file_path)
|
||||
|
||||
def check_total_file_size(self, files: list[str]) -> Optional[dict[str, Any]]:
|
||||
def check_total_file_size(self, files: list[str], model_name: str) -> Optional[dict[str, Any]]:
|
||||
"""
|
||||
Check if total file sizes would exceed token threshold before embedding.
|
||||
|
||||
@@ -1224,6 +1192,7 @@ When recommending searches, be specific about what information you need and why
|
||||
|
||||
Args:
|
||||
files: List of file paths to check
|
||||
model_name: The resolved model name to use for token limits
|
||||
|
||||
Returns:
|
||||
Dict with `code_too_large` response if too large, None if acceptable
|
||||
@@ -1231,13 +1200,6 @@ When recommending searches, be specific about what information you need and why
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# Get current model name for context-aware thresholds
|
||||
model_name = getattr(self, "_current_model_name", None)
|
||||
if not model_name:
|
||||
from config import DEFAULT_MODEL
|
||||
|
||||
model_name = DEFAULT_MODEL
|
||||
|
||||
# Use centralized file size checking with model context
|
||||
from utils.file_utils import check_total_file_size as check_file_size_utility
|
||||
|
||||
@@ -1353,6 +1315,65 @@ When recommending searches, be specific about what information you need and why
|
||||
# Extract and validate images from request
|
||||
images = getattr(request, "images", None) or []
|
||||
|
||||
# MODEL RESOLUTION NOW HAPPENS AT MCP BOUNDARY
|
||||
# Extract pre-resolved model context from server.py
|
||||
model_context = self._current_arguments.get("_model_context")
|
||||
resolved_model_name = self._current_arguments.get("_resolved_model_name")
|
||||
|
||||
if model_context and resolved_model_name:
|
||||
# Model was already resolved at MCP boundary
|
||||
model_name = resolved_model_name
|
||||
logger.debug(f"Using pre-resolved model '{model_name}' from MCP boundary")
|
||||
else:
|
||||
# Fallback for direct execute calls
|
||||
model_name = getattr(request, "model", None)
|
||||
if not model_name:
|
||||
from config import DEFAULT_MODEL
|
||||
|
||||
model_name = DEFAULT_MODEL
|
||||
logger.debug(f"Using fallback model resolution for '{model_name}' (test mode)")
|
||||
|
||||
# For tests: Check if we should require model selection (auto mode)
|
||||
if self._should_require_model_selection(model_name):
|
||||
# Get suggested model based on tool category
|
||||
from providers.registry import ModelProviderRegistry
|
||||
|
||||
tool_category = self.get_model_category()
|
||||
suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
|
||||
|
||||
# Build error message based on why selection is required
|
||||
if model_name.lower() == "auto":
|
||||
error_message = (
|
||||
f"Model parameter is required in auto mode. "
|
||||
f"Suggested model for {self.name}: '{suggested_model}' "
|
||||
f"(category: {tool_category.value})"
|
||||
)
|
||||
else:
|
||||
# Model was specified but not available
|
||||
available_models = self._get_available_models()
|
||||
|
||||
error_message = (
|
||||
f"Model '{model_name}' is not available with current API keys. "
|
||||
f"Available models: {', '.join(available_models)}. "
|
||||
f"Suggested model for {self.name}: '{suggested_model}' "
|
||||
f"(category: {tool_category.value})"
|
||||
)
|
||||
error_output = ToolOutput(
|
||||
status="error",
|
||||
content=error_message,
|
||||
content_type="text",
|
||||
)
|
||||
return [TextContent(type="text", text=error_output.model_dump_json())]
|
||||
|
||||
# Create model context for tests
|
||||
from utils.model_context import ModelContext
|
||||
|
||||
model_context = ModelContext(model_name)
|
||||
|
||||
# Store resolved model name for use by helper methods
|
||||
self._current_model_name = model_name
|
||||
self._model_context = model_context
|
||||
|
||||
# Check if we have continuation_id - if so, conversation history is already embedded
|
||||
continuation_id = getattr(request, "continuation_id", None)
|
||||
|
||||
@@ -1389,57 +1410,11 @@ When recommending searches, be specific about what information you need and why
|
||||
prompt = f"{prompt}\n\n{follow_up_instructions}"
|
||||
logger.debug(f"Added follow-up instructions for new {self.name} conversation")
|
||||
|
||||
# Extract model configuration from request or use defaults
|
||||
model_name = getattr(request, "model", None)
|
||||
if not model_name:
|
||||
from config import DEFAULT_MODEL
|
||||
|
||||
model_name = DEFAULT_MODEL
|
||||
|
||||
# Check if we need Claude to select a model
|
||||
# This happens when:
|
||||
# 1. The model is explicitly "auto"
|
||||
# 2. The requested model is not available
|
||||
if self._should_require_model_selection(model_name):
|
||||
# Get suggested model based on tool category
|
||||
from providers.registry import ModelProviderRegistry
|
||||
|
||||
tool_category = self.get_model_category()
|
||||
suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
|
||||
|
||||
# Build error message based on why selection is required
|
||||
if model_name.lower() == "auto":
|
||||
error_message = (
|
||||
f"Model parameter is required in auto mode. "
|
||||
f"Suggested model for {self.name}: '{suggested_model}' "
|
||||
f"(category: {tool_category.value})"
|
||||
)
|
||||
else:
|
||||
# Model was specified but not available
|
||||
# Get list of available models
|
||||
available_models = self._get_available_models()
|
||||
|
||||
error_message = (
|
||||
f"Model '{model_name}' is not available with current API keys. "
|
||||
f"Available models: {', '.join(available_models)}. "
|
||||
f"Suggested model for {self.name}: '{suggested_model}' "
|
||||
f"(category: {tool_category.value})"
|
||||
)
|
||||
|
||||
error_output = ToolOutput(
|
||||
status="error",
|
||||
content=error_message,
|
||||
content_type="text",
|
||||
)
|
||||
return [TextContent(type="text", text=error_output.model_dump_json())]
|
||||
|
||||
# Store model name for use by helper methods like _prepare_file_content_for_prompt
|
||||
# Only set this after auto mode validation to prevent "auto" being used as a model name
|
||||
self._current_model_name = model_name
|
||||
# Model name already resolved and stored in self._current_model_name earlier
|
||||
|
||||
# Validate images at MCP boundary if any were provided
|
||||
if images:
|
||||
image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
|
||||
image_validation_error = self._validate_image_limits(images, self._current_model_name, continuation_id)
|
||||
if image_validation_error:
|
||||
return [TextContent(type="text", text=json.dumps(image_validation_error))]
|
||||
|
||||
@@ -1451,10 +1426,10 @@ When recommending searches, be specific about what information you need and why
|
||||
thinking_mode = self.get_default_thinking_mode()
|
||||
|
||||
# Get the appropriate model provider
|
||||
provider = self.get_model_provider(model_name)
|
||||
provider = self.get_model_provider(self._current_model_name)
|
||||
|
||||
# Validate and correct temperature for this model
|
||||
temperature, temp_warnings = self._validate_and_correct_temperature(model_name, temperature)
|
||||
temperature, temp_warnings = self._validate_and_correct_temperature(self._current_model_name, temperature)
|
||||
|
||||
# Log any temperature corrections
|
||||
for warning in temp_warnings:
|
||||
@@ -1465,16 +1440,21 @@ When recommending searches, be specific about what information you need and why
|
||||
|
||||
# Generate AI response using the provider
|
||||
logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.name}")
|
||||
logger.info(f"Using model: {model_name} via {provider.get_provider_type().value} provider")
|
||||
logger.debug(f"Prompt length: {len(prompt)} characters")
|
||||
logger.info(f"Using model: {self._current_model_name} via {provider.get_provider_type().value} provider")
|
||||
|
||||
# Import token estimation utility
|
||||
from utils.token_utils import estimate_tokens
|
||||
|
||||
estimated_tokens = estimate_tokens(prompt)
|
||||
logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")
|
||||
|
||||
# Generate content with provider abstraction
|
||||
model_response = provider.generate_content(
|
||||
prompt=prompt,
|
||||
model_name=model_name,
|
||||
model_name=self._current_model_name,
|
||||
system_prompt=system_prompt,
|
||||
temperature=temperature,
|
||||
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
|
||||
thinking_mode=thinking_mode if provider.supports_thinking_mode(self._current_model_name) else None,
|
||||
images=images if images else None, # Pass images via kwargs
|
||||
)
|
||||
|
||||
@@ -1486,7 +1466,11 @@ When recommending searches, be specific about what information you need and why
|
||||
|
||||
# Parse response to check for clarification requests or format output
|
||||
# Pass model info for conversation tracking
|
||||
model_info = {"provider": provider, "model_name": model_name, "model_response": model_response}
|
||||
model_info = {
|
||||
"provider": provider,
|
||||
"model_name": self._current_model_name,
|
||||
"model_response": model_response,
|
||||
}
|
||||
tool_output = self._parse_response(raw_text, request, model_info)
|
||||
logger.info(f"✅ {self.name} tool completed successfully")
|
||||
|
||||
@@ -1894,8 +1878,14 @@ When recommending searches, be specific about what information you need and why
|
||||
Tuple of (corrected_temperature, warning_messages)
|
||||
"""
|
||||
try:
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
# Use the already-resolved provider and capabilities from model context
|
||||
if hasattr(self, "_model_context") and self._model_context:
|
||||
capabilities = self._model_context.capabilities
|
||||
else:
|
||||
# Fallback for edge cases (e.g., direct test calls)
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
|
||||
constraint = capabilities.temperature_constraint
|
||||
|
||||
warnings = []
|
||||
|
||||
@@ -227,13 +227,7 @@ class CodeReviewTool(BaseTool):
|
||||
if updated_files is not None:
|
||||
request.files = updated_files
|
||||
|
||||
# MCP boundary check - STRICT REJECTION
|
||||
if request.files:
|
||||
file_size_check = self.check_total_file_size(request.files)
|
||||
if file_size_check:
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
|
||||
# File size validation happens at MCP boundary in server.py
|
||||
|
||||
# Check user input size at MCP transport boundary (before adding internal content)
|
||||
user_content = request.prompt
|
||||
|
||||
846
tools/consensus.py
Normal file
846
tools/consensus.py
Normal file
@@ -0,0 +1,846 @@
|
||||
"""
|
||||
Consensus tool for multi-model perspective gathering and validation
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from mcp.types import TextContent
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tools.models import ToolModelCategory
|
||||
|
||||
from config import DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION
|
||||
from systemprompts import CONSENSUS_PROMPT
|
||||
|
||||
from .base import BaseTool, ToolRequest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelConfig(BaseModel):
|
||||
"""Enhanced model configuration for consensus tool"""
|
||||
|
||||
model: str = Field(..., description="Model name to use (e.g., 'o3', 'flash', 'pro')")
|
||||
stance: Optional[str] = Field(
|
||||
default="neutral",
|
||||
description=(
|
||||
"Stance for this model. Supportive: 'for', 'support', 'favor'. "
|
||||
"Critical: 'against', 'oppose', 'critical'. Neutral: 'neutral'. "
|
||||
"Defaults to 'neutral'."
|
||||
),
|
||||
)
|
||||
stance_prompt: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Custom stance-specific instructions for this model. "
|
||||
"If provided, this will be used instead of the default stance prompt. "
|
||||
"Should be clear, specific instructions about how this model should approach the analysis."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class ConsensusRequest(ToolRequest):
|
||||
"""Request model for consensus tool"""
|
||||
|
||||
prompt: str = Field(
|
||||
...,
|
||||
description=(
|
||||
"Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
|
||||
"Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
|
||||
),
|
||||
)
|
||||
models: list[ModelConfig] = Field(
|
||||
...,
|
||||
description=(
|
||||
"List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
|
||||
"Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
|
||||
"{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
|
||||
"Maximum 2 instances per model+stance combination."
|
||||
),
|
||||
)
|
||||
files: Optional[list[str]] = Field(
|
||||
default_factory=list,
|
||||
description="Optional files or directories for additional context (must be absolute paths)",
|
||||
)
|
||||
images: Optional[list[str]] = Field(
|
||||
default_factory=list,
|
||||
description=(
|
||||
"Optional images showing expected UI changes, design requirements, "
|
||||
"or visual references for the consensus analysis"
|
||||
),
|
||||
)
|
||||
focus_areas: Optional[list[str]] = Field(
|
||||
default_factory=list,
|
||||
description="Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
|
||||
)
|
||||
|
||||
@field_validator("models")
|
||||
@classmethod
|
||||
def validate_models_not_empty(cls, v):
|
||||
if not v:
|
||||
raise ValueError("At least one model must be specified")
|
||||
return v
|
||||
|
||||
|
||||
class ConsensusTool(BaseTool):
|
||||
"""Multi-model consensus tool for gathering diverse perspectives on technical proposals"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def parse_structured_prompt_models(model_spec: str) -> list[dict[str, str]]:
|
||||
"""
|
||||
Parse consensus model specification from structured prompt format.
|
||||
|
||||
This method parses structured prompt specifications used in Claude Code shortcuts
|
||||
like "/zen:consensus:flash:for,o3:against,pro:neutral" to extract model configurations
|
||||
with their assigned stances.
|
||||
|
||||
Supported formats:
|
||||
- "model:stance" - Explicit stance assignment (e.g., "flash:for", "o3:against")
|
||||
- "model" - Defaults to neutral stance (e.g., "pro" becomes "pro:neutral")
|
||||
|
||||
Supported stances:
|
||||
- Supportive: "for", "support", "favor"
|
||||
- Critical: "against", "oppose", "critical"
|
||||
- Neutral: "neutral" (default)
|
||||
|
||||
Args:
|
||||
model_spec (str): Comma-separated model specification string.
|
||||
Examples: "flash:for,o3:against,pro:neutral" or "flash:for,o3:against,pro"
|
||||
|
||||
Returns:
|
||||
list[dict[str, str]]: List of model configuration dictionaries with keys:
|
||||
- "model": The model name (e.g., "flash", "o3", "pro")
|
||||
- "stance": The normalized stance (e.g., "for", "against", "neutral")
|
||||
|
||||
Examples:
|
||||
>>> ConsensusTool.parse_structured_prompt_models("flash:for,o3:against,pro")
|
||||
[{"model": "flash", "stance": "for"}, {"model": "o3", "stance": "against"}, {"model": "pro", "stance": "neutral"}]
|
||||
|
||||
>>> ConsensusTool.parse_structured_prompt_models("flash,o3,pro")
|
||||
[{"model": "flash", "stance": "neutral"}, {"model": "o3", "stance": "neutral"}, {"model": "pro", "stance": "neutral"}]
|
||||
"""
|
||||
models = []
|
||||
|
||||
# Split by comma to get individual model specs
|
||||
model_parts = model_spec.split(",")
|
||||
|
||||
for part in model_parts:
|
||||
part = part.strip()
|
||||
if ":" in part:
|
||||
# Model with stance: "flash:for" or "o3:against"
|
||||
model_name, stance = part.split(":", 1)
|
||||
models.append({"model": model_name.strip(), "stance": stance.strip()})
|
||||
else:
|
||||
# Model without stance (defaults to neutral): "pro"
|
||||
models.append({"model": part.strip(), "stance": "neutral"})
|
||||
|
||||
return models
|
||||
|
||||
def get_name(self) -> str:
|
||||
return "consensus"
|
||||
|
||||
def get_description(self) -> str:
|
||||
return (
|
||||
"MULTI-MODEL CONSENSUS - Gather diverse perspectives from multiple AI models on technical proposals, "
|
||||
"plans, and ideas. Perfect for validation, feasibility assessment, and getting comprehensive "
|
||||
"viewpoints on complex decisions. Supports advanced stance steering with custom instructions for each model. "
|
||||
"You can specify different stances (for/against/neutral) and provide custom stance prompts to guide each "
|
||||
"model's analysis. Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on implementation "
|
||||
"benefits and user value'}, {'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify potential "
|
||||
"risks and technical challenges'}]. Use neutral stances by default unless structured debate would add value."
|
||||
)
|
||||
|
||||
def get_input_schema(self) -> dict[str, Any]:
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Description of what to get consensus on, testing objectives, and specific scope/focus areas. "
|
||||
"Be as detailed as possible about the proposal, plan, or idea you want multiple perspectives on."
|
||||
),
|
||||
},
|
||||
"models": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "Model name to use (e.g., 'o3', 'flash', 'pro')",
|
||||
},
|
||||
"stance": {
|
||||
"type": "string",
|
||||
"enum": ["for", "support", "favor", "against", "oppose", "critical", "neutral"],
|
||||
"description": "Stance for this model: supportive ('for', 'support', 'favor'), critical ('against', 'oppose', 'critical'), or 'neutral'",
|
||||
"default": "neutral",
|
||||
},
|
||||
"stance_prompt": {
|
||||
"type": "string",
|
||||
"description": "Custom stance-specific instructions for this model. If provided, this will be used instead of the default stance prompt.",
|
||||
},
|
||||
},
|
||||
"required": ["model"],
|
||||
},
|
||||
"description": (
|
||||
"List of model configurations for consensus analysis. Each model can have a specific stance and custom instructions. "
|
||||
"Example: [{'model': 'o3', 'stance': 'for', 'stance_prompt': 'Focus on benefits and opportunities...'}, "
|
||||
"{'model': 'flash', 'stance': 'against', 'stance_prompt': 'Identify risks and challenges...'}]. "
|
||||
"Maximum 2 instances per model+stance combination."
|
||||
),
|
||||
},
|
||||
"files": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Optional files or directories for additional context (must be absolute paths)",
|
||||
},
|
||||
"images": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": (
|
||||
"Optional images showing expected UI changes, design requirements, "
|
||||
"or visual references for the consensus analysis"
|
||||
),
|
||||
},
|
||||
"focus_areas": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Specific aspects to focus on (e.g., 'performance', 'security', 'user experience')",
|
||||
},
|
||||
"temperature": {
|
||||
"type": "number",
|
||||
"description": "Temperature (0-1, default 0.2 for consistency)",
|
||||
"minimum": 0,
|
||||
"maximum": 1,
|
||||
"default": self.get_default_temperature(),
|
||||
},
|
||||
"thinking_mode": {
|
||||
"type": "string",
|
||||
"enum": ["minimal", "low", "medium", "high", "max"],
|
||||
"description": (
|
||||
"Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), "
|
||||
"high (67%), max (100% of model max)"
|
||||
),
|
||||
},
|
||||
"use_websearch": {
|
||||
"type": "boolean",
|
||||
"description": (
|
||||
"Enable web search for documentation, best practices, and current information. "
|
||||
"Particularly useful for: brainstorming sessions, architectural design discussions, "
|
||||
"exploring industry best practices, working with specific frameworks/technologies, "
|
||||
"researching solutions to complex problems, or when current documentation and "
|
||||
"community insights would enhance the analysis."
|
||||
),
|
||||
"default": True,
|
||||
},
|
||||
"continuation_id": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Thread continuation ID for multi-turn conversations. Can be used to continue "
|
||||
"conversations across different tools. Only provide this if continuing a previous "
|
||||
"conversation thread."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["prompt", "models"],
|
||||
}
|
||||
|
||||
return schema
|
||||
|
||||
def get_system_prompt(self) -> str:
|
||||
return CONSENSUS_PROMPT
|
||||
|
||||
def get_default_temperature(self) -> float:
|
||||
return 0.2 # Lower temperature for more consistent consensus responses
|
||||
|
||||
def get_model_category(self) -> "ToolModelCategory":
|
||||
"""Consensus uses extended reasoning models for deep analysis"""
|
||||
from tools.models import ToolModelCategory
|
||||
|
||||
return ToolModelCategory.EXTENDED_REASONING
|
||||
|
||||
def get_request_model(self):
|
||||
return ConsensusRequest
|
||||
|
||||
def format_conversation_turn(self, turn) -> list[str]:
|
||||
"""
|
||||
Format consensus turns with individual model responses for better readability.
|
||||
|
||||
This custom formatting shows the individual model responses that were
|
||||
synthesized into the consensus, making it easier to understand the
|
||||
reasoning behind the final recommendation.
|
||||
"""
|
||||
parts = []
|
||||
|
||||
# Add files context if present
|
||||
if turn.files:
|
||||
parts.append(f"Files used in this turn: {', '.join(turn.files)}")
|
||||
parts.append("")
|
||||
|
||||
# Check if this is a consensus turn with individual responses
|
||||
if turn.model_metadata and turn.model_metadata.get("individual_responses"):
|
||||
individual_responses = turn.model_metadata["individual_responses"]
|
||||
|
||||
# Add consensus header
|
||||
models_consulted = []
|
||||
for resp in individual_responses:
|
||||
model = resp["model"]
|
||||
stance = resp.get("stance", "neutral")
|
||||
if stance != "neutral":
|
||||
models_consulted.append(f"{model}:{stance}")
|
||||
else:
|
||||
models_consulted.append(model)
|
||||
|
||||
parts.append(f"Models consulted: {', '.join(models_consulted)}")
|
||||
parts.append("")
|
||||
parts.append("=== INDIVIDUAL MODEL RESPONSES ===")
|
||||
parts.append("")
|
||||
|
||||
# Add each successful model response
|
||||
for i, response in enumerate(individual_responses):
|
||||
model_name = response["model"]
|
||||
stance = response.get("stance", "neutral")
|
||||
verdict = response["verdict"]
|
||||
|
||||
stance_label = f"({stance.title()} Stance)" if stance != "neutral" else "(Neutral Analysis)"
|
||||
parts.append(f"**{model_name.upper()} {stance_label}**:")
|
||||
parts.append(verdict)
|
||||
|
||||
if i < len(individual_responses) - 1:
|
||||
parts.append("")
|
||||
parts.append("---")
|
||||
parts.append("")
|
||||
|
||||
parts.append("=== END INDIVIDUAL RESPONSES ===")
|
||||
parts.append("")
|
||||
parts.append("Claude's Synthesis:")
|
||||
|
||||
# Add the actual content
|
||||
parts.append(turn.content)
|
||||
|
||||
return parts
|
||||
|
||||
def _normalize_stance(self, stance: Optional[str]) -> str:
|
||||
"""Normalize stance to canonical form."""
|
||||
if not stance:
|
||||
return "neutral"
|
||||
|
||||
stance = stance.lower()
|
||||
|
||||
# Define stance synonyms
|
||||
supportive_stances = {"for", "support", "favor"}
|
||||
critical_stances = {"against", "oppose", "critical"}
|
||||
|
||||
# Map synonyms to canonical stance
|
||||
if stance in supportive_stances:
|
||||
return "for"
|
||||
elif stance in critical_stances:
|
||||
return "against"
|
||||
elif stance == "neutral":
|
||||
return "neutral"
|
||||
else:
|
||||
# Unknown stances default to neutral for robustness
|
||||
logger.warning(
|
||||
f"Unknown stance '{stance}' provided, defaulting to 'neutral'. Valid stances: {', '.join(sorted(supportive_stances | critical_stances))}, or 'neutral'"
|
||||
)
|
||||
return "neutral"
|
||||
|
||||
def _validate_model_combinations(self, model_configs: list[ModelConfig]) -> tuple[list[ModelConfig], list[str]]:
|
||||
"""Validate model configurations and enforce limits.
|
||||
|
||||
Returns:
|
||||
tuple: (valid_configs, skipped_entries)
|
||||
- Each model+stance combination can appear max 2 times
|
||||
- Same model+stance limited to 2 instances
|
||||
"""
|
||||
valid_configs = []
|
||||
skipped_entries = []
|
||||
combination_counts = {} # Track (model, stance) -> count
|
||||
|
||||
for config in model_configs:
|
||||
try:
|
||||
# Normalize stance
|
||||
normalized_stance = self._normalize_stance(config.stance)
|
||||
|
||||
# Create normalized config
|
||||
normalized_config = ModelConfig(
|
||||
model=config.model, stance=normalized_stance, stance_prompt=config.stance_prompt
|
||||
)
|
||||
|
||||
combination_key = (config.model, normalized_stance)
|
||||
current_count = combination_counts.get(combination_key, 0)
|
||||
|
||||
if current_count >= DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION:
|
||||
# Already have max instances of this model+stance combination
|
||||
skipped_entries.append(
|
||||
f"{config.model}:{normalized_stance} (max {DEFAULT_CONSENSUS_MAX_INSTANCES_PER_COMBINATION} instances)"
|
||||
)
|
||||
continue
|
||||
|
||||
combination_counts[combination_key] = current_count + 1
|
||||
valid_configs.append(normalized_config)
|
||||
|
||||
except ValueError as e:
|
||||
# Invalid stance or model
|
||||
skipped_entries.append(f"{config.model} ({str(e)})")
|
||||
continue
|
||||
|
||||
return valid_configs, skipped_entries
|
||||
|
||||
def _get_stance_enhanced_prompt(self, stance: str, custom_stance_prompt: Optional[str] = None) -> str:
|
||||
"""Get the system prompt with stance injection based on the stance."""
|
||||
base_prompt = self.get_system_prompt()
|
||||
|
||||
# If custom stance prompt is provided, use it instead of default
|
||||
if custom_stance_prompt:
|
||||
# Validate stance placeholder exists exactly once
|
||||
if base_prompt.count("{stance_prompt}") != 1:
|
||||
raise ValueError(
|
||||
"System prompt must contain exactly one '{stance_prompt}' placeholder, "
|
||||
f"found {base_prompt.count('{stance_prompt}')}"
|
||||
)
|
||||
return base_prompt.replace("{stance_prompt}", custom_stance_prompt)
|
||||
|
||||
stance_prompts = {
|
||||
"for": """SUPPORTIVE PERSPECTIVE WITH INTEGRITY
|
||||
|
||||
You are tasked with advocating FOR this proposal, but with CRITICAL GUARDRAILS:
|
||||
|
||||
MANDATORY ETHICAL CONSTRAINTS:
|
||||
- This is NOT a debate for entertainment. You MUST act in good faith and in the best interest of the questioner
|
||||
- You MUST think deeply about whether supporting this idea is safe, sound, and passes essential requirements
|
||||
- You MUST be direct and unequivocal in saying "this is a bad idea" when it truly is
|
||||
- There must be at least ONE COMPELLING reason to be optimistic, otherwise DO NOT support it
|
||||
|
||||
WHEN TO REFUSE SUPPORT (MUST OVERRIDE STANCE):
|
||||
- If the idea is fundamentally harmful to users, project, or stakeholders
|
||||
- If implementation would violate security, privacy, or ethical standards
|
||||
- If the proposal is technically infeasible within realistic constraints
|
||||
- If costs/risks dramatically outweigh any potential benefits
|
||||
|
||||
YOUR SUPPORTIVE ANALYSIS SHOULD:
|
||||
- Identify genuine strengths and opportunities
|
||||
- Propose solutions to overcome legitimate challenges
|
||||
- Highlight synergies with existing systems
|
||||
- Suggest optimizations that enhance value
|
||||
- Present realistic implementation pathways
|
||||
|
||||
Remember: Being "for" means finding the BEST possible version of the idea IF it has merit, not blindly supporting bad ideas.""",
|
||||
"against": """CRITICAL PERSPECTIVE WITH RESPONSIBILITY
|
||||
|
||||
You are tasked with critiquing this proposal, but with ESSENTIAL BOUNDARIES:
|
||||
|
||||
MANDATORY FAIRNESS CONSTRAINTS:
|
||||
- You MUST NOT oppose genuinely excellent, common-sense ideas just to be contrarian
|
||||
- You MUST acknowledge when a proposal is fundamentally sound and well-conceived
|
||||
- You CANNOT give harmful advice or recommend against beneficial changes
|
||||
- If the idea is outstanding, say so clearly while offering constructive refinements
|
||||
|
||||
WHEN TO MODERATE CRITICISM (MUST OVERRIDE STANCE):
|
||||
- If the proposal addresses critical user needs effectively
|
||||
- If it follows established best practices with good reason
|
||||
- If benefits clearly and substantially outweigh risks
|
||||
- If it's the obvious right solution to the problem
|
||||
|
||||
YOUR CRITICAL ANALYSIS SHOULD:
|
||||
- Identify legitimate risks and failure modes
|
||||
- Point out overlooked complexities
|
||||
- Suggest more efficient alternatives
|
||||
- Highlight potential negative consequences
|
||||
- Question assumptions that may be flawed
|
||||
|
||||
Remember: Being "against" means rigorous scrutiny to ensure quality, not undermining good ideas that deserve support.""",
|
||||
"neutral": """BALANCED PERSPECTIVE
|
||||
|
||||
Provide objective analysis considering both positive and negative aspects. However, if there is overwhelming evidence
|
||||
that the proposal clearly leans toward being exceptionally good or particularly problematic, you MUST accurately
|
||||
reflect this reality. Being "balanced" means being truthful about the weight of evidence, not artificially creating
|
||||
50/50 splits when the reality is 90/10.
|
||||
|
||||
Your analysis should:
|
||||
- Present all significant pros and cons discovered
|
||||
- Weight them according to actual impact and likelihood
|
||||
- If evidence strongly favors one conclusion, clearly state this
|
||||
- Provide proportional coverage based on the strength of arguments
|
||||
- Help the questioner see the true balance of considerations
|
||||
|
||||
Remember: Artificial balance that misrepresents reality is not helpful. True balance means accurate representation
|
||||
of the evidence, even when it strongly points in one direction.""",
|
||||
}
|
||||
|
||||
stance_prompt = stance_prompts.get(stance, stance_prompts["neutral"])
|
||||
|
||||
# Validate stance placeholder exists exactly once
|
||||
if base_prompt.count("{stance_prompt}") != 1:
|
||||
raise ValueError(
|
||||
"System prompt must contain exactly one '{stance_prompt}' placeholder, "
|
||||
f"found {base_prompt.count('{stance_prompt}')}"
|
||||
)
|
||||
|
||||
# Inject stance into the system prompt
|
||||
return base_prompt.replace("{stance_prompt}", stance_prompt)
|
||||
|
||||
def _get_single_response(
|
||||
self, provider, model_config: ModelConfig, prompt: str, request: ConsensusRequest
|
||||
) -> dict[str, Any]:
|
||||
"""Get response from a single model - synchronous method."""
|
||||
logger.debug(f"Getting response from {model_config.model} with stance '{model_config.stance}'")
|
||||
|
||||
try:
|
||||
# Provider.generate_content is synchronous, not async
|
||||
response = provider.generate_content(
|
||||
prompt=prompt,
|
||||
model_name=model_config.model,
|
||||
system_prompt=self._get_stance_enhanced_prompt(model_config.stance, model_config.stance_prompt),
|
||||
temperature=getattr(request, "temperature", None) or self.get_default_temperature(),
|
||||
thinking_mode=getattr(request, "thinking_mode", "medium"),
|
||||
images=getattr(request, "images", None) or [],
|
||||
)
|
||||
return {
|
||||
"model": model_config.model,
|
||||
"stance": model_config.stance,
|
||||
"status": "success",
|
||||
"verdict": response.content, # Contains structured Markdown
|
||||
"metadata": {
|
||||
"provider": getattr(provider.get_provider_type(), "value", provider.get_provider_type()),
|
||||
"usage": response.usage if hasattr(response, "usage") else None,
|
||||
"custom_stance_prompt": bool(model_config.stance_prompt),
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting response from {model_config.model}:{model_config.stance}: {str(e)}")
|
||||
return {"model": model_config.model, "stance": model_config.stance, "status": "error", "error": str(e)}
|
||||
|
||||
def _get_consensus_responses(
|
||||
self, provider_configs: list[tuple], prompt: str, request: ConsensusRequest
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Execute all model requests sequentially - purely synchronous like other tools."""
|
||||
|
||||
logger.debug(f"Processing {len(provider_configs)} models sequentially")
|
||||
responses = []
|
||||
|
||||
for i, (provider, model_config) in enumerate(provider_configs):
|
||||
try:
|
||||
logger.debug(
|
||||
f"Processing {model_config.model}:{model_config.stance} sequentially ({i+1}/{len(provider_configs)})"
|
||||
)
|
||||
|
||||
# Direct synchronous call - matches pattern of other tools
|
||||
response = self._get_single_response(provider, model_config, prompt, request)
|
||||
responses.append(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get response from {model_config.model}:{model_config.stance}: {str(e)}")
|
||||
responses.append(
|
||||
{
|
||||
"model": model_config.model,
|
||||
"stance": model_config.stance,
|
||||
"status": "error",
|
||||
"error": f"Unhandled exception: {str(e)}",
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug(f"Sequential processing completed for {len(responses)} models")
|
||||
return responses
|
||||
|
||||
def _format_consensus_output(self, responses: list[dict[str, Any]], skipped_entries: list[str]) -> str:
|
||||
"""Format the consensus responses into structured output for Claude."""
|
||||
|
||||
logger.debug(f"Formatting consensus output for {len(responses)} responses")
|
||||
|
||||
# Separate successful and failed responses
|
||||
successful_responses = [r for r in responses if r["status"] == "success"]
|
||||
failed_responses = [r for r in responses if r["status"] == "error"]
|
||||
|
||||
logger.debug(f"Successful responses: {len(successful_responses)}, Failed: {len(failed_responses)}")
|
||||
|
||||
# Prepare the structured output (minimize size for MCP stability)
|
||||
models_used = [
|
||||
f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in successful_responses
|
||||
]
|
||||
models_errored = [
|
||||
f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"] for r in failed_responses
|
||||
]
|
||||
|
||||
# Prepare clean responses without truncation
|
||||
clean_responses = []
|
||||
for r in responses:
|
||||
if r["status"] == "success":
|
||||
clean_responses.append(
|
||||
{
|
||||
"model": r["model"],
|
||||
"stance": r["stance"],
|
||||
"status": r["status"],
|
||||
"verdict": r.get("verdict", ""),
|
||||
"metadata": r.get("metadata", {}),
|
||||
}
|
||||
)
|
||||
else:
|
||||
clean_responses.append(
|
||||
{
|
||||
"model": r["model"],
|
||||
"stance": r["stance"],
|
||||
"status": r["status"],
|
||||
"error": r.get("error", "Unknown error"),
|
||||
}
|
||||
)
|
||||
|
||||
output_data = {
|
||||
"status": "consensus_success" if successful_responses else "consensus_failed",
|
||||
"models_used": models_used,
|
||||
"models_skipped": skipped_entries,
|
||||
"models_errored": models_errored,
|
||||
"responses": clean_responses,
|
||||
"next_steps": self._get_synthesis_guidance(successful_responses, failed_responses),
|
||||
}
|
||||
|
||||
return json.dumps(output_data, indent=2)
|
||||
|
||||
def _get_synthesis_guidance(
|
||||
self, successful_responses: list[dict[str, Any]], failed_responses: list[dict[str, Any]]
|
||||
) -> str:
|
||||
"""Generate guidance for Claude on how to synthesize the consensus results."""
|
||||
|
||||
if not successful_responses:
|
||||
return (
|
||||
"No models provided successful responses. Please retry with different models or "
|
||||
"check the error messages for guidance on resolving the issues."
|
||||
)
|
||||
|
||||
if len(successful_responses) == 1:
|
||||
return (
|
||||
"Only one model provided a successful response. Synthesize based on the available "
|
||||
"perspective and indicate areas where additional expert input would be valuable "
|
||||
"due to the limited consensus data."
|
||||
)
|
||||
|
||||
# Multiple successful responses - provide comprehensive synthesis guidance
|
||||
stance_counts = {"for": 0, "against": 0, "neutral": 0}
|
||||
for resp in successful_responses:
|
||||
stance = resp.get("stance", "neutral")
|
||||
stance_counts[stance] = stance_counts.get(stance, 0) + 1
|
||||
|
||||
guidance = (
|
||||
"Claude, synthesize these perspectives by first identifying the key points of "
|
||||
"**agreement** and **disagreement** between the models. Then provide your final, "
|
||||
"consolidated recommendation, explaining how you weighed the different opinions and "
|
||||
"why your proposed solution is the most balanced approach. Explicitly address the "
|
||||
"most critical risks raised by each model and provide actionable next steps for implementation."
|
||||
)
|
||||
|
||||
if failed_responses:
|
||||
guidance += (
|
||||
f" Note: {len(failed_responses)} model(s) failed to respond - consider this "
|
||||
"partial consensus and indicate where additional expert input would strengthen the analysis."
|
||||
)
|
||||
|
||||
return guidance
|
||||
|
||||
async def prepare_prompt(self, request: ConsensusRequest) -> str:
|
||||
"""Prepare the consensus prompt with context files and focus areas."""
|
||||
# Check for prompt.txt in files
|
||||
prompt_content, updated_files = self.handle_prompt_file(request.files)
|
||||
|
||||
# Use prompt.txt content if available, otherwise use the prompt field
|
||||
user_content = prompt_content if prompt_content else request.prompt
|
||||
|
||||
# Check user input size at MCP transport boundary (before adding internal content)
|
||||
size_check = self.check_prompt_size(user_content)
|
||||
if size_check:
|
||||
# Need to return error, but prepare_prompt returns str
|
||||
# Use exception to handle this cleanly
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
|
||||
|
||||
# Update request files list
|
||||
if updated_files is not None:
|
||||
request.files = updated_files
|
||||
|
||||
# Add focus areas if specified
|
||||
if request.focus_areas:
|
||||
focus_areas_text = "\n\nSpecific focus areas for this analysis:\n" + "\n".join(
|
||||
f"- {area}" for area in request.focus_areas
|
||||
)
|
||||
user_content += focus_areas_text
|
||||
|
||||
# Add context files if provided (using centralized file handling with filtering)
|
||||
if request.files:
|
||||
file_content, processed_files = self._prepare_file_content_for_prompt(
|
||||
request.files, request.continuation_id, "Context files"
|
||||
)
|
||||
self._actually_processed_files = processed_files
|
||||
if file_content:
|
||||
user_content = f"{user_content}\n\n=== CONTEXT FILES ===\n{file_content}\n=== END CONTEXT ===="
|
||||
|
||||
# Check token limits
|
||||
self._validate_token_limit(user_content, "Content")
|
||||
|
||||
return user_content
|
||||
|
||||
async def execute(self, arguments: dict[str, Any]) -> list[TextContent]:
|
||||
"""Execute consensus gathering from multiple models."""
|
||||
|
||||
# Store arguments for base class methods
|
||||
self._current_arguments = arguments
|
||||
|
||||
# Validate and create request
|
||||
request = ConsensusRequest(**arguments)
|
||||
|
||||
# Validate model configurations and enforce limits
|
||||
valid_configs, skipped_entries = self._validate_model_combinations(request.models)
|
||||
|
||||
if not valid_configs:
|
||||
error_output = {
|
||||
"status": "consensus_failed",
|
||||
"error": "No valid model configurations after validation",
|
||||
"models_skipped": skipped_entries,
|
||||
"next_steps": "Please provide valid model configurations with proper model names and stance values.",
|
||||
}
|
||||
return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
|
||||
|
||||
# Set up a dummy model context for consensus since we handle multiple models
|
||||
# This is needed for base class methods like prepare_prompt to work
|
||||
if not hasattr(self, "_model_context") or not self._model_context:
|
||||
from utils.model_context import ModelContext
|
||||
|
||||
# Use the first model as the representative for token calculations
|
||||
first_model = valid_configs[0].model if valid_configs else "flash"
|
||||
self._model_context = ModelContext(first_model)
|
||||
|
||||
# Handle conversation continuation if specified
|
||||
if request.continuation_id:
|
||||
from utils.conversation_memory import build_conversation_history, get_thread
|
||||
|
||||
thread_context = get_thread(request.continuation_id)
|
||||
if thread_context:
|
||||
# Build conversation history using the same pattern as other tools
|
||||
conversation_context, _ = build_conversation_history(thread_context, self._model_context)
|
||||
if conversation_context:
|
||||
# Add conversation context to the beginning of the prompt
|
||||
enhanced_prompt = f"{conversation_context}\n\n{request.prompt}"
|
||||
request.prompt = enhanced_prompt
|
||||
|
||||
# Prepare the consensus prompt
|
||||
consensus_prompt = await self.prepare_prompt(request)
|
||||
|
||||
# Get providers for valid model configurations with caching to avoid duplicate lookups
|
||||
provider_configs = []
|
||||
provider_cache = {} # Cache to avoid duplicate provider lookups
|
||||
|
||||
for model_config in valid_configs:
|
||||
try:
|
||||
# Check cache first
|
||||
if model_config.model in provider_cache:
|
||||
provider = provider_cache[model_config.model]
|
||||
else:
|
||||
# Look up provider and cache it
|
||||
provider = self.get_model_provider(model_config.model)
|
||||
provider_cache[model_config.model] = provider
|
||||
|
||||
provider_configs.append((provider, model_config))
|
||||
except Exception as e:
|
||||
# Track failed models
|
||||
model_display = (
|
||||
f"{model_config.model}:{model_config.stance}"
|
||||
if model_config.stance != "neutral"
|
||||
else model_config.model
|
||||
)
|
||||
skipped_entries.append(f"{model_display} (provider not available: {str(e)})")
|
||||
|
||||
if not provider_configs:
|
||||
error_output = {
|
||||
"status": "consensus_failed",
|
||||
"error": "No model providers available",
|
||||
"models_skipped": skipped_entries,
|
||||
"next_steps": "Please check that the specified models have configured API keys and are available.",
|
||||
}
|
||||
return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
|
||||
|
||||
# Send to all models sequentially (purely synchronous like other tools)
|
||||
logger.debug(f"Sending consensus request to {len(provider_configs)} models")
|
||||
responses = self._get_consensus_responses(provider_configs, consensus_prompt, request)
|
||||
logger.debug(f"Received {len(responses)} responses from consensus models")
|
||||
|
||||
# Enforce minimum success requirement - must have at least 1 successful response
|
||||
successful_responses = [r for r in responses if r["status"] == "success"]
|
||||
if not successful_responses:
|
||||
error_output = {
|
||||
"status": "consensus_failed",
|
||||
"error": "All model calls failed - no successful responses received",
|
||||
"models_skipped": skipped_entries,
|
||||
"models_errored": [
|
||||
f"{r['model']}:{r['stance']}" if r["stance"] != "neutral" else r["model"]
|
||||
for r in responses
|
||||
if r["status"] == "error"
|
||||
],
|
||||
"next_steps": "Please retry with different models or check the error messages for guidance on resolving the issues.",
|
||||
}
|
||||
return [TextContent(type="text", text=json.dumps(error_output, indent=2))]
|
||||
|
||||
logger.debug("About to format consensus output for MCP response")
|
||||
|
||||
# Structure the output and store in conversation memory
|
||||
consensus_output = self._format_consensus_output(responses, skipped_entries)
|
||||
|
||||
# Log response size for debugging
|
||||
output_size = len(consensus_output)
|
||||
logger.debug(f"Consensus output size: {output_size:,} characters")
|
||||
|
||||
# Store in conversation memory if continuation_id is provided
|
||||
if request.continuation_id:
|
||||
self.store_conversation_turn(
|
||||
request.continuation_id,
|
||||
consensus_output,
|
||||
request.files,
|
||||
request.images,
|
||||
responses, # Store individual responses in metadata
|
||||
skipped_entries,
|
||||
)
|
||||
|
||||
return [TextContent(type="text", text=consensus_output)]
|
||||
|
||||
def store_conversation_turn(
|
||||
self,
|
||||
continuation_id: str,
|
||||
output: str,
|
||||
files: list[str],
|
||||
images: list[str],
|
||||
responses: list[dict[str, Any]],
|
||||
skipped_entries: list[str],
|
||||
):
|
||||
"""Store consensus turn in conversation memory with special metadata."""
|
||||
from utils.conversation_memory import add_turn
|
||||
|
||||
# Filter successful and failed responses
|
||||
successful_responses = [r for r in responses if r["status"] == "success"]
|
||||
failed_responses = [r for r in responses if r["status"] == "error"]
|
||||
|
||||
# Prepare metadata for conversation storage
|
||||
metadata = {
|
||||
"tool_type": "consensus",
|
||||
"models_used": [r["model"] for r in successful_responses],
|
||||
"models_skipped": skipped_entries,
|
||||
"models_errored": [r["model"] for r in failed_responses],
|
||||
"individual_responses": successful_responses, # Only store successful responses
|
||||
}
|
||||
|
||||
# Store the turn with special consensus metadata - add_turn is synchronous
|
||||
add_turn(
|
||||
thread_id=continuation_id,
|
||||
role="assistant",
|
||||
content=output,
|
||||
files=files or [],
|
||||
images=images or [],
|
||||
tool_name="consensus",
|
||||
model_provider="consensus", # Special provider name
|
||||
model_name="consensus", # Special model name
|
||||
model_metadata=metadata,
|
||||
)
|
||||
@@ -159,13 +159,7 @@ class DebugIssueTool(BaseTool):
|
||||
if updated_files is not None:
|
||||
request.files = updated_files
|
||||
|
||||
# MCP boundary check - STRICT REJECTION
|
||||
if request.files:
|
||||
file_size_check = self.check_total_file_size(request.files)
|
||||
if file_size_check:
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
|
||||
# File size validation happens at MCP boundary in server.py
|
||||
|
||||
# Build context sections
|
||||
context_parts = [f"=== ISSUE DESCRIPTION ===\n{request.prompt}\n=== END DESCRIPTION ==="]
|
||||
|
||||
@@ -236,13 +236,7 @@ class Precommit(BaseTool):
|
||||
translated_path = translate_path_for_environment(request.path)
|
||||
translated_files = translate_file_paths(request.files)
|
||||
|
||||
# MCP boundary check - STRICT REJECTION (check original files before translation)
|
||||
if request.files:
|
||||
file_size_check = self.check_total_file_size(request.files)
|
||||
if file_size_check:
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
|
||||
# File size validation happens at MCP boundary in server.py
|
||||
|
||||
# Check if the path translation resulted in an error path
|
||||
if translated_path.startswith("/inaccessible/"):
|
||||
|
||||
@@ -409,23 +409,25 @@ class RefactorTool(BaseTool):
|
||||
continuation_id = getattr(request, "continuation_id", None)
|
||||
|
||||
# Get model context for token budget calculation
|
||||
model_name = getattr(self, "_current_model_name", None)
|
||||
available_tokens = None
|
||||
|
||||
if model_name:
|
||||
if hasattr(self, "_model_context") and self._model_context:
|
||||
try:
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
capabilities = self._model_context.capabilities
|
||||
# Use 75% of context for content (code + style examples), 25% for response
|
||||
available_tokens = int(capabilities.context_window * 0.75)
|
||||
logger.debug(
|
||||
f"[REFACTOR] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {model_name}"
|
||||
f"[REFACTOR] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {self._model_context.model_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
# Fallback to conservative estimate
|
||||
logger.warning(f"[REFACTOR] Could not get model capabilities for {model_name}: {e}")
|
||||
logger.warning(f"[REFACTOR] Could not get model capabilities: {e}")
|
||||
available_tokens = 120000 # Conservative fallback
|
||||
logger.debug(f"[REFACTOR] Using fallback token budget: {available_tokens:,} tokens")
|
||||
else:
|
||||
# No model context available (shouldn't happen in normal flow)
|
||||
available_tokens = 120000 # Conservative fallback
|
||||
logger.debug(f"[REFACTOR] No model context, using fallback token budget: {available_tokens:,} tokens")
|
||||
|
||||
# Process style guide examples first to determine token allocation
|
||||
style_examples_content = ""
|
||||
|
||||
@@ -290,23 +290,25 @@ class TestGenerationTool(BaseTool):
|
||||
continuation_id = getattr(request, "continuation_id", None)
|
||||
|
||||
# Get model context for token budget calculation
|
||||
model_name = getattr(self, "_current_model_name", None)
|
||||
available_tokens = None
|
||||
|
||||
if model_name:
|
||||
if hasattr(self, "_model_context") and self._model_context:
|
||||
try:
|
||||
provider = self.get_model_provider(model_name)
|
||||
capabilities = provider.get_capabilities(model_name)
|
||||
capabilities = self._model_context.capabilities
|
||||
# Use 75% of context for content (code + test examples), 25% for response
|
||||
available_tokens = int(capabilities.context_window * 0.75)
|
||||
logger.debug(
|
||||
f"[TESTGEN] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {model_name}"
|
||||
f"[TESTGEN] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {self._model_context.model_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
# Fallback to conservative estimate
|
||||
logger.warning(f"[TESTGEN] Could not get model capabilities for {model_name}: {e}")
|
||||
logger.warning(f"[TESTGEN] Could not get model capabilities: {e}")
|
||||
available_tokens = 120000 # Conservative fallback
|
||||
logger.debug(f"[TESTGEN] Using fallback token budget: {available_tokens:,} tokens")
|
||||
else:
|
||||
# No model context available (shouldn't happen in normal flow)
|
||||
available_tokens = 120000 # Conservative fallback
|
||||
logger.debug(f"[TESTGEN] No model context, using fallback token budget: {available_tokens:,} tokens")
|
||||
|
||||
# Process test examples first to determine token allocation
|
||||
test_examples_content = ""
|
||||
|
||||
@@ -158,13 +158,7 @@ class ThinkDeepTool(BaseTool):
|
||||
if updated_files is not None:
|
||||
request.files = updated_files
|
||||
|
||||
# MCP boundary check - STRICT REJECTION
|
||||
if request.files:
|
||||
file_size_check = self.check_total_file_size(request.files)
|
||||
if file_size_check:
|
||||
from tools.models import ToolOutput
|
||||
|
||||
raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**file_size_check).model_dump_json()}")
|
||||
# File size validation happens at MCP boundary in server.py
|
||||
|
||||
# Build context parts
|
||||
context_parts = [f"=== CLAUDE'S CURRENT ANALYSIS ===\n{current_analysis}\n=== END ANALYSIS ==="]
|
||||
|
||||
Reference in New Issue
Block a user