Add Consensus Tool for Multi-Model Perspective Gathering (#67)

* WIP
Refactor resolving mode_names, should be done once at MCP call boundary
Pass around model context instead
Consensus tool allows one to get a consensus from multiple models, optionally assigning one a 'for' or 'against' stance to find nuanced responses.

* Deduplication of model resolution, model_context should be available before reaching deeper parts of the code
Improved abstraction when building conversations
Throw programmer errors early

* Guardrails
Support for `model:option` format at MCP boundary so future tools can use additional options if needed instead of handling this only for consensus
Model name now supports an optional ":option" for future use

* Simplified async flow

* Improved model for request to support natural language
Simplified async flow

* Improved model for request to support natural language
Simplified async flow

* Fix consensus tool async/sync patterns to match codebase standards

CRITICAL FIXES:
- Converted _get_consensus_responses from async to sync (matches other tools)
- Converted store_conversation_turn from async to sync (add_turn is synchronous)
- Removed unnecessary asyncio imports and sleep calls
- Fixed ClosedResourceError in MCP protocol during long consensus operations

PATTERN ALIGNMENT:
- Consensus tool now follows same sync patterns as all other tools
- Only execute() and prepare_prompt() are async (base class requirement)
- All internal operations are synchronous like analyze, chat, debug, etc.

TESTING:
- MCP simulation test now passes: consensus_stance 
- Two-model consensus works correctly in ~35 seconds
- Unknown stance handling defaults to neutral with warnings
- All 9 unit tests pass (100% success rate)

The consensus tool async patterns were anomalous in the codebase.
This fix aligns it with the established synchronous patterns used
by all other tools while maintaining full functionality.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* Fixed call order and added new test

* Cleanup dead comments
Docs for the new tool
Improved tests

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Beehive Innovations
2025-06-17 10:53:17 +04:00
committed by GitHub
parent 9b98df650b
commit 95556ba9ea
31 changed files with 2643 additions and 324 deletions

View File

@@ -31,6 +31,7 @@ from providers.base import ProviderType
from utils import check_token_limit
from utils.conversation_memory import (
MAX_CONVERSATION_TURNS,
ConversationTurn,
add_turn,
create_thread,
get_conversation_file_list,
@@ -643,6 +644,41 @@ class BaseTool(ABC):
)
return requested_files
def format_conversation_turn(self, turn: ConversationTurn) -> list[str]:
"""
Format a conversation turn for display in conversation history.
Tools can override this to provide custom formatting for their responses
while maintaining the standard structure for cross-tool compatibility.
This method is called by build_conversation_history when reconstructing
conversation context, allowing each tool to control how its responses
appear in subsequent conversation turns.
Args:
turn: The conversation turn to format (from utils.conversation_memory)
Returns:
list[str]: Lines of formatted content for this turn
Example:
Default implementation returns:
["Files used in this turn: file1.py, file2.py", "", "Response content..."]
Tools can override to add custom sections, formatting, or metadata display.
"""
parts = []
# Add files context if present
if turn.files:
parts.append(f"Files used in this turn: {', '.join(turn.files)}")
parts.append("") # Empty line for readability
# Add the actual content
parts.append(turn.content)
return parts
def _prepare_file_content_for_prompt(
self,
request_files: list[str],
@@ -716,109 +752,35 @@ class BaseTool(ABC):
elif max_tokens is not None:
effective_max_tokens = max_tokens - reserve_tokens
else:
# Get model-specific limits
# First check if model_context was passed from server.py
model_context = None
if arguments:
model_context = arguments.get("_model_context") or getattr(self, "_current_arguments", {}).get(
"_model_context"
# The execute() method is responsible for setting self._model_context.
# A missing context is a programming error, not a fallback case.
if not hasattr(self, "_model_context") or not self._model_context:
logger.error(
f"[FILES] {self.name}: _prepare_file_content_for_prompt called without a valid model context. "
"This indicates an incorrect call sequence in the tool's implementation."
)
# Fail fast to reveal integration issues. A silent fallback with arbitrary
# limits can hide bugs and lead to unexpected token usage or silent failures.
raise RuntimeError("ModelContext not initialized before file preparation.")
if model_context:
# Use the passed model context
try:
token_allocation = model_context.calculate_token_allocation()
effective_max_tokens = token_allocation.file_tokens - reserve_tokens
logger.debug(
f"[FILES] {self.name}: Using passed model context for {model_context.model_name}: "
f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
)
except Exception as e:
logger.warning(f"[FILES] {self.name}: Error using passed model context: {e}")
# Fall through to manual calculation
model_context = None
if not model_context:
# Manual calculation as fallback
from config import DEFAULT_MODEL
model_name = getattr(self, "_current_model_name", None) or DEFAULT_MODEL
# Handle auto mode gracefully
if model_name.lower() == "auto":
from providers.registry import ModelProviderRegistry
# Use tool-specific fallback model for capacity estimation
# This properly handles different providers (OpenAI=200K, Gemini=1M)
tool_category = self.get_model_category()
fallback_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
logger.debug(
f"[FILES] {self.name}: Auto mode detected, using {fallback_model} "
f"for {tool_category.value} tool capacity estimation"
)
try:
provider = self.get_model_provider(fallback_model)
capabilities = provider.get_capabilities(fallback_model)
# Calculate content allocation based on model capacity
if capabilities.context_window < 300_000:
# Smaller context models: 60% content, 40% response
model_content_tokens = int(capabilities.context_window * 0.6)
else:
# Larger context models: 80% content, 20% response
model_content_tokens = int(capabilities.context_window * 0.8)
effective_max_tokens = model_content_tokens - reserve_tokens
logger.debug(
f"[FILES] {self.name}: Using {fallback_model} capacity for auto mode: "
f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
)
except (ValueError, AttributeError) as e:
# Handle specific errors: provider not found, model not supported, missing attributes
logger.warning(
f"[FILES] {self.name}: Could not get capabilities for fallback model {fallback_model}: {type(e).__name__}: {e}"
)
# Fall back to conservative default for safety
effective_max_tokens = 100_000 - reserve_tokens
except Exception as e:
# Catch any other unexpected errors
logger.error(
f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
)
effective_max_tokens = 100_000 - reserve_tokens
else:
# Normal mode - use the specified model
try:
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
# Calculate content allocation based on model capacity
if capabilities.context_window < 300_000:
# Smaller context models: 60% content, 40% response
model_content_tokens = int(capabilities.context_window * 0.6)
else:
# Larger context models: 80% content, 20% response
model_content_tokens = int(capabilities.context_window * 0.8)
effective_max_tokens = model_content_tokens - reserve_tokens
logger.debug(
f"[FILES] {self.name}: Using model-specific limit for {model_name}: "
f"{model_content_tokens:,} content tokens from {capabilities.context_window:,} total"
)
except (ValueError, AttributeError) as e:
# Handle specific errors: provider not found, model not supported, missing attributes
logger.warning(
f"[FILES] {self.name}: Could not get model capabilities for {model_name}: {type(e).__name__}: {e}"
)
# Fall back to conservative default for safety
effective_max_tokens = 100_000 - reserve_tokens
except Exception as e:
# Catch any other unexpected errors
logger.error(
f"[FILES] {self.name}: Unexpected error getting model capabilities: {type(e).__name__}: {e}"
)
effective_max_tokens = 100_000 - reserve_tokens
# This is now the single source of truth for token allocation.
model_context = self._model_context
try:
token_allocation = model_context.calculate_token_allocation()
# Standardize on `file_tokens` for consistency and correctness.
# This fixes the bug where the old code incorrectly used content_tokens
effective_max_tokens = token_allocation.file_tokens - reserve_tokens
logger.debug(
f"[FILES] {self.name}: Using model context for {model_context.model_name}: "
f"{token_allocation.file_tokens:,} file tokens from {token_allocation.total_tokens:,} total"
)
except Exception as e:
logger.error(
f"[FILES] {self.name}: Failed to calculate token allocation from model context: {e}", exc_info=True
)
# If the context exists but calculation fails, we still need to prevent a crash.
# A loud error is logged, and we fall back to a safe default.
effective_max_tokens = 100_000 - reserve_tokens
# Ensure we have a reasonable minimum budget
effective_max_tokens = max(1000, effective_max_tokens)
@@ -1087,8 +1049,14 @@ When recommending searches, be specific about what information you need and why
# Get model capabilities to check image support and size limits
try:
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
# Use the already-resolved provider from model context if available
if hasattr(self, "_model_context") and self._model_context:
provider = self._model_context.provider
capabilities = self._model_context.capabilities
else:
# Fallback for edge cases (e.g., direct test calls)
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
except Exception as e:
logger.warning(f"Failed to get capabilities for model {model_name}: {e}")
# Fall back to checking custom models configuration
@@ -1214,7 +1182,7 @@ When recommending searches, be specific about what information you need and why
return estimate_file_tokens(file_path)
def check_total_file_size(self, files: list[str]) -> Optional[dict[str, Any]]:
def check_total_file_size(self, files: list[str], model_name: str) -> Optional[dict[str, Any]]:
"""
Check if total file sizes would exceed token threshold before embedding.
@@ -1224,6 +1192,7 @@ When recommending searches, be specific about what information you need and why
Args:
files: List of file paths to check
model_name: The resolved model name to use for token limits
Returns:
Dict with `code_too_large` response if too large, None if acceptable
@@ -1231,13 +1200,6 @@ When recommending searches, be specific about what information you need and why
if not files:
return None
# Get current model name for context-aware thresholds
model_name = getattr(self, "_current_model_name", None)
if not model_name:
from config import DEFAULT_MODEL
model_name = DEFAULT_MODEL
# Use centralized file size checking with model context
from utils.file_utils import check_total_file_size as check_file_size_utility
@@ -1353,6 +1315,65 @@ When recommending searches, be specific about what information you need and why
# Extract and validate images from request
images = getattr(request, "images", None) or []
# MODEL RESOLUTION NOW HAPPENS AT MCP BOUNDARY
# Extract pre-resolved model context from server.py
model_context = self._current_arguments.get("_model_context")
resolved_model_name = self._current_arguments.get("_resolved_model_name")
if model_context and resolved_model_name:
# Model was already resolved at MCP boundary
model_name = resolved_model_name
logger.debug(f"Using pre-resolved model '{model_name}' from MCP boundary")
else:
# Fallback for direct execute calls
model_name = getattr(request, "model", None)
if not model_name:
from config import DEFAULT_MODEL
model_name = DEFAULT_MODEL
logger.debug(f"Using fallback model resolution for '{model_name}' (test mode)")
# For tests: Check if we should require model selection (auto mode)
if self._should_require_model_selection(model_name):
# Get suggested model based on tool category
from providers.registry import ModelProviderRegistry
tool_category = self.get_model_category()
suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
# Build error message based on why selection is required
if model_name.lower() == "auto":
error_message = (
f"Model parameter is required in auto mode. "
f"Suggested model for {self.name}: '{suggested_model}' "
f"(category: {tool_category.value})"
)
else:
# Model was specified but not available
available_models = self._get_available_models()
error_message = (
f"Model '{model_name}' is not available with current API keys. "
f"Available models: {', '.join(available_models)}. "
f"Suggested model for {self.name}: '{suggested_model}' "
f"(category: {tool_category.value})"
)
error_output = ToolOutput(
status="error",
content=error_message,
content_type="text",
)
return [TextContent(type="text", text=error_output.model_dump_json())]
# Create model context for tests
from utils.model_context import ModelContext
model_context = ModelContext(model_name)
# Store resolved model name for use by helper methods
self._current_model_name = model_name
self._model_context = model_context
# Check if we have continuation_id - if so, conversation history is already embedded
continuation_id = getattr(request, "continuation_id", None)
@@ -1389,57 +1410,11 @@ When recommending searches, be specific about what information you need and why
prompt = f"{prompt}\n\n{follow_up_instructions}"
logger.debug(f"Added follow-up instructions for new {self.name} conversation")
# Extract model configuration from request or use defaults
model_name = getattr(request, "model", None)
if not model_name:
from config import DEFAULT_MODEL
model_name = DEFAULT_MODEL
# Check if we need Claude to select a model
# This happens when:
# 1. The model is explicitly "auto"
# 2. The requested model is not available
if self._should_require_model_selection(model_name):
# Get suggested model based on tool category
from providers.registry import ModelProviderRegistry
tool_category = self.get_model_category()
suggested_model = ModelProviderRegistry.get_preferred_fallback_model(tool_category)
# Build error message based on why selection is required
if model_name.lower() == "auto":
error_message = (
f"Model parameter is required in auto mode. "
f"Suggested model for {self.name}: '{suggested_model}' "
f"(category: {tool_category.value})"
)
else:
# Model was specified but not available
# Get list of available models
available_models = self._get_available_models()
error_message = (
f"Model '{model_name}' is not available with current API keys. "
f"Available models: {', '.join(available_models)}. "
f"Suggested model for {self.name}: '{suggested_model}' "
f"(category: {tool_category.value})"
)
error_output = ToolOutput(
status="error",
content=error_message,
content_type="text",
)
return [TextContent(type="text", text=error_output.model_dump_json())]
# Store model name for use by helper methods like _prepare_file_content_for_prompt
# Only set this after auto mode validation to prevent "auto" being used as a model name
self._current_model_name = model_name
# Model name already resolved and stored in self._current_model_name earlier
# Validate images at MCP boundary if any were provided
if images:
image_validation_error = self._validate_image_limits(images, model_name, continuation_id)
image_validation_error = self._validate_image_limits(images, self._current_model_name, continuation_id)
if image_validation_error:
return [TextContent(type="text", text=json.dumps(image_validation_error))]
@@ -1451,10 +1426,10 @@ When recommending searches, be specific about what information you need and why
thinking_mode = self.get_default_thinking_mode()
# Get the appropriate model provider
provider = self.get_model_provider(model_name)
provider = self.get_model_provider(self._current_model_name)
# Validate and correct temperature for this model
temperature, temp_warnings = self._validate_and_correct_temperature(model_name, temperature)
temperature, temp_warnings = self._validate_and_correct_temperature(self._current_model_name, temperature)
# Log any temperature corrections
for warning in temp_warnings:
@@ -1465,16 +1440,21 @@ When recommending searches, be specific about what information you need and why
# Generate AI response using the provider
logger.info(f"Sending request to {provider.get_provider_type().value} API for {self.name}")
logger.info(f"Using model: {model_name} via {provider.get_provider_type().value} provider")
logger.debug(f"Prompt length: {len(prompt)} characters")
logger.info(f"Using model: {self._current_model_name} via {provider.get_provider_type().value} provider")
# Import token estimation utility
from utils.token_utils import estimate_tokens
estimated_tokens = estimate_tokens(prompt)
logger.debug(f"Prompt length: {len(prompt)} characters (~{estimated_tokens:,} tokens)")
# Generate content with provider abstraction
model_response = provider.generate_content(
prompt=prompt,
model_name=model_name,
model_name=self._current_model_name,
system_prompt=system_prompt,
temperature=temperature,
thinking_mode=thinking_mode if provider.supports_thinking_mode(model_name) else None,
thinking_mode=thinking_mode if provider.supports_thinking_mode(self._current_model_name) else None,
images=images if images else None, # Pass images via kwargs
)
@@ -1486,7 +1466,11 @@ When recommending searches, be specific about what information you need and why
# Parse response to check for clarification requests or format output
# Pass model info for conversation tracking
model_info = {"provider": provider, "model_name": model_name, "model_response": model_response}
model_info = {
"provider": provider,
"model_name": self._current_model_name,
"model_response": model_response,
}
tool_output = self._parse_response(raw_text, request, model_info)
logger.info(f"{self.name} tool completed successfully")
@@ -1894,8 +1878,14 @@ When recommending searches, be specific about what information you need and why
Tuple of (corrected_temperature, warning_messages)
"""
try:
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
# Use the already-resolved provider and capabilities from model context
if hasattr(self, "_model_context") and self._model_context:
capabilities = self._model_context.capabilities
else:
# Fallback for edge cases (e.g., direct test calls)
provider = self.get_model_provider(model_name)
capabilities = provider.get_capabilities(model_name)
constraint = capabilities.temperature_constraint
warnings = []