* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools * WIP - Update tests to match new tools * Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97 Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96 Improved retry error logging Cleanup * WIP - chat tool using new architecture and improved code sharing * Removed todo * Removed todo * Cleanup old name * Tweak wordings * Tweak wordings Migrate old tests * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 * Support for Flash 2.0 and Flash Lite 2.0 Fixed test * Improved consensus to use the workflow base class * Improved consensus to use the workflow base class * Allow images * Allow images * Replaced old consensus tool * Cleanup tests * Tests for prompt size * New tool: docgen Tests for prompt size Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107 Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105 * Improved docgen prompt Exclude TestGen from pytest inclusion * Updated errors * Lint * DocGen instructed not to fix bugs, surface them and stick to d * WIP * Stop claude from being lazy and only documenting a small handful * More style rules --------- Co-authored-by: Claude <noreply@anthropic.com>
172 lines
6.9 KiB
Python
172 lines
6.9 KiB
Python
"""
|
|
Model context management for dynamic token allocation.
|
|
|
|
This module provides a clean abstraction for model-specific token management,
|
|
ensuring that token limits are properly calculated based on the current model
|
|
being used, not global constants.
|
|
|
|
CONVERSATION MEMORY INTEGRATION:
|
|
This module works closely with the conversation memory system to provide
|
|
optimal token allocation for multi-turn conversations:
|
|
|
|
1. DUAL PRIORITIZATION STRATEGY SUPPORT:
|
|
- Provides separate token budgets for conversation history vs. files
|
|
- Enables the conversation memory system to apply newest-first prioritization
|
|
- Ensures optimal balance between context preservation and new content
|
|
|
|
2. MODEL-SPECIFIC ALLOCATION:
|
|
- Dynamic allocation based on model capabilities (context window size)
|
|
- Conservative allocation for smaller models (O3: 200K context)
|
|
- Generous allocation for larger models (Gemini: 1M+ context)
|
|
- Adapts token distribution ratios based on model capacity
|
|
|
|
3. CROSS-TOOL CONSISTENCY:
|
|
- Provides consistent token budgets across different tools
|
|
- Enables seamless conversation continuation between tools
|
|
- Supports conversation reconstruction with proper budget management
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Any, Optional
|
|
|
|
from config import DEFAULT_MODEL
|
|
from providers import ModelCapabilities, ModelProviderRegistry
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TokenAllocation:
|
|
"""Token allocation strategy for a model."""
|
|
|
|
total_tokens: int
|
|
content_tokens: int
|
|
response_tokens: int
|
|
file_tokens: int
|
|
history_tokens: int
|
|
|
|
@property
|
|
def available_for_prompt(self) -> int:
|
|
"""Tokens available for the actual prompt after allocations."""
|
|
return self.content_tokens - self.file_tokens - self.history_tokens
|
|
|
|
|
|
class ModelContext:
|
|
"""
|
|
Encapsulates model-specific information and token calculations.
|
|
|
|
This class provides a single source of truth for all model-related
|
|
token calculations, ensuring consistency across the system.
|
|
"""
|
|
|
|
def __init__(self, model_name: str, model_option: Optional[str] = None):
|
|
self.model_name = model_name
|
|
self.model_option = model_option # Store optional model option (e.g., "for", "against", etc.)
|
|
self._provider = None
|
|
self._capabilities = None
|
|
self._token_allocation = None
|
|
|
|
@property
|
|
def provider(self):
|
|
"""Get the model provider lazily."""
|
|
if self._provider is None:
|
|
self._provider = ModelProviderRegistry.get_provider_for_model(self.model_name)
|
|
if not self._provider:
|
|
available_models = ModelProviderRegistry.get_available_models()
|
|
raise ValueError(f"Model '{self.model_name}' is not available. Available models: {available_models}")
|
|
return self._provider
|
|
|
|
@property
|
|
def capabilities(self) -> ModelCapabilities:
|
|
"""Get model capabilities lazily."""
|
|
if self._capabilities is None:
|
|
self._capabilities = self.provider.get_capabilities(self.model_name)
|
|
return self._capabilities
|
|
|
|
def calculate_token_allocation(self, reserved_for_response: Optional[int] = None) -> TokenAllocation:
|
|
"""
|
|
Calculate token allocation based on model capacity and conversation requirements.
|
|
|
|
This method implements the core token budget calculation that supports the
|
|
dual prioritization strategy used in conversation memory and file processing:
|
|
|
|
TOKEN ALLOCATION STRATEGY:
|
|
1. CONTENT vs RESPONSE SPLIT:
|
|
- Smaller models (< 300K): 60% content, 40% response (conservative)
|
|
- Larger models (≥ 300K): 80% content, 20% response (generous)
|
|
|
|
2. CONTENT SUB-ALLOCATION:
|
|
- File tokens: 30-40% of content budget for newest file versions
|
|
- History tokens: 40-50% of content budget for conversation context
|
|
- Remaining: Available for tool-specific prompt content
|
|
|
|
3. CONVERSATION MEMORY INTEGRATION:
|
|
- History allocation enables conversation reconstruction in reconstruct_thread_context()
|
|
- File allocation supports newest-first file prioritization in tools
|
|
- Remaining budget passed to tools via _remaining_tokens parameter
|
|
|
|
Args:
|
|
reserved_for_response: Override response token reservation
|
|
|
|
Returns:
|
|
TokenAllocation with calculated budgets for dual prioritization strategy
|
|
"""
|
|
total_tokens = self.capabilities.context_window
|
|
|
|
# Dynamic allocation based on model capacity
|
|
if total_tokens < 300_000:
|
|
# Smaller context models (O3): Conservative allocation
|
|
content_ratio = 0.6 # 60% for content
|
|
response_ratio = 0.4 # 40% for response
|
|
file_ratio = 0.3 # 30% of content for files
|
|
history_ratio = 0.5 # 50% of content for history
|
|
else:
|
|
# Larger context models (Gemini): More generous allocation
|
|
content_ratio = 0.8 # 80% for content
|
|
response_ratio = 0.2 # 20% for response
|
|
file_ratio = 0.4 # 40% of content for files
|
|
history_ratio = 0.4 # 40% of content for history
|
|
|
|
# Calculate allocations
|
|
content_tokens = int(total_tokens * content_ratio)
|
|
response_tokens = reserved_for_response or int(total_tokens * response_ratio)
|
|
|
|
# Sub-allocations within content budget
|
|
file_tokens = int(content_tokens * file_ratio)
|
|
history_tokens = int(content_tokens * history_ratio)
|
|
|
|
allocation = TokenAllocation(
|
|
total_tokens=total_tokens,
|
|
content_tokens=content_tokens,
|
|
response_tokens=response_tokens,
|
|
file_tokens=file_tokens,
|
|
history_tokens=history_tokens,
|
|
)
|
|
|
|
logger.debug(f"Token allocation for {self.model_name}:")
|
|
logger.debug(f" Total: {allocation.total_tokens:,}")
|
|
logger.debug(f" Content: {allocation.content_tokens:,} ({content_ratio:.0%})")
|
|
logger.debug(f" Response: {allocation.response_tokens:,} ({response_ratio:.0%})")
|
|
logger.debug(f" Files: {allocation.file_tokens:,} ({file_ratio:.0%} of content)")
|
|
logger.debug(f" History: {allocation.history_tokens:,} ({history_ratio:.0%} of content)")
|
|
|
|
return allocation
|
|
|
|
def estimate_tokens(self, text: str) -> int:
|
|
"""
|
|
Estimate token count for text using model-specific tokenizer.
|
|
|
|
For now, uses simple estimation. Can be enhanced with model-specific
|
|
tokenizers (tiktoken for OpenAI, etc.) in the future.
|
|
"""
|
|
# TODO: Integrate model-specific tokenizers
|
|
# For now, use conservative estimation
|
|
return len(text) // 3 # Conservative estimate
|
|
|
|
@classmethod
|
|
def from_arguments(cls, arguments: dict[str, Any]) -> "ModelContext":
|
|
"""Create ModelContext from tool arguments."""
|
|
model_name = arguments.get("model") or DEFAULT_MODEL
|
|
return cls(model_name)
|