From 5cd4908e32c41e5feb873fafef399e8ce0a2a71a Mon Sep 17 00:00:00 2001 From: Fahad Date: Mon, 9 Jun 2025 05:22:22 +0400 Subject: [PATCH] fix: increase output token limit to prevent response truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add MAX_OUTPUT_TOKENS constant set to 32,768 (Gemini 2.5 Pro's limit) - Update all tools and chat handler to use MAX_OUTPUT_TOKENS - Add comprehensive tests for output token configuration - Update README with configuration details and system prompt docs This fixes the issue where Gemini responses were being cut off at 8192 tokens, causing Claude to repeatedly ask for the same analysis. Fixes #1 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- README.md | 51 ++++++++++- config.py | 1 + server.py | 4 +- tests/test_config.py | 8 +- tests/test_output_tokens.py | 165 ++++++++++++++++++++++++++++++++++++ tools/base.py | 6 +- 6 files changed, 226 insertions(+), 9 deletions(-) create mode 100644 tests/test_output_tokens.py diff --git a/README.md b/README.md index 4d0ad97..efa6b9b 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,30 @@ Tools can reference files for additional context: "Get gemini to think deeper about my design, reference the current architecture.md" ``` +## Configuration + +The server includes several configurable properties that control its behavior: + +### Model Configuration +- **`DEFAULT_MODEL`**: `"gemini-2.5-pro-preview-06-05"` - The default Gemini model used +- **`MAX_CONTEXT_TOKENS`**: `1,000,000` - Maximum input context (1M tokens for Gemini 2.5 Pro) +- **`MAX_OUTPUT_TOKENS`**: `32,768` - Maximum output tokens per response + +### Temperature Defaults +Different tools use optimized temperature settings: +- **`TEMPERATURE_ANALYTICAL`**: `0.2` - Used for code review and debugging (focused, deterministic) +- **`TEMPERATURE_BALANCED`**: `0.5` - Used for general chat (balanced creativity/accuracy) +- **`TEMPERATURE_CREATIVE`**: `0.7` - Used for deep thinking and architecture (more creative) + +### Customizing Output Length +Each tool accepts an optional `max_tokens` parameter to override the default: +``` +"Use gemini to analyze main.py with max_tokens 16000" +"Get gemini to think deeper about this design with max_tokens 50000" +``` + +Note: The maximum supported output is 32,768 tokens for Gemini 2.5 Pro. + ## Installation 1. Clone the repository: @@ -286,14 +310,37 @@ Tools can reference files for additional context: export GEMINI_API_KEY="your-api-key-here" ``` +## How System Prompts Work + +The server uses carefully crafted system prompts to give each tool specialized expertise: + +### Prompt Architecture +- **Centralized Prompts**: All system prompts are defined in `prompts/tool_prompts.py` +- **Tool Integration**: Each tool inherits from `BaseTool` and implements `get_system_prompt()` +- **Prompt Flow**: `User Request → Tool Selection → System Prompt + Context → Gemini Response` + +### Specialized Expertise +Each tool has a unique system prompt that defines its role and approach: +- **`think_deeper`**: Acts as a senior development partner, challenging assumptions and finding edge cases +- **`review_code`**: Expert code reviewer with security/performance focus, uses severity levels +- **`debug_issue`**: Systematic debugger providing root cause analysis and prevention strategies +- **`analyze`**: Code analyst focusing on architecture, patterns, and actionable insights + +### Customization +To modify tool behavior, you can: +1. Edit prompts in `prompts/tool_prompts.py` for global changes +2. Override `get_system_prompt()` in a tool class for tool-specific changes +3. Use the `temperature` parameter to adjust response style (0.2 for focused, 0.7 for creative) + ## Contributing We welcome contributions! The modular architecture makes it easy to add new tools: 1. Create a new tool in `tools/` 2. Inherit from `BaseTool` -3. Implement required methods -4. Add to `TOOLS` in `server.py` +3. Implement required methods (including `get_system_prompt()`) +4. Add your system prompt to `prompts/tool_prompts.py` +5. Register your tool in `TOOLS` dict in `server.py` See existing tools for examples. diff --git a/config.py b/config.py index b60218d..4ec7009 100644 --- a/config.py +++ b/config.py @@ -10,6 +10,7 @@ __author__ = "Fahad Gilani" # Model configuration DEFAULT_MODEL = "gemini-2.5-pro-preview-06-05" MAX_CONTEXT_TOKENS = 1_000_000 # 1M tokens for Gemini Pro +MAX_OUTPUT_TOKENS = 32_768 # Maximum output tokens for Gemini 2.5 Pro # Temperature defaults for different tool types TEMPERATURE_ANALYTICAL = 0.2 # For code review, debugging diff --git a/server.py b/server.py index 4dae804..2ac214d 100644 --- a/server.py +++ b/server.py @@ -15,7 +15,7 @@ from mcp.server.models import InitializationOptions from mcp.server.stdio import stdio_server from mcp.types import TextContent, Tool -from config import (DEFAULT_MODEL, MAX_CONTEXT_TOKENS, __author__, __updated__, +from config import (DEFAULT_MODEL, MAX_CONTEXT_TOKENS, MAX_OUTPUT_TOKENS, __author__, __updated__, __version__) from tools import AnalyzeTool, DebugIssueTool, ReviewCodeTool, ThinkDeeperTool @@ -160,7 +160,7 @@ async def handle_chat(arguments: Dict[str, Any]) -> List[TextContent]: model_name=DEFAULT_MODEL, generation_config={ "temperature": temperature, - "max_output_tokens": 8192, + "max_output_tokens": MAX_OUTPUT_TOKENS, "candidate_count": 1, }, ) diff --git a/tests/test_config.py b/tests/test_config.py index 485e30d..fbcaa16 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,9 +2,10 @@ Tests for configuration """ -from config import (DEFAULT_MODEL, MAX_CONTEXT_TOKENS, TEMPERATURE_ANALYTICAL, - TEMPERATURE_BALANCED, TEMPERATURE_CREATIVE, TOOL_TRIGGERS, - __author__, __updated__, __version__) +from config import (DEFAULT_MODEL, MAX_CONTEXT_TOKENS, MAX_OUTPUT_TOKENS, + TEMPERATURE_ANALYTICAL, TEMPERATURE_BALANCED, + TEMPERATURE_CREATIVE, TOOL_TRIGGERS, __author__, + __updated__, __version__) class TestConfig: @@ -20,6 +21,7 @@ class TestConfig: """Test model configuration""" assert DEFAULT_MODEL == "gemini-2.5-pro-preview-06-05" assert MAX_CONTEXT_TOKENS == 1_000_000 + assert MAX_OUTPUT_TOKENS == 32_768 def test_temperature_defaults(self): """Test temperature constants""" diff --git a/tests/test_output_tokens.py b/tests/test_output_tokens.py new file mode 100644 index 0000000..484b949 --- /dev/null +++ b/tests/test_output_tokens.py @@ -0,0 +1,165 @@ +""" +Tests for MAX_OUTPUT_TOKENS configuration +""" + +from unittest.mock import Mock, patch + +import pytest + +from config import MAX_OUTPUT_TOKENS +from tools.base import BaseTool, ToolRequest + + +class TestMaxOutputTokens: + """Test that MAX_OUTPUT_TOKENS is properly applied""" + + def test_max_output_tokens_value(self): + """Test the MAX_OUTPUT_TOKENS constant value""" + assert MAX_OUTPUT_TOKENS == 32_768 + + def test_tool_request_default_max_tokens(self): + """Test that ToolRequest has correct default max_tokens""" + request = ToolRequest() + assert request.max_tokens == MAX_OUTPUT_TOKENS + + @pytest.mark.asyncio + @patch("google.generativeai.GenerativeModel") + async def test_base_tool_uses_max_output_tokens(self, mock_model): + """Test that BaseTool properly uses MAX_OUTPUT_TOKENS in model creation""" + + # Create a concrete implementation of BaseTool for testing + class TestTool(BaseTool): + def get_name(self): + return "test_tool" + + def get_description(self): + return "Test tool" + + def get_input_schema(self): + return { + "type": "object", + "properties": { + "test": {"type": "string"} + }, + "required": ["test"] + } + + def get_system_prompt(self): + return "Test prompt" + + def get_request_model(self): + class TestRequest(ToolRequest): + test: str + return TestRequest + + async def prepare_prompt(self, request): + return f"Test: {request.test}" + + # Mock response + mock_response = Mock() + mock_response.candidates = [Mock()] + mock_response.candidates[0].content.parts = [Mock(text="Test response")] + + mock_instance = Mock() + mock_instance.generate_content.return_value = mock_response + mock_model.return_value = mock_instance + + # Execute tool + tool = TestTool() + await tool.execute({"test": "value"}) + + # Verify model was created with MAX_OUTPUT_TOKENS + mock_model.assert_called_once() + call_args = mock_model.call_args + + # Check generation_config + assert "generation_config" in call_args[1] + config = call_args[1]["generation_config"] + assert config["max_output_tokens"] == MAX_OUTPUT_TOKENS + + @pytest.mark.asyncio + @patch("google.generativeai.GenerativeModel") + async def test_custom_max_tokens_override(self, mock_model): + """Test that custom max_tokens value overrides the default""" + + class TestTool(BaseTool): + def get_name(self): + return "test_tool" + + def get_description(self): + return "Test tool" + + def get_input_schema(self): + return { + "type": "object", + "properties": { + "test": {"type": "string"}, + "max_tokens": {"type": "integer"} + }, + "required": ["test"] + } + + def get_system_prompt(self): + return "Test prompt" + + def get_request_model(self): + class TestRequest(ToolRequest): + test: str + return TestRequest + + async def prepare_prompt(self, request): + return f"Test: {request.test}" + + # Mock response + mock_response = Mock() + mock_response.candidates = [Mock()] + mock_response.candidates[0].content.parts = [Mock(text="Test response")] + + mock_instance = Mock() + mock_instance.generate_content.return_value = mock_response + mock_model.return_value = mock_instance + + # Execute tool with custom max_tokens + tool = TestTool() + custom_max_tokens = 16384 + await tool.execute({"test": "value", "max_tokens": custom_max_tokens}) + + # Verify model was created with custom max_tokens + mock_model.assert_called_once() + call_args = mock_model.call_args + + # Check generation_config + assert "generation_config" in call_args[1] + config = call_args[1]["generation_config"] + assert config["max_output_tokens"] == custom_max_tokens + + +class TestServerMaxOutputTokens: + """Test that server.py properly uses MAX_OUTPUT_TOKENS""" + + @pytest.mark.asyncio + @patch("google.generativeai.GenerativeModel") + async def test_handle_chat_uses_max_output_tokens(self, mock_model): + """Test that handle_chat uses MAX_OUTPUT_TOKENS""" + from server import handle_chat + + # Mock response + mock_response = Mock() + mock_response.candidates = [Mock()] + mock_response.candidates[0].content.parts = [Mock(text="Chat response")] + + mock_instance = Mock() + mock_instance.generate_content.return_value = mock_response + mock_model.return_value = mock_instance + + # Call handle_chat + await handle_chat({"prompt": "Test question"}) + + # Verify model was created with MAX_OUTPUT_TOKENS + mock_model.assert_called_once() + call_args = mock_model.call_args + + # Check generation_config + assert "generation_config" in call_args[1] + config = call_args[1]["generation_config"] + assert config["max_output_tokens"] == MAX_OUTPUT_TOKENS diff --git a/tools/base.py b/tools/base.py index 54e1c70..8ad5fe0 100644 --- a/tools/base.py +++ b/tools/base.py @@ -9,6 +9,8 @@ import google.generativeai as genai from mcp.types import TextContent from pydantic import BaseModel, Field +from config import MAX_OUTPUT_TOKENS + class ToolRequest(BaseModel): """Base request model for all tools""" @@ -17,7 +19,7 @@ class ToolRequest(BaseModel): None, description="Model to use (defaults to Gemini 2.5 Pro)" ) max_tokens: Optional[int] = Field( - 8192, description="Maximum number of tokens in response" + MAX_OUTPUT_TOKENS, description="Maximum number of tokens in response" ) temperature: Optional[float] = Field( None, description="Temperature for response (tool-specific defaults)" @@ -78,7 +80,7 @@ class BaseTool(ABC): temperature = getattr(request, "temperature", None) if temperature is None: temperature = self.get_default_temperature() - max_tokens = getattr(request, "max_tokens", 8192) + max_tokens = getattr(request, "max_tokens", MAX_OUTPUT_TOKENS) # Create and configure model model = self.create_model(model_name, temperature, max_tokens)