From 77da7b17e655560769439b897272aaca1ef7563d Mon Sep 17 00:00:00 2001 From: Fahad Date: Tue, 17 Jun 2025 11:29:45 +0400 Subject: [PATCH] Fixes bug pointed out by @dsaluja (https://github.com/dsaluja) Fixes other providers not fixed by https://github.com/BeehiveInnovations/zen-mcp-server/pull/66 New regression tests --- config.py | 4 +- providers/gemini.py | 22 +++- providers/openai_compatible.py | 16 ++- test_enhanced_consensus.py | 138 -------------------- test_line_number_accuracy.py | 72 ---------- test_line_numbers.py | 36 ----- tests/test_gemini_token_usage.py | 105 +++++++++++++++ tests/test_openai_compatible_token_usage.py | 135 +++++++++++++++++++ 8 files changed, 270 insertions(+), 258 deletions(-) delete mode 100644 test_enhanced_consensus.py delete mode 100644 test_line_number_accuracy.py delete mode 100644 test_line_numbers.py create mode 100644 tests/test_gemini_token_usage.py create mode 100644 tests/test_openai_compatible_token_usage.py diff --git a/config.py b/config.py index 2005cd7..9e5c601 100644 --- a/config.py +++ b/config.py @@ -14,9 +14,9 @@ import os # These values are used in server responses and for tracking releases # IMPORTANT: This is the single source of truth for version and author info # Semantic versioning: MAJOR.MINOR.PATCH -__version__ = "4.9.1" +__version__ = "4.9.2" # Last update date in ISO format -__updated__ = "2025-06-16" +__updated__ = "2025-06-17" # Primary maintainer __author__ = "Fahad Gilani" diff --git a/providers/gemini.py b/providers/gemini.py index 8aec123..edfd27b 100644 --- a/providers/gemini.py +++ b/providers/gemini.py @@ -303,12 +303,26 @@ class GeminiModelProvider(ModelProvider): # Note: The actual structure depends on the SDK version and response format if hasattr(response, "usage_metadata"): metadata = response.usage_metadata + + # Extract token counts with explicit None checks + input_tokens = None + output_tokens = None + if hasattr(metadata, "prompt_token_count"): - usage["input_tokens"] = metadata.prompt_token_count + value = metadata.prompt_token_count + if value is not None: + input_tokens = value + usage["input_tokens"] = value + if hasattr(metadata, "candidates_token_count"): - usage["output_tokens"] = metadata.candidates_token_count - if "input_tokens" in usage and "output_tokens" in usage: - usage["total_tokens"] = usage["input_tokens"] + usage["output_tokens"] + value = metadata.candidates_token_count + if value is not None: + output_tokens = value + usage["output_tokens"] = value + + # Calculate total only if both values are available and valid + if input_tokens is not None and output_tokens is not None: + usage["total_tokens"] = input_tokens + output_tokens return usage diff --git a/providers/openai_compatible.py b/providers/openai_compatible.py index 5fa138e..d7c78a6 100644 --- a/providers/openai_compatible.py +++ b/providers/openai_compatible.py @@ -300,10 +300,13 @@ class OpenAICompatibleProvider(ModelProvider): if hasattr(response, "usage"): usage = self._extract_usage(response) elif hasattr(response, "input_tokens") and hasattr(response, "output_tokens"): + # Safely extract token counts with None handling + input_tokens = getattr(response, "input_tokens", 0) or 0 + output_tokens = getattr(response, "output_tokens", 0) or 0 usage = { - "input_tokens": getattr(response, "input_tokens", 0), - "output_tokens": getattr(response, "output_tokens", 0), - "total_tokens": getattr(response, "input_tokens", 0) + getattr(response, "output_tokens", 0), + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, } return ModelResponse( @@ -607,9 +610,10 @@ class OpenAICompatibleProvider(ModelProvider): usage = {} if hasattr(response, "usage") and response.usage: - usage["input_tokens"] = getattr(response.usage, "prompt_tokens", 0) - usage["output_tokens"] = getattr(response.usage, "completion_tokens", 0) - usage["total_tokens"] = getattr(response.usage, "total_tokens", 0) + # Safely extract token counts with None handling + usage["input_tokens"] = getattr(response.usage, "prompt_tokens", 0) or 0 + usage["output_tokens"] = getattr(response.usage, "completion_tokens", 0) or 0 + usage["total_tokens"] = getattr(response.usage, "total_tokens", 0) or 0 return usage diff --git a/test_enhanced_consensus.py b/test_enhanced_consensus.py deleted file mode 100644 index a8765ec..0000000 --- a/test_enhanced_consensus.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for the enhanced consensus tool with ModelConfig objects -""" - -import asyncio -import json -import sys - -from tools.consensus import ConsensusTool - - -async def test_enhanced_consensus(): - """Test the enhanced consensus tool with custom stance prompts""" - - print("๐Ÿงช Testing Enhanced Consensus Tool") - print("=" * 50) - - # Test all stance synonyms work - print("๐Ÿ“ Testing stance synonym normalization...") - tool = ConsensusTool() - - test_synonyms = [ - ("support", "for"), - ("favor", "for"), - ("oppose", "against"), - ("critical", "against"), - ("neutral", "neutral"), - ("for", "for"), - ("against", "against"), - # Test unknown stances default to neutral - ("maybe", "neutral"), - ("supportive", "neutral"), - ("random", "neutral"), - ] - - for input_stance, expected in test_synonyms: - normalized = tool._normalize_stance(input_stance) - status = "โœ…" if normalized == expected else "โŒ" - print(f"{status} '{input_stance}' โ†’ '{normalized}' (expected: '{expected}')") - - print() - - # Create consensus tool instance - tool = ConsensusTool() - - # Test arguments with new ModelConfig format - test_arguments = { - "prompt": "Should we add a pizza ordering button to our enterprise software?", - "models": [ - { - "model": "flash", - "stance": "support", # Test synonym - "stance_prompt": "You are a user experience advocate. Focus on how this feature could improve user engagement and satisfaction. Consider the human elements - how might this bring joy to users' workday? Think about unexpected benefits and creative use cases.", - }, - { - "model": "flash", - "stance": "oppose", # Test synonym - "stance_prompt": "You are a software architecture specialist. Focus on technical concerns: code maintainability, security implications, scope creep, and system complexity. Consider long-term costs and potential maintenance burden.", - }, - ], - "focus_areas": ["user experience", "technical complexity", "business value"], - "temperature": 0.3, - } - - try: - print("๐Ÿ“ Test Arguments:") - print(json.dumps(test_arguments, indent=2)) - print() - - print("๐Ÿš€ Executing consensus tool...") - - # Execute the tool - result = await tool.execute(test_arguments) - - print("โœ… Consensus tool execution completed!") - print() - - # Parse and display results - if result and len(result) > 0: - response_text = result[0].text - try: - response_data = json.loads(response_text) - print("๐Ÿ“Š Consensus Results:") - print(f"Status: {response_data.get('status', 'unknown')}") - - if response_data.get("status") == "consensus_success": - models_used = response_data.get("models_used", []) - print(f"Models used: {', '.join(models_used)}") - - responses = response_data.get("responses", []) - print(f"\n๐ŸŽญ Individual Model Responses ({len(responses)} total):") - - for i, resp in enumerate(responses, 1): - model = resp.get("model", "unknown") - stance = resp.get("stance", "neutral") - status = resp.get("status", "unknown") - - print(f"\n{i}. {model.upper()} ({stance} stance) - {status}") - - if status == "success": - verdict = resp.get("verdict", "No verdict") - custom_prompt = resp.get("metadata", {}).get("custom_stance_prompt", False) - print(f" Custom prompt used: {'Yes' if custom_prompt else 'No'}") - print(f" Verdict preview: {verdict[:200]}...") - - # Show stance normalization worked - if stance in ["support", "oppose"]: - expected = "for" if stance == "support" else "against" - print(f" โœ… Stance '{stance}' normalized correctly") - else: - error = resp.get("error", "Unknown error") - print(f" Error: {error}") - - else: - print(f"โŒ Consensus failed: {response_data.get('error', 'Unknown error')}") - - except json.JSONDecodeError: - print("๐Ÿ“„ Raw response (not JSON):") - print(response_text[:500] + "..." if len(response_text) > 500 else response_text) - else: - print("โŒ No response received from consensus tool") - - except Exception as e: - print(f"โŒ Test failed with exception: {str(e)}") - import traceback - - traceback.print_exc() - return False - - print("\n๐ŸŽ‰ Enhanced consensus tool test completed!") - return True - - -if __name__ == "__main__": - # Run the test - success = asyncio.run(test_enhanced_consensus()) - sys.exit(0 if success else 1) diff --git a/test_line_number_accuracy.py b/test_line_number_accuracy.py deleted file mode 100644 index 7c11d3f..0000000 --- a/test_line_number_accuracy.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to verify line number accuracy in the MCP server -""" - -import asyncio -import json - -from tools.analyze import AnalyzeTool -from tools.chat import ChatTool - - -async def test_line_number_reporting(): - """Test if tools report accurate line numbers when analyzing code""" - - print("=== Testing Line Number Accuracy ===\n") - - # Test 1: Analyze tool with line numbers - analyze_tool = AnalyzeTool() - - # Create a request that asks about specific line numbers - analyze_request = { - "files": ["/Users/fahad/Developer/gemini-mcp-server/test_line_numbers.py"], - "prompt": "Find all the lines where 'ignore_patterns' is assigned a list value. Report the exact line numbers.", - "model": "flash", # Use a real model - } - - print("1. Testing Analyze tool:") - print(f" Prompt: {analyze_request['prompt']}") - - try: - response = await analyze_tool.execute(analyze_request) - result = json.loads(response[0].text) - - if result["status"] == "success": - print(f" Response excerpt: {result['content'][:200]}...") - else: - print(f" Error: {result}") - except Exception as e: - print(f" Exception: {e}") - - print("\n" + "=" * 50 + "\n") - - # Test 2: Chat tool to simulate the user's scenario - chat_tool = ChatTool() - - chat_request = { - "files": ["/Users/fahad/Developer/loganalyzer/main.py"], - "prompt": "Tell me the exact line number where 'ignore_patterns' is assigned a list in the file. Be precise about the line number.", - "model": "flash", - } - - print("2. Testing Chat tool with user's actual file:") - print(f" File: {chat_request['files'][0]}") - print(f" Prompt: {chat_request['prompt']}") - - try: - response = await chat_tool.execute(chat_request) - result = json.loads(response[0].text) - - if result["status"] == "success": - print(f" Response excerpt: {result['content'][:300]}...") - else: - print(f" Error: {result}") - except Exception as e: - print(f" Exception: {e}") - - print("\n=== Test Complete ===") - - -if __name__ == "__main__": - asyncio.run(test_line_number_reporting()) diff --git a/test_line_numbers.py b/test_line_numbers.py deleted file mode 100644 index 8dcd963..0000000 --- a/test_line_numbers.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -"""Test file to verify line number accuracy""" - - -# Line 4: Empty line above -def example_function(): - """Line 6: Docstring""" - # Line 7: Comment - pass # Line 8 - - -# Line 10: Another comment -class TestClass: - """Line 12: Class docstring""" - - def __init__(self): - """Line 15: Init docstring""" - # Line 16: This is where we'll test - self.test_variable = "Line 17" - - def method_one(self): - """Line 20: Method docstring""" - # Line 21: Important assignment below - ignore_patterns = ["pattern1", "pattern2", "pattern3"] # Line 22: This is our test line - return ignore_patterns - - -# Line 25: More code below -def another_function(): - """Line 27: Another docstring""" - # Line 28: Another assignment - ignore_patterns = ["different", "patterns"] # Line 29: Second occurrence - return ignore_patterns - - -# Line 32: End of file marker diff --git a/tests/test_gemini_token_usage.py b/tests/test_gemini_token_usage.py new file mode 100644 index 0000000..e7a006f --- /dev/null +++ b/tests/test_gemini_token_usage.py @@ -0,0 +1,105 @@ +"""Tests for Gemini provider token usage extraction.""" + +import unittest +from unittest.mock import Mock + +from providers.gemini import GeminiModelProvider + + +class TestGeminiTokenUsage(unittest.TestCase): + """Test Gemini provider token usage handling.""" + + def setUp(self): + """Set up test fixtures.""" + self.provider = GeminiModelProvider("test-key") + + def test_extract_usage_with_valid_tokens(self): + """Test token extraction with valid token counts.""" + response = Mock() + response.usage_metadata = Mock() + response.usage_metadata.prompt_token_count = 100 + response.usage_metadata.candidates_token_count = 50 + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 100) + self.assertEqual(usage["output_tokens"], 50) + self.assertEqual(usage["total_tokens"], 150) + + def test_extract_usage_with_none_input_tokens(self): + """Test token extraction when input_tokens is None (regression test for bug).""" + response = Mock() + response.usage_metadata = Mock() + response.usage_metadata.prompt_token_count = None # This was causing crashes + response.usage_metadata.candidates_token_count = 50 + + usage = self.provider._extract_usage(response) + + # Should not include input_tokens when None + self.assertNotIn("input_tokens", usage) + self.assertEqual(usage["output_tokens"], 50) + # Should not calculate total_tokens when input is None + self.assertNotIn("total_tokens", usage) + + def test_extract_usage_with_none_output_tokens(self): + """Test token extraction when output_tokens is None (regression test for bug).""" + response = Mock() + response.usage_metadata = Mock() + response.usage_metadata.prompt_token_count = 100 + response.usage_metadata.candidates_token_count = None # This was causing crashes + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 100) + # Should not include output_tokens when None + self.assertNotIn("output_tokens", usage) + # Should not calculate total_tokens when output is None + self.assertNotIn("total_tokens", usage) + + def test_extract_usage_with_both_none_tokens(self): + """Test token extraction when both token counts are None.""" + response = Mock() + response.usage_metadata = Mock() + response.usage_metadata.prompt_token_count = None + response.usage_metadata.candidates_token_count = None + + usage = self.provider._extract_usage(response) + + # Should return empty dict when all tokens are None + self.assertEqual(usage, {}) + + def test_extract_usage_without_usage_metadata(self): + """Test token extraction when response has no usage_metadata.""" + response = Mock(spec=[]) + + usage = self.provider._extract_usage(response) + + # Should return empty dict + self.assertEqual(usage, {}) + + def test_extract_usage_with_zero_tokens(self): + """Test token extraction with zero token counts.""" + response = Mock() + response.usage_metadata = Mock() + response.usage_metadata.prompt_token_count = 0 + response.usage_metadata.candidates_token_count = 0 + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 0) + self.assertEqual(usage["output_tokens"], 0) + self.assertEqual(usage["total_tokens"], 0) + + def test_extract_usage_missing_attributes(self): + """Test token extraction when metadata lacks token count attributes.""" + response = Mock() + response.usage_metadata = Mock(spec=[]) + + usage = self.provider._extract_usage(response) + + # Should return empty dict when attributes are missing + self.assertEqual(usage, {}) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_openai_compatible_token_usage.py b/tests/test_openai_compatible_token_usage.py new file mode 100644 index 0000000..7060b58 --- /dev/null +++ b/tests/test_openai_compatible_token_usage.py @@ -0,0 +1,135 @@ +"""Tests for OpenAI-compatible provider token usage extraction.""" + +import unittest +from unittest.mock import Mock + +from providers.openai_compatible import OpenAICompatibleProvider + + +class TestOpenAICompatibleTokenUsage(unittest.TestCase): + """Test OpenAI-compatible provider token usage handling.""" + + def setUp(self): + """Set up test fixtures.""" + + # Create a concrete implementation for testing + class TestProvider(OpenAICompatibleProvider): + FRIENDLY_NAME = "Test" + SUPPORTED_MODELS = {"test-model": {"context_window": 4096}} + + def get_capabilities(self, model_name): + return Mock() + + def get_provider_type(self): + return Mock() + + def validate_model_name(self, model_name): + return True + + self.provider = TestProvider("test-key") + + def test_extract_usage_with_valid_tokens(self): + """Test token extraction with valid token counts.""" + response = Mock() + response.usage = Mock() + response.usage.prompt_tokens = 100 + response.usage.completion_tokens = 50 + response.usage.total_tokens = 150 + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 100) + self.assertEqual(usage["output_tokens"], 50) + self.assertEqual(usage["total_tokens"], 150) + + def test_extract_usage_with_none_prompt_tokens(self): + """Test token extraction when prompt_tokens is None (regression test for bug).""" + response = Mock() + response.usage = Mock() + response.usage.prompt_tokens = None # This was causing crashes + response.usage.completion_tokens = 50 + response.usage.total_tokens = None + + usage = self.provider._extract_usage(response) + + # Should default to 0 when None + self.assertEqual(usage["input_tokens"], 0) + self.assertEqual(usage["output_tokens"], 50) + self.assertEqual(usage["total_tokens"], 0) + + def test_extract_usage_with_none_completion_tokens(self): + """Test token extraction when completion_tokens is None (regression test for bug).""" + response = Mock() + response.usage = Mock() + response.usage.prompt_tokens = 100 + response.usage.completion_tokens = None # This was causing crashes + response.usage.total_tokens = None + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 100) + # Should default to 0 when None + self.assertEqual(usage["output_tokens"], 0) + self.assertEqual(usage["total_tokens"], 0) + + def test_extract_usage_with_all_none_tokens(self): + """Test token extraction when all token counts are None.""" + response = Mock() + response.usage = Mock() + response.usage.prompt_tokens = None + response.usage.completion_tokens = None + response.usage.total_tokens = None + + usage = self.provider._extract_usage(response) + + # Should default to 0 for all when None + self.assertEqual(usage["input_tokens"], 0) + self.assertEqual(usage["output_tokens"], 0) + self.assertEqual(usage["total_tokens"], 0) + + def test_extract_usage_without_usage(self): + """Test token extraction when response has no usage.""" + response = Mock(spec=[]) # No usage attribute + + usage = self.provider._extract_usage(response) + + # Should return empty dict + self.assertEqual(usage, {}) + + def test_extract_usage_with_zero_tokens(self): + """Test token extraction with zero token counts.""" + response = Mock() + response.usage = Mock() + response.usage.prompt_tokens = 0 + response.usage.completion_tokens = 0 + response.usage.total_tokens = 0 + + usage = self.provider._extract_usage(response) + + self.assertEqual(usage["input_tokens"], 0) + self.assertEqual(usage["output_tokens"], 0) + self.assertEqual(usage["total_tokens"], 0) + + def test_alternative_token_format_with_none(self): + """Test alternative token format (input_tokens/output_tokens) with None values.""" + # This tests the other code path in generate_content_openai_responses + # Simulate a response with input_tokens/output_tokens attributes that could be None + response = Mock() + response.input_tokens = None # This was causing crashes + response.output_tokens = 50 + + # Test the pattern: getattr(response, "input_tokens", 0) or 0 + input_tokens = getattr(response, "input_tokens", 0) or 0 + output_tokens = getattr(response, "output_tokens", 0) or 0 + + # Should not crash and should handle None gracefully + self.assertEqual(input_tokens, 0) + self.assertEqual(output_tokens, 50) + + # Test that addition works + total = input_tokens + output_tokens + self.assertEqual(total, 50) + + +if __name__ == "__main__": + unittest.main()