178 lines
6.1 KiB
Python
178 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Model Thinking Configuration Test
|
|
|
|
Tests that thinking configuration is properly applied only to models that support it,
|
|
and that Flash models work correctly without thinking config.
|
|
"""
|
|
|
|
from .base_test import BaseSimulatorTest
|
|
|
|
|
|
class TestModelThinkingConfig(BaseSimulatorTest):
|
|
"""Test model-specific thinking configuration behavior"""
|
|
|
|
@property
|
|
def test_name(self) -> str:
|
|
return "model_thinking_config"
|
|
|
|
@property
|
|
def test_description(self) -> str:
|
|
return "Model-specific thinking configuration behavior"
|
|
|
|
def test_pro_model_with_thinking_config(self):
|
|
"""Test that Pro model uses thinking configuration"""
|
|
self.logger.info("Testing Pro model with thinking configuration...")
|
|
|
|
try:
|
|
# Test with explicit pro model and high thinking mode
|
|
response, continuation_id = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "What is 2 + 2? Please think carefully and explain.",
|
|
"model": "pro", # Should resolve to gemini-2.5-pro
|
|
"thinking_mode": "high", # Should use thinking_config
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
raise Exception("Pro model test failed: No response received")
|
|
|
|
self.logger.info("✅ Pro model with thinking config works correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"❌ Pro model test failed: {e}")
|
|
return False
|
|
|
|
def test_flash_model_without_thinking_config(self):
|
|
"""Test that Flash model works without thinking configuration"""
|
|
self.logger.info("Testing Flash model without thinking configuration...")
|
|
|
|
try:
|
|
# Test with explicit flash model and thinking mode (should be ignored)
|
|
response, continuation_id = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "What is 3 + 3? Give a quick answer.",
|
|
"model": "flash", # Should resolve to gemini-2.5-flash
|
|
"thinking_mode": "high", # Should be ignored for Flash model
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
raise Exception("Flash model test failed: No response received")
|
|
|
|
self.logger.info("✅ Flash model without thinking config works correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
if "thinking" in str(e).lower() and ("not supported" in str(e).lower() or "invalid" in str(e).lower()):
|
|
raise Exception(f"Flash model incorrectly tried to use thinking config: {e}")
|
|
self.logger.error(f"❌ Flash model test failed: {e}")
|
|
return False
|
|
|
|
def test_model_resolution_logic(self):
|
|
"""Test that model resolution works correctly for both shortcuts and full names"""
|
|
self.logger.info("Testing model resolution logic...")
|
|
|
|
test_cases = [
|
|
("pro", "should work with Pro model"),
|
|
("flash", "should work with Flash model"),
|
|
("gemini-2.5-pro", "should work with full Pro model name"),
|
|
("gemini-2.5-flash", "should work with full Flash model name"),
|
|
]
|
|
|
|
success_count = 0
|
|
|
|
for model_name, description in test_cases:
|
|
try:
|
|
response, continuation_id = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": f"Test with {model_name}: What is 1 + 1?",
|
|
"model": model_name,
|
|
"thinking_mode": "medium",
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
raise Exception(f"No response received for model {model_name}")
|
|
|
|
self.logger.info(f"✅ {model_name} {description}")
|
|
success_count += 1
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"❌ {model_name} failed: {e}")
|
|
return False
|
|
|
|
return success_count == len(test_cases)
|
|
|
|
def test_default_model_behavior(self):
|
|
"""Test behavior with server default model (no explicit model specified)"""
|
|
self.logger.info("Testing default model behavior...")
|
|
|
|
try:
|
|
# Test without specifying model (should use server default)
|
|
response, continuation_id = self.call_mcp_tool(
|
|
"chat",
|
|
{
|
|
"prompt": "Test default model: What is 4 + 4?",
|
|
# No model specified - should use DEFAULT_MODEL from config
|
|
"thinking_mode": "medium",
|
|
},
|
|
)
|
|
|
|
if not response:
|
|
raise Exception("Default model test failed: No response received")
|
|
|
|
self.logger.info("✅ Default model behavior works correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"❌ Default model test failed: {e}")
|
|
return False
|
|
|
|
def run_test(self) -> bool:
|
|
"""Run all model thinking configuration tests"""
|
|
self.logger.info(f" Test: {self.test_description}")
|
|
|
|
try:
|
|
# Test Pro model with thinking config
|
|
if not self.test_pro_model_with_thinking_config():
|
|
return False
|
|
|
|
# Test Flash model without thinking config
|
|
if not self.test_flash_model_without_thinking_config():
|
|
return False
|
|
|
|
# Test model resolution logic
|
|
if not self.test_model_resolution_logic():
|
|
return False
|
|
|
|
# Test default model behavior
|
|
if not self.test_default_model_behavior():
|
|
return False
|
|
|
|
self.logger.info(f"✅ All {self.test_name} tests passed!")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"❌ {self.test_name} test failed: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Run the model thinking configuration tests"""
|
|
import sys
|
|
|
|
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
|
test = TestModelThinkingConfig(verbose=verbose)
|
|
|
|
success = test.run_test()
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|