feat: enhance model support by adding GPT-5.1 to .gitignore and updating cassette maintenance documentation for dual-model testing

This commit is contained in:
Bjorn Melin
2025-11-14 01:40:49 -07:00
parent 8e9aa2304d
commit f713d8a354
13 changed files with 255 additions and 34 deletions

1
.gitignore vendored
View File

@@ -183,6 +183,7 @@ CLAUDE.local.md
.docker_cleaned
logs/
*.backup
*.backup-*.json
/.desktop_configured
/worktrees/

View File

@@ -222,10 +222,45 @@ If you encounter issues with cassette testing:
3. Run semantic matching tests to verify the system
4. Open an issue if you find a bug in the matching logic
## Dual-Model Cassette Coverage
Some integration tests maintain cassettes for multiple model variants to ensure regression coverage across model families. For example:
### Consensus Tool Cassettes
The `test_consensus_integration.py` test uses parameterized fixtures to test both `gpt-5` and `gpt-5.1` models:
- `tests/openai_cassettes/consensus_step1_gpt5_for.json` - Cassette for gpt-5 model
- `tests/openai_cassettes/consensus_step1_gpt51_for.json` - Cassette for gpt-5.1 model
**When updating consensus cassettes:**
1. Both cassettes should be updated if the test logic changes
2. If only one model's behavior changes, update only that cassette
3. The test uses `@pytest.mark.parametrize` to run against both models
4. Each cassette path is mapped in the `CONSENSUS_CASSETTES` dictionary
**To re-record a specific model's cassette:**
```bash
# Delete the specific cassette
rm tests/openai_cassettes/consensus_step1_gpt5_for.json
# Run the test with real API key (it will record for gpt-5)
OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5] -v
# Or for gpt-5.1
rm tests/openai_cassettes/consensus_step1_gpt51_for.json
OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5.1] -v
```
This dual-coverage approach ensures that both model families continue to work correctly as the codebase evolves.
## Related Files
- `tests/http_transport_recorder.py` - Cassette recording/replay implementation
- `tests/transport_helpers.py` - Helper functions for injecting transports
- `tests/test_cassette_semantic_matching.py` - Tests for semantic matching
- `tests/test_o3_pro_output_text_fix.py` - Example of cassette usage
- `tests/test_consensus_integration.py` - Example of dual-model cassette coverage
- `tests/openai_cassettes/` - Directory containing recorded cassettes

View File

@@ -193,6 +193,7 @@ def disable_force_env_override(monkeypatch):
monkeypatch.setenv("MAX_CONVERSATION_TURNS", "50")
import importlib
import sys
import config
import utils.conversation_memory as conversation_memory
@@ -200,6 +201,10 @@ def disable_force_env_override(monkeypatch):
importlib.reload(config)
importlib.reload(conversation_memory)
test_conversation_module = sys.modules.get("tests.test_conversation_memory")
if test_conversation_module is not None:
test_conversation_module.MAX_CONVERSATION_TURNS = conversation_memory.MAX_CONVERSATION_TURNS
try:
yield
finally:

File diff suppressed because one or more lines are too long

View File

@@ -94,9 +94,9 @@ class TestAutoModeComprehensive:
"OPENROUTER_API_KEY": None,
},
{
"EXTENDED_REASONING": "gpt-5-codex", # GPT-5-Codex prioritized for coding tasks
"FAST_RESPONSE": "gpt-5", # Prefer gpt-5 for speed
"BALANCED": "gpt-5", # Prefer gpt-5 for balanced
"EXTENDED_REASONING": "gpt-5.1-codex", # GPT-5.1 Codex prioritized for coding tasks
"FAST_RESPONSE": "gpt-5.1", # Prefer gpt-5.1 for speed
"BALANCED": "gpt-5.1", # Prefer gpt-5.1 for balanced
},
),
# Only X.AI API available

View File

@@ -83,7 +83,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
pass
monkeypatch.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro")
monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5")
monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5.1")
monkeypatch.setenv("OPENROUTER_ALLOWED_MODELS", "gpt5nano")
monkeypatch.setenv("XAI_ALLOWED_MODELS", "")
@@ -104,7 +104,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
("OPENAI_API_KEY", "test-openai"),
("OPENROUTER_API_KEY", "test-openrouter"),
("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro"),
("OPENAI_ALLOWED_MODELS", "gpt-5"),
("OPENAI_ALLOWED_MODELS", "gpt-5.1"),
("OPENROUTER_ALLOWED_MODELS", "gpt5nano"),
("XAI_ALLOWED_MODELS", ""),
):
@@ -139,7 +139,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry):
assert payload["status"] == "error"
available_models = _extract_available_models(payload["content"])
assert set(available_models) == {"gemini-2.5-pro", "gpt-5", "gpt5nano", "openai/gpt-5-nano"}
assert set(available_models) == {"gemini-2.5-pro", "gpt-5.1", "gpt5nano", "openai/gpt-5-nano"}
@pytest.mark.no_mock_provider
@@ -225,6 +225,6 @@ def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, rese
available_models = _extract_available_models(payload["content"])
assert "gemini-2.5-pro" in available_models
assert "gpt-5" in available_models
assert any(model in available_models for model in {"gpt-5.1", "gpt-5"})
assert "grok-4" in available_models
assert len(available_models) >= 5

View File

@@ -98,9 +98,9 @@ class TestAutoModeProviderSelection:
balanced = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
# Should select appropriate OpenAI models based on new preference order
assert extended_reasoning == "gpt-5-codex" # GPT-5-Codex prioritized for extended reasoning
assert fast_response == "gpt-5" # gpt-5 comes first in fast response preference
assert balanced == "gpt-5" # gpt-5 for balanced
assert extended_reasoning == "gpt-5.1-codex" # GPT-5.1 Codex prioritized for extended reasoning
assert fast_response == "gpt-5.1" # gpt-5.1 comes first in fast response preference
assert balanced == "gpt-5.1" # gpt-5.1 for balanced
finally:
# Restore original environment

View File

@@ -16,7 +16,12 @@ from tools.consensus import ConsensusTool
# Directories for recorded HTTP interactions
CASSETTE_DIR = Path(__file__).parent / "openai_cassettes"
CASSETTE_DIR.mkdir(exist_ok=True)
CONSENSUS_CASSETTE_PATH = CASSETTE_DIR / "consensus_step1_gpt5_for.json"
# Mapping of OpenAI model names to their cassette files
CONSENSUS_CASSETTES = {
"gpt-5": CASSETTE_DIR / "consensus_step1_gpt5_for.json",
"gpt-5.1": CASSETTE_DIR / "consensus_step1_gpt51_for.json",
}
GEMINI_REPLAY_DIR = Path(__file__).parent / "gemini_cassettes"
GEMINI_REPLAY_DIR.mkdir(exist_ok=True)
@@ -26,8 +31,15 @@ GEMINI_REPLAY_PATH = GEMINI_REPLAY_DIR / "consensus" / "step2_gemini25_flash_aga
@pytest.mark.asyncio
@pytest.mark.no_mock_provider
async def test_consensus_multi_model_consultations(monkeypatch):
"""Exercise ConsensusTool against gpt-5 (supporting) and gemini-2.0-flash (critical)."""
@pytest.mark.parametrize("openai_model", ["gpt-5", "gpt-5.1"])
async def test_consensus_multi_model_consultations(monkeypatch, openai_model):
"""Exercise ConsensusTool against OpenAI model (supporting) and gemini-2.5-flash (critical).
Tests both gpt-5 and gpt-5.1 to ensure regression coverage for both model families.
"""
# Get the cassette path for this model
consensus_cassette_path = CONSENSUS_CASSETTES[openai_model]
env_updates = {
"DEFAULT_MODEL": "auto",
@@ -43,13 +55,14 @@ async def test_consensus_multi_model_consultations(monkeypatch):
"CUSTOM_API_URL",
]
recording_mode = not CONSENSUS_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists()
recording_mode = not consensus_cassette_path.exists() or not GEMINI_REPLAY_PATH.exists()
if recording_mode:
openai_key = env_updates["OPENAI_API_KEY"].strip()
gemini_key = env_updates["GEMINI_API_KEY"].strip()
if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")):
pytest.skip(
"Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record."
"Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY "
"not configured. Provide real keys to record."
)
GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True)
@@ -66,27 +79,43 @@ async def test_consensus_multi_model_consultations(monkeypatch):
m.setenv("GEMINI_API_KEY", "dummy-key-for-replay")
m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay")
# Ensure restriction policies allow the latest OpenAI models under test
m.setenv("OPENAI_ALLOWED_MODELS", openai_model)
m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_REPLAY_DIR))
m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID)
for key in keys_to_clear:
m.delenv(key, raising=False)
# Reset providers and register only OpenAI & Gemini for deterministic behavior
# Ensure we use the built-in OpenAI catalogue rather than leftovers from
# other tests that patch OPENAI_MODELS_CONFIG_PATH.
m.delenv("OPENAI_MODELS_CONFIG_PATH", raising=False)
# Reset providers/restrictions and register only OpenAI & Gemini for deterministic behavior
ModelProviderRegistry.reset_for_testing()
import utils.model_restrictions as model_restrictions
model_restrictions._restriction_service = None
from providers.gemini import GeminiModelProvider
from providers.openai import OpenAIModelProvider
# Earlier tests may override the OpenAI provider's registry by pointing
# OPENAI_MODELS_CONFIG_PATH at fixtures. Force a reload so model
# metadata is restored from conf/openai_models.json.
OpenAIModelProvider.reload_registry()
assert openai_model in OpenAIModelProvider.MODEL_CAPABILITIES
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider)
# Inject HTTP transport for OpenAI interactions
inject_transport(monkeypatch, CONSENSUS_CASSETTE_PATH)
inject_transport(monkeypatch, str(consensus_cassette_path))
tool = ConsensusTool()
models_to_consult = [
{"model": "gpt-5", "stance": "for"},
{"model": openai_model, "stance": "for"},
{"model": "gemini-2.5-flash", "stance": "against"},
]
@@ -105,7 +134,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
step1_data = json.loads(step1_response[0].text)
assert step1_data["status"] == "analysis_and_first_model_consulted"
assert step1_data["model_consulted"] == "gpt-5"
assert step1_data["model_consulted"] == openai_model
assert step1_data["model_response"]["status"] == "success"
assert step1_data["model_response"]["metadata"]["provider"] == "openai"
assert step1_data["model_response"]["verdict"]
@@ -118,7 +147,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
summary_for_step2 = step1_data["model_response"]["verdict"][:200]
step2_arguments = {
"step": f"Incorporated gpt-5 perspective: {summary_for_step2}",
"step": f"Incorporated {openai_model} perspective: {summary_for_step2}",
"step_number": 2,
"total_steps": len(models_to_consult),
"next_step_required": False,
@@ -138,7 +167,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
assert step2_data["model_response"]["metadata"]["provider"] == "google"
assert step2_data["model_response"]["verdict"]
assert step2_data["complete_consensus"]["models_consulted"] == [
"gpt-5:for",
f"{openai_model}:for",
"gemini-2.5-flash:against",
]
assert step2_data["consensus_complete"] is True
@@ -159,7 +188,7 @@ async def test_consensus_multi_model_consultations(monkeypatch):
gemini_provider._client = None
# Ensure cassettes exist for future replays
assert CONSENSUS_CASSETTE_PATH.exists()
assert consensus_cassette_path.exists()
assert GEMINI_REPLAY_PATH.exists()
# Clean up provider registry state after test

View File

@@ -37,14 +37,14 @@ class TestIntelligentFallback:
@patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False)
def test_prefers_openai_o3_mini_when_available(self):
"""Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)"""
"""Test that gpt-5.1 is preferred when OpenAI API key is available (based on new preference order)"""
# Register only OpenAI provider for this test
from providers.openai import OpenAIModelProvider
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
fallback_model = ModelProviderRegistry.get_preferred_fallback_model()
assert fallback_model == "gpt-5" # Based on new preference order: gpt-5 before o4-mini
assert fallback_model == "gpt-5.1" # Based on new preference order: gpt-5.1 before o4-mini
@patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False)
def test_prefers_gemini_flash_when_openai_unavailable(self):
@@ -147,8 +147,8 @@ class TestIntelligentFallback:
history, tokens = build_conversation_history(context, model_context=None)
# Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order)
mock_context_class.assert_called_once_with("gpt-5")
# Verify that ModelContext was called with gpt-5.1 (the intelligent fallback based on new preference order)
mock_context_class.assert_called_once_with("gpt-5.1")
def test_auto_mode_with_gemini_only(self):
"""Test auto mode behavior when only Gemini API key is available"""

View File

@@ -50,6 +50,9 @@ class TestOpenAIProvider:
assert provider.validate_model_name("o4-mini") is True
assert provider.validate_model_name("gpt-5") is True
assert provider.validate_model_name("gpt-5-mini") is True
assert provider.validate_model_name("gpt-5.1") is True
assert provider.validate_model_name("gpt-5.1-codex") is True
assert provider.validate_model_name("gpt-5.1-codex-mini") is True
# Test valid aliases
assert provider.validate_model_name("mini") is True
@@ -59,6 +62,9 @@ class TestOpenAIProvider:
assert provider.validate_model_name("gpt5") is True
assert provider.validate_model_name("gpt5-mini") is True
assert provider.validate_model_name("gpt5mini") is True
assert provider.validate_model_name("gpt5.1") is True
assert provider.validate_model_name("gpt5.1-codex") is True
assert provider.validate_model_name("codex-mini") is True
# Test invalid model
assert provider.validate_model_name("invalid-model") is False
@@ -77,6 +83,9 @@ class TestOpenAIProvider:
assert provider._resolve_model_name("gpt5") == "gpt-5"
assert provider._resolve_model_name("gpt5-mini") == "gpt-5-mini"
assert provider._resolve_model_name("gpt5mini") == "gpt-5-mini"
assert provider._resolve_model_name("gpt5.1") == "gpt-5.1"
assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex"
assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini"
# Test full name passthrough
assert provider._resolve_model_name("o3") == "o3"
@@ -86,6 +95,9 @@ class TestOpenAIProvider:
assert provider._resolve_model_name("o4-mini") == "o4-mini"
assert provider._resolve_model_name("gpt-5") == "gpt-5"
assert provider._resolve_model_name("gpt-5-mini") == "gpt-5-mini"
assert provider._resolve_model_name("gpt-5.1") == "gpt-5.1"
assert provider._resolve_model_name("gpt-5.1-codex") == "gpt-5.1-codex"
assert provider._resolve_model_name("gpt-5.1-codex-mini") == "gpt-5.1-codex-mini"
def test_get_capabilities_o3(self):
"""Test getting model capabilities for O3."""
@@ -146,6 +158,36 @@ class TestOpenAIProvider:
assert capabilities.supports_function_calling is True
assert capabilities.supports_temperature is True
def test_get_capabilities_gpt51(self):
"""Test GPT-5.1 capabilities reflect new metadata."""
provider = OpenAIModelProvider("test-key")
capabilities = provider.get_capabilities("gpt-5.1")
assert capabilities.model_name == "gpt-5.1"
assert capabilities.supports_streaming is True
assert capabilities.supports_function_calling is True
assert capabilities.supports_json_mode is True
assert capabilities.allow_code_generation is True
def test_get_capabilities_gpt51_codex(self):
"""Test GPT-5.1 Codex is responses-only and non-streaming."""
provider = OpenAIModelProvider("test-key")
capabilities = provider.get_capabilities("gpt-5.1-codex")
assert capabilities.model_name == "gpt-5.1-codex"
assert capabilities.supports_streaming is False
assert capabilities.use_openai_response_api is True
assert capabilities.allow_code_generation is True
def test_get_capabilities_gpt51_codex_mini(self):
"""Test GPT-5.1 Codex mini exposes streaming and code generation."""
provider = OpenAIModelProvider("test-key")
capabilities = provider.get_capabilities("gpt-5.1-codex-mini")
assert capabilities.model_name == "gpt-5.1-codex-mini"
assert capabilities.supports_streaming is True
assert capabilities.allow_code_generation is True
@patch("providers.openai_compatible.OpenAI")
def test_generate_content_resolves_alias_before_api_call(self, mock_openai_class):
"""Test that generate_content resolves aliases before making API calls.

View File

@@ -98,8 +98,8 @@ class TestModelSelection:
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING)
# OpenAI prefers GPT-5-Codex for extended reasoning (coding tasks)
assert model == "gpt-5-codex"
# OpenAI prefers GPT-5.1-Codex for extended reasoning (coding tasks)
assert model == "gpt-5.1-codex"
def test_extended_reasoning_with_gemini_only(self):
"""Test EXTENDED_REASONING prefers pro when only Gemini is available."""
@@ -133,8 +133,8 @@ class TestModelSelection:
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE)
# OpenAI now prefers gpt-5 for fast response (based on our new preference order)
assert model == "gpt-5"
# OpenAI now prefers gpt-5.1 for fast response (based on our new preference order)
assert model == "gpt-5.1"
def test_fast_response_with_gemini_only(self):
"""Test FAST_RESPONSE prefers flash when only Gemini is available."""
@@ -167,8 +167,8 @@ class TestModelSelection:
ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider)
model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED)
# OpenAI prefers gpt-5 for balanced (based on our new preference order)
assert model == "gpt-5"
# OpenAI prefers gpt-5.1 for balanced (based on our new preference order)
assert model == "gpt-5.1"
def test_no_category_uses_balanced_logic(self):
"""Test that no category specified uses balanced logic."""
@@ -195,7 +195,7 @@ class TestFlexibleModelSelection:
"env": {"OPENAI_API_KEY": "test-key"},
"provider_type": ProviderType.OPENAI,
"category": ToolModelCategory.EXTENDED_REASONING,
"expected": "gpt-5-codex", # GPT-5-Codex prioritized for coding tasks
"expected": "gpt-5.1-codex", # GPT-5.1-Codex prioritized for coding tasks
},
# Case 2: Gemini provider for fast response
{
@@ -209,7 +209,7 @@ class TestFlexibleModelSelection:
"env": {"OPENAI_API_KEY": "test-key"},
"provider_type": ProviderType.OPENAI,
"category": ToolModelCategory.FAST_RESPONSE,
"expected": "gpt-5", # Based on new preference order
"expected": "gpt-5.1", # Based on new preference order
},
]

View File

@@ -209,6 +209,9 @@ class TestOpenAIProvider:
assert provider.validate_model_name("o4-mini")
assert provider.validate_model_name("o4mini")
assert provider.validate_model_name("o4-mini")
assert provider.validate_model_name("gpt-5.1")
assert provider.validate_model_name("gpt-5.1-codex")
assert provider.validate_model_name("gpt-5.1-codex-mini")
assert not provider.validate_model_name("gpt-4o")
assert not provider.validate_model_name("invalid-model")
@@ -219,3 +222,20 @@ class TestOpenAIProvider:
aliases = ["o3", "o3mini", "o3-mini", "o4-mini", "o4mini"]
for alias in aliases:
assert not provider.get_capabilities(alias).supports_extended_thinking
def test_gpt51_family_capabilities(self):
"""Ensure GPT-5.1 family exposes correct capability flags."""
provider = OpenAIModelProvider(api_key="test-key")
base = provider.get_capabilities("gpt-5.1")
assert base.supports_streaming
assert base.allow_code_generation
codex = provider.get_capabilities("gpt-5.1-codex")
assert not codex.supports_streaming
assert codex.use_openai_response_api
assert codex.allow_code_generation
codex_mini = provider.get_capabilities("gpt-5.1-codex-mini")
assert codex_mini.supports_streaming
assert codex_mini.allow_code_generation

View File

@@ -54,6 +54,9 @@ class TestSupportedModelsAliases:
assert "o3mini" in provider.MODEL_CAPABILITIES["o3-mini"].aliases
assert "o3pro" in provider.MODEL_CAPABILITIES["o3-pro"].aliases
assert "gpt4.1" in provider.MODEL_CAPABILITIES["gpt-4.1"].aliases
assert "gpt5.1" in provider.MODEL_CAPABILITIES["gpt-5.1"].aliases
assert "gpt5.1-codex" in provider.MODEL_CAPABILITIES["gpt-5.1-codex"].aliases
assert "codex-mini" in provider.MODEL_CAPABILITIES["gpt-5.1-codex-mini"].aliases
# Test alias resolution
assert provider._resolve_model_name("mini") == "gpt-5-mini" # mini -> gpt-5-mini now
@@ -61,10 +64,14 @@ class TestSupportedModelsAliases:
assert provider._resolve_model_name("o3pro") == "o3-pro" # o3pro resolves to o3-pro
assert provider._resolve_model_name("o4mini") == "o4-mini"
assert provider._resolve_model_name("gpt4.1") == "gpt-4.1" # gpt4.1 resolves to gpt-4.1
assert provider._resolve_model_name("gpt5.1") == "gpt-5.1"
assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex"
assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini"
# Test case insensitive resolution
assert provider._resolve_model_name("Mini") == "gpt-5-mini" # mini -> gpt-5-mini now
assert provider._resolve_model_name("O3MINI") == "o3-mini"
assert provider._resolve_model_name("Gpt5.1") == "gpt-5.1"
def test_xai_provider_aliases(self):
"""Test XAI provider's alias structure."""