Adds flexible cassette matching that ignores system prompt changes for o3 models, preventing CI failures when prompts are updated. Changes: - Semantic matching: Only compares model name, user question, and core params - Ignores: System prompts, conversation memory instructions, metadata - Prevents cassette breaks when prompts change between code versions - Added comprehensive tests for semantic matching behavior - Created maintenance documentation (tests/CASSETTE_MAINTENANCE.md) This solves the CI failure where o3-pro test cassettes would break whenever system prompts or conversation memory format changed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
126 lines
4.6 KiB
Python
126 lines
4.6 KiB
Python
"""
|
|
Tests for cassette semantic matching to prevent breaks from prompt changes.
|
|
|
|
This validates that o3 model cassettes match on semantic content (model + user question)
|
|
rather than exact request bodies, preventing cassette breaks when system prompts change.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
|
|
import pytest
|
|
|
|
from tests.http_transport_recorder import ReplayTransport
|
|
|
|
|
|
class TestCassetteSemanticMatching:
|
|
"""Test that cassette matching is resilient to prompt changes."""
|
|
|
|
@pytest.fixture
|
|
def dummy_cassette(self, tmp_path):
|
|
"""Create a minimal dummy cassette file."""
|
|
cassette_file = tmp_path / "dummy.json"
|
|
cassette_file.write_text(json.dumps({"interactions": []}))
|
|
return cassette_file
|
|
|
|
def test_o3_model_semantic_matching(self, dummy_cassette):
|
|
"""Test that o3 models use semantic matching."""
|
|
transport = ReplayTransport(str(dummy_cassette))
|
|
|
|
# Two requests with same user question but different system prompts
|
|
request1_body = {
|
|
"model": "o3-pro",
|
|
"reasoning": {"effort": "medium"},
|
|
"input": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": "System prompt v1...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nMore instructions...",
|
|
}
|
|
],
|
|
}
|
|
],
|
|
}
|
|
|
|
request2_body = {
|
|
"model": "o3-pro",
|
|
"reasoning": {"effort": "medium"},
|
|
"input": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": "System prompt v2 (DIFFERENT)...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nDifferent instructions...",
|
|
}
|
|
],
|
|
}
|
|
],
|
|
}
|
|
|
|
# Extract semantic fields - should be identical
|
|
semantic1 = transport._extract_semantic_fields(request1_body)
|
|
semantic2 = transport._extract_semantic_fields(request2_body)
|
|
|
|
assert semantic1 == semantic2, "Semantic fields should match despite different prompts"
|
|
assert semantic1["user_question"] == "What is 2 + 2?"
|
|
assert semantic1["model"] == "o3-pro"
|
|
assert semantic1["reasoning"] == {"effort": "medium"}
|
|
|
|
# Generate signatures - should be identical
|
|
content1 = json.dumps(semantic1, sort_keys=True)
|
|
content2 = json.dumps(semantic2, sort_keys=True)
|
|
hash1 = hashlib.md5(content1.encode()).hexdigest()
|
|
hash2 = hashlib.md5(content2.encode()).hexdigest()
|
|
|
|
assert hash1 == hash2, "Hashes should match for same semantic content"
|
|
|
|
def test_non_o3_model_exact_matching(self, dummy_cassette):
|
|
"""Test that non-o3 models still use exact matching."""
|
|
transport = ReplayTransport(str(dummy_cassette))
|
|
|
|
request_body = {
|
|
"model": "gpt-4",
|
|
"messages": [{"role": "user", "content": "test"}],
|
|
}
|
|
|
|
# Should not use semantic matching
|
|
assert not transport._is_o3_model_request(request_body)
|
|
|
|
def test_o3_mini_semantic_matching(self, dummy_cassette):
|
|
"""Test that o3-mini also uses semantic matching."""
|
|
transport = ReplayTransport(str(dummy_cassette))
|
|
|
|
request_body = {
|
|
"model": "o3-mini",
|
|
"reasoning": {"effort": "low"},
|
|
"input": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "input_text", "text": "System...\n\n=== USER REQUEST ===\nTest\n=== END REQUEST ==="}
|
|
],
|
|
}
|
|
],
|
|
}
|
|
|
|
assert transport._is_o3_model_request(request_body)
|
|
semantic = transport._extract_semantic_fields(request_body)
|
|
assert semantic["model"] == "o3-mini"
|
|
assert semantic["user_question"] == "Test"
|
|
|
|
def test_o3_without_request_markers(self, dummy_cassette):
|
|
"""Test o3 requests without REQUEST markers fall back to full text."""
|
|
transport = ReplayTransport(str(dummy_cassette))
|
|
|
|
request_body = {
|
|
"model": "o3-pro",
|
|
"reasoning": {"effort": "medium"},
|
|
"input": [{"role": "user", "content": [{"type": "input_text", "text": "Just a simple question"}]}],
|
|
}
|
|
|
|
semantic = transport._extract_semantic_fields(request_body)
|
|
assert semantic["user_question"] == "Just a simple question"
|