feat: implement semantic cassette matching for o3 models
Adds flexible cassette matching that ignores system prompt changes for o3 models, preventing CI failures when prompts are updated. Changes: - Semantic matching: Only compares model name, user question, and core params - Ignores: System prompts, conversation memory instructions, metadata - Prevents cassette breaks when prompts change between code versions - Added comprehensive tests for semantic matching behavior - Created maintenance documentation (tests/CASSETTE_MAINTENANCE.md) This solves the CI failure where o3-pro test cassettes would break whenever system prompts or conversation memory format changed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
125
tests/test_cassette_semantic_matching.py
Normal file
125
tests/test_cassette_semantic_matching.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Tests for cassette semantic matching to prevent breaks from prompt changes.
|
||||
|
||||
This validates that o3 model cassettes match on semantic content (model + user question)
|
||||
rather than exact request bodies, preventing cassette breaks when system prompts change.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.http_transport_recorder import ReplayTransport
|
||||
|
||||
|
||||
class TestCassetteSemanticMatching:
|
||||
"""Test that cassette matching is resilient to prompt changes."""
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_cassette(self, tmp_path):
|
||||
"""Create a minimal dummy cassette file."""
|
||||
cassette_file = tmp_path / "dummy.json"
|
||||
cassette_file.write_text(json.dumps({"interactions": []}))
|
||||
return cassette_file
|
||||
|
||||
def test_o3_model_semantic_matching(self, dummy_cassette):
|
||||
"""Test that o3 models use semantic matching."""
|
||||
transport = ReplayTransport(str(dummy_cassette))
|
||||
|
||||
# Two requests with same user question but different system prompts
|
||||
request1_body = {
|
||||
"model": "o3-pro",
|
||||
"reasoning": {"effort": "medium"},
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "System prompt v1...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nMore instructions...",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request2_body = {
|
||||
"model": "o3-pro",
|
||||
"reasoning": {"effort": "medium"},
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "System prompt v2 (DIFFERENT)...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nDifferent instructions...",
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Extract semantic fields - should be identical
|
||||
semantic1 = transport._extract_semantic_fields(request1_body)
|
||||
semantic2 = transport._extract_semantic_fields(request2_body)
|
||||
|
||||
assert semantic1 == semantic2, "Semantic fields should match despite different prompts"
|
||||
assert semantic1["user_question"] == "What is 2 + 2?"
|
||||
assert semantic1["model"] == "o3-pro"
|
||||
assert semantic1["reasoning"] == {"effort": "medium"}
|
||||
|
||||
# Generate signatures - should be identical
|
||||
content1 = json.dumps(semantic1, sort_keys=True)
|
||||
content2 = json.dumps(semantic2, sort_keys=True)
|
||||
hash1 = hashlib.md5(content1.encode()).hexdigest()
|
||||
hash2 = hashlib.md5(content2.encode()).hexdigest()
|
||||
|
||||
assert hash1 == hash2, "Hashes should match for same semantic content"
|
||||
|
||||
def test_non_o3_model_exact_matching(self, dummy_cassette):
|
||||
"""Test that non-o3 models still use exact matching."""
|
||||
transport = ReplayTransport(str(dummy_cassette))
|
||||
|
||||
request_body = {
|
||||
"model": "gpt-4",
|
||||
"messages": [{"role": "user", "content": "test"}],
|
||||
}
|
||||
|
||||
# Should not use semantic matching
|
||||
assert not transport._is_o3_model_request(request_body)
|
||||
|
||||
def test_o3_mini_semantic_matching(self, dummy_cassette):
|
||||
"""Test that o3-mini also uses semantic matching."""
|
||||
transport = ReplayTransport(str(dummy_cassette))
|
||||
|
||||
request_body = {
|
||||
"model": "o3-mini",
|
||||
"reasoning": {"effort": "low"},
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "System...\n\n=== USER REQUEST ===\nTest\n=== END REQUEST ==="}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
assert transport._is_o3_model_request(request_body)
|
||||
semantic = transport._extract_semantic_fields(request_body)
|
||||
assert semantic["model"] == "o3-mini"
|
||||
assert semantic["user_question"] == "Test"
|
||||
|
||||
def test_o3_without_request_markers(self, dummy_cassette):
|
||||
"""Test o3 requests without REQUEST markers fall back to full text."""
|
||||
transport = ReplayTransport(str(dummy_cassette))
|
||||
|
||||
request_body = {
|
||||
"model": "o3-pro",
|
||||
"reasoning": {"effort": "medium"},
|
||||
"input": [{"role": "user", "content": [{"type": "input_text", "text": "Just a simple question"}]}],
|
||||
}
|
||||
|
||||
semantic = transport._extract_semantic_fields(request_body)
|
||||
assert semantic["user_question"] == "Just a simple question"
|
||||
Reference in New Issue
Block a user