feat: implement semantic cassette matching for o3 models

Adds flexible cassette matching that ignores system prompt changes for o3 models, preventing CI failures when prompts are updated. Changes: - Semantic matching: Only compares model name, user question, and core params - Ignores: System prompts, conversation memory instructions, metadata - Prevents cassette breaks when prompts change between code versions - Added comprehensive tests for semantic matching behavior - Created maintenance documentation (tests/CASSETTE_MAINTENANCE.md) This solves the CI failure where o3-pro test cassettes would break whenever system prompts or conversation memory format changed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-01 18:53:30 +04:00
parent cff6d8998f
commit 70fa088c32
3 changed files with 417 additions and 2 deletions
--- a/tests/test_cassette_semantic_matching.py
+++ b/tests/test_cassette_semantic_matching.py
@@ -0,0 +1,125 @@
+"""
+Tests for cassette semantic matching to prevent breaks from prompt changes.
+
+This validates that o3 model cassettes match on semantic content (model + user question)
+rather than exact request bodies, preventing cassette breaks when system prompts change.
+"""
+
+import hashlib
+import json
+
+import pytest
+
+from tests.http_transport_recorder import ReplayTransport
+
+
+class TestCassetteSemanticMatching:
+    """Test that cassette matching is resilient to prompt changes."""
+
+    @pytest.fixture
+    def dummy_cassette(self, tmp_path):
+        """Create a minimal dummy cassette file."""
+        cassette_file = tmp_path / "dummy.json"
+        cassette_file.write_text(json.dumps({"interactions": []}))
+        return cassette_file
+
+    def test_o3_model_semantic_matching(self, dummy_cassette):
+        """Test that o3 models use semantic matching."""
+        transport = ReplayTransport(str(dummy_cassette))
+
+        # Two requests with same user question but different system prompts
+        request1_body = {
+            "model": "o3-pro",
+            "reasoning": {"effort": "medium"},
+            "input": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": "System prompt v1...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nMore instructions...",
+                        }
+                    ],
+                }
+            ],
+        }
+
+        request2_body = {
+            "model": "o3-pro",
+            "reasoning": {"effort": "medium"},
+            "input": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": "System prompt v2 (DIFFERENT)...\n\n=== USER REQUEST ===\nWhat is 2 + 2?\n=== END REQUEST ===\n\nDifferent instructions...",
+                        }
+                    ],
+                }
+            ],
+        }
+
+        # Extract semantic fields - should be identical
+        semantic1 = transport._extract_semantic_fields(request1_body)
+        semantic2 = transport._extract_semantic_fields(request2_body)
+
+        assert semantic1 == semantic2, "Semantic fields should match despite different prompts"
+        assert semantic1["user_question"] == "What is 2 + 2?"
+        assert semantic1["model"] == "o3-pro"
+        assert semantic1["reasoning"] == {"effort": "medium"}
+
+        # Generate signatures - should be identical
+        content1 = json.dumps(semantic1, sort_keys=True)
+        content2 = json.dumps(semantic2, sort_keys=True)
+        hash1 = hashlib.md5(content1.encode()).hexdigest()
+        hash2 = hashlib.md5(content2.encode()).hexdigest()
+
+        assert hash1 == hash2, "Hashes should match for same semantic content"
+
+    def test_non_o3_model_exact_matching(self, dummy_cassette):
+        """Test that non-o3 models still use exact matching."""
+        transport = ReplayTransport(str(dummy_cassette))
+
+        request_body = {
+            "model": "gpt-4",
+            "messages": [{"role": "user", "content": "test"}],
+        }
+
+        # Should not use semantic matching
+        assert not transport._is_o3_model_request(request_body)
+
+    def test_o3_mini_semantic_matching(self, dummy_cassette):
+        """Test that o3-mini also uses semantic matching."""
+        transport = ReplayTransport(str(dummy_cassette))
+
+        request_body = {
+            "model": "o3-mini",
+            "reasoning": {"effort": "low"},
+            "input": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "input_text", "text": "System...\n\n=== USER REQUEST ===\nTest\n=== END REQUEST ==="}
+                    ],
+                }
+            ],
+        }
+
+        assert transport._is_o3_model_request(request_body)
+        semantic = transport._extract_semantic_fields(request_body)
+        assert semantic["model"] == "o3-mini"
+        assert semantic["user_question"] == "Test"
+
+    def test_o3_without_request_markers(self, dummy_cassette):
+        """Test o3 requests without REQUEST markers fall back to full text."""
+        transport = ReplayTransport(str(dummy_cassette))
+
+        request_body = {
+            "model": "o3-pro",
+            "reasoning": {"effort": "medium"},
+            "input": [{"role": "user", "content": [{"type": "input_text", "text": "Just a simple question"}]}],
+        }
+
+        semantic = transport._extract_semantic_fields(request_body)
+        assert semantic["user_question"] == "Just a simple question"