Files
my-pal-mcp-server/tests/test_pii_sanitizer.py
Josh Vera 7f92085c70 feat: Fix o3-pro response parsing and implement HTTP transport recorder
- Fix o3-pro response parsing to use output_text convenience field
- Replace respx with custom httpx transport solution for better reliability
- Implement comprehensive PII sanitization to prevent secret exposure
- Add HTTP request/response recording with cassette format for testing
- Sanitize all existing cassettes to remove exposed API keys
- Update documentation to reflect new HTTP transport recorder
- Add test suite for PII sanitization and HTTP recording

This change:
1. Fixes timeout issues with o3-pro API calls (was 2+ minutes, now ~15-22 seconds)
2. Properly captures response content without httpx.ResponseNotRead exceptions
3. Preserves original HTTP response format including gzip compression
4. Prevents future secret exposure with automatic PII sanitization
5. Enables reliable replay testing for o3-pro interactions

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-12 18:47:17 -06:00

150 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""Test cases for PII sanitizer."""
import unittest
from pii_sanitizer import PIISanitizer, PIIPattern
class TestPIISanitizer(unittest.TestCase):
"""Test PII sanitization functionality."""
def setUp(self):
"""Set up test sanitizer."""
self.sanitizer = PIISanitizer()
def test_api_key_sanitization(self):
"""Test various API key formats are sanitized."""
test_cases = [
# OpenAI keys
("sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-proj-SANITIZED"),
("sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN", "sk-SANITIZED"),
# Anthropic keys
("sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12", "sk-ant-SANITIZED"),
# Google keys
("AIzaSyD-1234567890abcdefghijklmnopqrstuv", "AIza-SANITIZED"),
# GitHub tokens
("ghp_1234567890abcdefghijklmnopqrstuvwxyz", "ghp_SANITIZED"),
("ghs_1234567890abcdefghijklmnopqrstuvwxyz", "ghs_SANITIZED"),
]
for original, expected in test_cases:
with self.subTest(original=original):
result = self.sanitizer.sanitize_string(original)
self.assertEqual(result, expected)
def test_personal_info_sanitization(self):
"""Test personal information is sanitized."""
test_cases = [
# Email addresses
("john.doe@example.com", "user@example.com"),
("test123@company.org", "user@example.com"),
# Phone numbers
("(555) 123-4567", "(XXX) XXX-XXXX"),
("555-123-4567", "(XXX) XXX-XXXX"),
("+1-555-123-4567", "+X-XXX-XXX-XXXX"),
# SSN
("123-45-6789", "XXX-XX-XXXX"),
# Credit card
("1234 5678 9012 3456", "XXXX-XXXX-XXXX-XXXX"),
("1234-5678-9012-3456", "XXXX-XXXX-XXXX-XXXX"),
]
for original, expected in test_cases:
with self.subTest(original=original):
result = self.sanitizer.sanitize_string(original)
self.assertEqual(result, expected)
def test_header_sanitization(self):
"""Test HTTP header sanitization."""
headers = {
"Authorization": "Bearer sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12",
"API-Key": "sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
"Content-Type": "application/json",
"User-Agent": "MyApp/1.0",
"Cookie": "session=abc123; user=john.doe@example.com"
}
sanitized = self.sanitizer.sanitize_headers(headers)
self.assertEqual(sanitized["Authorization"], "Bearer SANITIZED")
self.assertEqual(sanitized["API-Key"], "sk-SANITIZED")
self.assertEqual(sanitized["Content-Type"], "application/json")
self.assertEqual(sanitized["User-Agent"], "MyApp/1.0")
self.assertIn("user@example.com", sanitized["Cookie"])
def test_nested_structure_sanitization(self):
"""Test sanitization of nested data structures."""
data = {
"user": {
"email": "john.doe@example.com",
"api_key": "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
},
"tokens": [
"ghp_1234567890abcdefghijklmnopqrstuvwxyz",
"Bearer sk-ant-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
],
"metadata": {
"ip": "192.168.1.100",
"phone": "(555) 123-4567"
}
}
sanitized = self.sanitizer.sanitize_value(data)
self.assertEqual(sanitized["user"]["email"], "user@example.com")
self.assertEqual(sanitized["user"]["api_key"], "sk-proj-SANITIZED")
self.assertEqual(sanitized["tokens"][0], "ghp_SANITIZED")
self.assertEqual(sanitized["tokens"][1], "Bearer sk-ant-SANITIZED")
self.assertEqual(sanitized["metadata"]["ip"], "0.0.0.0")
self.assertEqual(sanitized["metadata"]["phone"], "(XXX) XXX-XXXX")
def test_url_sanitization(self):
"""Test URL parameter sanitization."""
urls = [
("https://api.example.com/v1/users?api_key=sk-1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN",
"https://api.example.com/v1/users?api_key=SANITIZED"),
("https://example.com/login?token=ghp_1234567890abcdefghijklmnopqrstuvwxyz&user=test",
"https://example.com/login?token=SANITIZED&user=test"),
]
for original, expected in urls:
with self.subTest(url=original):
result = self.sanitizer.sanitize_url(original)
self.assertEqual(result, expected)
def test_disable_sanitization(self):
"""Test that sanitization can be disabled."""
self.sanitizer.sanitize_enabled = False
sensitive_data = "sk-proj-abcd1234567890ABCD1234567890abcd1234567890ABCD12"
result = self.sanitizer.sanitize_string(sensitive_data)
# Should return original when disabled
self.assertEqual(result, sensitive_data)
def test_custom_pattern(self):
"""Test adding custom PII patterns."""
# Add custom pattern for internal employee IDs
custom_pattern = PIIPattern.create(
name="employee_id",
pattern=r'EMP\d{6}',
replacement="EMP-REDACTED",
description="Internal employee IDs"
)
self.sanitizer.add_pattern(custom_pattern)
text = "Employee EMP123456 has access to the system"
result = self.sanitizer.sanitize_string(text)
self.assertEqual(result, "Employee EMP-REDACTED has access to the system")
if __name__ == "__main__":
unittest.main()