diff --git a/.gitignore b/.gitignore index be60b01..d2beebe 100644 --- a/.gitignore +++ b/.gitignore @@ -183,6 +183,7 @@ CLAUDE.local.md .docker_cleaned logs/ *.backup +*.backup-*.json /.desktop_configured /worktrees/ diff --git a/tests/CASSETTE_MAINTENANCE.md b/tests/CASSETTE_MAINTENANCE.md index 75710da..cb2c57c 100644 --- a/tests/CASSETTE_MAINTENANCE.md +++ b/tests/CASSETTE_MAINTENANCE.md @@ -222,10 +222,45 @@ If you encounter issues with cassette testing: 3. Run semantic matching tests to verify the system 4. Open an issue if you find a bug in the matching logic +## Dual-Model Cassette Coverage + +Some integration tests maintain cassettes for multiple model variants to ensure regression coverage across model families. For example: + +### Consensus Tool Cassettes + +The `test_consensus_integration.py` test uses parameterized fixtures to test both `gpt-5` and `gpt-5.1` models: + +- `tests/openai_cassettes/consensus_step1_gpt5_for.json` - Cassette for gpt-5 model +- `tests/openai_cassettes/consensus_step1_gpt51_for.json` - Cassette for gpt-5.1 model + +**When updating consensus cassettes:** + +1. Both cassettes should be updated if the test logic changes +2. If only one model's behavior changes, update only that cassette +3. The test uses `@pytest.mark.parametrize` to run against both models +4. Each cassette path is mapped in the `CONSENSUS_CASSETTES` dictionary + +**To re-record a specific model's cassette:** + +```bash +# Delete the specific cassette +rm tests/openai_cassettes/consensus_step1_gpt5_for.json + +# Run the test with real API key (it will record for gpt-5) +OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5] -v + +# Or for gpt-5.1 +rm tests/openai_cassettes/consensus_step1_gpt51_for.json +OPENAI_API_KEY="your-real-key" python -m pytest tests/test_consensus_integration.py::test_consensus_multi_model_consultations[gpt-5.1] -v +``` + +This dual-coverage approach ensures that both model families continue to work correctly as the codebase evolves. + ## Related Files - `tests/http_transport_recorder.py` - Cassette recording/replay implementation - `tests/transport_helpers.py` - Helper functions for injecting transports - `tests/test_cassette_semantic_matching.py` - Tests for semantic matching - `tests/test_o3_pro_output_text_fix.py` - Example of cassette usage +- `tests/test_consensus_integration.py` - Example of dual-model cassette coverage - `tests/openai_cassettes/` - Directory containing recorded cassettes diff --git a/tests/conftest.py b/tests/conftest.py index 723e06d..6aa710a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -193,6 +193,7 @@ def disable_force_env_override(monkeypatch): monkeypatch.setenv("MAX_CONVERSATION_TURNS", "50") import importlib + import sys import config import utils.conversation_memory as conversation_memory @@ -200,6 +201,10 @@ def disable_force_env_override(monkeypatch): importlib.reload(config) importlib.reload(conversation_memory) + test_conversation_module = sys.modules.get("tests.test_conversation_memory") + if test_conversation_module is not None: + test_conversation_module.MAX_CONVERSATION_TURNS = conversation_memory.MAX_CONVERSATION_TURNS + try: yield finally: diff --git a/tests/openai_cassettes/consensus_step1_gpt51_for.json b/tests/openai_cassettes/consensus_step1_gpt51_for.json new file mode 100644 index 0000000..1a8720b --- /dev/null +++ b/tests/openai_cassettes/consensus_step1_gpt51_for.json @@ -0,0 +1,82 @@ +{ + "interactions": [ + { + "request": { + "content": { + "messages": [ + { + "content": "\nROLE\nYou are an expert technical consultant providing consensus analysis on proposals, plans, and ideas. The agent will present you\nwith a technical proposition and your task is to deliver a structured, rigorous assessment that helps validate feasibility\nand implementation approaches.\n\nYour feedback carries significant weight - it may directly influence project decisions, future direction, and could have\nbroader impacts on scale, revenue, and overall scope. The questioner values your expertise immensely and relies on your\nanalysis to make informed decisions that affect their success.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE\u2502 code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE\u2502\" markers in generated code\nsnippets.\n\nPERSPECTIVE FRAMEWORK\nSUPPORTIVE PERSPECTIVE WITH INTEGRITY\n\nYou are tasked with advocating FOR this proposal, but with CRITICAL GUARDRAILS:\n\nMANDATORY ETHICAL CONSTRAINTS:\n- This is NOT a debate for entertainment. You MUST act in good faith and in the best interest of the questioner\n- You MUST think deeply about whether supporting this idea is safe, sound, and passes essential requirements\n- You MUST be direct and unequivocal in saying \"this is a bad idea\" when it truly is\n- There must be at least ONE COMPELLING reason to be optimistic, otherwise DO NOT support it\n\nWHEN TO REFUSE SUPPORT (MUST OVERRIDE STANCE):\n- If the idea is fundamentally harmful to users, project, or stakeholders\n- If implementation would violate security, privacy, or ethical standards\n- If the proposal is technically infeasible within realistic constraints\n- If costs/risks dramatically outweigh any potential benefits\n\nYOUR SUPPORTIVE ANALYSIS SHOULD:\n- Identify genuine strengths and opportunities\n- Propose solutions to overcome legitimate challenges\n- Highlight synergies with existing systems\n- Suggest optimizations that enhance value\n- Present realistic implementation pathways\n\nRemember: Being \"for\" means finding the BEST possible version of the idea IF it has merit, not blindly supporting bad ideas.\n\nIF MORE INFORMATION IS NEEDED\nIMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,\nor technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based\non the information given rather than requesting technical files.\n\nIf you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to\nprovide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).\nDo NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nFor business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the\ncontext provided, even if specific technical details are not available.\n\nEVALUATION FRAMEWORK\nAssess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you\nacknowledge fundamental truths about feasibility, safety, or value:\n\n1. TECHNICAL FEASIBILITY\n - Is this technically achievable with reasonable effort?\n - What are the core technical dependencies and requirements?\n - Are there any fundamental technical blockers?\n\n2. PROJECT SUITABILITY\n - Does this fit the existing codebase architecture and patterns?\n - Is it compatible with current technology stack and constraints?\n - How well does it align with the project's technical direction?\n\n3. USER VALUE ASSESSMENT\n - Will users actually want and use this feature?\n - What concrete benefits does this provide?\n - How does this compare to alternative solutions?\n\n4. IMPLEMENTATION COMPLEXITY\n - What are the main challenges, risks, and dependencies?\n - What is the estimated effort and timeline?\n - What expertise and resources are required?\n\n5. ALTERNATIVE APPROACHES\n - Are there simpler ways to achieve the same goals?\n - What are the trade-offs between different approaches?\n - Should we consider a different strategy entirely?\n\n6. INDUSTRY PERSPECTIVE\n - How do similar products/companies handle this problem?\n - What are current best practices and emerging patterns?\n - Are there proven solutions or cautionary tales?\n\n7. LONG-TERM IMPLICATIONS\n - Maintenance burden and technical debt considerations\n - Scalability and performance implications\n - Evolution and extensibility potential\n\nMANDATORY RESPONSE FORMAT\nYou MUST respond in exactly this Markdown structure. Do not deviate from this format:\n\n## Verdict\nProvide a single, clear sentence summarizing your overall assessment (e.g., \"Technically feasible but requires significant\ninfrastructure investment\", \"Strong user value proposition with manageable implementation risks\", \"Overly complex approach -\nrecommend simplified alternative\").\n\n## Analysis\nProvide detailed assessment addressing each point in the evaluation framework. Use clear reasoning and specific examples.\nBe thorough but concise. Address both strengths and weaknesses objectively.\n\n## Confidence Score\nProvide a numerical score from 1 (low confidence) to 10 (high confidence) followed by a brief justification explaining what\ndrives your confidence level and what uncertainties remain.\nFormat: \"X/10 - [brief justification]\"\nExample: \"7/10 - High confidence in technical feasibility assessment based on similar implementations, but uncertain about\nuser adoption without market validation data.\"\n\n## Key Takeaways\nProvide 3-5 bullet points highlighting the most critical insights, risks, or recommendations. These should be actionable\nand specific.\n\nQUALITY STANDARDS\n- Ground all insights in the current project's scope and constraints\n- Be honest about limitations and uncertainties\n- Focus on practical, implementable solutions rather than theoretical possibilities\n- Provide specific, actionable guidance rather than generic advice\n- Balance optimism with realistic risk assessment\n- Reference concrete examples and precedents when possible\n\nREMINDERS\n- Your assessment will be synthesized with other expert opinions by the agent\n- Aim to provide unique insights that complement other perspectives\n- If files are provided, reference specific technical details in your analysis\n- Maintain professional objectivity while being decisive in your recommendations\n- Keep your response concise - your entire reply must not exceed 850 tokens to ensure transport compatibility\n- CRITICAL: Your stance does NOT override your responsibility to provide truthful, ethical, and beneficial guidance\n- Bad ideas must be called out regardless of stance; good ideas must be acknowledged regardless of stance\n", + "role": "system" + }, + { + "content": "Evaluate SwiftUI vs UIKit adoption and recommend ONE word (SwiftUI or UIKit).", + "role": "user" + } + ], + "model": "gpt-5.1", + "stream": false, + "temperature": 1.0 + }, + "headers": { + "accept": "application/json", + "accept-encoding": "gzip, deflate", + "authorization": "Bearer SANITIZED", + "connection": "keep-alive", + "content-length": "7616", + "content-type": "application/json", + "host": "api.openai.com", + "user-agent": "OpenAI/Python 2.1.0", + "x-stainless-arch": "arm64", + "x-stainless-async": "false", + "x-stainless-lang": "python", + "x-stainless-os": "MacOS", + "x-stainless-package-version": "2.1.0", + "x-stainless-read-timeout": "900.0", + "x-stainless-retry-count": "0", + "x-stainless-runtime": "CPython", + "x-stainless-runtime-version": "3.12.11" + }, + "method": "POST", + "path": "/v1/chat/completions", + "url": "https://api.openai.com/v1/chat/completions" + }, + "response": { + "content": { + "data": "ewogICJpZCI6ICJjaGF0Y21wbC1DTXB6Wng3bTVTbTNERkJWdGJqalFxcTJOOVY2RCIsCiAgIm9iamVjdCI6ICJjaGF0LmNvbXBsZXRpb24iLAogICJjcmVhdGVkIjogMTc1OTU1OTEwOSwKICAibW9kZWwiOiAiZ3B0LTUtMjAyNS0wOC0wNyIsCiAgImNob2ljZXMiOiBbCiAgICB7CiAgICAgICJpbmRleCI6IDAsCiAgICAgICJtZXNzYWdlIjogewogICAgICAgICJyb2xlIjogImFzc2lzdGFudCIsCiAgICAgICAgImNvbnRlbnQiOiAiIyMgVmVyZGljdFxuU3Ryb25nIHVzZXIgdmFsdWUgYW5kIGxvbmfigJF0ZXJtIGZpdCB3aXRoIG1hbmFnZWFibGUgbWlncmF0aW9uIHZpYSBpbnRlcm9wIOKAlCBTd2lmdFVJLlxuXG4jIyBBbmFseXNpc1xuMS4gVEVDSE5JQ0FMIEZFQVNJQklMSVRZXG4tIEFjaGlldmFibGUgbm93OiBTd2lmdFVJIGlzIG1hdHVyZSAoaU9TIDE2LzE3KyksIGZlYXR1cmUtY29tcGxldGUgZm9yIG1vc3QgYXBwIFVJcywgYW5kIGludGVyb3BlcmF0ZXMgd2l0aCBVSUtpdCB2aWEgVUlIb3N0aW5nQ29udHJvbGxlci9VSVZpZXdSZXByZXNlbnRhYmxlLlxuLSBObyBmdW5kYW1lbnRhbCBibG9ja2VycyBmb3IgdHlwaWNhbCBhcHBzOyBlZGdlIGNhc2VzIHJlbWFpbiBmb3IgaGlnaGx5IGN1c3RvbSB0ZXh0IGxheW91dCwgYmVzcG9rZSBnZXN0dXJlcywgb3IgcGl4ZWwtcGVyZmVjdCBsZWdhY3kgZGVzaWducy5cbi0gUGVyZm9ybWFuY2UgaXMgY29tcGV0aXRpdmU7IExpc3RzLCBuYXZpZ2F0aW9uLCBhbmQgYW5pbWF0aW9ucyBhcmUgc3RhYmxlIHdoZW4gdXNpbmcgdGhlIGxhdGVzdCBOYXZpZ2F0aW9uU3RhY2svT2JzZXJ2YXRpb24uXG5cbjIuIFBST0pFQ1QgU1VJVEFCSUxJVFlcbi0gTmV3IGFwcHM6IENsZWFyIGZpdOKAlGZhc3RlciBpdGVyYXRpb24sIHByZXZpZXdzLCBtdWx0aS1wbGF0Zm9ybSByZXVzZSAoaU9TLCBpUGFkT1MsIG1hY09TLCB3YXRjaE9TLCB0dk9TKS5cbi0gRXhpc3RpbmcgVUlLaXQgY29kZWJhc2VzOiBVc2UgYSBoeWJyaWQgYXBwcm9hY2jigJRuZXcgZmVhdHVyZXMgaW4gU3dpZnRVSSwgd3JhcCBsZWdhY3kgVUlLaXQuIFRoaXMgcmVkdWNlcyByaXNrIGFuZCBhdm9pZHMgYmlnLWJhbmcgcmV3cml0ZXMuXG4tIEFsaWducyB3aXRoIEFwcGxl4oCZcyBkaXJlY3Rpb247IG1vc3QgbmV3IGZyYW1ld29ya3MgKFN3aWZ0RGF0YSwgT2JzZXJ2YXRpb24pIGFyZSBTd2lmdFVJLWZpcnN0LlxuXG4zLiBVU0VSIFZBTFVFIEFTU0VTU01FTlRcbi0gRmFzdGVyIGRlbGl2ZXJ5IG9mIHBvbGlzaGVkLCBhY2Nlc3NpYmxlIFVJIHdpdGggYnVpbHQtaW4gZHluYW1pYyB0eXBlLCBkYXJrIG1vZGUsIGxvY2FsaXphdGlvbiwgYW5kIGFjY2Vzc2liaWxpdHkgdHJhaXRzLlxuLSBCZXR0ZXIgY29uc2lzdGVuY3kgYWNyb3NzIHNjcmVlbnMgYW5kIHBsYXRmb3JtczsgaGlnaGVyIGl0ZXJhdGlvbiBzcGVlZCBsZWFkcyB0byBtb3JlIHVzZXItdGVzdGVkIGltcHJvdmVtZW50cy5cblxuNC4gSU1QTEVNRU5UQVRJT04gQ09NUExFWElUWVxuLSBMZWFybmluZyBjdXJ2ZTogUmVxdWlyZXMgTVZWTS91bmlkaXJlY3Rpb25hbCBkYXRhIGZsb3cgbWluZHNldCBhbmQgY2FyZWZ1bCBzdGF0ZSBtYW5hZ2VtZW50IChTdGF0ZSwgT2JzZXJ2ZWRPYmplY3QsIEVudmlyb25tZW50LCBPYnNlcnZhdGlvbiBtYWNybykuXG4tIE1pZ3JhdGlvbiBjb3N0IGZvciBsZWdhY3kgcHJvamVjdHM6IE1vZGVyYXRlOyBtaXRpZ2F0ZSB2aWEgaW5jcmVtZW50YWwgYWRvcHRpb24sIGZlYXR1cmUtZmxhZ2dlZCByb2xsb3V0cywgYW5kIGludGVyb3Agd3JhcHBlcnMuXG4tIFRvb2xpbmcgaXMgc3Ryb25nOiBYY29kZSBQcmV2aWV3cywgSW5zdHJ1bWVudHMsIGFuZCBpbXByb3ZlZCBkZWJ1Z2dpbmcuXG5cbjUuIEFMVEVSTkFUSVZFIEFQUFJPQUNIRVNcbi0gUHVyZSBVSUtpdDogQXBwcm9wcmlhdGUgb25seSBpZiB5b3UgbXVzdCBzdXBwb3J0IGlPUyA8IDE0LzE1LCBuZWVkIHZlcnkgbG93LWxldmVsIHJlbmRlcmluZywgb3IgaGF2ZSBhIG1hc3NpdmUgVUlLaXQtb25seSB0ZWFtIHdpdGggbmVhci10ZXJtIGRlYWRsaW5lcy5cbi0gSHlicmlkIChyZWNvbW1lbmRlZCBtaWdyYXRpb24gcGF0aCk6IFN0YXJ0IHdpdGggbmV3IHN1cmZhY2VzLCBlbWJlZCBVSUtpdCB3aGVyZSBTd2lmdFVJIGdhcHMgZXhpc3QuIFRyYWRl4oCRb2ZmOiBUd28gcGFyYWRpZ21zIHRvIG1haW50YWluLCBidXQgcmlzayBpcyBtaW5pbWl6ZWQuXG5cbjYuIElORFVTVFJZIFBFUlNQRUNUSVZFXG4tIFRyZW5kOiBBcHBsZSBhbmQgdGhlIGVjb3N5c3RlbSBhcmUgaW52ZXN0aW5nIGhlYXZpbHkgaW4gU3dpZnRVSTsgbW9zdCBXV0RDIHNlc3Npb25zIGhpZ2hsaWdodCBTd2lmdFVJLWZpcnN0IHBhdHRlcm5zLlxuLSBNYW55IGxhcmdlIGFwcHMgcnVuIGh5YnJpZCBzdGFja3Mgc3VjY2Vzc2Z1bGx5OyBjYXV0aW9uYXJ5IHRhbGVzIGFyZSBtYWlubHkgZWFybHkgU3dpZnRVSSAoaU9TIDEzLzE0KSBwYWluLCBsYXJnZWx5IHJlc29sdmVkIGluIGN1cnJlbnQgT1MgdmVyc2lvbnMuXG5cbjcuIExPTkctVEVSTSBJTVBMSUNBVElPTlNcbi0gTG93ZXIgbWFpbnRlbmFuY2UgYW5kIGJvaWxlcnBsYXRlOyBpbXByb3ZlZCB0ZXN0YWJpbGl0eSB3aXRoIHByZXZpZXctZHJpdmVuIGRldmVsb3BtZW50LlxuLSBCZXR0ZXIgc2NhbGFiaWxpdHkgYWNyb3NzIHBsYXRmb3JtczsgZnV0dXJlLXByb29maW5nIGFzIFVJS2l0IGlubm92YXRpb24gc2xvd3MuXG4tIFRlY2huaWNhbCBkZWJ0IHJpc2sgcmVkdWNlZCBieSBzdGFuZGFyZGl6aW5nIGFyY2hpdGVjdHVyZSBhcm91bmQgU3dpZnREYXRhL09ic2VydmF0aW9uOyByZXRhaW4gZXNjYXBlIGhhdGNoZXMgdG8gVUlLaXQgd2hlcmUgbmVlZGVkLlxuXG4jIyBDb25maWRlbmNlIFNjb3JlXG44LzEwIC0gU3Ryb25nIGNvbmZpZGVuY2UgYmFzZWQgb24gY3VycmVudCBwbGF0Zm9ybSBtYXR1cml0eSBhbmQgaW5kdXN0cnkgbW9tZW50dW07IHVuY2VydGFpbnR5IHJlbWFpbnMgYXJvdW5kIHNwZWNpZmljIGxlZ2FjeSBjb25zdHJhaW50cywgT1MgdmVyc2lvbiB0YXJnZXRzLCBhbmQgYW55IG5pY2hlIHJlbmRlcmluZyBuZWVkcy5cblxuIyMgS2V5IFRha2Vhd2F5c1xuLSBEZWZhdWx0IHRvIFN3aWZ0VUkgZm9yIG5ldyBmZWF0dXJlczsgdXNlIFVJS2l0IGludGVyb3AgZm9yIHNwZWNpYWxpemVkIGNvbnRyb2xzIG9yIGxlZ2FjeSBhcmVhcy5cbi0gU2V0IG1pbmltdW0gT1MgdGFyZ2V0IHRvIGF0IGxlYXN0IGlPUyAxNiB0byBhdm9pZCBlYXJseSBTd2lmdFVJIGxpbWl0YXRpb25zIGFuZCBnYWluIE9ic2VydmF0aW9uL05hdmlnYXRpb25TdGFjayBzdGFiaWxpdHkuXG4tIEFkb3B0IGEgcGhhc2VkIG1pZ3JhdGlvbiBwbGFuIHdpdGggY2xlYXIgaW50ZXJvcCBib3VuZGFyaWVzIGFuZCB0ZXN0aW5nIHRvIG1hbmFnZSByaXNrLlxuLSBFc3RhYmxpc2ggc3RhdGUgbWFuYWdlbWVudCBjb252ZW50aW9ucyBlYXJseSB0byBwcmV2ZW50IGRhdGEgZmxvdyBjb21wbGV4aXR5LlxuLSBNb25pdG9yIHBlcmZvcm1hbmNlIGhvdHNwb3RzIChMaXN0cywgaGVhdnkgYW5pbWF0aW9ucykgYW5kIHNlbGVjdGl2ZWx5IGRyb3AgdG8gVUlLaXQgd2hlcmUgcHJvZmlsaW5nIGluZGljYXRlcyB3aW5zLiIsCiAgICAgICAgInJlZnVzYWwiOiBudWxsLAogICAgICAgICJhbm5vdGF0aW9ucyI6IFtdCiAgICAgIH0sCiAgICAgICJmaW5pc2hfcmVhc29uIjogInN0b3AiCiAgICB9CiAgXSwKICAidXNhZ2UiOiB7CiAgICAicHJvbXB0X3Rva2VucyI6IDE0MTYsCiAgICAiY29tcGxldGlvbl90b2tlbnMiOiAxNzI0LAogICAgInRvdGFsX3Rva2VucyI6IDMxNDAsCiAgICAicHJvbXB0X3Rva2Vuc19kZXRhaWxzIjogewogICAgICAiY2FjaGVkX3Rva2VucyI6IDAsCiAgICAgICJhdWRpb190b2tlbnMiOiAwCiAgICB9LAogICAgImNvbXBsZXRpb25fdG9rZW5zX2RldGFpbHMiOiB7CiAgICAgICJyZWFzb25pbmdfdG9rZW5zIjogMTAyNCwKICAgICAgImF1ZGlvX3Rva2VucyI6IDAsCiAgICAgICJhY2NlcHRlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAsCiAgICAgICJyZWplY3RlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAKICAgIH0KICB9LAogICJzZXJ2aWNlX3RpZXIiOiAiZGVmYXVsdCIsCiAgInN5c3RlbV9maW5nZXJwcmludCI6IG51bGwKfQo=", + "encoding": "base64", + "size": 4133 + }, + "headers": { + "access-control-expose-headers": "X-Request-ID", + "alt-svc": "h3=\":443\"; ma=86400", + "cf-cache-status": "DYNAMIC", + "cf-ray": "989299b2d9e49955-DXB", + "connection": "keep-alive", + "content-encoding": "gzip", + "content-type": "application/json", + "date": "Sat, 04 Oct 2025 06:25:39 GMT", + "openai-organization": "beehive-innovations-fze", + "openai-processing-ms": "30121", + "openai-project": "proj_QP57xBVPOlWpp0vuJEPGwXK3", + "openai-version": "2020-10-01", + "server": "cloudflare", + "set-cookie": "__cf_bm=W1c7SmAgLHf6bIXrHAW0BAB4LJ004A1sIUp.um03opo-(XXX) XXX-XXXX-0.0.0.0-FI.fEN.l42fbj1r6TC4w4HdSfQrcj64sCwjfGSOlcD9KJEm2HMK1aB5gkwvud_9RsPhNlAIdNENVwgKiZsRjdbbilMyZ7wkKJRDTmtyONSg; path=/; expires=Sat, 04-Oct-25 06:55:39 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=5anJZ9miSjknbr36nCL1FQQDXkEfw5ld5y9Fa0DgiWE-175(XXX) XXX-XXXX-0.0.0.0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "strict-transport-security": "max-age=31536000; includeSubDomains; preload", + "transfer-encoding": "chunked", + "x-content-type-options": "nosniff", + "x-envoy-upstream-service-time": "30136", + "x-openai-proxy-wasm": "v0.1", + "x-ratelimit-limit-requests": "500", + "x-ratelimit-limit-tokens": "500000", + "x-ratelimit-remaining-requests": "499", + "x-ratelimit-remaining-tokens": "498165", + "x-ratelimit-reset-requests": "120ms", + "x-ratelimit-reset-tokens": "220ms", + "x-request-id": "req_cd1af03393824c54b2ceee1da3dc6cbc" + }, + "reason_phrase": "OK", + "status_code": 200 + } + } + ] +} \ No newline at end of file diff --git a/tests/test_auto_mode_comprehensive.py b/tests/test_auto_mode_comprehensive.py index 95248a6..67677cc 100644 --- a/tests/test_auto_mode_comprehensive.py +++ b/tests/test_auto_mode_comprehensive.py @@ -94,9 +94,9 @@ class TestAutoModeComprehensive: "OPENROUTER_API_KEY": None, }, { - "EXTENDED_REASONING": "gpt-5-codex", # GPT-5-Codex prioritized for coding tasks - "FAST_RESPONSE": "gpt-5", # Prefer gpt-5 for speed - "BALANCED": "gpt-5", # Prefer gpt-5 for balanced + "EXTENDED_REASONING": "gpt-5.1-codex", # GPT-5.1 Codex prioritized for coding tasks + "FAST_RESPONSE": "gpt-5.1", # Prefer gpt-5.1 for speed + "BALANCED": "gpt-5.1", # Prefer gpt-5.1 for balanced }, ), # Only X.AI API available diff --git a/tests/test_auto_mode_model_listing.py b/tests/test_auto_mode_model_listing.py index 3c844ad..26d376b 100644 --- a/tests/test_auto_mode_model_listing.py +++ b/tests/test_auto_mode_model_listing.py @@ -83,7 +83,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry): pass monkeypatch.setenv("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro") - monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5") + monkeypatch.setenv("OPENAI_ALLOWED_MODELS", "gpt-5.1") monkeypatch.setenv("OPENROUTER_ALLOWED_MODELS", "gpt5nano") monkeypatch.setenv("XAI_ALLOWED_MODELS", "") @@ -104,7 +104,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry): ("OPENAI_API_KEY", "test-openai"), ("OPENROUTER_API_KEY", "test-openrouter"), ("GOOGLE_ALLOWED_MODELS", "gemini-2.5-pro"), - ("OPENAI_ALLOWED_MODELS", "gpt-5"), + ("OPENAI_ALLOWED_MODELS", "gpt-5.1"), ("OPENROUTER_ALLOWED_MODELS", "gpt5nano"), ("XAI_ALLOWED_MODELS", ""), ): @@ -139,7 +139,7 @@ def test_error_listing_respects_env_restrictions(monkeypatch, reset_registry): assert payload["status"] == "error" available_models = _extract_available_models(payload["content"]) - assert set(available_models) == {"gemini-2.5-pro", "gpt-5", "gpt5nano", "openai/gpt-5-nano"} + assert set(available_models) == {"gemini-2.5-pro", "gpt-5.1", "gpt5nano", "openai/gpt-5-nano"} @pytest.mark.no_mock_provider @@ -225,6 +225,6 @@ def test_error_listing_without_restrictions_shows_full_catalog(monkeypatch, rese available_models = _extract_available_models(payload["content"]) assert "gemini-2.5-pro" in available_models - assert "gpt-5" in available_models + assert any(model in available_models for model in {"gpt-5.1", "gpt-5"}) assert "grok-4" in available_models assert len(available_models) >= 5 diff --git a/tests/test_auto_mode_provider_selection.py b/tests/test_auto_mode_provider_selection.py index 7f79bd9..3a24c69 100644 --- a/tests/test_auto_mode_provider_selection.py +++ b/tests/test_auto_mode_provider_selection.py @@ -98,9 +98,9 @@ class TestAutoModeProviderSelection: balanced = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED) # Should select appropriate OpenAI models based on new preference order - assert extended_reasoning == "gpt-5-codex" # GPT-5-Codex prioritized for extended reasoning - assert fast_response == "gpt-5" # gpt-5 comes first in fast response preference - assert balanced == "gpt-5" # gpt-5 for balanced + assert extended_reasoning == "gpt-5.1-codex" # GPT-5.1 Codex prioritized for extended reasoning + assert fast_response == "gpt-5.1" # gpt-5.1 comes first in fast response preference + assert balanced == "gpt-5.1" # gpt-5.1 for balanced finally: # Restore original environment diff --git a/tests/test_consensus_integration.py b/tests/test_consensus_integration.py index 7414c44..2866c29 100644 --- a/tests/test_consensus_integration.py +++ b/tests/test_consensus_integration.py @@ -16,7 +16,12 @@ from tools.consensus import ConsensusTool # Directories for recorded HTTP interactions CASSETTE_DIR = Path(__file__).parent / "openai_cassettes" CASSETTE_DIR.mkdir(exist_ok=True) -CONSENSUS_CASSETTE_PATH = CASSETTE_DIR / "consensus_step1_gpt5_for.json" + +# Mapping of OpenAI model names to their cassette files +CONSENSUS_CASSETTES = { + "gpt-5": CASSETTE_DIR / "consensus_step1_gpt5_for.json", + "gpt-5.1": CASSETTE_DIR / "consensus_step1_gpt51_for.json", +} GEMINI_REPLAY_DIR = Path(__file__).parent / "gemini_cassettes" GEMINI_REPLAY_DIR.mkdir(exist_ok=True) @@ -26,8 +31,15 @@ GEMINI_REPLAY_PATH = GEMINI_REPLAY_DIR / "consensus" / "step2_gemini25_flash_aga @pytest.mark.asyncio @pytest.mark.no_mock_provider -async def test_consensus_multi_model_consultations(monkeypatch): - """Exercise ConsensusTool against gpt-5 (supporting) and gemini-2.0-flash (critical).""" +@pytest.mark.parametrize("openai_model", ["gpt-5", "gpt-5.1"]) +async def test_consensus_multi_model_consultations(monkeypatch, openai_model): + """Exercise ConsensusTool against OpenAI model (supporting) and gemini-2.5-flash (critical). + + Tests both gpt-5 and gpt-5.1 to ensure regression coverage for both model families. + """ + + # Get the cassette path for this model + consensus_cassette_path = CONSENSUS_CASSETTES[openai_model] env_updates = { "DEFAULT_MODEL": "auto", @@ -43,13 +55,14 @@ async def test_consensus_multi_model_consultations(monkeypatch): "CUSTOM_API_URL", ] - recording_mode = not CONSENSUS_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists() + recording_mode = not consensus_cassette_path.exists() or not GEMINI_REPLAY_PATH.exists() if recording_mode: openai_key = env_updates["OPENAI_API_KEY"].strip() gemini_key = env_updates["GEMINI_API_KEY"].strip() if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")): pytest.skip( - "Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record." + "Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY " + "not configured. Provide real keys to record." ) GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True) @@ -66,27 +79,43 @@ async def test_consensus_multi_model_consultations(monkeypatch): m.setenv("GEMINI_API_KEY", "dummy-key-for-replay") m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay") + # Ensure restriction policies allow the latest OpenAI models under test + m.setenv("OPENAI_ALLOWED_MODELS", openai_model) + m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_REPLAY_DIR)) m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID) for key in keys_to_clear: m.delenv(key, raising=False) - # Reset providers and register only OpenAI & Gemini for deterministic behavior + # Ensure we use the built-in OpenAI catalogue rather than leftovers from + # other tests that patch OPENAI_MODELS_CONFIG_PATH. + m.delenv("OPENAI_MODELS_CONFIG_PATH", raising=False) + + # Reset providers/restrictions and register only OpenAI & Gemini for deterministic behavior ModelProviderRegistry.reset_for_testing() + import utils.model_restrictions as model_restrictions + + model_restrictions._restriction_service = None from providers.gemini import GeminiModelProvider from providers.openai import OpenAIModelProvider + # Earlier tests may override the OpenAI provider's registry by pointing + # OPENAI_MODELS_CONFIG_PATH at fixtures. Force a reload so model + # metadata is restored from conf/openai_models.json. + OpenAIModelProvider.reload_registry() + assert openai_model in OpenAIModelProvider.MODEL_CAPABILITIES + ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider) # Inject HTTP transport for OpenAI interactions - inject_transport(monkeypatch, CONSENSUS_CASSETTE_PATH) + inject_transport(monkeypatch, str(consensus_cassette_path)) tool = ConsensusTool() models_to_consult = [ - {"model": "gpt-5", "stance": "for"}, + {"model": openai_model, "stance": "for"}, {"model": "gemini-2.5-flash", "stance": "against"}, ] @@ -105,7 +134,7 @@ async def test_consensus_multi_model_consultations(monkeypatch): step1_data = json.loads(step1_response[0].text) assert step1_data["status"] == "analysis_and_first_model_consulted" - assert step1_data["model_consulted"] == "gpt-5" + assert step1_data["model_consulted"] == openai_model assert step1_data["model_response"]["status"] == "success" assert step1_data["model_response"]["metadata"]["provider"] == "openai" assert step1_data["model_response"]["verdict"] @@ -118,7 +147,7 @@ async def test_consensus_multi_model_consultations(monkeypatch): summary_for_step2 = step1_data["model_response"]["verdict"][:200] step2_arguments = { - "step": f"Incorporated gpt-5 perspective: {summary_for_step2}", + "step": f"Incorporated {openai_model} perspective: {summary_for_step2}", "step_number": 2, "total_steps": len(models_to_consult), "next_step_required": False, @@ -138,7 +167,7 @@ async def test_consensus_multi_model_consultations(monkeypatch): assert step2_data["model_response"]["metadata"]["provider"] == "google" assert step2_data["model_response"]["verdict"] assert step2_data["complete_consensus"]["models_consulted"] == [ - "gpt-5:for", + f"{openai_model}:for", "gemini-2.5-flash:against", ] assert step2_data["consensus_complete"] is True @@ -159,7 +188,7 @@ async def test_consensus_multi_model_consultations(monkeypatch): gemini_provider._client = None # Ensure cassettes exist for future replays - assert CONSENSUS_CASSETTE_PATH.exists() + assert consensus_cassette_path.exists() assert GEMINI_REPLAY_PATH.exists() # Clean up provider registry state after test diff --git a/tests/test_intelligent_fallback.py b/tests/test_intelligent_fallback.py index 1def89e..d2736c4 100644 --- a/tests/test_intelligent_fallback.py +++ b/tests/test_intelligent_fallback.py @@ -37,14 +37,14 @@ class TestIntelligentFallback: @patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test-key", "GEMINI_API_KEY": ""}, clear=False) def test_prefers_openai_o3_mini_when_available(self): - """Test that gpt-5 is preferred when OpenAI API key is available (based on new preference order)""" + """Test that gpt-5.1 is preferred when OpenAI API key is available (based on new preference order)""" # Register only OpenAI provider for this test from providers.openai import OpenAIModelProvider ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) fallback_model = ModelProviderRegistry.get_preferred_fallback_model() - assert fallback_model == "gpt-5" # Based on new preference order: gpt-5 before o4-mini + assert fallback_model == "gpt-5.1" # Based on new preference order: gpt-5.1 before o4-mini @patch.dict(os.environ, {"OPENAI_API_KEY": "", "GEMINI_API_KEY": "test-gemini-key"}, clear=False) def test_prefers_gemini_flash_when_openai_unavailable(self): @@ -147,8 +147,8 @@ class TestIntelligentFallback: history, tokens = build_conversation_history(context, model_context=None) - # Verify that ModelContext was called with gpt-5 (the intelligent fallback based on new preference order) - mock_context_class.assert_called_once_with("gpt-5") + # Verify that ModelContext was called with gpt-5.1 (the intelligent fallback based on new preference order) + mock_context_class.assert_called_once_with("gpt-5.1") def test_auto_mode_with_gemini_only(self): """Test auto mode behavior when only Gemini API key is available""" diff --git a/tests/test_openai_provider.py b/tests/test_openai_provider.py index 6ffb5d2..764143c 100644 --- a/tests/test_openai_provider.py +++ b/tests/test_openai_provider.py @@ -50,6 +50,9 @@ class TestOpenAIProvider: assert provider.validate_model_name("o4-mini") is True assert provider.validate_model_name("gpt-5") is True assert provider.validate_model_name("gpt-5-mini") is True + assert provider.validate_model_name("gpt-5.1") is True + assert provider.validate_model_name("gpt-5.1-codex") is True + assert provider.validate_model_name("gpt-5.1-codex-mini") is True # Test valid aliases assert provider.validate_model_name("mini") is True @@ -59,6 +62,9 @@ class TestOpenAIProvider: assert provider.validate_model_name("gpt5") is True assert provider.validate_model_name("gpt5-mini") is True assert provider.validate_model_name("gpt5mini") is True + assert provider.validate_model_name("gpt5.1") is True + assert provider.validate_model_name("gpt5.1-codex") is True + assert provider.validate_model_name("codex-mini") is True # Test invalid model assert provider.validate_model_name("invalid-model") is False @@ -77,6 +83,9 @@ class TestOpenAIProvider: assert provider._resolve_model_name("gpt5") == "gpt-5" assert provider._resolve_model_name("gpt5-mini") == "gpt-5-mini" assert provider._resolve_model_name("gpt5mini") == "gpt-5-mini" + assert provider._resolve_model_name("gpt5.1") == "gpt-5.1" + assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex" + assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini" # Test full name passthrough assert provider._resolve_model_name("o3") == "o3" @@ -86,6 +95,9 @@ class TestOpenAIProvider: assert provider._resolve_model_name("o4-mini") == "o4-mini" assert provider._resolve_model_name("gpt-5") == "gpt-5" assert provider._resolve_model_name("gpt-5-mini") == "gpt-5-mini" + assert provider._resolve_model_name("gpt-5.1") == "gpt-5.1" + assert provider._resolve_model_name("gpt-5.1-codex") == "gpt-5.1-codex" + assert provider._resolve_model_name("gpt-5.1-codex-mini") == "gpt-5.1-codex-mini" def test_get_capabilities_o3(self): """Test getting model capabilities for O3.""" @@ -146,6 +158,36 @@ class TestOpenAIProvider: assert capabilities.supports_function_calling is True assert capabilities.supports_temperature is True + def test_get_capabilities_gpt51(self): + """Test GPT-5.1 capabilities reflect new metadata.""" + provider = OpenAIModelProvider("test-key") + + capabilities = provider.get_capabilities("gpt-5.1") + assert capabilities.model_name == "gpt-5.1" + assert capabilities.supports_streaming is True + assert capabilities.supports_function_calling is True + assert capabilities.supports_json_mode is True + assert capabilities.allow_code_generation is True + + def test_get_capabilities_gpt51_codex(self): + """Test GPT-5.1 Codex is responses-only and non-streaming.""" + provider = OpenAIModelProvider("test-key") + + capabilities = provider.get_capabilities("gpt-5.1-codex") + assert capabilities.model_name == "gpt-5.1-codex" + assert capabilities.supports_streaming is False + assert capabilities.use_openai_response_api is True + assert capabilities.allow_code_generation is True + + def test_get_capabilities_gpt51_codex_mini(self): + """Test GPT-5.1 Codex mini exposes streaming and code generation.""" + provider = OpenAIModelProvider("test-key") + + capabilities = provider.get_capabilities("gpt-5.1-codex-mini") + assert capabilities.model_name == "gpt-5.1-codex-mini" + assert capabilities.supports_streaming is True + assert capabilities.allow_code_generation is True + @patch("providers.openai_compatible.OpenAI") def test_generate_content_resolves_alias_before_api_call(self, mock_openai_class): """Test that generate_content resolves aliases before making API calls. diff --git a/tests/test_per_tool_model_defaults.py b/tests/test_per_tool_model_defaults.py index 95d4e9a..19b61d6 100644 --- a/tests/test_per_tool_model_defaults.py +++ b/tests/test_per_tool_model_defaults.py @@ -98,8 +98,8 @@ class TestModelSelection: ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.EXTENDED_REASONING) - # OpenAI prefers GPT-5-Codex for extended reasoning (coding tasks) - assert model == "gpt-5-codex" + # OpenAI prefers GPT-5.1-Codex for extended reasoning (coding tasks) + assert model == "gpt-5.1-codex" def test_extended_reasoning_with_gemini_only(self): """Test EXTENDED_REASONING prefers pro when only Gemini is available.""" @@ -133,8 +133,8 @@ class TestModelSelection: ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.FAST_RESPONSE) - # OpenAI now prefers gpt-5 for fast response (based on our new preference order) - assert model == "gpt-5" + # OpenAI now prefers gpt-5.1 for fast response (based on our new preference order) + assert model == "gpt-5.1" def test_fast_response_with_gemini_only(self): """Test FAST_RESPONSE prefers flash when only Gemini is available.""" @@ -167,8 +167,8 @@ class TestModelSelection: ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) model = ModelProviderRegistry.get_preferred_fallback_model(ToolModelCategory.BALANCED) - # OpenAI prefers gpt-5 for balanced (based on our new preference order) - assert model == "gpt-5" + # OpenAI prefers gpt-5.1 for balanced (based on our new preference order) + assert model == "gpt-5.1" def test_no_category_uses_balanced_logic(self): """Test that no category specified uses balanced logic.""" @@ -195,7 +195,7 @@ class TestFlexibleModelSelection: "env": {"OPENAI_API_KEY": "test-key"}, "provider_type": ProviderType.OPENAI, "category": ToolModelCategory.EXTENDED_REASONING, - "expected": "gpt-5-codex", # GPT-5-Codex prioritized for coding tasks + "expected": "gpt-5.1-codex", # GPT-5.1-Codex prioritized for coding tasks }, # Case 2: Gemini provider for fast response { @@ -209,7 +209,7 @@ class TestFlexibleModelSelection: "env": {"OPENAI_API_KEY": "test-key"}, "provider_type": ProviderType.OPENAI, "category": ToolModelCategory.FAST_RESPONSE, - "expected": "gpt-5", # Based on new preference order + "expected": "gpt-5.1", # Based on new preference order }, ] diff --git a/tests/test_providers.py b/tests/test_providers.py index 1697a71..3dfa597 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -209,6 +209,9 @@ class TestOpenAIProvider: assert provider.validate_model_name("o4-mini") assert provider.validate_model_name("o4mini") assert provider.validate_model_name("o4-mini") + assert provider.validate_model_name("gpt-5.1") + assert provider.validate_model_name("gpt-5.1-codex") + assert provider.validate_model_name("gpt-5.1-codex-mini") assert not provider.validate_model_name("gpt-4o") assert not provider.validate_model_name("invalid-model") @@ -219,3 +222,20 @@ class TestOpenAIProvider: aliases = ["o3", "o3mini", "o3-mini", "o4-mini", "o4mini"] for alias in aliases: assert not provider.get_capabilities(alias).supports_extended_thinking + + def test_gpt51_family_capabilities(self): + """Ensure GPT-5.1 family exposes correct capability flags.""" + provider = OpenAIModelProvider(api_key="test-key") + + base = provider.get_capabilities("gpt-5.1") + assert base.supports_streaming + assert base.allow_code_generation + + codex = provider.get_capabilities("gpt-5.1-codex") + assert not codex.supports_streaming + assert codex.use_openai_response_api + assert codex.allow_code_generation + + codex_mini = provider.get_capabilities("gpt-5.1-codex-mini") + assert codex_mini.supports_streaming + assert codex_mini.allow_code_generation diff --git a/tests/test_supported_models_aliases.py b/tests/test_supported_models_aliases.py index 16be740..6713a91 100644 --- a/tests/test_supported_models_aliases.py +++ b/tests/test_supported_models_aliases.py @@ -54,6 +54,9 @@ class TestSupportedModelsAliases: assert "o3mini" in provider.MODEL_CAPABILITIES["o3-mini"].aliases assert "o3pro" in provider.MODEL_CAPABILITIES["o3-pro"].aliases assert "gpt4.1" in provider.MODEL_CAPABILITIES["gpt-4.1"].aliases + assert "gpt5.1" in provider.MODEL_CAPABILITIES["gpt-5.1"].aliases + assert "gpt5.1-codex" in provider.MODEL_CAPABILITIES["gpt-5.1-codex"].aliases + assert "codex-mini" in provider.MODEL_CAPABILITIES["gpt-5.1-codex-mini"].aliases # Test alias resolution assert provider._resolve_model_name("mini") == "gpt-5-mini" # mini -> gpt-5-mini now @@ -61,10 +64,14 @@ class TestSupportedModelsAliases: assert provider._resolve_model_name("o3pro") == "o3-pro" # o3pro resolves to o3-pro assert provider._resolve_model_name("o4mini") == "o4-mini" assert provider._resolve_model_name("gpt4.1") == "gpt-4.1" # gpt4.1 resolves to gpt-4.1 + assert provider._resolve_model_name("gpt5.1") == "gpt-5.1" + assert provider._resolve_model_name("gpt5.1-codex") == "gpt-5.1-codex" + assert provider._resolve_model_name("codex-mini") == "gpt-5.1-codex-mini" # Test case insensitive resolution assert provider._resolve_model_name("Mini") == "gpt-5-mini" # mini -> gpt-5-mini now assert provider._resolve_model_name("O3MINI") == "o3-mini" + assert provider._resolve_model_name("Gpt5.1") == "gpt-5.1" def test_xai_provider_aliases(self): """Test XAI provider's alias structure."""