From 3c4b1368c60e0be0eeb2262c66bee0fbebae638c Mon Sep 17 00:00:00 2001 From: Fahad Date: Fri, 3 Oct 2025 13:07:42 +0400 Subject: [PATCH] test: http cassettes added for improved testing of consensus --- .../step2_gemini25_flash_against/mldev.json | 133 ++++++++++++++ .../consensus_step1_gpt5_for.json | 81 +++++++++ tests/test_consensus_integration.py | 166 ++++++++++++++++++ 3 files changed, 380 insertions(+) create mode 100644 tests/gemini_cassettes/consensus/step2_gemini25_flash_against/mldev.json create mode 100644 tests/openai_cassettes/consensus_step1_gpt5_for.json create mode 100644 tests/test_consensus_integration.py diff --git a/tests/gemini_cassettes/consensus/step2_gemini25_flash_against/mldev.json b/tests/gemini_cassettes/consensus/step2_gemini25_flash_against/mldev.json new file mode 100644 index 0000000..90a0584 --- /dev/null +++ b/tests/gemini_cassettes/consensus/step2_gemini25_flash_against/mldev.json @@ -0,0 +1,133 @@ +{ + "replay_id": "tests.consensus/swiftui_vs_uikit/step2_gemini25_flash_against", + "interactions": [ + { + "request": { + "method": "post", + "url": "{MLDEV_URL_PREFIX}/models/gemini-2.5-flash:generateContent", + "headers": { + "Content-Type": "application/json", + "x-goog-api-key": "{REDACTED}", + "user-agent": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}", + "x-goog-api-client": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}" + }, + "body_segments": [ + { + "contents": [ + { + "parts": [ + { + "text": "\nROLE\nYou are an expert technical consultant providing consensus analysis on proposals, plans, and ideas. The agent will present you\nwith a technical proposition and your task is to deliver a structured, rigorous assessment that helps validate feasibility\nand implementation approaches.\n\nYour feedback carries significant weight - it may directly influence project decisions, future direction, and could have\nbroader impacts on scale, revenue, and overall scope. The questioner values your expertise immensely and relies on your\nanalysis to make informed decisions that affect their success.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE│ code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE│\" markers in generated code\nsnippets.\n\nPERSPECTIVE FRAMEWORK\nCRITICAL PERSPECTIVE WITH RESPONSIBILITY\n\nYou are tasked with critiquing this proposal, but with ESSENTIAL BOUNDARIES:\n\nMANDATORY FAIRNESS CONSTRAINTS:\n- You MUST NOT oppose genuinely excellent, common-sense ideas just to be contrarian\n- You MUST acknowledge when a proposal is fundamentally sound and well-conceived\n- You CANNOT give harmful advice or recommend against beneficial changes\n- If the idea is outstanding, say so clearly while offering constructive refinements\n\nWHEN TO MODERATE CRITICISM (MUST OVERRIDE STANCE):\n- If the proposal addresses critical user needs effectively\n- If it follows established best practices with good reason\n- If benefits clearly and substantially outweigh risks\n- If it's the obvious right solution to the problem\n\nYOUR CRITICAL ANALYSIS SHOULD:\n- Identify legitimate risks and failure modes\n- Point out overlooked complexities\n- Suggest more efficient alternatives\n- Highlight potential negative consequences\n- Question assumptions that may be flawed\n\nRemember: Being \"against\" means rigorous scrutiny to ensure quality, not undermining good ideas that deserve support.\n\nIF MORE INFORMATION IS NEEDED\nIMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,\nor technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based\non the information given rather than requesting technical files.\n\nIf you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to\nprovide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).\nDo NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nFor business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the\ncontext provided, even if specific technical details are not available.\n\nEVALUATION FRAMEWORK\nAssess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you\nacknowledge fundamental truths about feasibility, safety, or value:\n\n1. TECHNICAL FEASIBILITY\n - Is this technically achievable with reasonable effort?\n - What are the core technical dependencies and requirements?\n - Are there any fundamental technical blockers?\n\n2. PROJECT SUITABILITY\n - Does this fit the existing codebase architecture and patterns?\n - Is it compatible with current technology stack and constraints?\n - How well does it align with the project's technical direction?\n\n3. USER VALUE ASSESSMENT\n - Will users actually want and use this feature?\n - What concrete benefits does this provide?\n - How does this compare to alternative solutions?\n\n4. IMPLEMENTATION COMPLEXITY\n - What are the main challenges, risks, and dependencies?\n - What is the estimated effort and timeline?\n - What expertise and resources are required?\n\n5. ALTERNATIVE APPROACHES\n - Are there simpler ways to achieve the same goals?\n - What are the trade-offs between different approaches?\n - Should we consider a different strategy entirely?\n\n6. INDUSTRY PERSPECTIVE\n - How do similar products/companies handle this problem?\n - What are current best practices and emerging patterns?\n - Are there proven solutions or cautionary tales?\n\n7. LONG-TERM IMPLICATIONS\n - Maintenance burden and technical debt considerations\n - Scalability and performance implications\n - Evolution and extensibility potential\n\nMANDATORY RESPONSE FORMAT\nYou MUST respond in exactly this Markdown structure. Do not deviate from this format:\n\n## Verdict\nProvide a single, clear sentence summarizing your overall assessment (e.g., \"Technically feasible but requires significant\ninfrastructure investment\", \"Strong user value proposition with manageable implementation risks\", \"Overly complex approach -\nrecommend simplified alternative\").\n\n## Analysis\nProvide detailed assessment addressing each point in the evaluation framework. Use clear reasoning and specific examples.\nBe thorough but concise. Address both strengths and weaknesses objectively.\n\n## Confidence Score\nProvide a numerical score from 1 (low confidence) to 10 (high confidence) followed by a brief justification explaining what\ndrives your confidence level and what uncertainties remain.\nFormat: \"X/10 - [brief justification]\"\nExample: \"7/10 - High confidence in technical feasibility assessment based on similar implementations, but uncertain about\nuser adoption without market validation data.\"\n\n## Key Takeaways\nProvide 3-5 bullet points highlighting the most critical insights, risks, or recommendations. These should be actionable\nand specific.\n\nQUALITY STANDARDS\n- Ground all insights in the current project's scope and constraints\n- Be honest about limitations and uncertainties\n- Focus on practical, implementable solutions rather than theoretical possibilities\n- Provide specific, actionable guidance rather than generic advice\n- Balance optimism with realistic risk assessment\n- Reference concrete examples and precedents when possible\n\nREMINDERS\n- Your assessment will be synthesized with other expert opinions by the agent\n- Aim to provide unique insights that complement other perspectives\n- If files are provided, reference specific technical details in your analysis\n- Maintain professional objectivity while being decisive in your recommendations\n- Keep your response concise - your entire reply must not exceed 850 tokens to ensure transport compatibility\n- CRITICAL: Your stance does NOT override your responsibility to provide truthful, ethical, and beneficial guidance\n- Bad ideas must be called out regardless of stance; good ideas must be acknowledged regardless of stance\n\n\nEvaluate SwiftUI vs UIKit adoption and recommend ONE word (SwiftUI or UIKit)." + } + ] + } + ], + "generationConfig": { + "temperature": 0.2, + "candidateCount": 1, + "thinkingConfig": { + "thinkingBudget": 8110 + } + } + } + ] + }, + "response": { + "status_code": 200, + "headers": { + "content-type": "application/json; charset=UTF-8", + "vary": "Origin, X-Origin, Referer", + "content-encoding": "gzip", + "date": "Fri, 03 Oct 2025 08:55:35 GMT", + "server": "scaffolding on HTTPServer2", + "x-xss-protection": "0", + "x-frame-options": "SAMEORIGIN", + "x-content-type-options": "nosniff", + "server-timing": "gfet4t7; dur=26537", + "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000", + "transfer-encoding": "chunked" + }, + "body_segments": [ + { + "candidates": [ + { + "content": { + "parts": [ + { + "text": "## Verdict\nFor new development and strategic long-term alignment with Apple's ecosystem, the recommended adoption is **SwiftUI**.\n\n## Analysis\n**1. TECHNICAL FEASIBILITY**\nBoth SwiftUI and UIKit are technically feasible for building robust iOS applications. SwiftUI, being Apple's modern declarative UI framework, offers a more streamlined and intuitive approach to UI development, especially for new features and applications. It leverages Swift's language features for a more concise and readable codebase. UIKit, while mature and powerful, often requires more boilerplate and an imperative approach. SwiftUI's feasibility is tied to supporting newer OS versions (iOS 13+), which might be a consideration for projects requiring broader legacy device support.\n\n**2. PROJECT SUITABILITY**\nFor *new projects*, SwiftUI is highly suitable, aligning with current best practices and Apple's strategic direction. It promotes modularity and reusability. For *existing UIKit projects*, a full migration to SwiftUI is a significant undertaking and generally not recommended as a \"big bang\" approach. However, SwiftUI is highly suitable for integrating new features or screens into existing UIKit applications through `UIHostingController`, allowing for a gradual adoption strategy. This hybrid approach is a common and effective pattern.\n\n**3. USER VALUE ASSESSMENT**\nUsers do not directly interact with the underlying UI framework. However, SwiftUI's declarative nature and built-in features (e.g., accessibility, dark mode, localization) can enable developers to build more polished, responsive, and feature-rich UIs faster. This can indirectly translate to higher user satisfaction through quicker feature delivery and a more consistent user experience across Apple's platforms. SwiftUI's ability to easily target multiple platforms (iOS, macOS, watchOS, tvOS) from a single codebase also offers potential for broader reach and consistent branding.\n\n**4. IMPLEMENTATION COMPLEXITY**\nSwiftUI generally reduces implementation complexity for common UI patterns due to its declarative syntax, live previews, and automatic layout management. This can lead to faster development cycles and fewer lines of code compared to UIKit. However, for highly custom or complex UI components, or when interacting with older system APIs not yet exposed in SwiftUI, developers might still need to bridge to UIKit using `UIViewRepresentable` or `UIViewControllerRepresentable`, which adds a layer of complexity. The learning curve for developers accustomed to UIKit's imperative style can also be a factor.\n\n**5. ALTERNATIVE APPROACHES**\nWithin the native Apple ecosystem, the primary alternative is UIKit. While UIKit remains a powerful and necessary framework for many existing applications and specific use cases, SwiftUI represents the future of Apple UI development. Other alternatives like cross-platform frameworks (e.g., React Native, Flutter) exist but are outside the scope of a native framework comparison and introduce their own set of trade-offs regarding performance, native look-and-feel, and access to platform-specific features.\n\n**6. INDUSTRY PERSPECTIVE**\nThe industry trend is a clear shift towards SwiftUI. Apple actively promotes SwiftUI, and major companies are increasingly adopting it for new applications and features. While UIKit expertise remains valuable, new hires are often expected to have SwiftUI knowledge. Hybrid applications are common, demonstrating a pragmatic approach to transitioning. SwiftUI is considered the modern best practice for new Apple platform UI development.\n\n**7. LONG-TERM IMPLICATIONS**\nAdopting SwiftUI offers significant long-term benefits. It aligns with Apple's future vision, ensuring better support for new OS features and hardware. It can lead to a more maintainable codebase due to its declarative nature and reduced boilerplate. Scalability and extensibility are enhanced by its component-based architecture. Investing in SwiftUI now reduces future technical debt related to maintaining an older UI framework and positions the project for easier expansion across Apple's device ecosystem.\n\n## Confidence Score\n9/10 - High confidence in recommending SwiftUI for future adoption based on Apple's clear strategic direction, industry trends, and the inherent benefits of declarative UI frameworks. The slight uncertainty (1 point) acknowledges that specific legacy project constraints (e.g., very old OS support requirements, deep reliance on highly custom UIKit components) might necessitate a more nuanced, slower, or hybrid approach.\n\n## Key Takeaways\n* **Strategic Alignment:** SwiftUI is Apple's future for UI development; adopting it ensures long-term alignment and access to new platform features.\n* **Developer Productivity:** Expect faster development cycles and reduced boilerplate for new UI components due to SwiftUI's declarative nature and live previews.\n* **Hybrid Approach for Existing Projects:** For existing UIKit codebases, a gradual, hybrid adoption strategy using `UIHostingController` is the most pragmatic and recommended path.\n* **OS Version Consideration:** Ensure target OS version compatibility (iOS 13+) aligns with project requirements before full SwiftUI commitment.\n* **Cross-Platform Potential:** SwiftUI offers significant advantages for building UIs across iOS, macOS, watchOS, and tvOS from a unified codebase." + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0 + } + ], + "usageMetadata": { + "promptTokenCount": 1404, + "candidatesTokenCount": 1019, + "totalTokenCount": 4824, + "promptTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 1404 + } + ], + "thoughtsTokenCount": 2401 + }, + "modelVersion": "gemini-2.5-flash", + "responseId": "h4_faNzUJc2JxN8PzPW6uA8" + } + ], + "byte_segments": [], + "sdk_response_segments": [ + { + "sdk_http_response": { + "headers": { + "content-type": "application/json; charset=UTF-8", + "vary": "Origin, X-Origin, Referer", + "content-encoding": "gzip", + "date": "Fri, 03 Oct 2025 08:55:35 GMT", + "server": "scaffolding on HTTPServer2", + "x-xss-protection": "0", + "x-frame-options": "SAMEORIGIN", + "x-content-type-options": "nosniff", + "server-timing": "gfet4t7; dur=26537", + "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000", + "transfer-encoding": "chunked" + } + }, + "candidates": [ + { + "content": { + "parts": [ + { + "text": "## Verdict\nFor new development and strategic long-term alignment with Apple's ecosystem, the recommended adoption is **SwiftUI**.\n\n## Analysis\n**1. TECHNICAL FEASIBILITY**\nBoth SwiftUI and UIKit are technically feasible for building robust iOS applications. SwiftUI, being Apple's modern declarative UI framework, offers a more streamlined and intuitive approach to UI development, especially for new features and applications. It leverages Swift's language features for a more concise and readable codebase. UIKit, while mature and powerful, often requires more boilerplate and an imperative approach. SwiftUI's feasibility is tied to supporting newer OS versions (iOS 13+), which might be a consideration for projects requiring broader legacy device support.\n\n**2. PROJECT SUITABILITY**\nFor *new projects*, SwiftUI is highly suitable, aligning with current best practices and Apple's strategic direction. It promotes modularity and reusability. For *existing UIKit projects*, a full migration to SwiftUI is a significant undertaking and generally not recommended as a \"big bang\" approach. However, SwiftUI is highly suitable for integrating new features or screens into existing UIKit applications through `UIHostingController`, allowing for a gradual adoption strategy. This hybrid approach is a common and effective pattern.\n\n**3. USER VALUE ASSESSMENT**\nUsers do not directly interact with the underlying UI framework. However, SwiftUI's declarative nature and built-in features (e.g., accessibility, dark mode, localization) can enable developers to build more polished, responsive, and feature-rich UIs faster. This can indirectly translate to higher user satisfaction through quicker feature delivery and a more consistent user experience across Apple's platforms. SwiftUI's ability to easily target multiple platforms (iOS, macOS, watchOS, tvOS) from a single codebase also offers potential for broader reach and consistent branding.\n\n**4. IMPLEMENTATION COMPLEXITY**\nSwiftUI generally reduces implementation complexity for common UI patterns due to its declarative syntax, live previews, and automatic layout management. This can lead to faster development cycles and fewer lines of code compared to UIKit. However, for highly custom or complex UI components, or when interacting with older system APIs not yet exposed in SwiftUI, developers might still need to bridge to UIKit using `UIViewRepresentable` or `UIViewControllerRepresentable`, which adds a layer of complexity. The learning curve for developers accustomed to UIKit's imperative style can also be a factor.\n\n**5. ALTERNATIVE APPROACHES**\nWithin the native Apple ecosystem, the primary alternative is UIKit. While UIKit remains a powerful and necessary framework for many existing applications and specific use cases, SwiftUI represents the future of Apple UI development. Other alternatives like cross-platform frameworks (e.g., React Native, Flutter) exist but are outside the scope of a native framework comparison and introduce their own set of trade-offs regarding performance, native look-and-feel, and access to platform-specific features.\n\n**6. INDUSTRY PERSPECTIVE**\nThe industry trend is a clear shift towards SwiftUI. Apple actively promotes SwiftUI, and major companies are increasingly adopting it for new applications and features. While UIKit expertise remains valuable, new hires are often expected to have SwiftUI knowledge. Hybrid applications are common, demonstrating a pragmatic approach to transitioning. SwiftUI is considered the modern best practice for new Apple platform UI development.\n\n**7. LONG-TERM IMPLICATIONS**\nAdopting SwiftUI offers significant long-term benefits. It aligns with Apple's future vision, ensuring better support for new OS features and hardware. It can lead to a more maintainable codebase due to its declarative nature and reduced boilerplate. Scalability and extensibility are enhanced by its component-based architecture. Investing in SwiftUI now reduces future technical debt related to maintaining an older UI framework and positions the project for easier expansion across Apple's device ecosystem.\n\n## Confidence Score\n9/10 - High confidence in recommending SwiftUI for future adoption based on Apple's clear strategic direction, industry trends, and the inherent benefits of declarative UI frameworks. The slight uncertainty (1 point) acknowledges that specific legacy project constraints (e.g., very old OS support requirements, deep reliance on highly custom UIKit components) might necessitate a more nuanced, slower, or hybrid approach.\n\n## Key Takeaways\n* **Strategic Alignment:** SwiftUI is Apple's future for UI development; adopting it ensures long-term alignment and access to new platform features.\n* **Developer Productivity:** Expect faster development cycles and reduced boilerplate for new UI components due to SwiftUI's declarative nature and live previews.\n* **Hybrid Approach for Existing Projects:** For existing UIKit codebases, a gradual, hybrid adoption strategy using `UIHostingController` is the most pragmatic and recommended path.\n* **OS Version Consideration:** Ensure target OS version compatibility (iOS 13+) aligns with project requirements before full SwiftUI commitment.\n* **Cross-Platform Potential:** SwiftUI offers significant advantages for building UIs across iOS, macOS, watchOS, and tvOS from a unified codebase." + } + ], + "role": "model" + }, + "finish_reason": "STOP", + "index": 0 + } + ], + "model_version": "gemini-2.5-flash", + "response_id": "h4_faNzUJc2JxN8PzPW6uA8", + "usage_metadata": { + "candidates_token_count": 1019, + "prompt_token_count": 1404, + "prompt_tokens_details": [ + { + "modality": "TEXT", + "token_count": 1404 + } + ], + "thoughts_token_count": 2401, + "total_token_count": 4824 + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/openai_cassettes/consensus_step1_gpt5_for.json b/tests/openai_cassettes/consensus_step1_gpt5_for.json new file mode 100644 index 0000000..95d04d0 --- /dev/null +++ b/tests/openai_cassettes/consensus_step1_gpt5_for.json @@ -0,0 +1,81 @@ +{ + "interactions": [ + { + "request": { + "content": { + "messages": [ + { + "content": "\nROLE\nYou are an expert technical consultant providing consensus analysis on proposals, plans, and ideas. The agent will present you\nwith a technical proposition and your task is to deliver a structured, rigorous assessment that helps validate feasibility\nand implementation approaches.\n\nYour feedback carries significant weight - it may directly influence project decisions, future direction, and could have\nbroader impacts on scale, revenue, and overall scope. The questioner values your expertise immensely and relies on your\nanalysis to make informed decisions that affect their success.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE\u2502 code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE\u2502\" markers in generated code\nsnippets.\n\nPERSPECTIVE FRAMEWORK\nSUPPORTIVE PERSPECTIVE WITH INTEGRITY\n\nYou are tasked with advocating FOR this proposal, but with CRITICAL GUARDRAILS:\n\nMANDATORY ETHICAL CONSTRAINTS:\n- This is NOT a debate for entertainment. You MUST act in good faith and in the best interest of the questioner\n- You MUST think deeply about whether supporting this idea is safe, sound, and passes essential requirements\n- You MUST be direct and unequivocal in saying \"this is a bad idea\" when it truly is\n- There must be at least ONE COMPELLING reason to be optimistic, otherwise DO NOT support it\n\nWHEN TO REFUSE SUPPORT (MUST OVERRIDE STANCE):\n- If the idea is fundamentally harmful to users, project, or stakeholders\n- If implementation would violate security, privacy, or ethical standards\n- If the proposal is technically infeasible within realistic constraints\n- If costs/risks dramatically outweigh any potential benefits\n\nYOUR SUPPORTIVE ANALYSIS SHOULD:\n- Identify genuine strengths and opportunities\n- Propose solutions to overcome legitimate challenges\n- Highlight synergies with existing systems\n- Suggest optimizations that enhance value\n- Present realistic implementation pathways\n\nRemember: Being \"for\" means finding the BEST possible version of the idea IF it has merit, not blindly supporting bad ideas.\n\nIF MORE INFORMATION IS NEEDED\nIMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,\nor technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based\non the information given rather than requesting technical files.\n\nIf you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to\nprovide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).\nDo NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nFor business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the\ncontext provided, even if specific technical details are not available.\n\nEVALUATION FRAMEWORK\nAssess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you\nacknowledge fundamental truths about feasibility, safety, or value:\n\n1. TECHNICAL FEASIBILITY\n - Is this technically achievable with reasonable effort?\n - What are the core technical dependencies and requirements?\n - Are there any fundamental technical blockers?\n\n2. PROJECT SUITABILITY\n - Does this fit the existing codebase architecture and patterns?\n - Is it compatible with current technology stack and constraints?\n - How well does it align with the project's technical direction?\n\n3. USER VALUE ASSESSMENT\n - Will users actually want and use this feature?\n - What concrete benefits does this provide?\n - How does this compare to alternative solutions?\n\n4. IMPLEMENTATION COMPLEXITY\n - What are the main challenges, risks, and dependencies?\n - What is the estimated effort and timeline?\n - What expertise and resources are required?\n\n5. ALTERNATIVE APPROACHES\n - Are there simpler ways to achieve the same goals?\n - What are the trade-offs between different approaches?\n - Should we consider a different strategy entirely?\n\n6. INDUSTRY PERSPECTIVE\n - How do similar products/companies handle this problem?\n - What are current best practices and emerging patterns?\n - Are there proven solutions or cautionary tales?\n\n7. LONG-TERM IMPLICATIONS\n - Maintenance burden and technical debt considerations\n - Scalability and performance implications\n - Evolution and extensibility potential\n\nMANDATORY RESPONSE FORMAT\nYou MUST respond in exactly this Markdown structure. Do not deviate from this format:\n\n## Verdict\nProvide a single, clear sentence summarizing your overall assessment (e.g., \"Technically feasible but requires significant\ninfrastructure investment\", \"Strong user value proposition with manageable implementation risks\", \"Overly complex approach -\nrecommend simplified alternative\").\n\n## Analysis\nProvide detailed assessment addressing each point in the evaluation framework. Use clear reasoning and specific examples.\nBe thorough but concise. Address both strengths and weaknesses objectively.\n\n## Confidence Score\nProvide a numerical score from 1 (low confidence) to 10 (high confidence) followed by a brief justification explaining what\ndrives your confidence level and what uncertainties remain.\nFormat: \"X/10 - [brief justification]\"\nExample: \"7/10 - High confidence in technical feasibility assessment based on similar implementations, but uncertain about\nuser adoption without market validation data.\"\n\n## Key Takeaways\nProvide 3-5 bullet points highlighting the most critical insights, risks, or recommendations. These should be actionable\nand specific.\n\nQUALITY STANDARDS\n- Ground all insights in the current project's scope and constraints\n- Be honest about limitations and uncertainties\n- Focus on practical, implementable solutions rather than theoretical possibilities\n- Provide specific, actionable guidance rather than generic advice\n- Balance optimism with realistic risk assessment\n- Reference concrete examples and precedents when possible\n\nREMINDERS\n- Your assessment will be synthesized with other expert opinions by the agent\n- Aim to provide unique insights that complement other perspectives\n- If files are provided, reference specific technical details in your analysis\n- Maintain professional objectivity while being decisive in your recommendations\n- Keep your response concise - your entire reply must not exceed 850 tokens to ensure transport compatibility\n- CRITICAL: Your stance does NOT override your responsibility to provide truthful, ethical, and beneficial guidance\n- Bad ideas must be called out regardless of stance; good ideas must be acknowledged regardless of stance\n", + "role": "system" + }, + { + "content": "Evaluate SwiftUI vs UIKit adoption and recommend ONE word (SwiftUI or UIKit).", + "role": "user" + } + ], + "model": "gpt-5", + "temperature": 1.0 + }, + "headers": { + "accept": "application/json", + "accept-encoding": "gzip, deflate", + "authorization": "Bearer SANITIZED", + "connection": "keep-alive", + "content-length": "7601", + "content-type": "application/json", + "host": "api.openai.com", + "user-agent": "OpenAI/Python 2.1.0", + "x-stainless-arch": "arm64", + "x-stainless-async": "false", + "x-stainless-lang": "python", + "x-stainless-os": "MacOS", + "x-stainless-package-version": "2.1.0", + "x-stainless-read-timeout": "900.0", + "x-stainless-retry-count": "0", + "x-stainless-runtime": "CPython", + "x-stainless-runtime-version": "3.12.11" + }, + "method": "POST", + "path": "/v1/chat/completions", + "url": "https://api.openai.com/v1/chat/completions" + }, + "response": { + "content": { + "data": "ewogICJpZCI6ICJjaGF0Y21wbC1DTVZxazZKUEE1SEpxeWdISDJKZ3hFSlNMbTJnTSIsCiAgIm9iamVjdCI6ICJjaGF0LmNvbXBsZXRpb24iLAogICJjcmVhdGVkIjogMTc1OTQ4MTY4MiwKICAibW9kZWwiOiAiZ3B0LTUtMjAyNS0wOC0wNyIsCiAgImNob2ljZXMiOiBbCiAgICB7CiAgICAgICJpbmRleCI6IDAsCiAgICAgICJtZXNzYWdlIjogewogICAgICAgICJyb2xlIjogImFzc2lzdGFudCIsCiAgICAgICAgImNvbnRlbnQiOiAiIyMgVmVyZGljdFxuU3dpZnRVSSDigJQgdGhlIHJpZ2h0IGRlZmF1bHQgZm9yIG1vc3QgbW9kZXJuIGlPUyBwcm9qZWN0cywgd2l0aCBVSUtpdCByZXNlcnZlZCBmb3IgbGVnYWN5IGludGVncmF0aW9uIG9yIGhpZ2hseSBzcGVjaWFsaXplZCByZW5kZXJpbmcgbmVlZHMuXG5cbiMjIEFuYWx5c2lzXG4xLiBURUNITklDQUwgRkVBU0lCSUxJVFlcbiAgIC0gU3dpZnRVSSBpcyBwcm9kdWN0aW9uLXJlYWR5IG9uIGlPUyAxNisgd2l0aCBtYXR1cmUgbmF2aWdhdGlvbiAoTmF2aWdhdGlvblN0YWNrKSwgbGlzdHMsIGFuaW1hdGlvbnMsIGFuZCBjb25jdXJyZW5jeSBpbnRlZ3JhdGlvbjsgaW50ZXJvcCBicmlkZ2VzIChVSUhvc3RpbmdDb250cm9sbGVyLCBVSVZpZXdSZXByZXNlbnRhYmxlL0NvbnRyb2xsZXJSZXByZXNlbnRhYmxlKSBjb3ZlciBnYXBzLlxuICAgLSBObyBmdW5kYW1lbnRhbCBibG9ja2VycyBmb3IgdHlwaWNhbCBhcHAgVUlzOyBlZGdlIGNhc2VzIHJlbWFpbiBmb3IgaGlnaGx5IGN1c3RvbSB0ZXh0L2xheW91dCwgYWR2YW5jZWQgZ2VzdHVyZSBhcmJpdHJhdGlvbiwgb3IgbG93LWxldmVsIGdyYXBoaWNzIHBpcGVsaW5lcy5cblxuMi4gUFJPSkVDVCBTVUlUQUJJTElUWVxuICAgLSBOZXcgb3IgYWN0aXZlbHkgbW9kZXJuaXplZCBjb2RlYmFzZXMgYmVuZWZpdCBmcm9tIFN3aWZ0VUnigJlzIGRlY2xhcmF0aXZlIHBhdHRlcm5zIGFuZCBTd2lmdCBDb25jdXJyZW5jeSBhbGlnbm1lbnQuXG4gICAtIFVJS2l0IHJlbWFpbnMgYXBwcm9wcmlhdGUgaWYgeW91IG11c3Q6IHN1cHBvcnQgb2xkZXIgT1NlcywgcmV1c2UgZXh0ZW5zaXZlIFVJS2l0IGNvbXBvbmVudHMsIG9yIGxldmVyYWdlIGJhdHRsZS10ZXN0ZWQgY3VzdG9tIGNvbnRhaW5lcnMuXG5cbjMuIFVTRVIgVkFMVUUgQVNTRVNTTUVOVFxuICAgLSBGYXN0ZXIgaXRlcmF0aW9uLCBtb3JlIGNvbnNpc3RlbnQgdGhlbWluZywgYWNjZXNzaWJpbGl0eSwgYW5kIGR5bmFtaWMgdHlwZSBzdXBwb3J0IHRyYW5zbGF0ZSB0byBiZXR0ZXIgVVggd2l0aCBmZXdlciBkZWZlY3RzLlxuICAgLSBTd2lmdFVJ4oCZcyBwcmV2aWV3LWRyaXZlbiB3b3JrZmxvdyBzaG9ydGVucyBmZWVkYmFjayBsb29wcywgdHlwaWNhbGx5IGluY3JlYXNpbmcgcG9saXNoLlxuXG40LiBJTVBMRU1FTlRBVElPTiBDT01QTEVYSVRZXG4gICAtIFN3aWZ0VUkgbG93ZXJzIGJvaWxlcnBsYXRlIGFuZCByZWR1Y2VzIHN0YXRlLW1hbmFnZW1lbnQgY29tcGxleGl0eSB2aWEgT2JzZXJ2YWJsZS9TdGF0ZS9FbnZpcm9ubWVudE9iamVjdCBhbmQgYXN5bmMvYXdhaXQuXG4gICAtIFJpc2tzOiBsZWFybmluZyBjdXJ2ZSBmb3IgYWR2YW5jZWQgcGF0dGVybnMsIG9jY2FzaW9uYWwgZnJhbWV3b3JrIHF1aXJrcyBuZWVkaW5nIHdvcmthcm91bmRzIG9yIFVJS2l0IGZhbGxiYWNrcy5cbiAgIC0gVGVhbSBuZWVkcyBjb21mb3J0IHdpdGggU3dpZnQgQ29uY3VycmVuY3kgYW5kIE1WVk0tc3R5bGUgYXJjaGl0ZWN0dXJlLlxuXG41LiBBTFRFUk5BVElWRSBBUFBST0FDSEVTXG4gICAtIEh5YnJpZCBhcHByb2FjaDogYWRvcHQgU3dpZnRVSSBmb3IgbmV3IHNjcmVlbnMgYW5kIHdyYXAgbGVnYWN5IFVJS2l0IHdoZXJlIG5lZWRlZDsgZGUtcmlzayBtaWdyYXRpb24gd2hpbGUgY2FwdHVyaW5nIGJlbmVmaXRzLlxuICAgLSBUcmFkZS1vZmZzOiBzbWFsbCBpbnRlcm9wIG92ZXJoZWFkIHZzLiBzdGVhZHkgbW9kZXJuaXphdGlvbiB3aXRob3V0IGJpZy1iYW5nIHJld3JpdGVzLlxuXG42LiBJTkRVU1RSWSBQRVJTUEVDVElWRVxuICAgLSBBcHBsZeKAmXMgcm9hZG1hcCBjbGVhcmx5IHByaW9yaXRpemVzIFN3aWZ0VUk7IFdXREMgZmVhdHVyZXMgaW5jcmVhc2luZ2x5IGxhbmQgdGhlcmUgZmlyc3QuXG4gICAtIE1hbnkgbGFyZ2UgYXBwcyBydW4gaHlicmlkIHN0YWNrcyBzdWNjZXNzZnVsbHk7IGJlc3QgcHJhY3RpY2VzIGNlbnRlciBvbiBtb2R1bGFyIGJvdW5kYXJpZXMgYW5kIGludGVyb3Agd3JhcHBlcnMuXG5cbjcuIExPTkctVEVSTSBJTVBMSUNBVElPTlNcbiAgIC0gTG93ZXIgbWFpbnRlbmFuY2UgYW5kIGdyZWF0ZXIgZXh0ZW5zaWJpbGl0eSBhcyBBcHBsZSBpbnZlc3RzIGluIFN3aWZ0VUkuXG4gICAtIEJldHRlciBzY2FsYWJpbGl0eSBvZiBVSSBjb2RlIHZpYSBkZWNsYXJhdGl2ZSBwYXR0ZXJucyBhbmQgcHJldmlldy1kcml2ZW4gZGV2ZWxvcG1lbnQuXG4gICAtIFRlY2huaWNhbCBkZWJ0IHJpc2sgZGVjcmVhc2VzIHdoZW4gYXZvaWRpbmcgZHVwbGljYXRpdmUgVUlLaXQgYWJzdHJhY3Rpb25zIG1vdmluZyBmb3J3YXJkLlxuXG4jIyBDb25maWRlbmNlIFNjb3JlXG44LzEwIC0gU3Ryb25nIGNvbmZpZGVuY2UgZ2l2ZW4gQXBwbGXigJlzIHRyYWplY3RvcnkgYW5kIHdpZGVzcHJlYWQgc3VjY2Vzc2Z1bCBoeWJyaWQgYWRvcHRpb25zOyBtaW5vciB1bmNlcnRhaW50eSByZW1haW5zIGFyb3VuZCBuaWNoZSBjb250cm9scyBhbmQgaGlnaGx5IGN1c3RvbWl6ZWQgVUkgd2hlcmUgVUlLaXQgbWF5IHN0aWxsIGJlIHN1cGVyaW9yLlxuXG4jIyBLZXkgVGFrZWF3YXlzXG4tIFJlY29tbWVuZGF0aW9uOiBTd2lmdFVJLlxuLSBVc2UgYSBoeWJyaWQgbWlncmF0aW9uOiBuZXcgZmVhdHVyZXMgaW4gU3dpZnRVSSwgd3JhcCBvciByZXRhaW4gY3JpdGljYWwgVUlLaXQgY29tcG9uZW50cyBzaG9ydC10ZXJtLlxuLSBUYXJnZXQgaU9TIDE2KyB0byBhdm9pZCBTd2lmdFVJIEFQSSBnYXBzOyBlbmZvcmNlIGNvbmN1cnJlbmN5IGFuZCBNVlZNIHBhdHRlcm5zLlxuLSBFc3RhYmxpc2ggaW50ZXJvcCBib3VuZGFyaWVzIGVhcmx5IChIb3N0aW5nQ29udHJvbGxlci9SZXByZXNlbnRhYmxlKSB0byBtYW5hZ2Ugcmlzay5cbi0gUmVzZXJ2ZSBVSUtpdCBmb3Igc3BlY2lhbGl6ZWQgcmVuZGVyaW5nLCBjb21wbGV4IHRleHQvbGF5b3V0LCBvciBkZWVwbHkgZW50cmVuY2hlZCBsZWdhY3kgc2NyZWVucy4iLAogICAgICAgICJyZWZ1c2FsIjogbnVsbCwKICAgICAgICAiYW5ub3RhdGlvbnMiOiBbXQogICAgICB9LAogICAgICAiZmluaXNoX3JlYXNvbiI6ICJzdG9wIgogICAgfQogIF0sCiAgInVzYWdlIjogewogICAgInByb21wdF90b2tlbnMiOiAxNDE2LAogICAgImNvbXBsZXRpb25fdG9rZW5zIjogMTA5OSwKICAgICJ0b3RhbF90b2tlbnMiOiAyNTE1LAogICAgInByb21wdF90b2tlbnNfZGV0YWlscyI6IHsKICAgICAgImNhY2hlZF90b2tlbnMiOiAxMjgwLAogICAgICAiYXVkaW9fdG9rZW5zIjogMAogICAgfSwKICAgICJjb21wbGV0aW9uX3Rva2Vuc19kZXRhaWxzIjogewogICAgICAicmVhc29uaW5nX3Rva2VucyI6IDUxMiwKICAgICAgImF1ZGlvX3Rva2VucyI6IDAsCiAgICAgICJhY2NlcHRlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAsCiAgICAgICJyZWplY3RlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAKICAgIH0KICB9LAogICJzZXJ2aWNlX3RpZXIiOiAiZGVmYXVsdCIsCiAgInN5c3RlbV9maW5nZXJwcmludCI6IG51bGwKfQo=", + "encoding": "base64", + "size": 3722 + }, + "headers": { + "access-control-expose-headers": "X-Request-ID", + "alt-svc": "h3=\":443\"; ma=86400", + "cf-cache-status": "DYNAMIC", + "cf-ray": "988b3762de588e29-DXB", + "connection": "keep-alive", + "content-encoding": "gzip", + "content-type": "application/json", + "date": "Fri, 03 Oct 2025 08:55:08 GMT", + "openai-organization": "beehive-innovations-fze", + "openai-processing-ms": "25923", + "openai-project": "proj_QP57xBVPOlWpp0vuJEPGwXK3", + "openai-version": "2020-10-01", + "server": "cloudflare", + "set-cookie": "__cf_bm=ZnG_u6hderauNIiWiuihCO5.i_Z0aEP9EjjknD17jPM-(XXX) XXX-XXXX-0.0.0.0-3vTRRzyustARkWI8n8LjlrXpVDzsqmtucj0WPtoT4v2hZHgdVgqfl0LlzS6uJ0_iCkxt2toE_WscAiswh2GDV8sfCfy4ICU7YCoDAZeFVlc; path=/; expires=Fri, 03-Oct-25 09:25:08 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=xesF489hc55fFC2NjFaGsl0esJNo83k.BNs8wvxUmaI-175(XXX) XXX-XXXX-0.0.0.0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "strict-transport-security": "max-age=31536000; includeSubDomains; preload", + "transfer-encoding": "chunked", + "x-content-type-options": "nosniff", + "x-envoy-upstream-service-time": "26101", + "x-openai-proxy-wasm": "v0.1", + "x-ratelimit-limit-requests": "500", + "x-ratelimit-limit-tokens": "500000", + "x-ratelimit-remaining-requests": "499", + "x-ratelimit-remaining-tokens": "498166", + "x-ratelimit-reset-requests": "120ms", + "x-ratelimit-reset-tokens": "220ms", + "x-request-id": "req_aa2b5181f76a4c978adcd1c5a4ef630d" + }, + "reason_phrase": "OK", + "status_code": 200 + } + } + ] +} \ No newline at end of file diff --git a/tests/test_consensus_integration.py b/tests/test_consensus_integration.py new file mode 100644 index 0000000..a652173 --- /dev/null +++ b/tests/test_consensus_integration.py @@ -0,0 +1,166 @@ +"""Integration test for ConsensusTool using OpenAI and Gemini recordings.""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +import pytest + +from providers.registry import ModelProviderRegistry +from providers.shared import ProviderType +from tests.transport_helpers import inject_transport +from tools.consensus import ConsensusTool + +# Directories for recorded HTTP interactions +CASSETTE_DIR = Path(__file__).parent / "openai_cassettes" +CASSETTE_DIR.mkdir(exist_ok=True) +CONSENSUS_CASSETTE_PATH = CASSETTE_DIR / "consensus_step1_gpt5_for.json" + +GEMINI_REPLAY_DIR = Path(__file__).parent / "gemini_cassettes" +GEMINI_REPLAY_DIR.mkdir(exist_ok=True) +GEMINI_REPLAY_ID = "consensus/step2_gemini25_flash_against/mldev" +GEMINI_REPLAY_PATH = GEMINI_REPLAY_DIR / "consensus" / "step2_gemini25_flash_against" / "mldev.json" + + +@pytest.mark.asyncio +@pytest.mark.no_mock_provider +async def test_consensus_multi_model_consultations(monkeypatch): + """Exercise ConsensusTool against gpt-5 (supporting) and gemini-2.0-flash (critical).""" + + env_updates = { + "DEFAULT_MODEL": "auto", + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""), + "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""), + } + keys_to_clear = [ + "XAI_API_KEY", + "OPENROUTER_API_KEY", + "ANTHROPIC_API_KEY", + "MISTRAL_API_KEY", + "CUSTOM_API_KEY", + "CUSTOM_API_URL", + ] + + recording_mode = not CONSENSUS_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists() + if recording_mode: + openai_key = env_updates["OPENAI_API_KEY"].strip() + gemini_key = env_updates["GEMINI_API_KEY"].strip() + if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")): + pytest.skip( + "Consensus cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record." + ) + + GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True) + + with monkeypatch.context() as m: + m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"]) + + if recording_mode: + m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"]) + m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"]) + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record") + else: + m.setenv("OPENAI_API_KEY", "dummy-key-for-replay") + m.setenv("GEMINI_API_KEY", "dummy-key-for-replay") + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay") + + m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_REPLAY_DIR)) + m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID) + + for key in keys_to_clear: + m.delenv(key, raising=False) + + # Reset providers and register only OpenAI & Gemini for deterministic behavior + ModelProviderRegistry.reset_for_testing() + from providers.gemini import GeminiModelProvider + from providers.openai_provider import OpenAIModelProvider + + ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) + ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider) + + # Inject HTTP transport for OpenAI interactions + inject_transport(monkeypatch, CONSENSUS_CASSETTE_PATH) + + tool = ConsensusTool() + + models_to_consult = [ + {"model": "gpt-5", "stance": "for"}, + {"model": "gemini-2.5-flash", "stance": "against"}, + ] + + # Step 1: CLI agent analysis followed by first model consultation + step1_arguments = { + "step": "Evaluate SwiftUI vs UIKit adoption and recommend ONE word (SwiftUI or UIKit).", + "step_number": 1, + "total_steps": len(models_to_consult), + "next_step_required": True, + "findings": "SwiftUI momentum is strong but UIKit remains battle-tested.", + "models": models_to_consult, + } + + step1_response = await tool.execute(step1_arguments) + assert step1_response and step1_response[0].type == "text" + step1_data = json.loads(step1_response[0].text) + + assert step1_data["status"] == "analysis_and_first_model_consulted" + assert step1_data["model_consulted"] == "gpt-5" + assert step1_data["model_response"]["status"] == "success" + assert step1_data["model_response"]["metadata"]["provider"] == "openai" + assert step1_data["model_response"]["verdict"] + + continuation_offer = step1_data.get("continuation_offer") + assert continuation_offer is not None + continuation_id = continuation_offer["continuation_id"] + + # Prepare step 2 inputs using the first model's response summary + summary_for_step2 = step1_data["model_response"]["verdict"][:200] + + step2_arguments = { + "step": f"Incorporated gpt-5 perspective: {summary_for_step2}", + "step_number": 2, + "total_steps": len(models_to_consult), + "next_step_required": False, + "findings": "Ready to gather opposing stance before synthesis.", + "continuation_id": continuation_id, + "current_model_index": step1_data.get("current_model_index", 1), + "model_responses": step1_data.get("model_responses", []), + } + + step2_response = await tool.execute(step2_arguments) + + assert step2_response and step2_response[0].type == "text" + step2_data = json.loads(step2_response[0].text) + + assert step2_data["status"] == "consensus_workflow_complete" + assert step2_data["model_consulted"] == "gemini-2.5-flash" + assert step2_data["model_response"]["metadata"]["provider"] == "google" + assert step2_data["model_response"]["verdict"] + assert step2_data["complete_consensus"]["models_consulted"] == [ + "gpt-5:for", + "gemini-2.5-flash:against", + ] + assert step2_data["consensus_complete"] is True + + continuation_offer_final = step2_data.get("continuation_offer") + assert continuation_offer_final is not None + assert continuation_offer_final["continuation_id"] == continuation_id + + # Ensure Gemini replay session is flushed to disk before verification + gemini_provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash") + if gemini_provider is not None: + try: + client = gemini_provider.client + if hasattr(client, "close"): + client.close() + finally: + if hasattr(gemini_provider, "_client"): + gemini_provider._client = None + + # Ensure cassettes exist for future replays + assert CONSENSUS_CASSETTE_PATH.exists() + assert GEMINI_REPLAY_PATH.exists() + + # Clean up provider registry state after test + ModelProviderRegistry.reset_for_testing()