diff --git a/tests/gemini_cassettes/chat_cross/step1_gemini25_flash_number/mldev.json b/tests/gemini_cassettes/chat_cross/step1_gemini25_flash_number/mldev.json new file mode 100644 index 0000000..498f894 --- /dev/null +++ b/tests/gemini_cassettes/chat_cross/step1_gemini25_flash_number/mldev.json @@ -0,0 +1,147 @@ +{ + "replay_id": "chat_cross/step1_gemini25_flash_number/mldev", + "interactions": [ + { + "request": { + "method": "post", + "url": "{MLDEV_URL_PREFIX}/models/gemini-2.5-flash:generateContent", + "headers": { + "Content-Type": "application/json", + "x-goog-api-key": "{REDACTED}", + "user-agent": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}", + "x-goog-api-client": "google-genai-sdk/{VERSION_NUMBER} {LANGUAGE_LABEL}/{VERSION_NUMBER}" + }, + "body_segments": [ + { + "contents": [ + { + "parts": [ + { + "text": "\nYou are a senior engineering thought-partner collaborating with another AI agent. Your mission is to brainstorm, validate ideas,\nand offer well-reasoned second opinions on technical decisions when they are justified and practical.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE│ code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE│\" markers in generated code\nsnippets.\n\nIF MORE INFORMATION IS NEEDED\nIf the agent is discussing specific code, functions, or project components that was not given as part of the context,\nand you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful\ncollaboration, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been\nprovided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nSCOPE & FOCUS\n• Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints.\n• Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.\n• Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.\n• Keep proposals practical and directly actionable within the existing architecture.\n• Overengineering is an anti-pattern — avoid solutions that introduce unnecessary abstraction, indirection, or\n configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,\n and may not arise in the foreseeable future.\n\nCOLLABORATION APPROACH\n1. Engage deeply with the agent's input – extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.\n2. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use.\n3. Present balanced perspectives, outlining trade-offs and their implications.\n4. Challenge assumptions constructively while respecting current design choices and goals.\n5. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes.\n\nBRAINSTORMING GUIDELINES\n• Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.\n• Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.\n• Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice.\n• Evaluate scalability, maintainability, and operational realities inside the existing architecture and current\nframework.\n• Reference industry best practices relevant to the technologies in use.\n• Communicate concisely and technically, assuming an experienced engineering audience.\n\nREMEMBER\nAct as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team\nreach sound, actionable decisions.\n\n\n=== USER REQUEST ===\nPick a number between 1 and 10 and respond with JUST that number.\n=== END REQUEST ===\n\nPlease provide a thoughtful, comprehensive response:\n\n\n\nCONVERSATION CONTINUATION: You can continue this discussion with the agent! (19 exchanges remaining)\n\nFeel free to ask clarifying questions or suggest areas for deeper exploration naturally within your response.\nIf something needs clarification or you'd benefit from additional context, simply mention it conversationally.\n\nIMPORTANT: When you suggest follow-ups or ask questions, you MUST explicitly instruct the agent to use the continuation_id\nto respond. Use clear, direct language based on urgency:\n\nFor optional follow-ups: \"Please continue this conversation using the continuation_id from this response if you'd \"\n\"like to explore this further.\"\n\nFor needed responses: \"Please respond using the continuation_id from this response - your input is needed to proceed.\"\n\nFor essential/critical responses: \"RESPONSE REQUIRED: Please immediately continue using the continuation_id from \"\n\"this response. Cannot proceed without your clarification/input.\"\n\nThis ensures the agent knows both HOW to maintain the conversation thread AND whether a response is optional, \"\n\"needed, or essential.\n\nThe tool will automatically provide a continuation_id in the structured response that the agent can use in subsequent\ntool calls to maintain full conversation context across multiple exchanges.\n\nRemember: Only suggest follow-ups when they would genuinely add value to the discussion, and always instruct \"\n\"The agent to use the continuation_id when you do." + } + ] + } + ], + "generationConfig": { + "temperature": 0.2, + "candidateCount": 1, + "thinkingConfig": { + "thinkingBudget": 8110 + } + } + } + ] + }, + "response": { + "status_code": 200, + "headers": { + "content-type": "application/json; charset=UTF-8", + "vary": "Origin, X-Origin, Referer", + "content-encoding": "gzip", + "date": "Fri, 03 Oct 2025 09:49:39 GMT", + "server": "scaffolding on HTTPServer2", + "x-xss-protection": "0", + "x-frame-options": "SAMEORIGIN", + "x-content-type-options": "nosniff", + "server-timing": "gfet4t7; dur=22365", + "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000", + "transfer-encoding": "chunked" + }, + "body_segments": [ + { + "candidates": [ + { + "content": { + "parts": [ + { + "text": "7" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0 + } + ], + "usageMetadata": { + "promptTokenCount": 1008, + "candidatesTokenCount": 1, + "totalTokenCount": 3956, + "cachedContentTokenCount": 992, + "promptTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 1008 + } + ], + "cacheTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 992 + } + ], + "thoughtsTokenCount": 2947 + }, + "modelVersion": "gemini-2.5-flash", + "responseId": "M5zfaOjyJvaZkdUPvPf70Ao" + } + ], + "byte_segments": [], + "sdk_response_segments": [ + { + "sdk_http_response": { + "headers": { + "content-type": "application/json; charset=UTF-8", + "vary": "Origin, X-Origin, Referer", + "content-encoding": "gzip", + "date": "Fri, 03 Oct 2025 09:49:39 GMT", + "server": "scaffolding on HTTPServer2", + "x-xss-protection": "0", + "x-frame-options": "SAMEORIGIN", + "x-content-type-options": "nosniff", + "server-timing": "gfet4t7; dur=22365", + "alt-svc": "h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000", + "transfer-encoding": "chunked" + } + }, + "candidates": [ + { + "content": { + "parts": [ + { + "text": "7" + } + ], + "role": "model" + }, + "finish_reason": "STOP", + "index": 0 + } + ], + "model_version": "gemini-2.5-flash", + "response_id": "M5zfaOjyJvaZkdUPvPf70Ao", + "usage_metadata": { + "cache_tokens_details": [ + { + "modality": "TEXT", + "token_count": 992 + } + ], + "cached_content_token_count": 992, + "candidates_token_count": 1, + "prompt_token_count": 1008, + "prompt_tokens_details": [ + { + "modality": "TEXT", + "token_count": 1008 + } + ], + "thoughts_token_count": 2947, + "total_token_count": 3956 + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/openai_cassettes/chat_cross_step2_gpt5_reminder.json b/tests/openai_cassettes/chat_cross_step2_gpt5_reminder.json new file mode 100644 index 0000000..bace375 --- /dev/null +++ b/tests/openai_cassettes/chat_cross_step2_gpt5_reminder.json @@ -0,0 +1,81 @@ +{ + "interactions": [ + { + "request": { + "content": { + "messages": [ + { + "content": "\nYou are a senior engineering thought-partner collaborating with another AI agent. Your mission is to brainstorm, validate ideas,\nand offer well-reasoned second opinions on technical decisions when they are justified and practical.\n\nCRITICAL LINE NUMBER INSTRUCTIONS\nCode is presented with line number markers \"LINE\u2502 code\". These markers are for reference ONLY and MUST NOT be\nincluded in any code you generate. Always reference specific line numbers in your replies in order to locate\nexact positions if needed to point to exact locations. Include a very short code excerpt alongside for clarity.\nInclude context_start_text and context_end_text as backup references. Never include \"LINE\u2502\" markers in generated code\nsnippets.\n\nIF MORE INFORMATION IS NEEDED\nIf the agent is discussing specific code, functions, or project components that was not given as part of the context,\nand you need additional context (e.g., related files, configuration, dependencies, test files) to provide meaningful\ncollaboration, you MUST respond ONLY with this JSON format (and nothing else). Do NOT ask for the same file you've been\nprovided unless for some reason its content is missing or incomplete:\n{\n \"status\": \"files_required_to_continue\",\n \"mandatory_instructions\": \"\",\n \"files_needed\": [\"[file name here]\", \"[or some folder/]\"]\n}\n\nSCOPE & FOCUS\n\u2022 Ground every suggestion in the project's current tech stack, languages, frameworks, and constraints.\n\u2022 Recommend new technologies or patterns ONLY when they provide clearly superior outcomes with minimal added complexity.\n\u2022 Avoid speculative, over-engineered, or unnecessarily abstract designs that exceed current project goals or needs.\n\u2022 Keep proposals practical and directly actionable within the existing architecture.\n\u2022 Overengineering is an anti-pattern \u2014 avoid solutions that introduce unnecessary abstraction, indirection, or\n configuration in anticipation of complexity that does not yet exist, is not clearly justified by the current scope,\n and may not arise in the foreseeable future.\n\nCOLLABORATION APPROACH\n1. Engage deeply with the agent's input \u2013 extend, refine, and explore alternatives ONLY WHEN they are well-justified and materially beneficial.\n2. Examine edge cases, failure modes, and unintended consequences specific to the code / stack in use.\n3. Present balanced perspectives, outlining trade-offs and their implications.\n4. Challenge assumptions constructively while respecting current design choices and goals.\n5. Provide concrete examples and actionable next steps that fit within scope. Prioritize direct, achievable outcomes.\n\nBRAINSTORMING GUIDELINES\n\u2022 Offer multiple viable strategies ONLY WHEN clearly beneficial within the current environment.\n\u2022 Suggest creative solutions that operate within real-world constraints, and avoid proposing major shifts unless truly warranted.\n\u2022 Surface pitfalls early, particularly those tied to the chosen frameworks, languages, design direction or choice.\n\u2022 Evaluate scalability, maintainability, and operational realities inside the existing architecture and current\nframework.\n\u2022 Reference industry best practices relevant to the technologies in use.\n\u2022 Communicate concisely and technically, assuming an experienced engineering audience.\n\nREMEMBER\nAct as a peer, not a lecturer. Avoid overcomplicating. Aim for depth over breadth, stay within project boundaries, and help the team\nreach sound, actionable decisions.\n", + "role": "system" + }, + { + "content": "=== CONVERSATION HISTORY (CONTINUATION) ===\nThread: dbadc23e-c0f4-4853-982f-6c5bc722b5de\nTool: chat\nTurn 3/20\nYou are continuing this conversation thread from where it left off.\n\nPrevious conversation turns:\n\n--- Turn 1 (Agent using chat) ---\nPick a number between 1 and 10 and respond with JUST that number.\n\n--- Turn 2 (gemini-2.5-flash using chat via google) ---\n7\n\n---\n\nAGENT'S TURN: Evaluate this perspective alongside your analysis to form a comprehensive solution and continue with the user's request and task at hand.\n\n--- Turn 3 (Agent) ---\nRemind me, what number did you pick, respond with JUST that number.\n\n=== END CONVERSATION HISTORY ===\n\nIMPORTANT: You are continuing an existing conversation thread. Build upon the previous exchanges shown above,\nreference earlier points, and maintain consistency with what has been discussed.\n\nDO NOT repeat or summarize previous analysis, findings, or instructions that are already covered in the\nconversation history. Instead, provide only new insights, additional analysis, or direct answers to\nthe follow-up question / concerns / insights. Assume the user has read the prior conversation.\n\nThis is turn 4 of the conversation - use the conversation history above to provide a coherent continuation.\n\n=== NEW USER INPUT ===\n=== USER REQUEST ===\nRemind me, what number did you pick, respond with JUST that number.\n=== END REQUEST ===\n\nPlease provide a thoughtful, comprehensive response:", + "role": "user" + } + ], + "model": "gpt-5", + "temperature": 1.0 + }, + "headers": { + "accept": "application/json", + "accept-encoding": "gzip, deflate", + "authorization": "Bearer SANITIZED", + "connection": "keep-alive", + "content-length": "5147", + "content-type": "application/json", + "host": "api.openai.com", + "user-agent": "OpenAI/Python 2.1.0", + "x-stainless-arch": "arm64", + "x-stainless-async": "false", + "x-stainless-lang": "python", + "x-stainless-os": "MacOS", + "x-stainless-package-version": "2.1.0", + "x-stainless-read-timeout": "900.0", + "x-stainless-retry-count": "0", + "x-stainless-runtime": "CPython", + "x-stainless-runtime-version": "3.12.11" + }, + "method": "POST", + "path": "/v1/chat/completions", + "url": "https://api.openai.com/v1/chat/completions" + }, + "response": { + "content": { + "data": "ewogICJpZCI6ICJjaGF0Y21wbC1DTVdodzFCNEo3TUhjb0xWWVhsN0M1d0F1elF4ViIsCiAgIm9iamVjdCI6ICJjaGF0LmNvbXBsZXRpb24iLAogICJjcmVhdGVkIjogMTc1OTQ4NDk4MCwKICAibW9kZWwiOiAiZ3B0LTUtMjAyNS0wOC0wNyIsCiAgImNob2ljZXMiOiBbCiAgICB7CiAgICAgICJpbmRleCI6IDAsCiAgICAgICJtZXNzYWdlIjogewogICAgICAgICJyb2xlIjogImFzc2lzdGFudCIsCiAgICAgICAgImNvbnRlbnQiOiAiNyIsCiAgICAgICAgInJlZnVzYWwiOiBudWxsLAogICAgICAgICJhbm5vdGF0aW9ucyI6IFtdCiAgICAgIH0sCiAgICAgICJmaW5pc2hfcmVhc29uIjogInN0b3AiCiAgICB9CiAgXSwKICAidXNhZ2UiOiB7CiAgICAicHJvbXB0X3Rva2VucyI6IDk3OCwKICAgICJjb21wbGV0aW9uX3Rva2VucyI6IDEzOCwKICAgICJ0b3RhbF90b2tlbnMiOiAxMTE2LAogICAgInByb21wdF90b2tlbnNfZGV0YWlscyI6IHsKICAgICAgImNhY2hlZF90b2tlbnMiOiAwLAogICAgICAiYXVkaW9fdG9rZW5zIjogMAogICAgfSwKICAgICJjb21wbGV0aW9uX3Rva2Vuc19kZXRhaWxzIjogewogICAgICAicmVhc29uaW5nX3Rva2VucyI6IDEyOCwKICAgICAgImF1ZGlvX3Rva2VucyI6IDAsCiAgICAgICJhY2NlcHRlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAsCiAgICAgICJyZWplY3RlZF9wcmVkaWN0aW9uX3Rva2VucyI6IDAKICAgIH0KICB9LAogICJzZXJ2aWNlX3RpZXIiOiAiZGVmYXVsdCIsCiAgInN5c3RlbV9maW5nZXJwcmludCI6IG51bGwKfQo=", + "encoding": "base64", + "size": 773 + }, + "headers": { + "access-control-expose-headers": "X-Request-ID", + "alt-svc": "h3=\":443\"; ma=86400", + "cf-cache-status": "DYNAMIC", + "cf-ray": "988b87e45f493415-DXB", + "connection": "keep-alive", + "content-encoding": "gzip", + "content-type": "application/json", + "date": "Fri, 03 Oct 2025 09:49:44 GMT", + "openai-organization": "beehive-innovations-fze", + "openai-processing-ms": "4345", + "openai-project": "proj_QP57xBVPOlWpp0vuJEPGwXK3", + "openai-version": "2020-10-01", + "server": "cloudflare", + "set-cookie": "__cf_bm=e4bPz0bDm.dZJPLs3DPSKIwVIK8uneu4..2bKR2EQM0-(XXX) XXX-XXXX-0.0.0.0-WptFnm.vM1sv_FhbO.QyisQbBmt2EspWDGAGduxDjCigpQGqTo0_DD6R7mlsC4zQO9A9epqiD8HM.fmNizAGrOVWpTu9CJ7nj7ut0R665.4; path=/; expires=Fri, 03-Oct-25 10:19:44 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=ahmlAfnEVavlLJLjK38OXrjlx0TE95CkXTxcY2SuDxk-175(XXX) XXX-XXXX-0.0.0.0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "strict-transport-security": "max-age=31536000; includeSubDomains; preload", + "transfer-encoding": "chunked", + "x-content-type-options": "nosniff", + "x-envoy-upstream-service-time": "4497", + "x-openai-proxy-wasm": "v0.1", + "x-ratelimit-limit-requests": "500", + "x-ratelimit-limit-tokens": "500000", + "x-ratelimit-remaining-requests": "499", + "x-ratelimit-remaining-tokens": "498763", + "x-ratelimit-reset-requests": "120ms", + "x-ratelimit-reset-tokens": "148ms", + "x-request-id": "req_5bb7c81e1f76490d8c1aa9247f2f05bb" + }, + "reason_phrase": "OK", + "status_code": 200 + } + } + ] +} \ No newline at end of file diff --git a/tests/test_chat_cross_model_continuation.py b/tests/test_chat_cross_model_continuation.py new file mode 100644 index 0000000..a83684a --- /dev/null +++ b/tests/test_chat_cross_model_continuation.py @@ -0,0 +1,196 @@ +"""Cross-provider continuation tests for ChatTool.""" + +from __future__ import annotations + +import json +import os +import re +import uuid +from pathlib import Path + +import pytest + +from providers.registry import ModelProviderRegistry +from providers.shared import ProviderType +from tests.transport_helpers import inject_transport +from tools.chat import ChatTool + +CASSETTE_DIR = Path(__file__).parent / "openai_cassettes" +CASSETTE_DIR.mkdir(exist_ok=True) +OPENAI_CASSETTE_PATH = CASSETTE_DIR / "chat_cross_step2_gpt5_reminder.json" + +GEMINI_CASSETTE_DIR = Path(__file__).parent / "gemini_cassettes" +GEMINI_CASSETTE_DIR.mkdir(exist_ok=True) +GEMINI_REPLAY_ID = "chat_cross/step1_gemini25_flash_number/mldev" +GEMINI_REPLAY_PATH = GEMINI_CASSETTE_DIR / "chat_cross" / "step1_gemini25_flash_number" / "mldev.json" + +FIXED_THREAD_ID = uuid.UUID("dbadc23e-c0f4-4853-982f-6c5bc722b5de") + + +WORD_TO_NUMBER = { + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + "ten": 10, +} + + +def _extract_number(text: str) -> str: + digit_match = re.search(r"\b(\d{1,2})\b", text) + if digit_match: + return digit_match.group(1) + + lower_text = text.lower() + for word, value in WORD_TO_NUMBER.items(): + if re.search(rf"\b{word}\b", lower_text): + return str(value) + return "" + + +@pytest.mark.asyncio +@pytest.mark.no_mock_provider +async def test_chat_cross_model_continuation(monkeypatch): + """Verify continuation across Gemini then OpenAI using recorded interactions.""" + + env_updates = { + "DEFAULT_MODEL": "auto", + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""), + "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", ""), + } + keys_to_clear = [ + "XAI_API_KEY", + "OPENROUTER_API_KEY", + "ANTHROPIC_API_KEY", + "MISTRAL_API_KEY", + "CUSTOM_API_KEY", + "CUSTOM_API_URL", + ] + + recording_mode = not OPENAI_CASSETTE_PATH.exists() or not GEMINI_REPLAY_PATH.exists() + if recording_mode: + openai_key = env_updates["OPENAI_API_KEY"].strip() + gemini_key = env_updates["GEMINI_API_KEY"].strip() + if (not openai_key or openai_key.startswith("dummy")) or (not gemini_key or gemini_key.startswith("dummy")): + pytest.skip( + "Cross-provider cassette missing and OPENAI_API_KEY/GEMINI_API_KEY not configured. Provide real keys to record." + ) + + GEMINI_REPLAY_PATH.parent.mkdir(parents=True, exist_ok=True) + + # Step 1 – Gemini picks a number + with monkeypatch.context() as m: + m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"]) + if recording_mode: + m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"]) + m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"]) + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record") + else: + m.setenv("OPENAI_API_KEY", "dummy-key-for-replay") + m.setenv("GEMINI_API_KEY", "dummy-key-for-replay") + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay") + + m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_CASSETTE_DIR)) + m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID) + + for key in keys_to_clear: + m.delenv(key, raising=False) + + ModelProviderRegistry.reset_for_testing() + from providers.gemini import GeminiModelProvider + from providers.openai_provider import OpenAIModelProvider + + ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) + ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider) + + from utils import conversation_memory + + m.setattr(conversation_memory.uuid, "uuid4", lambda: FIXED_THREAD_ID) + + chat_tool = ChatTool() + step1_args = { + "prompt": "Pick a number between 1 and 10 and respond with JUST that number.", + "model": "gemini-2.5-flash", + "temperature": 0.2, + } + + step1_result = await chat_tool.execute(step1_args) + assert step1_result and step1_result[0].type == "text" + + step1_data = json.loads(step1_result[0].text) + assert step1_data["status"] in {"success", "continuation_available"} + assert step1_data.get("metadata", {}).get("provider_used") == "google" + continuation_offer = step1_data.get("continuation_offer") + assert continuation_offer is not None + continuation_id = continuation_offer["continuation_id"] + assert continuation_id + + chosen_number = _extract_number(step1_data["content"]) + assert chosen_number.isdigit() + assert 1 <= int(chosen_number) <= 10 + + # Ensure replay is flushed for Gemini recordings + gemini_provider = ModelProviderRegistry.get_provider_for_model("gemini-2.5-flash") + if gemini_provider is not None: + try: + client = gemini_provider.client + if hasattr(client, "close"): + client.close() + finally: + if hasattr(gemini_provider, "_client"): + gemini_provider._client = None + + assert GEMINI_REPLAY_PATH.exists() + + # Step 2 – gpt-5 recalls the number via continuation + with monkeypatch.context() as m: + if recording_mode: + m.setenv("OPENAI_API_KEY", env_updates["OPENAI_API_KEY"]) + m.setenv("GEMINI_API_KEY", env_updates["GEMINI_API_KEY"]) + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "record") + else: + m.setenv("OPENAI_API_KEY", "dummy-key-for-replay") + m.setenv("GEMINI_API_KEY", "dummy-key-for-replay") + m.setenv("GOOGLE_GENAI_CLIENT_MODE", "replay") + + m.setenv("DEFAULT_MODEL", env_updates["DEFAULT_MODEL"]) + m.setenv("GOOGLE_GENAI_REPLAYS_DIRECTORY", str(GEMINI_CASSETTE_DIR)) + m.setenv("GOOGLE_GENAI_REPLAY_ID", GEMINI_REPLAY_ID) + for key in keys_to_clear: + m.delenv(key, raising=False) + + ModelProviderRegistry.reset_for_testing() + from providers.gemini import GeminiModelProvider + from providers.openai_provider import OpenAIModelProvider + + ModelProviderRegistry.register_provider(ProviderType.OPENAI, OpenAIModelProvider) + ModelProviderRegistry.register_provider(ProviderType.GOOGLE, GeminiModelProvider) + + inject_transport(monkeypatch, OPENAI_CASSETTE_PATH) + + chat_tool = ChatTool() + step2_args = { + "prompt": "Remind me, what number did you pick, respond with JUST that number.", + "model": "gpt-5", + "continuation_id": continuation_id, + "temperature": 0.2, + } + + step2_result = await chat_tool.execute(step2_args) + assert step2_result and step2_result[0].type == "text" + + step2_data = json.loads(step2_result[0].text) + assert step2_data["status"] in {"success", "continuation_available"} + assert step2_data.get("metadata", {}).get("provider_used") == "openai" + + recalled_number = _extract_number(step2_data["content"]) + assert recalled_number == chosen_number + + assert OPENAI_CASSETTE_PATH.exists() + + ModelProviderRegistry.reset_for_testing()