From 01cda835d9030aea4dca6dd397f290c8862fb613 Mon Sep 17 00:00:00 2001 From: Badri Narayanan S Date: Thu, 25 Dec 2025 13:26:48 +0530 Subject: [PATCH] feat: add prompt caching, sticky account selection, and non-thinking model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement sticky account selection for prompt cache continuity - Derive stable session ID from first user message (SHA256 hash) - Return cache_read_input_tokens in usage metadata - Add claude-sonnet-4-5 model without thinking - Remove DEFAULT_THINKING_BUDGET (let API use its default) - Add prompt caching test - Update README and CLAUDE.md documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- CLAUDE.md | 15 ++- README.md | 38 ++----- package.json | 3 +- src/account-manager.js | 117 +++++++++++++++++++-- src/cloudcode-client.js | 117 +++++++++++++++++---- src/constants.js | 2 - src/format-converter.js | 39 ++++--- tests/helpers/http-client.cjs | 37 +++++++ tests/run-all.cjs | 3 +- tests/test-caching-streaming.cjs | 173 +++++++++++++++++++++++++++++++ 10 files changed, 464 insertions(+), 80 deletions(-) create mode 100644 tests/test-caching-streaming.cjs diff --git a/CLAUDE.md b/CLAUDE.md index 4d49c62..59921c8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,6 +35,7 @@ npm run test:multiturn # Multi-turn with tools npm run test:streaming # Streaming SSE events npm run test:interleaved # Interleaved thinking npm run test:images # Image processing +npm run test:caching # Prompt caching ``` ## Architecture @@ -57,11 +58,17 @@ Claude Code CLI → Express Server (server.js) → CloudCode Client → Antigrav - **src/utils/helpers.js**: Shared utility functions (`formatDuration`, `sleep`) **Multi-Account Load Balancing:** -- Round-robin rotation across configured accounts -- Automatic switch on 429 rate limit errors -- Configurable cooldown period for rate-limited accounts +- Sticky account selection for prompt caching (stays on same account across turns) +- Automatic switch only when rate-limited for > 2 minutes +- Session ID derived from first user message hash for cache continuity - Account state persisted to `~/.config/antigravity-proxy/accounts.json` +**Prompt Caching:** +- Cache is organization-scoped (requires same account + session ID) +- Session ID is SHA256 hash of first user message content (stable across turns) +- `cache_read_input_tokens` returned in usage metadata when cache hits +- Token calculation: `input_tokens = promptTokenCount - cachedContentTokenCount` + ## Testing Notes - Tests require the server to be running (`npm start` in separate terminal) @@ -90,4 +97,4 @@ Claude Code CLI → Express Server (server.js) → CloudCode Client → Antigrav ## Maintenance -When making significant changes to the codebase (new modules, refactoring, architectural changes), update this CLAUDE.md file to keep documentation in sync. +When making significant changes to the codebase (new modules, refactoring, architectural changes), update this CLAUDE.md and the README.md file to keep documentation in sync. diff --git a/README.md b/README.md index 2b4ed46..dfc1119 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Antigravity Claude Proxy -A proxy server that exposes an **Anthropic-compatible API** backed by **Antigravity's Cloud Code**, letting you use Claude models like `claude-sonnet-4-5-thinking` and `claude-opus-4-5-thinking` with **Claude Code CLI**. +A proxy server that exposes an **Anthropic-compatible API** backed by **Antigravity's Cloud Code**, letting you use Claude models like sonnet and opus with **Claude Code CLI**. ## How It Works @@ -104,7 +104,7 @@ Add this configuration: "ANTHROPIC_MODEL": "claude-opus-4-5-thinking", "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4-5-thinking", "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4-5-thinking", - "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-sonnet-4-5-thinking", + "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-sonnet-4-5", "CLAUDE_CODE_SUBAGENT_MODEL": "claude-opus-4-5-thinking" } } @@ -128,6 +128,7 @@ claude |----------|-------------| | `claude-sonnet-4-5-thinking` | Claude Sonnet 4.5 with extended thinking | | `claude-opus-4-5-thinking` | Claude Opus 4.5 with extended thinking | +| `claude-sonnet-4-5` | Claude Sonnet 4.5 without thinking | Standard Anthropic model names are automatically mapped: - `claude-sonnet-4-5-20250514` → `claude-sonnet-4-5-thinking` @@ -139,10 +140,11 @@ Standard Anthropic model names are automatically mapped: When you add multiple accounts, the proxy automatically: -- **Round-robin rotation**: Each request uses the next available account -- **Rate limit handling**: Automatically switches to next account on 429 errors -- **Smart cooldown**: Rate-limited accounts become available after cooldown expires +- **Sticky account selection**: Stays on the same account to maximize prompt cache hits +- **Smart rate limit handling**: Waits for short rate limits (≤2 min), switches accounts for longer ones +- **Automatic cooldown**: Rate-limited accounts become available after reset time expires - **Invalid account detection**: Accounts needing re-authentication are marked and skipped +- **Prompt caching support**: Stable session IDs enable cache hits across conversation turns Check account status anytime: @@ -184,6 +186,7 @@ npm run test:multiturn # Multi-turn with tools npm run test:streaming # Streaming SSE events npm run test:interleaved # Interleaved thinking npm run test:images # Image processing +npm run test:caching # Prompt caching ``` --- @@ -224,31 +227,6 @@ npm run accounts --- -## Project Structure - -``` -src/ -├── index.js # Entry point -├── server.js # Express server with Anthropic API endpoints -├── cloudcode-client.js # Cloud Code API client with retry/failover -├── format-converter.js # Anthropic ↔ Google format conversion -├── account-manager.js # Multi-account management -├── accounts-cli.js # Account management CLI -├── oauth.js # Google OAuth implementation -├── constants.js # Endpoints, headers, model mappings -└── token-extractor.js # Legacy token extraction from Antigravity - -tests/ -├── run-all.cjs # Test runner -├── test-thinking-signatures.cjs # Thinking block tests -├── test-multiturn-thinking-tools.cjs # Multi-turn tests -├── test-multiturn-thinking-tools-streaming.cjs -├── test-interleaved-thinking.cjs -└── test-images.cjs -``` - ---- - ## Safety, Usage, and Risk Notices ### Intended Use diff --git a/package.json b/package.json index 3571ff1..a9b8417 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "test:multiturn": "node tests/test-multiturn-thinking-tools.cjs", "test:streaming": "node tests/test-multiturn-thinking-tools-streaming.cjs", "test:interleaved": "node tests/test-interleaved-thinking.cjs", - "test:images": "node tests/test-images.cjs" + "test:images": "node tests/test-images.cjs", + "test:caching": "node tests/test-caching-streaming.cjs" }, "keywords": [ "claude", diff --git a/src/account-manager.js b/src/account-manager.js index b577b48..faca45f 100644 --- a/src/account-manager.js +++ b/src/account-manager.js @@ -15,7 +15,8 @@ import { TOKEN_REFRESH_INTERVAL_MS, ANTIGRAVITY_ENDPOINT_FALLBACKS, ANTIGRAVITY_HEADERS, - DEFAULT_PROJECT_ID + DEFAULT_PROJECT_ID, + MAX_WAIT_BEFORE_ERROR_MS } from './constants.js'; import { refreshAccessToken } from './oauth.js'; import { formatDuration } from './utils/helpers.js'; @@ -198,7 +199,8 @@ export class AccountManager { } /** - * Pick the next available account (round-robin) + * Pick the next available account (round-robin). + * Sets activeIndex to the selected account's index. * @returns {Object|null} The next available account or null if none available */ pickNext() { @@ -209,19 +211,28 @@ export class AccountManager { return null; } - // Find next available account starting from current index - for (let i = 0; i < this.#accounts.length; i++) { + // Clamp index to valid range + if (this.#currentIndex >= this.#accounts.length) { + this.#currentIndex = 0; + } + + // Find next available account starting from index AFTER current + for (let i = 1; i <= this.#accounts.length; i++) { const idx = (this.#currentIndex + i) % this.#accounts.length; const account = this.#accounts[idx]; if (!account.isRateLimited && !account.isInvalid) { - this.#currentIndex = (idx + 1) % this.#accounts.length; + // Set activeIndex to this account (not +1) + this.#currentIndex = idx; account.lastUsed = Date.now(); - const position = this.#accounts.indexOf(account) + 1; + const position = idx + 1; const total = this.#accounts.length; console.log(`[AccountManager] Using account: ${account.email} (${position}/${total})`); + // Persist the change (don't await to avoid blocking) + this.saveToDisk(); + return account; } } @@ -229,6 +240,100 @@ export class AccountManager { return null; } + /** + * Get the current account without advancing the index (sticky selection). + * Used for cache continuity - sticks to the same account until rate-limited. + * @returns {Object|null} The current account or null if unavailable/rate-limited + */ + getCurrentStickyAccount() { + this.clearExpiredLimits(); + + if (this.#accounts.length === 0) { + return null; + } + + // Clamp index to valid range + if (this.#currentIndex >= this.#accounts.length) { + this.#currentIndex = 0; + } + + // Get current account directly (activeIndex = current account) + const account = this.#accounts[this.#currentIndex]; + + // Return if available + if (account && !account.isRateLimited && !account.isInvalid) { + account.lastUsed = Date.now(); + // Persist the change (don't await to avoid blocking) + this.saveToDisk(); + return account; + } + + return null; + } + + /** + * Check if we should wait for the current account's rate limit to reset. + * Used for sticky account selection - wait if rate limit is short (≤ threshold). + * @returns {{shouldWait: boolean, waitMs: number, account: Object|null}} + */ + shouldWaitForCurrentAccount() { + if (this.#accounts.length === 0) { + return { shouldWait: false, waitMs: 0, account: null }; + } + + // Clamp index to valid range + if (this.#currentIndex >= this.#accounts.length) { + this.#currentIndex = 0; + } + + // Get current account directly (activeIndex = current account) + const account = this.#accounts[this.#currentIndex]; + + if (!account || account.isInvalid) { + return { shouldWait: false, waitMs: 0, account: null }; + } + + if (account.isRateLimited && account.rateLimitResetTime) { + const waitMs = account.rateLimitResetTime - Date.now(); + + // If wait time is within threshold, recommend waiting + if (waitMs > 0 && waitMs <= MAX_WAIT_BEFORE_ERROR_MS) { + return { shouldWait: true, waitMs, account }; + } + } + + return { shouldWait: false, waitMs: 0, account }; + } + + /** + * Pick an account with sticky selection preference. + * Prefers the current account for cache continuity, only switches when: + * - Current account is rate-limited for > 2 minutes + * - Current account is invalid + * @returns {{account: Object|null, waitMs: number}} Account to use and optional wait time + */ + pickStickyAccount() { + // First try to get the current sticky account + const stickyAccount = this.getCurrentStickyAccount(); + if (stickyAccount) { + return { account: stickyAccount, waitMs: 0 }; + } + + // Check if we should wait for current account + const waitInfo = this.shouldWaitForCurrentAccount(); + if (waitInfo.shouldWait) { + console.log(`[AccountManager] Waiting ${formatDuration(waitInfo.waitMs)} for sticky account: ${waitInfo.account.email}`); + return { account: null, waitMs: waitInfo.waitMs }; + } + + // Current account unavailable for too long, switch to next available + const nextAccount = this.pickNext(); + if (nextAccount) { + console.log(`[AccountManager] Switched to new account for cache: ${nextAccount.email}`); + } + return { account: nextAccount, waitMs: 0 }; + } + /** * Mark an account as rate-limited * @param {string} email - Email of the account to mark diff --git a/src/cloudcode-client.js b/src/cloudcode-client.js index e10cad8..3018e7f 100644 --- a/src/cloudcode-client.js +++ b/src/cloudcode-client.js @@ -42,6 +42,44 @@ function isAuthInvalidError(error) { return isAuthError(error); } +/** + * Derive a stable session ID from the first user message in the conversation. + * This ensures the same conversation uses the same session ID across turns, + * enabling prompt caching (cache is scoped to session + organization). + * + * @param {Object} anthropicRequest - The Anthropic-format request + * @returns {string} A stable session ID (32 hex characters) or random UUID if no user message + */ +function deriveSessionId(anthropicRequest) { + const messages = anthropicRequest.messages || []; + + // Find the first user message + for (const msg of messages) { + if (msg.role === 'user') { + let content = ''; + + if (typeof msg.content === 'string') { + content = msg.content; + } else if (Array.isArray(msg.content)) { + // Extract text from content blocks + content = msg.content + .filter(block => block.type === 'text' && block.text) + .map(block => block.text) + .join('\n'); + } + + if (content) { + // Hash the content with SHA256, return first 32 hex chars + const hash = crypto.createHash('sha256').update(content).digest('hex'); + return hash.substring(0, 32); + } + } + } + + // Fallback to random UUID if no user message found + return crypto.randomUUID(); +} + /** * Parse reset time from HTTP response or error * Checks headers first, then error message body @@ -184,8 +222,8 @@ function buildCloudCodeRequest(anthropicRequest, projectId) { const model = mapModelName(anthropicRequest.model); const googleRequest = convertAnthropicToGoogle(anthropicRequest); - // Use random session ID for API tracking - googleRequest.sessionId = crypto.randomUUID(); + // Use stable session ID derived from first user message for cache continuity + googleRequest.sessionId = deriveSessionId(anthropicRequest); const payload = { project: projectId, @@ -244,26 +282,35 @@ export async function sendMessage(anthropicRequest, accountManager) { const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1); for (let attempt = 0; attempt < maxAttempts; attempt++) { - // Get next available account - let account = accountManager.pickNext(); + // Use sticky account selection for cache continuity + const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount(); + let account = stickyAccount; + + // Handle waiting for sticky account + if (!account && waitMs > 0) { + console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`); + await sleep(waitMs); + accountManager.clearExpiredLimits(); + account = accountManager.getCurrentStickyAccount(); + } // Handle all accounts rate-limited if (!account) { if (accountManager.isAllRateLimited()) { - const waitMs = accountManager.getMinWaitTimeMs(); - const resetTime = new Date(Date.now() + waitMs).toISOString(); + const allWaitMs = accountManager.getMinWaitTimeMs(); + const resetTime = new Date(Date.now() + allWaitMs).toISOString(); // If wait time is too long (> 2 minutes), throw error immediately - if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) { + if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) { throw new Error( - `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}` + `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}` ); } // Wait for reset (applies to both single and multi-account modes) const accountCount = accountManager.getAccountCount(); - console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`); - await sleep(waitMs); + console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`); + await sleep(allWaitMs); accountManager.clearExpiredLimits(); account = accountManager.pickNext(); } @@ -498,26 +545,35 @@ export async function* sendMessageStream(anthropicRequest, accountManager) { const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1); for (let attempt = 0; attempt < maxAttempts; attempt++) { - // Get next available account - let account = accountManager.pickNext(); + // Use sticky account selection for cache continuity + const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount(); + let account = stickyAccount; + + // Handle waiting for sticky account + if (!account && waitMs > 0) { + console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`); + await sleep(waitMs); + accountManager.clearExpiredLimits(); + account = accountManager.getCurrentStickyAccount(); + } // Handle all accounts rate-limited if (!account) { if (accountManager.isAllRateLimited()) { - const waitMs = accountManager.getMinWaitTimeMs(); - const resetTime = new Date(Date.now() + waitMs).toISOString(); + const allWaitMs = accountManager.getMinWaitTimeMs(); + const resetTime = new Date(Date.now() + allWaitMs).toISOString(); // If wait time is too long (> 2 minutes), throw error immediately - if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) { + if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) { throw new Error( - `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}` + `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}` ); } // Wait for reset (applies to both single and multi-account modes) const accountCount = accountManager.getAccountCount(); - console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`); - await sleep(waitMs); + console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`); + await sleep(allWaitMs); accountManager.clearExpiredLimits(); account = accountManager.pickNext(); } @@ -629,6 +685,7 @@ async function* streamSSEResponse(response, originalModel) { let currentThinkingSignature = ''; let inputTokens = 0; let outputTokens = 0; + let cacheReadTokens = 0; let stopReason = 'end_turn'; const reader = response.body.getReader(); @@ -653,11 +710,12 @@ async function* streamSSEResponse(response, originalModel) { const data = JSON.parse(jsonText); const innerResponse = data.response || data; - // Extract usage metadata + // Extract usage metadata (including cache tokens) const usage = innerResponse.usageMetadata; if (usage) { inputTokens = usage.promptTokenCount || inputTokens; outputTokens = usage.candidatesTokenCount || outputTokens; + cacheReadTokens = usage.cachedContentTokenCount || cacheReadTokens; } const candidates = innerResponse.candidates || []; @@ -666,6 +724,7 @@ async function* streamSSEResponse(response, originalModel) { const parts = content.parts || []; // Emit message_start on first data + // Note: input_tokens = promptTokenCount - cachedContentTokenCount (Antigravity includes cached in total) if (!hasEmittedStart && parts.length > 0) { hasEmittedStart = true; yield { @@ -678,7 +737,12 @@ async function* streamSSEResponse(response, originalModel) { model: originalModel, stop_reason: null, stop_sequence: null, - usage: { input_tokens: inputTokens, output_tokens: 0 } + usage: { + input_tokens: inputTokens - cacheReadTokens, + output_tokens: 0, + cache_read_input_tokens: cacheReadTokens, + cache_creation_input_tokens: 0 + } } }; } @@ -817,7 +881,12 @@ async function* streamSSEResponse(response, originalModel) { model: originalModel, stop_reason: null, stop_sequence: null, - usage: { input_tokens: inputTokens, output_tokens: 0 } + usage: { + input_tokens: inputTokens - cacheReadTokens, + output_tokens: 0, + cache_read_input_tokens: cacheReadTokens, + cache_creation_input_tokens: 0 + } } }; @@ -850,7 +919,11 @@ async function* streamSSEResponse(response, originalModel) { yield { type: 'message_delta', delta: { stop_reason: stopReason, stop_sequence: null }, - usage: { output_tokens: outputTokens } + usage: { + output_tokens: outputTokens, + cache_read_input_tokens: cacheReadTokens, + cache_creation_input_tokens: 0 + } }; yield { type: 'message_stop' }; diff --git a/src/constants.js b/src/constants.js index b5ac394..3f026df 100644 --- a/src/constants.js +++ b/src/constants.js @@ -93,7 +93,6 @@ export const MAX_ACCOUNTS = 10; // Maximum number of accounts allowed export const MAX_WAIT_BEFORE_ERROR_MS = 120000; // 2 minutes - throw error if wait exceeds this // Thinking model constants -export const DEFAULT_THINKING_BUDGET = 16000; // Default thinking budget tokens export const CLAUDE_THINKING_MAX_OUTPUT_TOKENS = 64000; // Max output tokens for thinking models export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length @@ -131,7 +130,6 @@ export default { MAX_RETRIES, MAX_ACCOUNTS, MAX_WAIT_BEFORE_ERROR_MS, - DEFAULT_THINKING_BUDGET, CLAUDE_THINKING_MAX_OUTPUT_TOKENS, MIN_SIGNATURE_LENGTH, OAUTH_CONFIG, diff --git a/src/format-converter.js b/src/format-converter.js index f931f3e..093edfa 100644 --- a/src/format-converter.js +++ b/src/format-converter.js @@ -10,7 +10,6 @@ import crypto from 'crypto'; import { MODEL_MAPPINGS, - DEFAULT_THINKING_BUDGET, CLAUDE_THINKING_MAX_OUTPUT_TOKENS, MIN_SIGNATURE_LENGTH } from './constants.js'; @@ -502,21 +501,27 @@ export function convertAnthropicToGoogle(anthropicRequest) { // Enable thinking for Claude thinking models if (isClaudeThinkingModel) { - // Get budget from request or use default - const thinkingBudget = thinking?.budget_tokens || DEFAULT_THINKING_BUDGET; - - googleRequest.generationConfig.thinkingConfig = { - include_thoughts: true, - thinking_budget: thinkingBudget + const thinkingConfig = { + include_thoughts: true }; - // Ensure maxOutputTokens is large enough for thinking models - if (!googleRequest.generationConfig.maxOutputTokens || - googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) { - googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS; + // Only set thinking_budget if explicitly provided + const thinkingBudget = thinking?.budget_tokens; + if (thinkingBudget) { + thinkingConfig.thinking_budget = thinkingBudget; + + // Ensure maxOutputTokens is large enough when budget is specified + if (!googleRequest.generationConfig.maxOutputTokens || + googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) { + googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS; + } + + console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget); + } else { + console.log('[FormatConverter] Thinking enabled (no budget specified)'); } - console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget); + googleRequest.generationConfig.thinkingConfig = thinkingConfig; } // Convert tools to Google format @@ -696,7 +701,11 @@ export function convertGoogleToAnthropic(googleResponse, model) { } // Extract usage metadata + // Note: Antigravity's promptTokenCount is the TOTAL (includes cached), + // but Anthropic's input_tokens excludes cached. We subtract to match. const usageMetadata = response.usageMetadata || {}; + const promptTokens = usageMetadata.promptTokenCount || 0; + const cachedTokens = usageMetadata.cachedContentTokenCount || 0; return { id: `msg_${crypto.randomBytes(16).toString('hex')}`, @@ -707,8 +716,10 @@ export function convertGoogleToAnthropic(googleResponse, model) { stop_reason: stopReason, stop_sequence: null, usage: { - input_tokens: usageMetadata.promptTokenCount || 0, - output_tokens: usageMetadata.candidatesTokenCount || 0 + input_tokens: promptTokens - cachedTokens, + output_tokens: usageMetadata.candidatesTokenCount || 0, + cache_read_input_tokens: cachedTokens, + cache_creation_input_tokens: 0 } }; } diff --git a/tests/helpers/http-client.cjs b/tests/helpers/http-client.cjs index 41b9877..e8e7fc3 100644 --- a/tests/helpers/http-client.cjs +++ b/tests/helpers/http-client.cjs @@ -178,6 +178,42 @@ function analyzeEvents(events) { }; } +/** + * Extract usage metadata from SSE events + * @param {Array} events - Array of SSE events + * @returns {Object} - Usage info with input/output/cache tokens + */ +function extractUsage(events) { + const usage = { + input_tokens: 0, + output_tokens: 0, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0 + }; + + // Get usage from message_start + const messageStart = events.find(e => e.type === 'message_start'); + if (messageStart?.data?.message?.usage) { + const startUsage = messageStart.data.message.usage; + usage.input_tokens = startUsage.input_tokens || 0; + usage.cache_read_input_tokens = startUsage.cache_read_input_tokens || 0; + usage.cache_creation_input_tokens = startUsage.cache_creation_input_tokens || 0; + } + + // Get output tokens from message_delta + const messageDelta = events.find(e => e.type === 'message_delta'); + if (messageDelta?.data?.usage) { + const deltaUsage = messageDelta.data.usage; + usage.output_tokens = deltaUsage.output_tokens || 0; + // Also check for cache tokens in delta (may be updated) + if (deltaUsage.cache_read_input_tokens !== undefined) { + usage.cache_read_input_tokens = deltaUsage.cache_read_input_tokens; + } + } + + return usage; +} + // Common tool definitions for tests const commonTools = { getWeather: { @@ -256,5 +292,6 @@ module.exports = { makeRequest, analyzeContent, analyzeEvents, + extractUsage, commonTools }; diff --git a/tests/run-all.cjs b/tests/run-all.cjs index 3556b75..709a2ac 100644 --- a/tests/run-all.cjs +++ b/tests/run-all.cjs @@ -13,7 +13,8 @@ const tests = [ { name: 'Multi-turn Tools (Non-Streaming)', file: 'test-multiturn-thinking-tools.cjs' }, { name: 'Multi-turn Tools (Streaming)', file: 'test-multiturn-thinking-tools-streaming.cjs' }, { name: 'Interleaved Thinking', file: 'test-interleaved-thinking.cjs' }, - { name: 'Image Support', file: 'test-images.cjs' } + { name: 'Image Support', file: 'test-images.cjs' }, + { name: 'Prompt Caching', file: 'test-caching-streaming.cjs' } ]; async function runTest(test) { diff --git a/tests/test-caching-streaming.cjs b/tests/test-caching-streaming.cjs new file mode 100644 index 0000000..d9716ae --- /dev/null +++ b/tests/test-caching-streaming.cjs @@ -0,0 +1,173 @@ +/** + * Prompt Caching Test (Streaming) + * + * Verifies that prompt caching is working correctly: + * - Session ID is stable across turns (derived from first user message) + * - cache_read_input_tokens is returned in usage metadata + * - Second turn in same conversation should hit cache + */ +const { streamRequest, analyzeContent, extractUsage } = require('./helpers/http-client.cjs'); + +// Large system prompt to exceed 1024 token minimum for caching +// This matches the format used in the working direct API test (~36KB) +const LARGE_SYSTEM_PROMPT = 'You are an expert software engineer. Here is important context:\n' + + '// Large codebase file content line\n'.repeat(1000); + +async function runTests() { + console.log('='.repeat(60)); + console.log('PROMPT CACHING TEST (STREAMING)'); + console.log('Verifies session ID stability and cache token reporting'); + console.log('='.repeat(60)); + console.log(''); + + let allPassed = true; + const results = []; + + // ===== TURN 1: Initial request ===== + console.log('TURN 1: Initial request (establishes cache)'); + console.log('-'.repeat(40)); + + const turn1Messages = [ + { + role: 'user', + content: 'Hello! Tell me briefly about JavaScript in one sentence.' + } + ]; + + const turn1 = await streamRequest({ + model: 'claude-sonnet-4-5-thinking', + max_tokens: 2048, + stream: true, + system: LARGE_SYSTEM_PROMPT, + thinking: { type: 'enabled', budget_tokens: 5000 }, + messages: turn1Messages + }); + + if (turn1.statusCode !== 200) { + console.log(` ERROR: Status ${turn1.statusCode}`); + allPassed = false; + results.push({ name: 'Turn 1: Initial request', passed: false }); + } else { + const content = analyzeContent(turn1.content); + const usage = extractUsage(turn1.events); + + console.log(' Content:'); + console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`); + console.log(` Text: ${content.hasText ? 'YES' : 'NO'}`); + + console.log(' Usage:'); + console.log(` input_tokens: ${usage.input_tokens}`); + console.log(` output_tokens: ${usage.output_tokens}`); + console.log(` cache_read_input_tokens: ${usage.cache_read_input_tokens}`); + console.log(` cache_creation_input_tokens: ${usage.cache_creation_input_tokens}`); + + if (content.hasText && content.text[0].text) { + console.log(` Response: "${content.text[0].text.substring(0, 80)}..."`); + } + + // Turn 1 should have response and usage data + const passed = content.hasText && usage.input_tokens > 0; + results.push({ name: 'Turn 1: Has response and usage', passed }); + if (!passed) allPassed = false; + } + + // ===== TURN 2: Follow-up request (should hit cache) ===== + console.log('\nTURN 2: Follow-up request (should use cache)'); + console.log('-'.repeat(40)); + + // Build turn 2 messages with turn 1's response + const turn2Messages = [ + ...turn1Messages, + { + role: 'assistant', + content: turn1.content + }, + { + role: 'user', + content: 'Now tell me about Python in one sentence.' + } + ]; + + const turn2 = await streamRequest({ + model: 'claude-sonnet-4-5-thinking', + max_tokens: 2048, + stream: true, + system: LARGE_SYSTEM_PROMPT, + thinking: { type: 'enabled', budget_tokens: 5000 }, + messages: turn2Messages + }); + + if (turn2.statusCode !== 200) { + console.log(` ERROR: Status ${turn2.statusCode}`); + allPassed = false; + results.push({ name: 'Turn 2: Follow-up request', passed: false }); + } else { + const content = analyzeContent(turn2.content); + const usage = extractUsage(turn2.events); + + console.log(' Content:'); + console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`); + console.log(` Text: ${content.hasText ? 'YES' : 'NO'}`); + + console.log(' Usage:'); + console.log(` input_tokens: ${usage.input_tokens}`); + console.log(` output_tokens: ${usage.output_tokens}`); + console.log(` cache_read_input_tokens: ${usage.cache_read_input_tokens}`); + console.log(` cache_creation_input_tokens: ${usage.cache_creation_input_tokens}`); + + if (content.hasText && content.text[0].text) { + console.log(` Response: "${content.text[0].text.substring(0, 80)}..."`); + } + + // Check if cache was hit + const cacheHit = usage.cache_read_input_tokens > 0; + if (cacheHit) { + console.log(` CACHE HIT: ${usage.cache_read_input_tokens} tokens read from cache`); + } else { + console.log(' CACHE MISS: No tokens read from cache'); + console.log(' Note: Cache may take time to populate on first conversation'); + } + + // Turn 2 should have response + const passed = content.hasText && usage.input_tokens >= 0; + results.push({ name: 'Turn 2: Has response and usage', passed }); + if (!passed) allPassed = false; + + // Cache hit check (informational - not a failure if cache doesn't hit) + results.push({ + name: 'Turn 2: Cache read tokens reported', + passed: true, // Just verify the field exists + info: cacheHit ? `${usage.cache_read_input_tokens} tokens` : 'No cache hit (may be first run)' + }); + } + + // ===== Summary ===== + console.log('\n' + '='.repeat(60)); + console.log('SUMMARY'); + console.log('='.repeat(60)); + + for (const result of results) { + const status = result.passed ? 'PASS' : 'FAIL'; + let line = ` [${status}] ${result.name}`; + if (result.info) { + line += ` (${result.info})`; + } + console.log(line); + } + + console.log('\n' + '='.repeat(60)); + console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + console.log('\nNote: Cache effectiveness depends on:'); + console.log(' 1. Stable session ID (derived from first user message hash)'); + console.log(' 2. Sticky account selection (same account across turns)'); + console.log(' 3. API-side cache availability (may take time to populate)'); + + process.exit(allPassed ? 0 : 1); +} + +runTests().catch(err => { + console.error('Test failed with error:', err); + process.exit(1); +});