feat: add prompt caching, sticky account selection, and non-thinking model

- Implement sticky account selection for prompt cache continuity - Derive stable session ID from first user message (SHA256 hash) - Return cache_read_input_tokens in usage metadata - Add claude-sonnet-4-5 model without thinking - Remove DEFAULT_THINKING_BUDGET (let API use its default) - Add prompt caching test - Update README and CLAUDE.md documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-25 13:26:48 +05:30
parent 943a4dcb20
commit 01cda835d9
10 changed files with 464 additions and 80 deletions
--- a/src/account-manager.js
+++ b/src/account-manager.js
@@ -15,7 +15,8 @@ import {
    TOKEN_REFRESH_INTERVAL_MS,
    ANTIGRAVITY_ENDPOINT_FALLBACKS,
    ANTIGRAVITY_HEADERS,
-    DEFAULT_PROJECT_ID
+    DEFAULT_PROJECT_ID,
+    MAX_WAIT_BEFORE_ERROR_MS
 } from './constants.js';
 import { refreshAccessToken } from './oauth.js';
 import { formatDuration } from './utils/helpers.js';
@@ -198,7 +199,8 @@ export class AccountManager {
    }

    /**
-     * Pick the next available account (round-robin)
+     * Pick the next available account (round-robin).
+     * Sets activeIndex to the selected account's index.
     * @returns {Object|null} The next available account or null if none available
     */
    pickNext() {
@@ -209,19 +211,28 @@ export class AccountManager {
            return null;
        }

-        // Find next available account starting from current index
-        for (let i = 0; i < this.#accounts.length; i++) {
+        // Clamp index to valid range
+        if (this.#currentIndex >= this.#accounts.length) {
+            this.#currentIndex = 0;
+        }
+
+        // Find next available account starting from index AFTER current
+        for (let i = 1; i <= this.#accounts.length; i++) {
            const idx = (this.#currentIndex + i) % this.#accounts.length;
            const account = this.#accounts[idx];

            if (!account.isRateLimited && !account.isInvalid) {
-                this.#currentIndex = (idx + 1) % this.#accounts.length;
+                // Set activeIndex to this account (not +1)
+                this.#currentIndex = idx;
                account.lastUsed = Date.now();

-                const position = this.#accounts.indexOf(account) + 1;
+                const position = idx + 1;
                const total = this.#accounts.length;
                console.log(`[AccountManager] Using account: ${account.email} (${position}/${total})`);

+                // Persist the change (don't await to avoid blocking)
+                this.saveToDisk();
+
                return account;
            }
        }
@@ -229,6 +240,100 @@ export class AccountManager {
        return null;
    }

+    /**
+     * Get the current account without advancing the index (sticky selection).
+     * Used for cache continuity - sticks to the same account until rate-limited.
+     * @returns {Object|null} The current account or null if unavailable/rate-limited
+     */
+    getCurrentStickyAccount() {
+        this.clearExpiredLimits();
+
+        if (this.#accounts.length === 0) {
+            return null;
+        }
+
+        // Clamp index to valid range
+        if (this.#currentIndex >= this.#accounts.length) {
+            this.#currentIndex = 0;
+        }
+
+        // Get current account directly (activeIndex = current account)
+        const account = this.#accounts[this.#currentIndex];
+
+        // Return if available
+        if (account && !account.isRateLimited && !account.isInvalid) {
+            account.lastUsed = Date.now();
+            // Persist the change (don't await to avoid blocking)
+            this.saveToDisk();
+            return account;
+        }
+
+        return null;
+    }
+
+    /**
+     * Check if we should wait for the current account's rate limit to reset.
+     * Used for sticky account selection - wait if rate limit is short (≤ threshold).
+     * @returns {{shouldWait: boolean, waitMs: number, account: Object|null}}
+     */
+    shouldWaitForCurrentAccount() {
+        if (this.#accounts.length === 0) {
+            return { shouldWait: false, waitMs: 0, account: null };
+        }
+
+        // Clamp index to valid range
+        if (this.#currentIndex >= this.#accounts.length) {
+            this.#currentIndex = 0;
+        }
+
+        // Get current account directly (activeIndex = current account)
+        const account = this.#accounts[this.#currentIndex];
+
+        if (!account || account.isInvalid) {
+            return { shouldWait: false, waitMs: 0, account: null };
+        }
+
+        if (account.isRateLimited && account.rateLimitResetTime) {
+            const waitMs = account.rateLimitResetTime - Date.now();
+
+            // If wait time is within threshold, recommend waiting
+            if (waitMs > 0 && waitMs <= MAX_WAIT_BEFORE_ERROR_MS) {
+                return { shouldWait: true, waitMs, account };
+            }
+        }
+
+        return { shouldWait: false, waitMs: 0, account };
+    }
+
+    /**
+     * Pick an account with sticky selection preference.
+     * Prefers the current account for cache continuity, only switches when:
+     * - Current account is rate-limited for > 2 minutes
+     * - Current account is invalid
+     * @returns {{account: Object|null, waitMs: number}} Account to use and optional wait time
+     */
+    pickStickyAccount() {
+        // First try to get the current sticky account
+        const stickyAccount = this.getCurrentStickyAccount();
+        if (stickyAccount) {
+            return { account: stickyAccount, waitMs: 0 };
+        }
+
+        // Check if we should wait for current account
+        const waitInfo = this.shouldWaitForCurrentAccount();
+        if (waitInfo.shouldWait) {
+            console.log(`[AccountManager] Waiting ${formatDuration(waitInfo.waitMs)} for sticky account: ${waitInfo.account.email}`);
+            return { account: null, waitMs: waitInfo.waitMs };
+        }
+
+        // Current account unavailable for too long, switch to next available
+        const nextAccount = this.pickNext();
+        if (nextAccount) {
+            console.log(`[AccountManager] Switched to new account for cache: ${nextAccount.email}`);
+        }
+        return { account: nextAccount, waitMs: 0 };
+    }
+
    /**
     * Mark an account as rate-limited
     * @param {string} email - Email of the account to mark
--- a/src/cloudcode-client.js
+++ b/src/cloudcode-client.js
@@ -42,6 +42,44 @@ function isAuthInvalidError(error) {
    return isAuthError(error);
 }

+/**
+ * Derive a stable session ID from the first user message in the conversation.
+ * This ensures the same conversation uses the same session ID across turns,
+ * enabling prompt caching (cache is scoped to session + organization).
+ *
+ * @param {Object} anthropicRequest - The Anthropic-format request
+ * @returns {string} A stable session ID (32 hex characters) or random UUID if no user message
+ */
+function deriveSessionId(anthropicRequest) {
+    const messages = anthropicRequest.messages || [];
+
+    // Find the first user message
+    for (const msg of messages) {
+        if (msg.role === 'user') {
+            let content = '';
+
+            if (typeof msg.content === 'string') {
+                content = msg.content;
+            } else if (Array.isArray(msg.content)) {
+                // Extract text from content blocks
+                content = msg.content
+                    .filter(block => block.type === 'text' && block.text)
+                    .map(block => block.text)
+                    .join('\n');
+            }
+
+            if (content) {
+                // Hash the content with SHA256, return first 32 hex chars
+                const hash = crypto.createHash('sha256').update(content).digest('hex');
+                return hash.substring(0, 32);
+            }
+        }
+    }
+
+    // Fallback to random UUID if no user message found
+    return crypto.randomUUID();
+}
+
 /**
 * Parse reset time from HTTP response or error
 * Checks headers first, then error message body
@@ -184,8 +222,8 @@ function buildCloudCodeRequest(anthropicRequest, projectId) {
    const model = mapModelName(anthropicRequest.model);
    const googleRequest = convertAnthropicToGoogle(anthropicRequest);

-    // Use random session ID for API tracking
-    googleRequest.sessionId = crypto.randomUUID();
+    // Use stable session ID derived from first user message for cache continuity
+    googleRequest.sessionId = deriveSessionId(anthropicRequest);

    const payload = {
        project: projectId,
@@ -244,26 +282,35 @@ export async function sendMessage(anthropicRequest, accountManager) {
    const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);

    for (let attempt = 0; attempt < maxAttempts; attempt++) {
-        // Get next available account
-        let account = accountManager.pickNext();
+        // Use sticky account selection for cache continuity
+        const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount();
+        let account = stickyAccount;
+
+        // Handle waiting for sticky account
+        if (!account && waitMs > 0) {
+            console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
+            await sleep(waitMs);
+            accountManager.clearExpiredLimits();
+            account = accountManager.getCurrentStickyAccount();
+        }

        // Handle all accounts rate-limited
        if (!account) {
            if (accountManager.isAllRateLimited()) {
-                const waitMs = accountManager.getMinWaitTimeMs();
-                const resetTime = new Date(Date.now() + waitMs).toISOString();
+                const allWaitMs = accountManager.getMinWaitTimeMs();
+                const resetTime = new Date(Date.now() + allWaitMs).toISOString();

                // If wait time is too long (> 2 minutes), throw error immediately
-                if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) {
+                if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
                    throw new Error(
-                        `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}`
+                        `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
                    );
                }

                // Wait for reset (applies to both single and multi-account modes)
                const accountCount = accountManager.getAccountCount();
-                console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`);
-                await sleep(waitMs);
+                console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
+                await sleep(allWaitMs);
                accountManager.clearExpiredLimits();
                account = accountManager.pickNext();
            }
@@ -498,26 +545,35 @@ export async function* sendMessageStream(anthropicRequest, accountManager) {
    const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);

    for (let attempt = 0; attempt < maxAttempts; attempt++) {
-        // Get next available account
-        let account = accountManager.pickNext();
+        // Use sticky account selection for cache continuity
+        const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount();
+        let account = stickyAccount;
+
+        // Handle waiting for sticky account
+        if (!account && waitMs > 0) {
+            console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
+            await sleep(waitMs);
+            accountManager.clearExpiredLimits();
+            account = accountManager.getCurrentStickyAccount();
+        }

        // Handle all accounts rate-limited
        if (!account) {
            if (accountManager.isAllRateLimited()) {
-                const waitMs = accountManager.getMinWaitTimeMs();
-                const resetTime = new Date(Date.now() + waitMs).toISOString();
+                const allWaitMs = accountManager.getMinWaitTimeMs();
+                const resetTime = new Date(Date.now() + allWaitMs).toISOString();

                // If wait time is too long (> 2 minutes), throw error immediately
-                if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) {
+                if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
                    throw new Error(
-                        `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}`
+                        `RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
                    );
                }

                // Wait for reset (applies to both single and multi-account modes)
                const accountCount = accountManager.getAccountCount();
-                console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`);
-                await sleep(waitMs);
+                console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
+                await sleep(allWaitMs);
                accountManager.clearExpiredLimits();
                account = accountManager.pickNext();
            }
@@ -629,6 +685,7 @@ async function* streamSSEResponse(response, originalModel) {
    let currentThinkingSignature = '';
    let inputTokens = 0;
    let outputTokens = 0;
+    let cacheReadTokens = 0;
    let stopReason = 'end_turn';

    const reader = response.body.getReader();
@@ -653,11 +710,12 @@ async function* streamSSEResponse(response, originalModel) {
                const data = JSON.parse(jsonText);
                const innerResponse = data.response || data;

-                // Extract usage metadata
+                // Extract usage metadata (including cache tokens)
                const usage = innerResponse.usageMetadata;
                if (usage) {
                    inputTokens = usage.promptTokenCount || inputTokens;
                    outputTokens = usage.candidatesTokenCount || outputTokens;
+                    cacheReadTokens = usage.cachedContentTokenCount || cacheReadTokens;
                }

                const candidates = innerResponse.candidates || [];
@@ -666,6 +724,7 @@ async function* streamSSEResponse(response, originalModel) {
                const parts = content.parts || [];

                // Emit message_start on first data
+                // Note: input_tokens = promptTokenCount - cachedContentTokenCount (Antigravity includes cached in total)
                if (!hasEmittedStart && parts.length > 0) {
                    hasEmittedStart = true;
                    yield {
@@ -678,7 +737,12 @@ async function* streamSSEResponse(response, originalModel) {
                            model: originalModel,
                            stop_reason: null,
                            stop_sequence: null,
-                            usage: { input_tokens: inputTokens, output_tokens: 0 }
+                            usage: {
+                                input_tokens: inputTokens - cacheReadTokens,
+                                output_tokens: 0,
+                                cache_read_input_tokens: cacheReadTokens,
+                                cache_creation_input_tokens: 0
+                            }
                        }
                    };
                }
@@ -817,7 +881,12 @@ async function* streamSSEResponse(response, originalModel) {
                model: originalModel,
                stop_reason: null,
                stop_sequence: null,
-                usage: { input_tokens: inputTokens, output_tokens: 0 }
+                usage: {
+                    input_tokens: inputTokens - cacheReadTokens,
+                    output_tokens: 0,
+                    cache_read_input_tokens: cacheReadTokens,
+                    cache_creation_input_tokens: 0
+                }
            }
        };

@@ -850,7 +919,11 @@ async function* streamSSEResponse(response, originalModel) {
    yield {
        type: 'message_delta',
        delta: { stop_reason: stopReason, stop_sequence: null },
-        usage: { output_tokens: outputTokens }
+        usage: {
+            output_tokens: outputTokens,
+            cache_read_input_tokens: cacheReadTokens,
+            cache_creation_input_tokens: 0
+        }
    };

    yield { type: 'message_stop' };
--- a/src/constants.js
+++ b/src/constants.js
@@ -93,7 +93,6 @@ export const MAX_ACCOUNTS = 10; // Maximum number of accounts allowed
 export const MAX_WAIT_BEFORE_ERROR_MS = 120000; // 2 minutes - throw error if wait exceeds this

 // Thinking model constants
-export const DEFAULT_THINKING_BUDGET = 16000; // Default thinking budget tokens
 export const CLAUDE_THINKING_MAX_OUTPUT_TOKENS = 64000; // Max output tokens for thinking models
 export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length

@@ -131,7 +130,6 @@ export default {
    MAX_RETRIES,
    MAX_ACCOUNTS,
    MAX_WAIT_BEFORE_ERROR_MS,
-    DEFAULT_THINKING_BUDGET,
    CLAUDE_THINKING_MAX_OUTPUT_TOKENS,
    MIN_SIGNATURE_LENGTH,
    OAUTH_CONFIG,
--- a/src/format-converter.js
+++ b/src/format-converter.js
@@ -10,7 +10,6 @@
 import crypto from 'crypto';
 import {
    MODEL_MAPPINGS,
-    DEFAULT_THINKING_BUDGET,
    CLAUDE_THINKING_MAX_OUTPUT_TOKENS,
    MIN_SIGNATURE_LENGTH
 } from './constants.js';
@@ -502,21 +501,27 @@ export function convertAnthropicToGoogle(anthropicRequest) {

    // Enable thinking for Claude thinking models
    if (isClaudeThinkingModel) {
-        // Get budget from request or use default
-        const thinkingBudget = thinking?.budget_tokens || DEFAULT_THINKING_BUDGET;
-
-        googleRequest.generationConfig.thinkingConfig = {
-            include_thoughts: true,
-            thinking_budget: thinkingBudget
+        const thinkingConfig = {
+            include_thoughts: true
        };

-        // Ensure maxOutputTokens is large enough for thinking models
-        if (!googleRequest.generationConfig.maxOutputTokens ||
-            googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) {
-            googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS;
+        // Only set thinking_budget if explicitly provided
+        const thinkingBudget = thinking?.budget_tokens;
+        if (thinkingBudget) {
+            thinkingConfig.thinking_budget = thinkingBudget;
+
+            // Ensure maxOutputTokens is large enough when budget is specified
+            if (!googleRequest.generationConfig.maxOutputTokens ||
+                googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) {
+                googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS;
+            }
+
+            console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget);
+        } else {
+            console.log('[FormatConverter] Thinking enabled (no budget specified)');
        }

-        console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget);
+        googleRequest.generationConfig.thinkingConfig = thinkingConfig;
    }

    // Convert tools to Google format
@@ -696,7 +701,11 @@ export function convertGoogleToAnthropic(googleResponse, model) {
    }

    // Extract usage metadata
+    // Note: Antigravity's promptTokenCount is the TOTAL (includes cached),
+    // but Anthropic's input_tokens excludes cached. We subtract to match.
    const usageMetadata = response.usageMetadata || {};
+    const promptTokens = usageMetadata.promptTokenCount || 0;
+    const cachedTokens = usageMetadata.cachedContentTokenCount || 0;

    return {
        id: `msg_${crypto.randomBytes(16).toString('hex')}`,
@@ -707,8 +716,10 @@ export function convertGoogleToAnthropic(googleResponse, model) {
        stop_reason: stopReason,
        stop_sequence: null,
        usage: {
-            input_tokens: usageMetadata.promptTokenCount || 0,
-            output_tokens: usageMetadata.candidatesTokenCount || 0
+            input_tokens: promptTokens - cachedTokens,
+            output_tokens: usageMetadata.candidatesTokenCount || 0,
+            cache_read_input_tokens: cachedTokens,
+            cache_creation_input_tokens: 0
        }
    };
 }