feat: add prompt caching, sticky account selection, and non-thinking model
- Implement sticky account selection for prompt cache continuity - Derive stable session ID from first user message (SHA256 hash) - Return cache_read_input_tokens in usage metadata - Add claude-sonnet-4-5 model without thinking - Remove DEFAULT_THINKING_BUDGET (let API use its default) - Add prompt caching test - Update README and CLAUDE.md documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,8 @@ import {
|
||||
TOKEN_REFRESH_INTERVAL_MS,
|
||||
ANTIGRAVITY_ENDPOINT_FALLBACKS,
|
||||
ANTIGRAVITY_HEADERS,
|
||||
DEFAULT_PROJECT_ID
|
||||
DEFAULT_PROJECT_ID,
|
||||
MAX_WAIT_BEFORE_ERROR_MS
|
||||
} from './constants.js';
|
||||
import { refreshAccessToken } from './oauth.js';
|
||||
import { formatDuration } from './utils/helpers.js';
|
||||
@@ -198,7 +199,8 @@ export class AccountManager {
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the next available account (round-robin)
|
||||
* Pick the next available account (round-robin).
|
||||
* Sets activeIndex to the selected account's index.
|
||||
* @returns {Object|null} The next available account or null if none available
|
||||
*/
|
||||
pickNext() {
|
||||
@@ -209,19 +211,28 @@ export class AccountManager {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find next available account starting from current index
|
||||
for (let i = 0; i < this.#accounts.length; i++) {
|
||||
// Clamp index to valid range
|
||||
if (this.#currentIndex >= this.#accounts.length) {
|
||||
this.#currentIndex = 0;
|
||||
}
|
||||
|
||||
// Find next available account starting from index AFTER current
|
||||
for (let i = 1; i <= this.#accounts.length; i++) {
|
||||
const idx = (this.#currentIndex + i) % this.#accounts.length;
|
||||
const account = this.#accounts[idx];
|
||||
|
||||
if (!account.isRateLimited && !account.isInvalid) {
|
||||
this.#currentIndex = (idx + 1) % this.#accounts.length;
|
||||
// Set activeIndex to this account (not +1)
|
||||
this.#currentIndex = idx;
|
||||
account.lastUsed = Date.now();
|
||||
|
||||
const position = this.#accounts.indexOf(account) + 1;
|
||||
const position = idx + 1;
|
||||
const total = this.#accounts.length;
|
||||
console.log(`[AccountManager] Using account: ${account.email} (${position}/${total})`);
|
||||
|
||||
// Persist the change (don't await to avoid blocking)
|
||||
this.saveToDisk();
|
||||
|
||||
return account;
|
||||
}
|
||||
}
|
||||
@@ -229,6 +240,100 @@ export class AccountManager {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current account without advancing the index (sticky selection).
|
||||
* Used for cache continuity - sticks to the same account until rate-limited.
|
||||
* @returns {Object|null} The current account or null if unavailable/rate-limited
|
||||
*/
|
||||
getCurrentStickyAccount() {
|
||||
this.clearExpiredLimits();
|
||||
|
||||
if (this.#accounts.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Clamp index to valid range
|
||||
if (this.#currentIndex >= this.#accounts.length) {
|
||||
this.#currentIndex = 0;
|
||||
}
|
||||
|
||||
// Get current account directly (activeIndex = current account)
|
||||
const account = this.#accounts[this.#currentIndex];
|
||||
|
||||
// Return if available
|
||||
if (account && !account.isRateLimited && !account.isInvalid) {
|
||||
account.lastUsed = Date.now();
|
||||
// Persist the change (don't await to avoid blocking)
|
||||
this.saveToDisk();
|
||||
return account;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if we should wait for the current account's rate limit to reset.
|
||||
* Used for sticky account selection - wait if rate limit is short (≤ threshold).
|
||||
* @returns {{shouldWait: boolean, waitMs: number, account: Object|null}}
|
||||
*/
|
||||
shouldWaitForCurrentAccount() {
|
||||
if (this.#accounts.length === 0) {
|
||||
return { shouldWait: false, waitMs: 0, account: null };
|
||||
}
|
||||
|
||||
// Clamp index to valid range
|
||||
if (this.#currentIndex >= this.#accounts.length) {
|
||||
this.#currentIndex = 0;
|
||||
}
|
||||
|
||||
// Get current account directly (activeIndex = current account)
|
||||
const account = this.#accounts[this.#currentIndex];
|
||||
|
||||
if (!account || account.isInvalid) {
|
||||
return { shouldWait: false, waitMs: 0, account: null };
|
||||
}
|
||||
|
||||
if (account.isRateLimited && account.rateLimitResetTime) {
|
||||
const waitMs = account.rateLimitResetTime - Date.now();
|
||||
|
||||
// If wait time is within threshold, recommend waiting
|
||||
if (waitMs > 0 && waitMs <= MAX_WAIT_BEFORE_ERROR_MS) {
|
||||
return { shouldWait: true, waitMs, account };
|
||||
}
|
||||
}
|
||||
|
||||
return { shouldWait: false, waitMs: 0, account };
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick an account with sticky selection preference.
|
||||
* Prefers the current account for cache continuity, only switches when:
|
||||
* - Current account is rate-limited for > 2 minutes
|
||||
* - Current account is invalid
|
||||
* @returns {{account: Object|null, waitMs: number}} Account to use and optional wait time
|
||||
*/
|
||||
pickStickyAccount() {
|
||||
// First try to get the current sticky account
|
||||
const stickyAccount = this.getCurrentStickyAccount();
|
||||
if (stickyAccount) {
|
||||
return { account: stickyAccount, waitMs: 0 };
|
||||
}
|
||||
|
||||
// Check if we should wait for current account
|
||||
const waitInfo = this.shouldWaitForCurrentAccount();
|
||||
if (waitInfo.shouldWait) {
|
||||
console.log(`[AccountManager] Waiting ${formatDuration(waitInfo.waitMs)} for sticky account: ${waitInfo.account.email}`);
|
||||
return { account: null, waitMs: waitInfo.waitMs };
|
||||
}
|
||||
|
||||
// Current account unavailable for too long, switch to next available
|
||||
const nextAccount = this.pickNext();
|
||||
if (nextAccount) {
|
||||
console.log(`[AccountManager] Switched to new account for cache: ${nextAccount.email}`);
|
||||
}
|
||||
return { account: nextAccount, waitMs: 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark an account as rate-limited
|
||||
* @param {string} email - Email of the account to mark
|
||||
|
||||
@@ -42,6 +42,44 @@ function isAuthInvalidError(error) {
|
||||
return isAuthError(error);
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive a stable session ID from the first user message in the conversation.
|
||||
* This ensures the same conversation uses the same session ID across turns,
|
||||
* enabling prompt caching (cache is scoped to session + organization).
|
||||
*
|
||||
* @param {Object} anthropicRequest - The Anthropic-format request
|
||||
* @returns {string} A stable session ID (32 hex characters) or random UUID if no user message
|
||||
*/
|
||||
function deriveSessionId(anthropicRequest) {
|
||||
const messages = anthropicRequest.messages || [];
|
||||
|
||||
// Find the first user message
|
||||
for (const msg of messages) {
|
||||
if (msg.role === 'user') {
|
||||
let content = '';
|
||||
|
||||
if (typeof msg.content === 'string') {
|
||||
content = msg.content;
|
||||
} else if (Array.isArray(msg.content)) {
|
||||
// Extract text from content blocks
|
||||
content = msg.content
|
||||
.filter(block => block.type === 'text' && block.text)
|
||||
.map(block => block.text)
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
if (content) {
|
||||
// Hash the content with SHA256, return first 32 hex chars
|
||||
const hash = crypto.createHash('sha256').update(content).digest('hex');
|
||||
return hash.substring(0, 32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to random UUID if no user message found
|
||||
return crypto.randomUUID();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse reset time from HTTP response or error
|
||||
* Checks headers first, then error message body
|
||||
@@ -184,8 +222,8 @@ function buildCloudCodeRequest(anthropicRequest, projectId) {
|
||||
const model = mapModelName(anthropicRequest.model);
|
||||
const googleRequest = convertAnthropicToGoogle(anthropicRequest);
|
||||
|
||||
// Use random session ID for API tracking
|
||||
googleRequest.sessionId = crypto.randomUUID();
|
||||
// Use stable session ID derived from first user message for cache continuity
|
||||
googleRequest.sessionId = deriveSessionId(anthropicRequest);
|
||||
|
||||
const payload = {
|
||||
project: projectId,
|
||||
@@ -244,26 +282,35 @@ export async function sendMessage(anthropicRequest, accountManager) {
|
||||
const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);
|
||||
|
||||
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
||||
// Get next available account
|
||||
let account = accountManager.pickNext();
|
||||
// Use sticky account selection for cache continuity
|
||||
const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount();
|
||||
let account = stickyAccount;
|
||||
|
||||
// Handle waiting for sticky account
|
||||
if (!account && waitMs > 0) {
|
||||
console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
|
||||
await sleep(waitMs);
|
||||
accountManager.clearExpiredLimits();
|
||||
account = accountManager.getCurrentStickyAccount();
|
||||
}
|
||||
|
||||
// Handle all accounts rate-limited
|
||||
if (!account) {
|
||||
if (accountManager.isAllRateLimited()) {
|
||||
const waitMs = accountManager.getMinWaitTimeMs();
|
||||
const resetTime = new Date(Date.now() + waitMs).toISOString();
|
||||
const allWaitMs = accountManager.getMinWaitTimeMs();
|
||||
const resetTime = new Date(Date.now() + allWaitMs).toISOString();
|
||||
|
||||
// If wait time is too long (> 2 minutes), throw error immediately
|
||||
if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) {
|
||||
if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
|
||||
throw new Error(
|
||||
`RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}`
|
||||
`RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for reset (applies to both single and multi-account modes)
|
||||
const accountCount = accountManager.getAccountCount();
|
||||
console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`);
|
||||
await sleep(waitMs);
|
||||
console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
|
||||
await sleep(allWaitMs);
|
||||
accountManager.clearExpiredLimits();
|
||||
account = accountManager.pickNext();
|
||||
}
|
||||
@@ -498,26 +545,35 @@ export async function* sendMessageStream(anthropicRequest, accountManager) {
|
||||
const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);
|
||||
|
||||
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
||||
// Get next available account
|
||||
let account = accountManager.pickNext();
|
||||
// Use sticky account selection for cache continuity
|
||||
const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount();
|
||||
let account = stickyAccount;
|
||||
|
||||
// Handle waiting for sticky account
|
||||
if (!account && waitMs > 0) {
|
||||
console.log(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
|
||||
await sleep(waitMs);
|
||||
accountManager.clearExpiredLimits();
|
||||
account = accountManager.getCurrentStickyAccount();
|
||||
}
|
||||
|
||||
// Handle all accounts rate-limited
|
||||
if (!account) {
|
||||
if (accountManager.isAllRateLimited()) {
|
||||
const waitMs = accountManager.getMinWaitTimeMs();
|
||||
const resetTime = new Date(Date.now() + waitMs).toISOString();
|
||||
const allWaitMs = accountManager.getMinWaitTimeMs();
|
||||
const resetTime = new Date(Date.now() + allWaitMs).toISOString();
|
||||
|
||||
// If wait time is too long (> 2 minutes), throw error immediately
|
||||
if (waitMs > MAX_WAIT_BEFORE_ERROR_MS) {
|
||||
if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
|
||||
throw new Error(
|
||||
`RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(waitMs)}. Next available: ${resetTime}`
|
||||
`RESOURCE_EXHAUSTED: Rate limited. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for reset (applies to both single and multi-account modes)
|
||||
const accountCount = accountManager.getAccountCount();
|
||||
console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(waitMs)}...`);
|
||||
await sleep(waitMs);
|
||||
console.log(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
|
||||
await sleep(allWaitMs);
|
||||
accountManager.clearExpiredLimits();
|
||||
account = accountManager.pickNext();
|
||||
}
|
||||
@@ -629,6 +685,7 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
let currentThinkingSignature = '';
|
||||
let inputTokens = 0;
|
||||
let outputTokens = 0;
|
||||
let cacheReadTokens = 0;
|
||||
let stopReason = 'end_turn';
|
||||
|
||||
const reader = response.body.getReader();
|
||||
@@ -653,11 +710,12 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
const data = JSON.parse(jsonText);
|
||||
const innerResponse = data.response || data;
|
||||
|
||||
// Extract usage metadata
|
||||
// Extract usage metadata (including cache tokens)
|
||||
const usage = innerResponse.usageMetadata;
|
||||
if (usage) {
|
||||
inputTokens = usage.promptTokenCount || inputTokens;
|
||||
outputTokens = usage.candidatesTokenCount || outputTokens;
|
||||
cacheReadTokens = usage.cachedContentTokenCount || cacheReadTokens;
|
||||
}
|
||||
|
||||
const candidates = innerResponse.candidates || [];
|
||||
@@ -666,6 +724,7 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
const parts = content.parts || [];
|
||||
|
||||
// Emit message_start on first data
|
||||
// Note: input_tokens = promptTokenCount - cachedContentTokenCount (Antigravity includes cached in total)
|
||||
if (!hasEmittedStart && parts.length > 0) {
|
||||
hasEmittedStart = true;
|
||||
yield {
|
||||
@@ -678,7 +737,12 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
model: originalModel,
|
||||
stop_reason: null,
|
||||
stop_sequence: null,
|
||||
usage: { input_tokens: inputTokens, output_tokens: 0 }
|
||||
usage: {
|
||||
input_tokens: inputTokens - cacheReadTokens,
|
||||
output_tokens: 0,
|
||||
cache_read_input_tokens: cacheReadTokens,
|
||||
cache_creation_input_tokens: 0
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -817,7 +881,12 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
model: originalModel,
|
||||
stop_reason: null,
|
||||
stop_sequence: null,
|
||||
usage: { input_tokens: inputTokens, output_tokens: 0 }
|
||||
usage: {
|
||||
input_tokens: inputTokens - cacheReadTokens,
|
||||
output_tokens: 0,
|
||||
cache_read_input_tokens: cacheReadTokens,
|
||||
cache_creation_input_tokens: 0
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -850,7 +919,11 @@ async function* streamSSEResponse(response, originalModel) {
|
||||
yield {
|
||||
type: 'message_delta',
|
||||
delta: { stop_reason: stopReason, stop_sequence: null },
|
||||
usage: { output_tokens: outputTokens }
|
||||
usage: {
|
||||
output_tokens: outputTokens,
|
||||
cache_read_input_tokens: cacheReadTokens,
|
||||
cache_creation_input_tokens: 0
|
||||
}
|
||||
};
|
||||
|
||||
yield { type: 'message_stop' };
|
||||
|
||||
@@ -93,7 +93,6 @@ export const MAX_ACCOUNTS = 10; // Maximum number of accounts allowed
|
||||
export const MAX_WAIT_BEFORE_ERROR_MS = 120000; // 2 minutes - throw error if wait exceeds this
|
||||
|
||||
// Thinking model constants
|
||||
export const DEFAULT_THINKING_BUDGET = 16000; // Default thinking budget tokens
|
||||
export const CLAUDE_THINKING_MAX_OUTPUT_TOKENS = 64000; // Max output tokens for thinking models
|
||||
export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length
|
||||
|
||||
@@ -131,7 +130,6 @@ export default {
|
||||
MAX_RETRIES,
|
||||
MAX_ACCOUNTS,
|
||||
MAX_WAIT_BEFORE_ERROR_MS,
|
||||
DEFAULT_THINKING_BUDGET,
|
||||
CLAUDE_THINKING_MAX_OUTPUT_TOKENS,
|
||||
MIN_SIGNATURE_LENGTH,
|
||||
OAUTH_CONFIG,
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
import crypto from 'crypto';
|
||||
import {
|
||||
MODEL_MAPPINGS,
|
||||
DEFAULT_THINKING_BUDGET,
|
||||
CLAUDE_THINKING_MAX_OUTPUT_TOKENS,
|
||||
MIN_SIGNATURE_LENGTH
|
||||
} from './constants.js';
|
||||
@@ -502,21 +501,27 @@ export function convertAnthropicToGoogle(anthropicRequest) {
|
||||
|
||||
// Enable thinking for Claude thinking models
|
||||
if (isClaudeThinkingModel) {
|
||||
// Get budget from request or use default
|
||||
const thinkingBudget = thinking?.budget_tokens || DEFAULT_THINKING_BUDGET;
|
||||
|
||||
googleRequest.generationConfig.thinkingConfig = {
|
||||
include_thoughts: true,
|
||||
thinking_budget: thinkingBudget
|
||||
const thinkingConfig = {
|
||||
include_thoughts: true
|
||||
};
|
||||
|
||||
// Ensure maxOutputTokens is large enough for thinking models
|
||||
if (!googleRequest.generationConfig.maxOutputTokens ||
|
||||
googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) {
|
||||
googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS;
|
||||
// Only set thinking_budget if explicitly provided
|
||||
const thinkingBudget = thinking?.budget_tokens;
|
||||
if (thinkingBudget) {
|
||||
thinkingConfig.thinking_budget = thinkingBudget;
|
||||
|
||||
// Ensure maxOutputTokens is large enough when budget is specified
|
||||
if (!googleRequest.generationConfig.maxOutputTokens ||
|
||||
googleRequest.generationConfig.maxOutputTokens <= thinkingBudget) {
|
||||
googleRequest.generationConfig.maxOutputTokens = CLAUDE_THINKING_MAX_OUTPUT_TOKENS;
|
||||
}
|
||||
|
||||
console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget);
|
||||
} else {
|
||||
console.log('[FormatConverter] Thinking enabled (no budget specified)');
|
||||
}
|
||||
|
||||
console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget);
|
||||
googleRequest.generationConfig.thinkingConfig = thinkingConfig;
|
||||
}
|
||||
|
||||
// Convert tools to Google format
|
||||
@@ -696,7 +701,11 @@ export function convertGoogleToAnthropic(googleResponse, model) {
|
||||
}
|
||||
|
||||
// Extract usage metadata
|
||||
// Note: Antigravity's promptTokenCount is the TOTAL (includes cached),
|
||||
// but Anthropic's input_tokens excludes cached. We subtract to match.
|
||||
const usageMetadata = response.usageMetadata || {};
|
||||
const promptTokens = usageMetadata.promptTokenCount || 0;
|
||||
const cachedTokens = usageMetadata.cachedContentTokenCount || 0;
|
||||
|
||||
return {
|
||||
id: `msg_${crypto.randomBytes(16).toString('hex')}`,
|
||||
@@ -707,8 +716,10 @@ export function convertGoogleToAnthropic(googleResponse, model) {
|
||||
stop_reason: stopReason,
|
||||
stop_sequence: null,
|
||||
usage: {
|
||||
input_tokens: usageMetadata.promptTokenCount || 0,
|
||||
output_tokens: usageMetadata.candidatesTokenCount || 0
|
||||
input_tokens: promptTokens - cachedTokens,
|
||||
output_tokens: usageMetadata.candidatesTokenCount || 0,
|
||||
cache_read_input_tokens: cachedTokens,
|
||||
cache_creation_input_tokens: 0
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user