From 5a85f0cfccb92b79f0c93acf7f0f9c344d30cf4f Mon Sep 17 00:00:00 2001 From: Badri Narayanan S Date: Sat, 24 Jan 2026 22:43:53 +0530 Subject: [PATCH] feat: comprehensive rate limit handling overhaul (inspired by opencode-antigravity-auth) This commit addresses "Max retries exceeded" errors during stress testing where all accounts would become exhausted simultaneously due to short per-second rate limits triggering cascading failures. ## Rate Limit Parser (`rate-limit-parser.js`) - Remove 2s buffer enforcement that caused cascading failures when API returned short reset times (200-600ms). Now adds 200ms buffer for sub-500ms resets - Add `parseRateLimitReason()` for smart backoff based on error type: QUOTA_EXHAUSTED, RATE_LIMIT_EXCEEDED, MODEL_CAPACITY_EXHAUSTED, SERVER_ERROR ## Message/Streaming Handlers - Add per-account+model rate limit state tracking with exponential backoff - For short rate limits (< 1 second), wait and retry on same account instead of switching - prevents thundering herd when all accounts hit per-second limits - Add throttle wait support for fallback modes (emergency/lastResort) - Add `calculateSmartBackoff()` with progressive tiers by error type ## HybridStrategy (`hybrid-strategy.js`) - Refactor `#getCandidates()` to return 4 fallback levels: - `normal`: All filters pass (health, tokens, quota) - `quota`: Bypass critical quota check - `emergency`: Bypass health check when ALL accounts unhealthy - `lastResort`: Bypass BOTH health AND token bucket checks - Add throttle wait times: 500ms for lastResort, 250ms for emergency - Fix LRU calculation to use seconds (matches opencode-antigravity-auth) ## Health Tracker - Increase `recoveryPerHour` from 2 to 10 for faster recovery (1 hour vs 5 hours) ## Account Manager - Add consecutive failure tracking: `getConsecutiveFailures()`, `incrementConsecutiveFailures()`, `resetConsecutiveFailures()` - Add cooldown mechanism separate from rate limits with `CooldownReason` - Reset consecutive failures on successful request ## Base Strategy - Add `isAccountCoolingDown()` check in `isAccountUsable()` ## Constants - Replace fixed `CAPACITY_RETRY_DELAY_MS` with progressive `CAPACITY_BACKOFF_TIERS_MS` - Add `BACKOFF_BY_ERROR_TYPE` for smart backoff - Add `QUOTA_EXHAUSTED_BACKOFF_TIERS_MS` for progressive quota backoff - Add `MIN_BACKOFF_MS` floor to prevent "Available in 0s" loops - Increase `MAX_CAPACITY_RETRIES` from 3 to 5 - Reduce `RATE_LIMIT_DEDUP_WINDOW_MS` from 5s to 2s ## Frontend - Remove `capacityRetryDelayMs` config (replaced by progressive tiers) - Update default `maxCapacityRetries` display from 3 to 5 ## Testing - Add `tests/stress-test.cjs` for concurrent request stress testing Co-Authored-By: Claude --- public/js/components/server-config.js | 6 - public/js/config/constants.js | 4 - public/js/translations/en.js | 2 - public/js/translations/id.js | 2 - public/js/translations/pt.js | 2 - public/js/translations/tr.js | 2 - public/js/translations/zh.js | 2 - public/views/settings.html | 32 +-- src/account-manager/index.js | 83 +++++- src/account-manager/rate-limits.js | 132 +++++++++ .../strategies/base-strategy.js | 5 + .../strategies/hybrid-strategy.js | 96 +++++-- .../strategies/trackers/health-tracker.js | 2 +- src/cloudcode/message-handler.js | 272 +++++++++++++----- src/cloudcode/rate-limit-parser.js | 66 ++++- src/cloudcode/streaming-handler.js | 265 ++++++++++++----- src/config.js | 5 + src/constants.js | 37 ++- src/webui/index.js | 5 +- tests/stress-test.cjs | 93 ++++++ 20 files changed, 869 insertions(+), 244 deletions(-) create mode 100644 tests/stress-test.cjs diff --git a/public/js/components/server-config.js b/public/js/components/server-config.js index 9fd168b..5ff98fc 100644 --- a/public/js/components/server-config.js +++ b/public/js/components/server-config.js @@ -274,12 +274,6 @@ window.Components.serverConfig = () => ({ (v) => window.Validators.validateTimeout(v, EXTENDED_COOLDOWN_MIN, EXTENDED_COOLDOWN_MAX)); }, - toggleCapacityRetryDelayMs(value) { - const { CAPACITY_RETRY_DELAY_MIN, CAPACITY_RETRY_DELAY_MAX } = window.AppConstants.VALIDATION; - this.saveConfigField('capacityRetryDelayMs', value, 'Capacity Retry Delay', - (v) => window.Validators.validateTimeout(v, CAPACITY_RETRY_DELAY_MIN, CAPACITY_RETRY_DELAY_MAX)); - }, - toggleMaxCapacityRetries(value) { const { MAX_CAPACITY_RETRIES_MIN, MAX_CAPACITY_RETRIES_MAX } = window.AppConstants.VALIDATION; this.saveConfigField('maxCapacityRetries', value, 'Max Capacity Retries', diff --git a/public/js/config/constants.js b/public/js/config/constants.js index 258ceb3..f9eaeb9 100644 --- a/public/js/config/constants.js +++ b/public/js/config/constants.js @@ -85,10 +85,6 @@ window.AppConstants.VALIDATION = { EXTENDED_COOLDOWN_MIN: 10000, EXTENDED_COOLDOWN_MAX: 300000, - // Capacity retry delay (500ms - 10 seconds) - CAPACITY_RETRY_DELAY_MIN: 500, - CAPACITY_RETRY_DELAY_MAX: 10000, - // Capacity retries (1 - 10) MAX_CAPACITY_RETRIES_MIN: 1, MAX_CAPACITY_RETRIES_MAX: 10 diff --git a/public/js/translations/en.js b/public/js/translations/en.js index 91d1d67..324c499 100644 --- a/public/js/translations/en.js +++ b/public/js/translations/en.js @@ -245,8 +245,6 @@ window.translations.en = { maxConsecutiveFailuresDesc: "Number of consecutive failures before applying extended cooldown to an account.", extendedCooldown: "Extended Cooldown", extendedCooldownDesc: "Cooldown duration applied after max consecutive failures reached.", - capacityRetryDelay: "Capacity Retry Delay", - capacityRetryDelayDesc: "Delay before retrying when model capacity is exhausted (not quota).", maxCapacityRetries: "Max Capacity Retries", maxCapacityRetriesDesc: "Maximum retries for capacity exhaustion before switching accounts.", saveConfigServer: "Save Configuration", diff --git a/public/js/translations/id.js b/public/js/translations/id.js index c06bc1c..c9a8b6f 100644 --- a/public/js/translations/id.js +++ b/public/js/translations/id.js @@ -278,8 +278,6 @@ window.translations.id = { maxConsecutiveFailuresDesc: "Jumlah kegagalan berturut-turut sebelum menerapkan cooldown diperpanjang.", extendedCooldown: "Cooldown Diperpanjang", extendedCooldownDesc: "Durasi cooldown setelah mencapai maks. kegagalan berturut-turut.", - capacityRetryDelay: "Jeda Retry Kapasitas", - capacityRetryDelayDesc: "Jeda sebelum retry saat kapasitas model habis (bukan kuota).", maxCapacityRetries: "Maks. Retry Kapasitas", maxCapacityRetriesDesc: "Maksimum retry untuk kehabisan kapasitas sebelum ganti akun.", saveConfigServer: "Simpan Konfigurasi", diff --git a/public/js/translations/pt.js b/public/js/translations/pt.js index 29f73f4..9de8f46 100644 --- a/public/js/translations/pt.js +++ b/public/js/translations/pt.js @@ -223,8 +223,6 @@ window.translations.pt = { maxConsecutiveFailuresDesc: "Número de falhas consecutivas antes de aplicar resfriamento estendido.", extendedCooldown: "Resfriamento Estendido", extendedCooldownDesc: "Duração do resfriamento aplicado após atingir máx. de falhas consecutivas.", - capacityRetryDelay: "Atraso de Retry de Capacidade", - capacityRetryDelayDesc: "Atraso antes de tentar novamente quando capacidade do modelo está esgotada (não quota).", maxCapacityRetries: "Máx. Retries de Capacidade", maxCapacityRetriesDesc: "Máximo de retries para esgotamento de capacidade antes de trocar conta.", saveConfigServer: "Salvar Configuração", diff --git a/public/js/translations/tr.js b/public/js/translations/tr.js index 1d00cec..d7416da 100644 --- a/public/js/translations/tr.js +++ b/public/js/translations/tr.js @@ -227,8 +227,6 @@ window.translations.tr = { maxConsecutiveFailuresDesc: "Uzatılmış soğuma uygulamadan önce ardışık başarısızlık sayısı.", extendedCooldown: "Uzatılmış Soğuma", extendedCooldownDesc: "Maks. ardışık başarısızlık sonrası uygulanan soğuma süresi.", - capacityRetryDelay: "Kapasite Yeniden Deneme Gecikmesi", - capacityRetryDelayDesc: "Model kapasitesi tükendiğinde (kota değil) yeniden denemeden önceki gecikme.", maxCapacityRetries: "Maks. Kapasite Yeniden Denemesi", maxCapacityRetriesDesc: "Hesap değiştirmeden önce kapasite tükenmesi için maksimum yeniden deneme.", saveConfigServer: "Yapılandırmayı Kaydet", diff --git a/public/js/translations/zh.js b/public/js/translations/zh.js index dc924d6..de2c7e4 100644 --- a/public/js/translations/zh.js +++ b/public/js/translations/zh.js @@ -245,8 +245,6 @@ window.translations.zh = { maxConsecutiveFailuresDesc: "触发扩展冷却前允许的连续失败次数。", extendedCooldown: "扩展冷却时间", extendedCooldownDesc: "达到最大连续失败后应用的冷却时长。", - capacityRetryDelay: "容量重试延迟", - capacityRetryDelayDesc: "模型容量耗尽(非配额)时重试前的延迟。", maxCapacityRetries: "最大容量重试次数", maxCapacityRetriesDesc: "容量耗尽时在切换账号前的最大重试次数。", saveConfigServer: "保存配置", diff --git a/public/views/settings.html b/public/views/settings.html index 1a48fb3..6a8463f 100644 --- a/public/views/settings.html +++ b/public/views/settings.html @@ -1226,47 +1226,23 @@ x-text="$store.global.t('extendedCooldownDesc')">Applied after max consecutive failures.

-
- -
- - -
-

Delay for capacity (not quota) issues.

-
-
diff --git a/src/account-manager/index.js b/src/account-manager/index.js index e9988b3..f9aecb9 100644 --- a/src/account-manager/index.js +++ b/src/account-manager/index.js @@ -15,7 +15,15 @@ import { markRateLimited as markLimited, markInvalid as markAccountInvalid, getMinWaitTimeMs as getMinWait, - getRateLimitInfo as getLimitInfo + getRateLimitInfo as getLimitInfo, + getConsecutiveFailures as getFailures, + resetConsecutiveFailures as resetFailures, + incrementConsecutiveFailures as incrementFailures, + markAccountCoolingDown as markCoolingDown, + isAccountCoolingDown as checkCoolingDown, + clearAccountCooldown as clearCooldown, + getCooldownRemaining as getCooldownMs, + CooldownReason } from './rate-limits.js'; import { getTokenForAccount as fetchToken, @@ -182,6 +190,10 @@ export class AccountManager { if (this.#strategy) { this.#strategy.onSuccess(account, modelId); } + // Reset consecutive failures on success (matches opencode-antigravity-auth) + if (account?.email) { + resetFailures(this.#accounts, account.email); + } } /** @@ -206,6 +218,26 @@ export class AccountManager { } } + /** + * Get the consecutive failure count for an account + * Used for progressive backoff calculation + * @param {string} email - Account email + * @returns {number} Number of consecutive failures + */ + getConsecutiveFailures(email) { + return getFailures(this.#accounts, email); + } + + /** + * Increment the consecutive failure count without marking as rate limited + * Used for quick retries to track failures while staying on same account + * @param {string} email - Account email + * @returns {number} New consecutive failure count + */ + incrementConsecutiveFailures(email) { + return incrementFailures(this.#accounts, email); + } + /** * Get the current strategy name * @returns {string} Strategy name @@ -275,6 +307,52 @@ export class AccountManager { return getLimitInfo(this.#accounts, email, modelId); } + // ============================================================================ + // Cooldown Methods (matches opencode-antigravity-auth) + // ============================================================================ + + /** + * Mark an account as cooling down for a specified duration + * Used for temporary backoff separate from rate limits + * @param {string} email - Email of the account + * @param {number} cooldownMs - Duration of cooldown in milliseconds + * @param {string} [reason] - Reason for the cooldown (use CooldownReason constants) + */ + markAccountCoolingDown(email, cooldownMs, reason = CooldownReason.RATE_LIMIT) { + markCoolingDown(this.#accounts, email, cooldownMs, reason); + } + + /** + * Check if an account is currently cooling down + * @param {string} email - Email of the account + * @returns {boolean} True if account is cooling down + */ + isAccountCoolingDown(email) { + const account = this.#accounts.find(a => a.email === email); + return account ? checkCoolingDown(account) : false; + } + + /** + * Clear the cooldown for an account + * @param {string} email - Email of the account + */ + clearAccountCooldown(email) { + const account = this.#accounts.find(a => a.email === email); + if (account) { + clearCooldown(account); + } + } + + /** + * Get time remaining until cooldown expires for an account + * @param {string} email - Email of the account + * @returns {number} Milliseconds until cooldown expires, 0 if not cooling down + */ + getCooldownRemaining(email) { + const account = this.#accounts.find(a => a.email === email); + return account ? getCooldownMs(account) : 0; + } + /** * Get OAuth token for an account * @param {Object} account - Account object with email and credentials @@ -378,4 +456,7 @@ export class AccountManager { } } +// Re-export CooldownReason for use by handlers +export { CooldownReason }; + export default AccountManager; diff --git a/src/account-manager/rate-limits.js b/src/account-manager/rate-limits.js index 0dca07a..b55c457 100644 --- a/src/account-manager/rate-limits.js +++ b/src/account-manager/rate-limits.js @@ -133,6 +133,9 @@ export function markRateLimited(accounts, email, resetMs = null, modelId) { actualResetMs: actualResetMs // Original duration from API }; + // Track consecutive failures for progressive backoff (matches opencode-antigravity-auth) + account.consecutiveFailures = (account.consecutiveFailures || 0) + 1; + // Log appropriately based on duration if (actualResetMs > DEFAULT_COOLDOWN_MS) { logger.warn( @@ -235,3 +238,132 @@ export function getRateLimitInfo(accounts, email, modelId) { waitMs }; } + +/** + * Get the consecutive failure count for an account + * Used for progressive backoff calculation (matches opencode-antigravity-auth) + * + * @param {Array} accounts - Array of account objects + * @param {string} email - Email of the account + * @returns {number} Number of consecutive failures + */ +export function getConsecutiveFailures(accounts, email) { + const account = accounts.find(a => a.email === email); + return account?.consecutiveFailures || 0; +} + +/** + * Reset the consecutive failure count for an account + * Called on successful request (matches opencode-antigravity-auth) + * + * @param {Array} accounts - Array of account objects + * @param {string} email - Email of the account + * @returns {boolean} True if account was found and reset + */ +export function resetConsecutiveFailures(accounts, email) { + const account = accounts.find(a => a.email === email); + if (!account) return false; + account.consecutiveFailures = 0; + return true; +} + +/** + * Increment the consecutive failure count for an account WITHOUT marking as rate limited + * Used for quick retries where we want to track failures but not skip the account + * (matches opencode-antigravity-auth behavior of always incrementing on 429) + * + * @param {Array} accounts - Array of account objects + * @param {string} email - Email of the account + * @returns {number} New consecutive failure count + */ +export function incrementConsecutiveFailures(accounts, email) { + const account = accounts.find(a => a.email === email); + if (!account) return 0; + account.consecutiveFailures = (account.consecutiveFailures || 0) + 1; + return account.consecutiveFailures; +} + +// ============================================================================ +// Cooldown Mechanism (matches opencode-antigravity-auth) +// Separate from rate limits - used for temporary backoff after failures +// ============================================================================ + +/** + * Cooldown reasons for debugging/logging + */ +export const CooldownReason = { + RATE_LIMIT: 'rate_limit', + AUTH_FAILURE: 'auth_failure', + CONSECUTIVE_FAILURES: 'consecutive_failures', + SERVER_ERROR: 'server_error' +}; + +/** + * Mark an account as cooling down for a specified duration + * Used for temporary backoff separate from rate limits + * + * @param {Array} accounts - Array of account objects + * @param {string} email - Email of the account + * @param {number} cooldownMs - Duration of cooldown in milliseconds + * @param {string} [reason] - Reason for the cooldown + * @returns {boolean} True if account was found and marked + */ +export function markAccountCoolingDown(accounts, email, cooldownMs, reason = CooldownReason.RATE_LIMIT) { + const account = accounts.find(a => a.email === email); + if (!account) return false; + + account.coolingDownUntil = Date.now() + cooldownMs; + account.cooldownReason = reason; + + logger.debug(`[AccountManager] Account ${email} cooling down for ${formatDuration(cooldownMs)} (reason: ${reason})`); + return true; +} + +/** + * Check if an account is currently cooling down + * Automatically clears expired cooldowns + * + * @param {Object} account - Account object + * @returns {boolean} True if account is cooling down + */ +export function isAccountCoolingDown(account) { + if (!account || account.coolingDownUntil === undefined) { + return false; + } + + const now = Date.now(); + if (now >= account.coolingDownUntil) { + // Cooldown expired - clear it + clearAccountCooldown(account); + return false; + } + + return true; +} + +/** + * Clear the cooldown for an account + * + * @param {Object} account - Account object + */ +export function clearAccountCooldown(account) { + if (account) { + delete account.coolingDownUntil; + delete account.cooldownReason; + } +} + +/** + * Get time remaining until cooldown expires for an account + * + * @param {Object} account - Account object + * @returns {number} Milliseconds until cooldown expires, 0 if not cooling down + */ +export function getCooldownRemaining(account) { + if (!account || account.coolingDownUntil === undefined) { + return 0; + } + + const remaining = account.coolingDownUntil - Date.now(); + return remaining > 0 ? remaining : 0; +} diff --git a/src/account-manager/strategies/base-strategy.js b/src/account-manager/strategies/base-strategy.js index 57007cc..d11bcd5 100644 --- a/src/account-manager/strategies/base-strategy.js +++ b/src/account-manager/strategies/base-strategy.js @@ -5,6 +5,8 @@ * All strategies must implement the selectAccount method. */ +import { isAccountCoolingDown } from '../rate-limits.js'; + /** * @typedef {Object} SelectionResult * @property {Object|null} account - The selected account or null if none available @@ -77,6 +79,9 @@ export class BaseStrategy { // Skip disabled accounts if (account.enabled === false) return false; + // Check if account is cooling down (matches opencode-antigravity-auth) + if (isAccountCoolingDown(account)) return false; + // Check model-specific rate limit if (modelId && account.modelRateLimits && account.modelRateLimits[modelId]) { const limit = account.modelRateLimits[modelId]; diff --git a/src/account-manager/strategies/hybrid-strategy.js b/src/account-manager/strategies/hybrid-strategy.js index 49a0b49..44fd1d1 100644 --- a/src/account-manager/strategies/hybrid-strategy.js +++ b/src/account-manager/strategies/hybrid-strategy.js @@ -65,7 +65,7 @@ export class HybridStrategy extends BaseStrategy { } // Get candidates that pass all filters - const candidates = this.#getCandidates(accounts, modelId); + const { candidates, fallbackLevel } = this.#getCandidates(accounts, modelId); if (candidates.length === 0) { // Diagnose why no candidates are available and compute wait time @@ -87,16 +87,30 @@ export class HybridStrategy extends BaseStrategy { const best = scored[0]; best.account.lastUsed = Date.now(); - // Consume a token from the bucket - this.#tokenBucketTracker.consume(best.account.email); + // Consume a token from the bucket (unless in lastResort mode where we bypassed token check) + if (fallbackLevel !== 'lastResort') { + this.#tokenBucketTracker.consume(best.account.email); + } if (onSave) onSave(); + // Calculate throttle wait time based on fallback level + // This prevents overwhelming the API when all accounts are stressed + let waitMs = 0; + if (fallbackLevel === 'lastResort') { + // All accounts exhausted - add significant delay to allow rate limits to clear + waitMs = 500; + } else if (fallbackLevel === 'emergency') { + // All accounts unhealthy - add moderate delay + waitMs = 250; + } + const position = best.index + 1; const total = accounts.length; - logger.info(`[HybridStrategy] Using account: ${best.account.email} (${position}/${total}, score: ${best.score.toFixed(1)})`); + const fallbackInfo = fallbackLevel !== 'normal' ? `, fallback: ${fallbackLevel}` : ''; + logger.info(`[HybridStrategy] Using account: ${best.account.email} (${position}/${total}, score: ${best.score.toFixed(1)}${fallbackInfo})`); - return { account: best.account, index: best.index, waitMs: 0 }; + return { account: best.account, index: best.index, waitMs }; } /** @@ -131,6 +145,8 @@ export class HybridStrategy extends BaseStrategy { /** * Get candidates that pass all filters * @private + * @returns {{candidates: Array, fallbackLevel: string}} Candidates and fallback level used + * fallbackLevel: 'normal' | 'quota' | 'emergency' | 'lastResort' */ #getCandidates(accounts, modelId) { const candidates = accounts @@ -160,24 +176,56 @@ export class HybridStrategy extends BaseStrategy { return true; }); - // If no candidates after quota filter, fall back to all usable accounts - // (better to use critical quota than fail entirely) - if (candidates.length === 0) { - const fallback = accounts - .map((account, index) => ({ account, index })) - .filter(({ account }) => { - if (!this.isAccountUsable(account, modelId)) return false; - if (!this.#healthTracker.isUsable(account.email)) return false; - if (!this.#tokenBucketTracker.hasTokens(account.email)) return false; - return true; - }); - if (fallback.length > 0) { - logger.warn('[HybridStrategy] All accounts have critical quota, using fallback'); - return fallback; - } + if (candidates.length > 0) { + return { candidates, fallbackLevel: 'normal' }; } - return candidates; + // If no candidates after quota filter, fall back to all usable accounts + // (better to use critical quota than fail entirely) + const fallback = accounts + .map((account, index) => ({ account, index })) + .filter(({ account }) => { + if (!this.isAccountUsable(account, modelId)) return false; + if (!this.#healthTracker.isUsable(account.email)) return false; + if (!this.#tokenBucketTracker.hasTokens(account.email)) return false; + return true; + }); + if (fallback.length > 0) { + logger.warn('[HybridStrategy] All accounts have critical quota, using fallback'); + return { candidates: fallback, fallbackLevel: 'quota' }; + } + + // Emergency fallback: bypass health check when ALL accounts are unhealthy + // This prevents "Max retries exceeded" when health scores are too low + const emergency = accounts + .map((account, index) => ({ account, index })) + .filter(({ account }) => { + if (!this.isAccountUsable(account, modelId)) return false; + if (!this.#tokenBucketTracker.hasTokens(account.email)) return false; + // Skip health check - use "least bad" account + return true; + }); + if (emergency.length > 0) { + logger.warn('[HybridStrategy] EMERGENCY: All accounts unhealthy, using least bad account'); + return { candidates: emergency, fallbackLevel: 'emergency' }; + } + + // Last resort: bypass BOTH health AND token bucket checks + // Only check basic usability (not rate-limited, not disabled) + const lastResort = accounts + .map((account, index) => ({ account, index })) + .filter(({ account }) => { + // Only check if account is usable (not rate-limited, not disabled) + if (!this.isAccountUsable(account, modelId)) return false; + // Skip health and token bucket checks entirely + return true; + }); + if (lastResort.length > 0) { + logger.warn('[HybridStrategy] LAST RESORT: All accounts exhausted, using any usable account'); + return { candidates: lastResort, fallbackLevel: 'lastResort' }; + } + + return { candidates: [], fallbackLevel: 'normal' }; } /** @@ -202,11 +250,11 @@ export class HybridStrategy extends BaseStrategy { const quotaComponent = quotaScore * this.#weights.quota; // LRU component (older = higher score) - // Use time since last use, capped at 1 hour for scoring + // Use time since last use in seconds, capped at 1 hour (matches opencode-antigravity-auth) const lastUsed = account.lastUsed || 0; const timeSinceLastUse = Math.min(Date.now() - lastUsed, 3600000); // Cap at 1 hour - const lruMinutes = timeSinceLastUse / 60000; - const lruComponent = lruMinutes * this.#weights.lru; + const lruSeconds = timeSinceLastUse / 1000; + const lruComponent = lruSeconds * this.#weights.lru; // 0-3600 * 0.1 = 0-360 max return healthComponent + tokenComponent + quotaComponent + lruComponent; } diff --git a/src/account-manager/strategies/trackers/health-tracker.js b/src/account-manager/strategies/trackers/health-tracker.js index a53274f..15d2da2 100644 --- a/src/account-manager/strategies/trackers/health-tracker.js +++ b/src/account-manager/strategies/trackers/health-tracker.js @@ -12,7 +12,7 @@ const DEFAULT_CONFIG = { successReward: 1, // Points on successful request rateLimitPenalty: -10, // Points on rate limit failurePenalty: -20, // Points on other failures - recoveryPerHour: 2, // Passive recovery rate + recoveryPerHour: 10, // Passive recovery rate (increased from 2 for faster recovery) minUsable: 50, // Minimum score to be selected maxScore: 100 // Maximum score cap }; diff --git a/src/cloudcode/message-handler.js b/src/cloudcode/message-handler.js index a283b46..312a974 100644 --- a/src/cloudcode/message-handler.js +++ b/src/cloudcode/message-handler.js @@ -11,63 +11,92 @@ import { MAX_WAIT_BEFORE_ERROR_MS, DEFAULT_COOLDOWN_MS, RATE_LIMIT_DEDUP_WINDOW_MS, + RATE_LIMIT_STATE_RESET_MS, + FIRST_RETRY_DELAY_MS, + SWITCH_ACCOUNT_DELAY_MS, MAX_CONSECUTIVE_FAILURES, EXTENDED_COOLDOWN_MS, - CAPACITY_RETRY_DELAY_MS, + CAPACITY_BACKOFF_TIERS_MS, MAX_CAPACITY_RETRIES, + BACKOFF_BY_ERROR_TYPE, + QUOTA_EXHAUSTED_BACKOFF_TIERS_MS, + MIN_BACKOFF_MS, isThinkingModel } from '../constants.js'; import { convertGoogleToAnthropic } from '../format/index.js'; import { isRateLimitError, isAuthError } from '../errors.js'; import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js'; import { logger } from '../utils/logger.js'; -import { parseResetTime } from './rate-limit-parser.js'; +import { parseResetTime, parseRateLimitReason } from './rate-limit-parser.js'; import { buildCloudCodeRequest, buildHeaders } from './request-builder.js'; import { parseThinkingSSEResponse } from './sse-parser.js'; import { getFallbackModel } from '../fallback-config.js'; /** - * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits - * Tracks last rate limit timestamp per model to skip duplicate retries + * Rate limit deduplication - prevents thundering herd on concurrent rate limits. + * Tracks rate limit state per account+model including consecutive429 count and timestamps. */ -const lastRateLimitTimestamps = new Map(); // modelId -> timestamp +const rateLimitStateByAccountModel = new Map(); // `${email}:${model}` -> { consecutive429, lastAt } /** - * Check if we should skip retry due to recent rate limit on this model + * Get deduplication key for rate limit tracking + * @param {string} email - Account email * @param {string} model - Model ID - * @returns {boolean} True if retry should be skipped (within dedup window) + * @returns {string} Dedup key */ -function shouldSkipRetryDueToDedup(model) { - const lastTimestamp = lastRateLimitTimestamps.get(model); - if (!lastTimestamp) return false; +function getDedupKey(email, model) { + return `${email}:${model}`; +} - const elapsed = Date.now() - lastTimestamp; - if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) { - logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`); - return true; +/** + * Get rate limit backoff with deduplication and exponential backoff (matches opencode-antigravity-auth) + * @param {string} email - Account email + * @param {string} model - Model ID + * @param {number|null} serverRetryAfterMs - Server-provided retry time + * @returns {{attempt: number, delayMs: number, isDuplicate: boolean}} Backoff info + */ +function getRateLimitBackoff(email, model, serverRetryAfterMs) { + const now = Date.now(); + const stateKey = getDedupKey(email, model); + const previous = rateLimitStateByAccountModel.get(stateKey); + + // Check if within dedup window - return duplicate status + if (previous && (now - previous.lastAt < RATE_LIMIT_DEDUP_WINDOW_MS)) { + const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS; + const backoffDelay = Math.min(baseDelay * Math.pow(2, previous.consecutive429 - 1), 60000); + logger.debug(`[CloudCode] Rate limit on ${email}:${model} within dedup window, attempt=${previous.consecutive429}, isDuplicate=true`); + return { attempt: previous.consecutive429, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: true }; } - return false; + + // Determine attempt number - reset after RATE_LIMIT_STATE_RESET_MS of inactivity + const attempt = previous && (now - previous.lastAt < RATE_LIMIT_STATE_RESET_MS) + ? previous.consecutive429 + 1 + : 1; + + // Update state + rateLimitStateByAccountModel.set(stateKey, { consecutive429: attempt, lastAt: now }); + + // Calculate exponential backoff + const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS; + const backoffDelay = Math.min(baseDelay * Math.pow(2, attempt - 1), 60000); + + logger.debug(`[CloudCode] Rate limit backoff for ${email}:${model}: attempt=${attempt}, delayMs=${Math.max(baseDelay, backoffDelay)}`); + return { attempt, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: false }; } /** - * Record rate limit timestamp for deduplication + * Clear rate limit state after successful request + * @param {string} email - Account email * @param {string} model - Model ID */ -function recordRateLimitTimestamp(model) { - lastRateLimitTimestamps.set(model, Date.now()); +function clearRateLimitState(email, model) { + const key = getDedupKey(email, model); + rateLimitStateByAccountModel.delete(key); } /** - * Clear rate limit timestamp after successful retry - * @param {string} model - Model ID - */ -function clearRateLimitTimestamp(model) { - lastRateLimitTimestamps.delete(model); -} - -/** - * Gap 3: Detect permanent authentication failures that require re-authentication - * These should mark the account as invalid rather than just clearing cache + * Detect permanent authentication failures that require re-authentication. + * These should mark the account as invalid rather than just clearing cache. * @param {string} errorText - Error message from API * @returns {boolean} True if permanent auth failure */ @@ -82,8 +111,8 @@ function isPermanentAuthFailure(errorText) { } /** - * Gap 4: Detect if 429 error is due to model capacity (not user quota) - * Capacity issues should retry on same account with shorter delay + * Detect if 429 error is due to model capacity (not user quota). + * Capacity issues should retry on same account with shorter delay. * @param {string} errorText - Error message from API * @returns {boolean} True if capacity exhausted (not quota) */ @@ -95,16 +124,47 @@ function isModelCapacityExhausted(errorText) { lower.includes('service temporarily unavailable'); } -// Periodically clean up stale dedup timestamps (every 60 seconds) +// Periodically clean up stale rate limit state (every 60 seconds) setInterval(() => { - const cutoff = Date.now() - 60000; // 1 minute - for (const [model, timestamp] of lastRateLimitTimestamps.entries()) { - if (timestamp < cutoff) { - lastRateLimitTimestamps.delete(model); + const cutoff = Date.now() - RATE_LIMIT_STATE_RESET_MS; + for (const [key, state] of rateLimitStateByAccountModel.entries()) { + if (state.lastAt < cutoff) { + rateLimitStateByAccountModel.delete(key); } } }, 60000); +/** + * Calculate smart backoff based on error type (matches opencode-antigravity-auth) + * @param {string} errorText - Error message + * @param {number|null} serverResetMs - Reset time from server + * @param {number} consecutiveFailures - Number of consecutive failures + * @returns {number} Backoff time in milliseconds + */ +function calculateSmartBackoff(errorText, serverResetMs, consecutiveFailures = 0) { + // If server provides a reset time, use it (with minimum floor to prevent loops) + if (serverResetMs && serverResetMs > 0) { + return Math.max(serverResetMs, MIN_BACKOFF_MS); + } + + const reason = parseRateLimitReason(errorText); + + switch (reason) { + case 'QUOTA_EXHAUSTED': + // Progressive backoff: [60s, 5m, 30m, 2h] + const tierIndex = Math.min(consecutiveFailures, QUOTA_EXHAUSTED_BACKOFF_TIERS_MS.length - 1); + return QUOTA_EXHAUSTED_BACKOFF_TIERS_MS[tierIndex]; + case 'RATE_LIMIT_EXCEEDED': + return BACKOFF_BY_ERROR_TYPE.RATE_LIMIT_EXCEEDED; + case 'MODEL_CAPACITY_EXHAUSTED': + return BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED; + case 'SERVER_ERROR': + return BACKOFF_BY_ERROR_TYPE.SERVER_ERROR; + default: + return BACKOFF_BY_ERROR_TYPE.UNKNOWN; + } +} + /** * Send a non-streaming request to Cloud Code with multi-account support * Uses SSE endpoint for thinking models (non-streaming doesn't return thinking blocks) @@ -174,7 +234,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab // Select account using configured strategy const { account, waitMs } = accountManager.selectAccount(model); - // If strategy returns a wait time, sleep and retry + // If strategy returns a wait time without an account, sleep and retry if (!account && waitMs > 0) { logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`); await sleep(waitMs + 500); @@ -182,6 +242,13 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab continue; } + // If strategy returns an account with throttle wait (fallback mode), apply delay + // This prevents overwhelming the API when using emergency/lastResort fallbacks + if (account && waitMs > 0) { + logger.debug(`[CloudCode] Throttling request (${waitMs}ms) - fallback mode active`); + await sleep(waitMs); + } + if (!account) { logger.warn(`[CloudCode] Strategy returned no account for ${model} (attempt ${attempt + 1}/${maxAttempts})`); continue; @@ -197,8 +264,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab // Try each endpoint with index-based loop for capacity retry support let lastError = null; - let retriedOnce = false; // Track if we've already retried for short rate limit - let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries + let capacityRetryCount = 0; let endpointIndex = 0; while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) { @@ -219,7 +285,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab logger.warn(`[CloudCode] Error at ${endpoint}: ${response.status} - ${errorText}`); if (response.status === 401) { - // Gap 3: Check for permanent auth failures + // Check for permanent auth failures if (isPermanentAuthFailure(errorText)) { logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`); accountManager.markInvalid(account.email, 'Token revoked - re-authentication required'); @@ -236,12 +302,17 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab if (response.status === 429) { const resetMs = parseResetTime(response, errorText); + const consecutiveFailures = accountManager.getConsecutiveFailures?.(account.email) || 0; - // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint + // Check if capacity issue (NOT quota) - retry same endpoint with progressive backoff if (isModelCapacityExhausted(errorText)) { if (capacityRetryCount < MAX_CAPACITY_RETRIES) { + // Progressive capacity backoff tiers + const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1); + const waitMs = resetMs || CAPACITY_BACKOFF_TIERS_MS[tierIndex]; capacityRetryCount++; - const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS; + // Track failures for progressive backoff escalation (matches opencode-antigravity-auth) + accountManager.incrementConsecutiveFailures(account.email); logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`); await sleep(waitMs); // Don't increment endpointIndex - retry same endpoint @@ -251,39 +322,80 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`); } - // Gap 1: Check deduplication window to prevent thundering herd - if (shouldSkipRetryDueToDedup(model)) { - logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`); - accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model); + // Get rate limit backoff with exponential backoff and state reset + const backoff = getRateLimitBackoff(account.email, model, resetMs); + + // For very short rate limits (< 1 second), always wait and retry + // Switching accounts won't help when all accounts have per-second rate limits + if (resetMs !== null && resetMs < 1000) { + const waitMs = resetMs; + logger.info(`[CloudCode] Short rate limit on ${account.email} (${resetMs}ms), waiting and retrying...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } + + // If within dedup window AND reset time is >= 1s, switch account + if (backoff.isDuplicate) { + const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures); + logger.info(`[CloudCode] Skipping retry due to recent rate limit on ${account.email} (attempt ${backoff.attempt}), switching account...`); + accountManager.markRateLimited(account.email, smartBackoffMs, model); throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`); } + // Calculate smart backoff based on error type + const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures); + // Decision: wait and retry OR switch account - if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) { - // Long-term quota exhaustion (> 10s) - switch to next account - logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`); - accountManager.markRateLimited(account.email, resetMs, model); + // First 429 gets a quick 1s retry (FIRST_RETRY_DELAY_MS) + if (backoff.attempt === 1 && smartBackoffMs <= DEFAULT_COOLDOWN_MS) { + // Quick 1s retry on first 429 (matches opencode-antigravity-auth) + const waitMs = backoff.delayMs; + // markRateLimited already increments consecutiveFailures internally + // This prevents concurrent retry storms and ensures progressive backoff escalation + accountManager.markRateLimited(account.email, waitMs, model); + logger.info(`[CloudCode] First rate limit on ${account.email}, quick retry after ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } else if (smartBackoffMs > DEFAULT_COOLDOWN_MS) { + // Long-term quota exhaustion (> 10s) - wait SWITCH_ACCOUNT_DELAY_MS then switch + logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(smartBackoffMs)}), switching account after ${formatDuration(SWITCH_ACCOUNT_DELAY_MS)} delay...`); + await sleep(SWITCH_ACCOUNT_DELAY_MS); + accountManager.markRateLimited(account.email, smartBackoffMs, model); throw new Error(`QUOTA_EXHAUSTED: ${errorText}`); } else { - // Short-term rate limit (<= 10s) - wait and retry once - const waitMs = resetMs || DEFAULT_COOLDOWN_MS; - - if (!retriedOnce) { - retriedOnce = true; - recordRateLimitTimestamp(model); // Gap 1: Record before retry - logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`); - await sleep(waitMs); - // Don't increment endpointIndex - retry same endpoint - continue; - } else { - // Already retried once, mark and switch - accountManager.markRateLimited(account.email, waitMs, model); - throw new Error(`RATE_LIMITED: ${errorText}`); - } + // Short-term rate limit but not first attempt - use exponential backoff delay + const waitMs = backoff.delayMs; + // markRateLimited already increments consecutiveFailures internally + accountManager.markRateLimited(account.email, waitMs, model); + logger.info(`[CloudCode] Rate limit on ${account.email} (attempt ${backoff.attempt}), waiting ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; } } if (response.status >= 400) { + // Check for 503 MODEL_CAPACITY_EXHAUSTED - use progressive backoff like 429 capacity + if (response.status === 503 && isModelCapacityExhausted(errorText)) { + if (capacityRetryCount < MAX_CAPACITY_RETRIES) { + // Progressive capacity backoff tiers (same as 429 capacity handling) + const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1); + const waitMs = CAPACITY_BACKOFF_TIERS_MS[tierIndex]; + capacityRetryCount++; + accountManager.incrementConsecutiveFailures(account.email); + logger.info(`[CloudCode] 503 Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } + // Max capacity retries exceeded - switch account + logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded on 503, switching account`); + accountManager.markRateLimited(account.email, BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED, model); + throw new Error(`CAPACITY_EXHAUSTED: ${errorText}`); + } + lastError = new Error(`API error ${response.status}: ${errorText}`); // Try next endpoint for 403/404/5xx errors (matches opencode-antigravity-auth behavior) if (response.status === 403 || response.status === 404) { @@ -300,8 +412,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab // For thinking models, parse SSE and accumulate all parts if (isThinking) { const result = await parseThinkingSSEResponse(response, anthropicRequest.model); - // Gap 1: Clear timestamp on success - clearRateLimitTimestamp(model); + // Clear rate limit state on success + clearRateLimitState(account.email, model); accountManager.notifySuccess(account, model); return result; } @@ -309,8 +421,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab // Non-thinking models use regular JSON const data = await response.json(); logger.debug('[CloudCode] Response received'); - // Gap 1: Clear timestamp on success - clearRateLimitTimestamp(model); + // Clear rate limit state on success + clearRateLimitState(account.email, model); accountManager.notifySuccess(account, model); return convertGoogleToAnthropic(data, anthropicRequest.model); @@ -350,13 +462,15 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) { accountManager.notifyFailure(account, model); - // Gap 2: Check consecutive failures for extended cooldown - const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0; - if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { - logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); + // Track 5xx errors for extended cooldown + // Note: markRateLimited already increments consecutiveFailures internally + const currentFailures = accountManager.getConsecutiveFailures(account.email); + if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) { + logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model); } else { - logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error, trying next...`); + accountManager.incrementConsecutiveFailures(account.email); + logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next...`); } continue; } @@ -364,13 +478,15 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab if (isNetworkError(error)) { accountManager.notifyFailure(account, model); - // Gap 2: Check consecutive failures for extended cooldown - const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0; - if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { - logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); + // Track network errors for extended cooldown + // Note: markRateLimited already increments consecutiveFailures internally + const currentFailures = accountManager.getConsecutiveFailures(account.email); + if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) { + logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model); } else { - logger.warn(`[CloudCode] Network error for ${account.email}, trying next account... (${error.message})`); + accountManager.incrementConsecutiveFailures(account.email); + logger.warn(`[CloudCode] Network error for ${account.email} (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next account... (${error.message})`); } await sleep(1000); continue; diff --git a/src/cloudcode/rate-limit-parser.js b/src/cloudcode/rate-limit-parser.js index 2fddb26..d6e1cb7 100644 --- a/src/cloudcode/rate-limit-parser.js +++ b/src/cloudcode/rate-limit-parser.js @@ -167,15 +167,69 @@ export function parseResetTime(responseOrError, errorText = '') { } } - // SANITY CHECK: Enforce strict minimums for found rate limits - // If we found a reset time, but it's very small (e.g. < 1s) or negative, - // explicitly bump it up to avoid "Available in 0s" loops. + // SANITY CHECK: Handle very small or negative reset times + // For sub-second rate limits (common with per-second quotas), add a small buffer + // For negative or zero, use a reasonable minimum if (resetMs !== null) { - if (resetMs < 1000) { - logger.debug(`[CloudCode] Reset time too small (${resetMs}ms), enforcing 2s buffer`); - resetMs = 2000; + if (resetMs <= 0) { + logger.debug(`[CloudCode] Reset time invalid (${resetMs}ms), using 500ms default`); + resetMs = 500; + } else if (resetMs < 500) { + // Very short reset - add 200ms buffer for network latency + logger.debug(`[CloudCode] Short reset time (${resetMs}ms), adding 200ms buffer`); + resetMs = resetMs + 200; } + // Note: No longer enforcing 2s minimum - this was causing cascading failures + // when all accounts had short rate limits simultaneously } return resetMs; } + +/** + * Parse the rate limit reason from error text + * Used for smart backoff by error type (matches opencode-antigravity-auth) + * + * @param {string} errorText - Error message/body text + * @returns {'RATE_LIMIT_EXCEEDED' | 'QUOTA_EXHAUSTED' | 'MODEL_CAPACITY_EXHAUSTED' | 'SERVER_ERROR' | 'UNKNOWN'} Error reason + */ +export function parseRateLimitReason(errorText) { + const lower = (errorText || '').toLowerCase(); + + // Check for quota exhaustion (daily/hourly limits) + if (lower.includes('quota_exhausted') || + lower.includes('quotaresetdelay') || + lower.includes('quotaresettimestamp') || + lower.includes('resource_exhausted') || + lower.includes('daily limit') || + lower.includes('quota exceeded')) { + return 'QUOTA_EXHAUSTED'; + } + + // Check for model capacity issues (temporary, retry quickly) + if (lower.includes('model_capacity_exhausted') || + lower.includes('capacity_exhausted') || + lower.includes('model is currently overloaded') || + lower.includes('service temporarily unavailable')) { + return 'MODEL_CAPACITY_EXHAUSTED'; + } + + // Check for rate limiting (per-minute limits) + if (lower.includes('rate_limit_exceeded') || + lower.includes('rate limit') || + lower.includes('too many requests') || + lower.includes('throttl')) { + return 'RATE_LIMIT_EXCEEDED'; + } + + // Check for server errors + if (lower.includes('internal server error') || + lower.includes('server error') || + lower.includes('503') || + lower.includes('502') || + lower.includes('504')) { + return 'SERVER_ERROR'; + } + + return 'UNKNOWN'; +} diff --git a/src/cloudcode/streaming-handler.js b/src/cloudcode/streaming-handler.js index 0cf0f3e..d32c363 100644 --- a/src/cloudcode/streaming-handler.js +++ b/src/cloudcode/streaming-handler.js @@ -12,61 +12,90 @@ import { MAX_WAIT_BEFORE_ERROR_MS, DEFAULT_COOLDOWN_MS, RATE_LIMIT_DEDUP_WINDOW_MS, + RATE_LIMIT_STATE_RESET_MS, + FIRST_RETRY_DELAY_MS, + SWITCH_ACCOUNT_DELAY_MS, MAX_CONSECUTIVE_FAILURES, EXTENDED_COOLDOWN_MS, - CAPACITY_RETRY_DELAY_MS, - MAX_CAPACITY_RETRIES + CAPACITY_BACKOFF_TIERS_MS, + MAX_CAPACITY_RETRIES, + BACKOFF_BY_ERROR_TYPE, + QUOTA_EXHAUSTED_BACKOFF_TIERS_MS, + MIN_BACKOFF_MS } from '../constants.js'; import { isRateLimitError, isAuthError, isEmptyResponseError } from '../errors.js'; import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js'; import { logger } from '../utils/logger.js'; -import { parseResetTime } from './rate-limit-parser.js'; +import { parseResetTime, parseRateLimitReason } from './rate-limit-parser.js'; import { buildCloudCodeRequest, buildHeaders } from './request-builder.js'; import { streamSSEResponse } from './sse-streamer.js'; import { getFallbackModel } from '../fallback-config.js'; import crypto from 'crypto'; /** - * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits - * Tracks last rate limit timestamp per model to skip duplicate retries + * Rate limit deduplication - prevents thundering herd on concurrent rate limits. + * Tracks rate limit state per account+model including consecutive429 count and timestamps. */ -const lastRateLimitTimestamps = new Map(); // modelId -> timestamp +const rateLimitStateByAccountModel = new Map(); // `${email}:${model}` -> { consecutive429, lastAt } /** - * Check if we should skip retry due to recent rate limit on this model + * Get deduplication key for rate limit tracking + * @param {string} email - Account email * @param {string} model - Model ID - * @returns {boolean} True if retry should be skipped (within dedup window) + * @returns {string} Dedup key */ -function shouldSkipRetryDueToDedup(model) { - const lastTimestamp = lastRateLimitTimestamps.get(model); - if (!lastTimestamp) return false; +function getDedupKey(email, model) { + return `${email}:${model}`; +} - const elapsed = Date.now() - lastTimestamp; - if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) { - logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`); - return true; +/** + * Get rate limit backoff with deduplication and exponential backoff (matches opencode-antigravity-auth) + * @param {string} email - Account email + * @param {string} model - Model ID + * @param {number|null} serverRetryAfterMs - Server-provided retry time + * @returns {{attempt: number, delayMs: number, isDuplicate: boolean}} Backoff info + */ +function getRateLimitBackoff(email, model, serverRetryAfterMs) { + const now = Date.now(); + const stateKey = getDedupKey(email, model); + const previous = rateLimitStateByAccountModel.get(stateKey); + + // Check if within dedup window - return duplicate status + if (previous && (now - previous.lastAt < RATE_LIMIT_DEDUP_WINDOW_MS)) { + const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS; + const backoffDelay = Math.min(baseDelay * Math.pow(2, previous.consecutive429 - 1), 60000); + logger.debug(`[CloudCode] Rate limit on ${email}:${model} within dedup window, attempt=${previous.consecutive429}, isDuplicate=true`); + return { attempt: previous.consecutive429, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: true }; } - return false; + + // Determine attempt number - reset after RATE_LIMIT_STATE_RESET_MS of inactivity + const attempt = previous && (now - previous.lastAt < RATE_LIMIT_STATE_RESET_MS) + ? previous.consecutive429 + 1 + : 1; + + // Update state + rateLimitStateByAccountModel.set(stateKey, { consecutive429: attempt, lastAt: now }); + + // Calculate exponential backoff + const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS; + const backoffDelay = Math.min(baseDelay * Math.pow(2, attempt - 1), 60000); + + logger.debug(`[CloudCode] Rate limit backoff for ${email}:${model}: attempt=${attempt}, delayMs=${Math.max(baseDelay, backoffDelay)}`); + return { attempt, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: false }; } /** - * Record rate limit timestamp for deduplication + * Clear rate limit state after successful request + * @param {string} email - Account email * @param {string} model - Model ID */ -function recordRateLimitTimestamp(model) { - lastRateLimitTimestamps.set(model, Date.now()); +function clearRateLimitState(email, model) { + const key = getDedupKey(email, model); + rateLimitStateByAccountModel.delete(key); } /** - * Clear rate limit timestamp after successful retry - * @param {string} model - Model ID - */ -function clearRateLimitTimestamp(model) { - lastRateLimitTimestamps.delete(model); -} - -/** - * Gap 3: Detect permanent authentication failures that require re-authentication + * Detect permanent authentication failures that require re-authentication. * @param {string} errorText - Error message from API * @returns {boolean} True if permanent auth failure */ @@ -81,7 +110,7 @@ function isPermanentAuthFailure(errorText) { } /** - * Gap 4: Detect if 429 error is due to model capacity (not user quota) + * Detect if 429 error is due to model capacity (not user quota). * @param {string} errorText - Error message from API * @returns {boolean} True if capacity exhausted (not quota) */ @@ -93,16 +122,47 @@ function isModelCapacityExhausted(errorText) { lower.includes('service temporarily unavailable'); } -// Periodically clean up stale dedup timestamps (every 60 seconds) +// Periodically clean up stale rate limit state (every 60 seconds) setInterval(() => { - const cutoff = Date.now() - 60000; // 1 minute - for (const [model, timestamp] of lastRateLimitTimestamps.entries()) { - if (timestamp < cutoff) { - lastRateLimitTimestamps.delete(model); + const cutoff = Date.now() - RATE_LIMIT_STATE_RESET_MS; + for (const [key, state] of rateLimitStateByAccountModel.entries()) { + if (state.lastAt < cutoff) { + rateLimitStateByAccountModel.delete(key); } } }, 60000); +/** + * Calculate smart backoff based on error type (matches opencode-antigravity-auth) + * @param {string} errorText - Error message + * @param {number|null} serverResetMs - Reset time from server + * @param {number} consecutiveFailures - Number of consecutive failures + * @returns {number} Backoff time in milliseconds + */ +function calculateSmartBackoff(errorText, serverResetMs, consecutiveFailures = 0) { + // If server provides a reset time, use it (with minimum floor to prevent loops) + if (serverResetMs && serverResetMs > 0) { + return Math.max(serverResetMs, MIN_BACKOFF_MS); + } + + const reason = parseRateLimitReason(errorText); + + switch (reason) { + case 'QUOTA_EXHAUSTED': + // Progressive backoff: [60s, 5m, 30m, 2h] + const tierIndex = Math.min(consecutiveFailures, QUOTA_EXHAUSTED_BACKOFF_TIERS_MS.length - 1); + return QUOTA_EXHAUSTED_BACKOFF_TIERS_MS[tierIndex]; + case 'RATE_LIMIT_EXCEEDED': + return BACKOFF_BY_ERROR_TYPE.RATE_LIMIT_EXCEEDED; + case 'MODEL_CAPACITY_EXHAUSTED': + return BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED; + case 'SERVER_ERROR': + return BACKOFF_BY_ERROR_TYPE.SERVER_ERROR; + default: + return BACKOFF_BY_ERROR_TYPE.UNKNOWN; + } +} + /** * Send a streaming request to Cloud Code with multi-account support * Streams events in real-time as they arrive from the server @@ -172,7 +232,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb // Select account using configured strategy const { account, waitMs } = accountManager.selectAccount(model); - // If strategy returns a wait time, sleep and retry + // If strategy returns a wait time without an account, sleep and retry if (!account && waitMs > 0) { logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`); await sleep(waitMs + 500); @@ -180,6 +240,13 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb continue; } + // If strategy returns an account with throttle wait (fallback mode), apply delay + // This prevents overwhelming the API when using emergency/lastResort fallbacks + if (account && waitMs > 0) { + logger.debug(`[CloudCode] Throttling request (${waitMs}ms) - fallback mode active`); + await sleep(waitMs); + } + if (!account) { logger.warn(`[CloudCode] Strategy returned no account for ${model} (attempt ${attempt + 1}/${maxAttempts})`); continue; @@ -195,8 +262,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb // Try each endpoint with index-based loop for capacity retry support let lastError = null; - let retriedOnce = false; // Track if we've already retried for short rate limit - let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries + let capacityRetryCount = 0; let endpointIndex = 0; while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) { @@ -215,7 +281,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb logger.warn(`[CloudCode] Stream error at ${endpoint}: ${response.status} - ${errorText}`); if (response.status === 401) { - // Gap 3: Check for permanent auth failures + // Check for permanent auth failures if (isPermanentAuthFailure(errorText)) { logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`); accountManager.markInvalid(account.email, 'Token revoked - re-authentication required'); @@ -231,12 +297,17 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb if (response.status === 429) { const resetMs = parseResetTime(response, errorText); + const consecutiveFailures = accountManager.getConsecutiveFailures?.(account.email) || 0; - // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint + // Check if capacity issue (NOT quota) - retry same endpoint with progressive backoff if (isModelCapacityExhausted(errorText)) { if (capacityRetryCount < MAX_CAPACITY_RETRIES) { + // Progressive capacity backoff tiers + const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1); + const waitMs = resetMs || CAPACITY_BACKOFF_TIERS_MS[tierIndex]; capacityRetryCount++; - const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS; + // Track failures for progressive backoff escalation (matches opencode-antigravity-auth) + accountManager.incrementConsecutiveFailures(account.email); logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`); await sleep(waitMs); // Don't increment endpointIndex - retry same endpoint @@ -246,38 +317,78 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`); } - // Gap 1: Check deduplication window to prevent thundering herd - if (shouldSkipRetryDueToDedup(model)) { - logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`); - accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model); + // Get rate limit backoff with exponential backoff and state reset + const backoff = getRateLimitBackoff(account.email, model, resetMs); + + // For very short rate limits (< 1 second), always wait and retry + // Switching accounts won't help when all accounts have per-second rate limits + if (resetMs !== null && resetMs < 1000) { + const waitMs = resetMs; + logger.info(`[CloudCode] Short rate limit on ${account.email} (${resetMs}ms), waiting and retrying...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } + + // If within dedup window AND reset time is >= 1s, switch account + if (backoff.isDuplicate) { + const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures); + logger.info(`[CloudCode] Skipping retry due to recent rate limit on ${account.email} (attempt ${backoff.attempt}), switching account...`); + accountManager.markRateLimited(account.email, smartBackoffMs, model); throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`); } + // Calculate smart backoff based on error type + const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures); + // Decision: wait and retry OR switch account - if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) { - // Long-term quota exhaustion (> 10s) - switch to next account - logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`); - accountManager.markRateLimited(account.email, resetMs, model); + // First 429 gets a quick 1s retry (FIRST_RETRY_DELAY_MS) + if (backoff.attempt === 1 && smartBackoffMs <= DEFAULT_COOLDOWN_MS) { + // Quick 1s retry on first 429 (matches opencode-antigravity-auth) + const waitMs = backoff.delayMs; + // markRateLimited already increments consecutiveFailures internally + accountManager.markRateLimited(account.email, waitMs, model); + logger.info(`[CloudCode] First rate limit on ${account.email}, quick retry after ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } else if (smartBackoffMs > DEFAULT_COOLDOWN_MS) { + // Long-term quota exhaustion (> 10s) - wait SWITCH_ACCOUNT_DELAY_MS then switch + logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(smartBackoffMs)}), switching account after ${formatDuration(SWITCH_ACCOUNT_DELAY_MS)} delay...`); + await sleep(SWITCH_ACCOUNT_DELAY_MS); + accountManager.markRateLimited(account.email, smartBackoffMs, model); throw new Error(`QUOTA_EXHAUSTED: ${errorText}`); } else { - // Short-term rate limit (<= 10s) - wait and retry once - const waitMs = resetMs || DEFAULT_COOLDOWN_MS; - - if (!retriedOnce) { - retriedOnce = true; - recordRateLimitTimestamp(model); // Gap 1: Record before retry - logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`); - await sleep(waitMs); - // Don't increment endpointIndex - retry same endpoint - continue; - } else { - // Already retried once, mark and switch - accountManager.markRateLimited(account.email, waitMs, model); - throw new Error(`RATE_LIMITED: ${errorText}`); - } + // Short-term rate limit but not first attempt - use exponential backoff delay + const waitMs = backoff.delayMs; + // markRateLimited already increments consecutiveFailures internally + accountManager.markRateLimited(account.email, waitMs, model); + logger.info(`[CloudCode] Rate limit on ${account.email} (attempt ${backoff.attempt}), waiting ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; } } + // Check for 503 MODEL_CAPACITY_EXHAUSTED - use progressive backoff like 429 capacity + if (response.status === 503 && isModelCapacityExhausted(errorText)) { + if (capacityRetryCount < MAX_CAPACITY_RETRIES) { + // Progressive capacity backoff tiers (same as 429 capacity handling) + const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1); + const waitMs = CAPACITY_BACKOFF_TIERS_MS[tierIndex]; + capacityRetryCount++; + accountManager.incrementConsecutiveFailures(account.email); + logger.info(`[CloudCode] 503 Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`); + await sleep(waitMs); + // Don't increment endpointIndex - retry same endpoint + continue; + } + // Max capacity retries exceeded - switch account + logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded on 503, switching account`); + accountManager.markRateLimited(account.email, BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED, model); + throw new Error(`CAPACITY_EXHAUSTED: ${errorText}`); + } + lastError = new Error(`API error ${response.status}: ${errorText}`); // Try next endpoint for 403/404/5xx errors (matches opencode-antigravity-auth behavior) @@ -299,8 +410,8 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb try { yield* streamSSEResponse(currentResponse, anthropicRequest.model); logger.debug('[CloudCode] Stream completed'); - // Gap 1: Clear timestamp on success - clearRateLimitTimestamp(model); + // Clear rate limit state on success + clearRateLimitState(account.email, model); accountManager.notifySuccess(account, model); return; } catch (streamError) { @@ -409,13 +520,15 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) { accountManager.notifyFailure(account, model); - // Gap 2: Check consecutive failures for extended cooldown - const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0; - if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { - logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); + // Track 5xx errors for extended cooldown + // Note: markRateLimited already increments consecutiveFailures internally + const currentFailures = accountManager.getConsecutiveFailures(account.email); + if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) { + logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model); } else { - logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error, trying next...`); + accountManager.incrementConsecutiveFailures(account.email); + logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next...`); } continue; } @@ -423,13 +536,15 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb if (isNetworkError(error)) { accountManager.notifyFailure(account, model); - // Gap 2: Check consecutive failures for extended cooldown - const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0; - if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { - logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); + // Track network errors for extended cooldown + // Note: markRateLimited already increments consecutiveFailures internally + const currentFailures = accountManager.getConsecutiveFailures(account.email); + if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) { + logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`); accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model); } else { - logger.warn(`[CloudCode] Network error for ${account.email} (stream), trying next account... (${error.message})`); + accountManager.incrementConsecutiveFailures(account.email); + logger.warn(`[CloudCode] Network error for ${account.email} (stream) (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next account... (${error.message})`); } await sleep(1000); continue; diff --git a/src/config.js b/src/config.js index d2c6008..7d23a36 100644 --- a/src/config.js +++ b/src/config.js @@ -16,6 +16,11 @@ const DEFAULT_CONFIG = { defaultCooldownMs: 10000, // 10 seconds maxWaitBeforeErrorMs: 120000, // 2 minutes maxAccounts: 10, // Maximum number of accounts allowed + // Rate limit handling (matches opencode-antigravity-auth) + rateLimitDedupWindowMs: 2000, // 2 seconds - prevents concurrent retry storms + maxConsecutiveFailures: 3, // Before applying extended cooldown + extendedCooldownMs: 60000, // 1 minute extended cooldown + maxCapacityRetries: 5, // Max retries for capacity exhaustion modelMapping: {}, // Account selection strategy configuration accountSelection: { diff --git a/src/constants.js b/src/constants.js index ad20ccf..1bf8a2d 100644 --- a/src/constants.js +++ b/src/constants.js @@ -103,16 +103,33 @@ export const MAX_ACCOUNTS = config?.maxAccounts || 10; // From config or 10 // Rate limit wait thresholds export const MAX_WAIT_BEFORE_ERROR_MS = config?.maxWaitBeforeErrorMs || 120000; // From config or 2 minutes -// Gap 1: Retry deduplication - prevents thundering herd on concurrent rate limits -export const RATE_LIMIT_DEDUP_WINDOW_MS = config?.rateLimitDedupWindowMs || 5000; // 5 seconds +// Retry deduplication - prevents thundering herd on concurrent rate limits +export const RATE_LIMIT_DEDUP_WINDOW_MS = config?.rateLimitDedupWindowMs || 2000; // 2 seconds +export const RATE_LIMIT_STATE_RESET_MS = config?.rateLimitStateResetMs || 120000; // 2 minutes - reset consecutive429 after inactivity +export const FIRST_RETRY_DELAY_MS = config?.firstRetryDelayMs || 1000; // Quick 1s retry on first 429 +export const SWITCH_ACCOUNT_DELAY_MS = config?.switchAccountDelayMs || 5000; // Delay before switching accounts -// Gap 2: Consecutive failure tracking - extended cooldown after repeated failures +// Consecutive failure tracking - extended cooldown after repeated failures export const MAX_CONSECUTIVE_FAILURES = config?.maxConsecutiveFailures || 3; export const EXTENDED_COOLDOWN_MS = config?.extendedCooldownMs || 60000; // 1 minute -// Gap 4: Capacity exhaustion - shorter retry for model capacity issues (not quota) -export const CAPACITY_RETRY_DELAY_MS = config?.capacityRetryDelayMs || 2000; // 2 seconds -export const MAX_CAPACITY_RETRIES = config?.maxCapacityRetries || 3; +// Capacity exhaustion - progressive backoff tiers for model capacity issues +export const CAPACITY_BACKOFF_TIERS_MS = config?.capacityBackoffTiersMs || [5000, 10000, 20000, 30000, 60000]; +export const MAX_CAPACITY_RETRIES = config?.maxCapacityRetries || 5; + +// Smart backoff by error type +export const BACKOFF_BY_ERROR_TYPE = { + RATE_LIMIT_EXCEEDED: 30000, // 30 seconds + MODEL_CAPACITY_EXHAUSTED: 15000, // 15 seconds + SERVER_ERROR: 20000, // 20 seconds + UNKNOWN: 60000 // 1 minute +}; + +// Progressive backoff tiers for QUOTA_EXHAUSTED (60s, 5m, 30m, 2h) +export const QUOTA_EXHAUSTED_BACKOFF_TIERS_MS = [60000, 300000, 1800000, 7200000]; + +// Minimum backoff floor to prevent "Available in 0s" loops (matches opencode-antigravity-auth) +export const MIN_BACKOFF_MS = 2000; // Thinking model constants export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length @@ -258,10 +275,16 @@ export default { MAX_ACCOUNTS, MAX_WAIT_BEFORE_ERROR_MS, RATE_LIMIT_DEDUP_WINDOW_MS, + RATE_LIMIT_STATE_RESET_MS, + FIRST_RETRY_DELAY_MS, + SWITCH_ACCOUNT_DELAY_MS, MAX_CONSECUTIVE_FAILURES, EXTENDED_COOLDOWN_MS, - CAPACITY_RETRY_DELAY_MS, + CAPACITY_BACKOFF_TIERS_MS, MAX_CAPACITY_RETRIES, + BACKOFF_BY_ERROR_TYPE, + QUOTA_EXHAUSTED_BACKOFF_TIERS_MS, + MIN_BACKOFF_MS, MIN_SIGNATURE_LENGTH, GEMINI_MAX_OUTPUT_TOKENS, GEMINI_SKIP_SIGNATURE, diff --git a/src/webui/index.js b/src/webui/index.js index 2326a1d..dbaa09a 100644 --- a/src/webui/index.js +++ b/src/webui/index.js @@ -397,7 +397,7 @@ export function mountWebUI(app, dirname, accountManager) { */ app.post('/api/config', (req, res) => { try { - const { debug, logLevel, maxRetries, retryBaseMs, retryMaxMs, persistTokenCache, defaultCooldownMs, maxWaitBeforeErrorMs, maxAccounts, accountSelection, rateLimitDedupWindowMs, maxConsecutiveFailures, extendedCooldownMs, capacityRetryDelayMs, maxCapacityRetries } = req.body; + const { debug, logLevel, maxRetries, retryBaseMs, retryMaxMs, persistTokenCache, defaultCooldownMs, maxWaitBeforeErrorMs, maxAccounts, accountSelection, rateLimitDedupWindowMs, maxConsecutiveFailures, extendedCooldownMs, maxCapacityRetries } = req.body; // Only allow updating specific fields (security) const updates = {}; @@ -435,9 +435,6 @@ export function mountWebUI(app, dirname, accountManager) { if (typeof extendedCooldownMs === 'number' && extendedCooldownMs >= 10000 && extendedCooldownMs <= 300000) { updates.extendedCooldownMs = extendedCooldownMs; } - if (typeof capacityRetryDelayMs === 'number' && capacityRetryDelayMs >= 500 && capacityRetryDelayMs <= 10000) { - updates.capacityRetryDelayMs = capacityRetryDelayMs; - } if (typeof maxCapacityRetries === 'number' && maxCapacityRetries >= 1 && maxCapacityRetries <= 10) { updates.maxCapacityRetries = maxCapacityRetries; } diff --git a/tests/stress-test.cjs b/tests/stress-test.cjs new file mode 100644 index 0000000..d493afe --- /dev/null +++ b/tests/stress-test.cjs @@ -0,0 +1,93 @@ +/** + * Stress Test - Send multiple concurrent requests to test rate limit handling + * + * Usage: node tests/stress-test.cjs [count] [model] + * Example: node tests/stress-test.cjs 10 gemini-3-flash + */ + +const BASE_URL = process.env.ANTHROPIC_BASE_URL || 'http://localhost:8080'; + +const count = parseInt(process.argv[2]) || 8; +const model = process.argv[3] || 'gemini-3-flash'; + +async function sendRequest(id) { + const startTime = Date.now(); + try { + const response = await fetch(`${BASE_URL}/v1/messages`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': 'test', + 'anthropic-version': '2023-06-01' + }, + body: JSON.stringify({ + model: model, + max_tokens: 100, + messages: [ + { role: 'user', content: `Request ${id}: Say "Hello ${id}" and nothing else.` } + ] + }) + }); + + const elapsed = Date.now() - startTime; + + if (!response.ok) { + const errorText = await response.text(); + console.log(`[${id}] ❌ ${response.status} after ${elapsed}ms: ${errorText.substring(0, 100)}`); + return { id, success: false, status: response.status, elapsed }; + } + + const data = await response.json(); + const text = data.content?.[0]?.text?.substring(0, 50) || 'No text'; + console.log(`[${id}] ✅ 200 after ${elapsed}ms: "${text}..."`); + return { id, success: true, status: 200, elapsed }; + } catch (error) { + const elapsed = Date.now() - startTime; + console.log(`[${id}] ❌ Error after ${elapsed}ms: ${error.message}`); + return { id, success: false, error: error.message, elapsed }; + } +} + +async function runStressTest() { + console.log(`\n🚀 Stress Test: Sending ${count} concurrent requests to ${model}\n`); + console.log(`Target: ${BASE_URL}/v1/messages\n`); + console.log('─'.repeat(70)); + + const startTime = Date.now(); + + // Send all requests concurrently + const promises = []; + for (let i = 1; i <= count; i++) { + promises.push(sendRequest(i)); + } + + const results = await Promise.all(promises); + + const totalElapsed = Date.now() - startTime; + console.log('─'.repeat(70)); + + // Summary + const successful = results.filter(r => r.success).length; + const failed = results.filter(r => !r.success).length; + const avgElapsed = Math.round(results.reduce((sum, r) => sum + r.elapsed, 0) / results.length); + + console.log(`\n📊 Summary:`); + console.log(` Total time: ${totalElapsed}ms`); + console.log(` Successful: ${successful}/${count}`); + console.log(` Failed: ${failed}/${count}`); + console.log(` Avg response time: ${avgElapsed}ms`); + + if (failed > 0) { + const errors = results.filter(r => !r.success); + const statusCounts = {}; + errors.forEach(e => { + const key = e.status || 'network'; + statusCounts[key] = (statusCounts[key] || 0) + 1; + }); + console.log(` Error breakdown: ${JSON.stringify(statusCounts)}`); + } + + console.log(''); +} + +runStressTest().catch(console.error);