From 5a85f0cfccb92b79f0c93acf7f0f9c344d30cf4f Mon Sep 17 00:00:00 2001
From: Badri Narayanan S <s.badrinarayanan791@gmail.com>
Date: Sat, 24 Jan 2026 22:43:53 +0530
Subject: [PATCH] feat: comprehensive rate limit handling overhaul (inspired by
 opencode-antigravity-auth)

This commit addresses "Max retries exceeded" errors during stress testing where
all accounts would become exhausted simultaneously due to short per-second rate
limits triggering cascading failures.

## Rate Limit Parser (`rate-limit-parser.js`)
- Remove 2s buffer enforcement that caused cascading failures when API returned
  short reset times (200-600ms). Now adds 200ms buffer for sub-500ms resets
- Add `parseRateLimitReason()` for smart backoff based on error type:
  QUOTA_EXHAUSTED, RATE_LIMIT_EXCEEDED, MODEL_CAPACITY_EXHAUSTED, SERVER_ERROR

## Message/Streaming Handlers
- Add per-account+model rate limit state tracking with exponential backoff
- For short rate limits (< 1 second), wait and retry on same account instead
  of switching - prevents thundering herd when all accounts hit per-second limits
- Add throttle wait support for fallback modes (emergency/lastResort)
- Add `calculateSmartBackoff()` with progressive tiers by error type

## HybridStrategy (`hybrid-strategy.js`)
- Refactor `#getCandidates()` to return 4 fallback levels:
  - `normal`: All filters pass (health, tokens, quota)
  - `quota`: Bypass critical quota check
  - `emergency`: Bypass health check when ALL accounts unhealthy
  - `lastResort`: Bypass BOTH health AND token bucket checks
- Add throttle wait times: 500ms for lastResort, 250ms for emergency
- Fix LRU calculation to use seconds (matches opencode-antigravity-auth)

## Health Tracker
- Increase `recoveryPerHour` from 2 to 10 for faster recovery (1 hour vs 5 hours)

## Account Manager
- Add consecutive failure tracking: `getConsecutiveFailures()`,
  `incrementConsecutiveFailures()`, `resetConsecutiveFailures()`
- Add cooldown mechanism separate from rate limits with `CooldownReason`
- Reset consecutive failures on successful request

## Base Strategy
- Add `isAccountCoolingDown()` check in `isAccountUsable()`

## Constants
- Replace fixed `CAPACITY_RETRY_DELAY_MS` with progressive `CAPACITY_BACKOFF_TIERS_MS`
- Add `BACKOFF_BY_ERROR_TYPE` for smart backoff
- Add `QUOTA_EXHAUSTED_BACKOFF_TIERS_MS` for progressive quota backoff
- Add `MIN_BACKOFF_MS` floor to prevent "Available in 0s" loops
- Increase `MAX_CAPACITY_RETRIES` from 3 to 5
- Reduce `RATE_LIMIT_DEDUP_WINDOW_MS` from 5s to 2s

## Frontend
- Remove `capacityRetryDelayMs` config (replaced by progressive tiers)
- Update default `maxCapacityRetries` display from 3 to 5

## Testing
- Add `tests/stress-test.cjs` for concurrent request stress testing

Co-Authored-By: Claude <noreply@anthropic.com>
---
 public/js/components/server-config.js         |   6 -
 public/js/config/constants.js                 |   4 -
 public/js/translations/en.js                  |   2 -
 public/js/translations/id.js                  |   2 -
 public/js/translations/pt.js                  |   2 -
 public/js/translations/tr.js                  |   2 -
 public/js/translations/zh.js                  |   2 -
 public/views/settings.html                    |  32 +--
 src/account-manager/index.js                  |  83 +++++-
 src/account-manager/rate-limits.js            | 132 +++++++++
 .../strategies/base-strategy.js               |   5 +
 .../strategies/hybrid-strategy.js             |  96 +++++--
 .../strategies/trackers/health-tracker.js     |   2 +-
 src/cloudcode/message-handler.js              | 272 +++++++++++++-----
 src/cloudcode/rate-limit-parser.js            |  66 ++++-
 src/cloudcode/streaming-handler.js            | 265 ++++++++++++-----
 src/config.js                                 |   5 +
 src/constants.js                              |  37 ++-
 src/webui/index.js                            |   5 +-
 tests/stress-test.cjs                         |  93 ++++++
 20 files changed, 869 insertions(+), 244 deletions(-)
 create mode 100644 tests/stress-test.cjs

diff --git a/public/js/components/server-config.js b/public/js/components/server-config.js
index 9fd168b..5ff98fc 100644
--- a/public/js/components/server-config.js
+++ b/public/js/components/server-config.js
@@ -274,12 +274,6 @@ window.Components.serverConfig = () => ({
             (v) => window.Validators.validateTimeout(v, EXTENDED_COOLDOWN_MIN, EXTENDED_COOLDOWN_MAX));
     },
 
-    toggleCapacityRetryDelayMs(value) {
-        const { CAPACITY_RETRY_DELAY_MIN, CAPACITY_RETRY_DELAY_MAX } = window.AppConstants.VALIDATION;
-        this.saveConfigField('capacityRetryDelayMs', value, 'Capacity Retry Delay',
-            (v) => window.Validators.validateTimeout(v, CAPACITY_RETRY_DELAY_MIN, CAPACITY_RETRY_DELAY_MAX));
-    },
-
     toggleMaxCapacityRetries(value) {
         const { MAX_CAPACITY_RETRIES_MIN, MAX_CAPACITY_RETRIES_MAX } = window.AppConstants.VALIDATION;
         this.saveConfigField('maxCapacityRetries', value, 'Max Capacity Retries',
diff --git a/public/js/config/constants.js b/public/js/config/constants.js
index 258ceb3..f9eaeb9 100644
--- a/public/js/config/constants.js
+++ b/public/js/config/constants.js
@@ -85,10 +85,6 @@ window.AppConstants.VALIDATION = {
     EXTENDED_COOLDOWN_MIN: 10000,
     EXTENDED_COOLDOWN_MAX: 300000,
 
-    // Capacity retry delay (500ms - 10 seconds)
-    CAPACITY_RETRY_DELAY_MIN: 500,
-    CAPACITY_RETRY_DELAY_MAX: 10000,
-
     // Capacity retries (1 - 10)
     MAX_CAPACITY_RETRIES_MIN: 1,
     MAX_CAPACITY_RETRIES_MAX: 10
diff --git a/public/js/translations/en.js b/public/js/translations/en.js
index 91d1d67..324c499 100644
--- a/public/js/translations/en.js
+++ b/public/js/translations/en.js
@@ -245,8 +245,6 @@ window.translations.en = {
     maxConsecutiveFailuresDesc: "Number of consecutive failures before applying extended cooldown to an account.",
     extendedCooldown: "Extended Cooldown",
     extendedCooldownDesc: "Cooldown duration applied after max consecutive failures reached.",
-    capacityRetryDelay: "Capacity Retry Delay",
-    capacityRetryDelayDesc: "Delay before retrying when model capacity is exhausted (not quota).",
     maxCapacityRetries: "Max Capacity Retries",
     maxCapacityRetriesDesc: "Maximum retries for capacity exhaustion before switching accounts.",
     saveConfigServer: "Save Configuration",
diff --git a/public/js/translations/id.js b/public/js/translations/id.js
index c06bc1c..c9a8b6f 100644
--- a/public/js/translations/id.js
+++ b/public/js/translations/id.js
@@ -278,8 +278,6 @@ window.translations.id = {
     maxConsecutiveFailuresDesc: "Jumlah kegagalan berturut-turut sebelum menerapkan cooldown diperpanjang.",
     extendedCooldown: "Cooldown Diperpanjang",
     extendedCooldownDesc: "Durasi cooldown setelah mencapai maks. kegagalan berturut-turut.",
-    capacityRetryDelay: "Jeda Retry Kapasitas",
-    capacityRetryDelayDesc: "Jeda sebelum retry saat kapasitas model habis (bukan kuota).",
     maxCapacityRetries: "Maks. Retry Kapasitas",
     maxCapacityRetriesDesc: "Maksimum retry untuk kehabisan kapasitas sebelum ganti akun.",
     saveConfigServer: "Simpan Konfigurasi",
diff --git a/public/js/translations/pt.js b/public/js/translations/pt.js
index 29f73f4..9de8f46 100644
--- a/public/js/translations/pt.js
+++ b/public/js/translations/pt.js
@@ -223,8 +223,6 @@ window.translations.pt = {
     maxConsecutiveFailuresDesc: "Número de falhas consecutivas antes de aplicar resfriamento estendido.",
     extendedCooldown: "Resfriamento Estendido",
     extendedCooldownDesc: "Duração do resfriamento aplicado após atingir máx. de falhas consecutivas.",
-    capacityRetryDelay: "Atraso de Retry de Capacidade",
-    capacityRetryDelayDesc: "Atraso antes de tentar novamente quando capacidade do modelo está esgotada (não quota).",
     maxCapacityRetries: "Máx. Retries de Capacidade",
     maxCapacityRetriesDesc: "Máximo de retries para esgotamento de capacidade antes de trocar conta.",
     saveConfigServer: "Salvar Configuração",
diff --git a/public/js/translations/tr.js b/public/js/translations/tr.js
index 1d00cec..d7416da 100644
--- a/public/js/translations/tr.js
+++ b/public/js/translations/tr.js
@@ -227,8 +227,6 @@ window.translations.tr = {
     maxConsecutiveFailuresDesc: "Uzatılmış soğuma uygulamadan önce ardışık başarısızlık sayısı.",
     extendedCooldown: "Uzatılmış Soğuma",
     extendedCooldownDesc: "Maks. ardışık başarısızlık sonrası uygulanan soğuma süresi.",
-    capacityRetryDelay: "Kapasite Yeniden Deneme Gecikmesi",
-    capacityRetryDelayDesc: "Model kapasitesi tükendiğinde (kota değil) yeniden denemeden önceki gecikme.",
     maxCapacityRetries: "Maks. Kapasite Yeniden Denemesi",
     maxCapacityRetriesDesc: "Hesap değiştirmeden önce kapasite tükenmesi için maksimum yeniden deneme.",
     saveConfigServer: "Yapılandırmayı Kaydet",
diff --git a/public/js/translations/zh.js b/public/js/translations/zh.js
index dc924d6..de2c7e4 100644
--- a/public/js/translations/zh.js
+++ b/public/js/translations/zh.js
@@ -245,8 +245,6 @@ window.translations.zh = {
     maxConsecutiveFailuresDesc: "触发扩展冷却前允许的连续失败次数。",
     extendedCooldown: "扩展冷却时间",
     extendedCooldownDesc: "达到最大连续失败后应用的冷却时长。",
-    capacityRetryDelay: "容量重试延迟",
-    capacityRetryDelayDesc: "模型容量耗尽（非配额）时重试前的延迟。",
     maxCapacityRetries: "最大容量重试次数",
     maxCapacityRetriesDesc: "容量耗尽时在切换账号前的最大重试次数。",
     saveConfigServer: "保存配置",
diff --git a/public/views/settings.html b/public/views/settings.html
index 1a48fb3..6a8463f 100644
--- a/public/views/settings.html
+++ b/public/views/settings.html
@@ -1226,47 +1226,23 @@
                                     x-text="$store.global.t('extendedCooldownDesc')">Applied after max consecutive failures.</p>
                             </div>
 
-                            <div class="form-control">
-                                <label class="label pt-0">
-                                    <span class="label-text text-gray-400 text-xs"
-                                        x-text="$store.global.t('capacityRetryDelay')">Capacity Retry Delay</span>
-                                    <span class="label-text-alt font-mono text-neon-cyan text-xs font-semibold"
-                                        x-text="Math.round((serverConfig.capacityRetryDelayMs || 2000) / 1000) + 's'"></span>
-                                </label>
-                                <div class="flex gap-3 items-center">
-                                    <input type="range" min="500" max="10000" step="500"
-                                        class="custom-range custom-range-cyan flex-1"
-                                        :value="serverConfig.capacityRetryDelayMs || 2000"
-                                        :style="`background-size: ${((serverConfig.capacityRetryDelayMs || 2000) - 500) / 95}% 100%`"
-                                        @input="toggleCapacityRetryDelayMs($event.target.value)"
-                                        aria-label="Capacity retry delay slider">
-                                    <input type="number" min="500" max="10000" step="500"
-                                        class="input input-xs input-bordered w-20 bg-space-800 border-space-border text-white font-mono text-center"
-                                        :value="serverConfig.capacityRetryDelayMs || 2000"
-                                        @change="toggleCapacityRetryDelayMs($event.target.value)"
-                                        aria-label="Capacity retry delay value">
-                                </div>
-                                <p class="text-[9px] text-gray-600 mt-1 leading-tight"
-                                    x-text="$store.global.t('capacityRetryDelayDesc')">Delay for capacity (not quota) issues.</p>
-                            </div>
-
                             <div class="form-control">
                                 <label class="label pt-0">
                                     <span class="label-text text-gray-400 text-xs"
                                         x-text="$store.global.t('maxCapacityRetries')">Max Capacity Retries</span>
                                     <span class="label-text-alt font-mono text-neon-cyan text-xs font-semibold"
-                                        x-text="serverConfig.maxCapacityRetries || 3"></span>
+                                        x-text="serverConfig.maxCapacityRetries || 5"></span>
                                 </label>
                                 <div class="flex gap-3 items-center">
                                     <input type="range" min="1" max="10" step="1"
                                         class="custom-range custom-range-cyan flex-1"
-                                        :value="serverConfig.maxCapacityRetries || 3"
-                                        :style="`background-size: ${((serverConfig.maxCapacityRetries || 3) - 1) / 0.09}% 100%`"
+                                        :value="serverConfig.maxCapacityRetries || 5"
+                                        :style="`background-size: ${((serverConfig.maxCapacityRetries || 5) - 1) / 0.09}% 100%`"
                                         @input="toggleMaxCapacityRetries($event.target.value)"
                                         aria-label="Max capacity retries slider">
                                     <input type="number" min="1" max="10" step="1"
                                         class="input input-xs input-bordered w-16 bg-space-800 border-space-border text-white font-mono text-center"
-                                        :value="serverConfig.maxCapacityRetries || 3"
+                                        :value="serverConfig.maxCapacityRetries || 5"
                                         @change="toggleMaxCapacityRetries($event.target.value)"
                                         aria-label="Max capacity retries value">
                                 </div>
diff --git a/src/account-manager/index.js b/src/account-manager/index.js
index e9988b3..f9aecb9 100644
--- a/src/account-manager/index.js
+++ b/src/account-manager/index.js
@@ -15,7 +15,15 @@ import {
     markRateLimited as markLimited,
     markInvalid as markAccountInvalid,
     getMinWaitTimeMs as getMinWait,
-    getRateLimitInfo as getLimitInfo
+    getRateLimitInfo as getLimitInfo,
+    getConsecutiveFailures as getFailures,
+    resetConsecutiveFailures as resetFailures,
+    incrementConsecutiveFailures as incrementFailures,
+    markAccountCoolingDown as markCoolingDown,
+    isAccountCoolingDown as checkCoolingDown,
+    clearAccountCooldown as clearCooldown,
+    getCooldownRemaining as getCooldownMs,
+    CooldownReason
 } from './rate-limits.js';
 import {
     getTokenForAccount as fetchToken,
@@ -182,6 +190,10 @@ export class AccountManager {
         if (this.#strategy) {
             this.#strategy.onSuccess(account, modelId);
         }
+        // Reset consecutive failures on success (matches opencode-antigravity-auth)
+        if (account?.email) {
+            resetFailures(this.#accounts, account.email);
+        }
     }
 
     /**
@@ -206,6 +218,26 @@ export class AccountManager {
         }
     }
 
+    /**
+     * Get the consecutive failure count for an account
+     * Used for progressive backoff calculation
+     * @param {string} email - Account email
+     * @returns {number} Number of consecutive failures
+     */
+    getConsecutiveFailures(email) {
+        return getFailures(this.#accounts, email);
+    }
+
+    /**
+     * Increment the consecutive failure count without marking as rate limited
+     * Used for quick retries to track failures while staying on same account
+     * @param {string} email - Account email
+     * @returns {number} New consecutive failure count
+     */
+    incrementConsecutiveFailures(email) {
+        return incrementFailures(this.#accounts, email);
+    }
+
     /**
      * Get the current strategy name
      * @returns {string} Strategy name
@@ -275,6 +307,52 @@ export class AccountManager {
         return getLimitInfo(this.#accounts, email, modelId);
     }
 
+    // ============================================================================
+    // Cooldown Methods (matches opencode-antigravity-auth)
+    // ============================================================================
+
+    /**
+     * Mark an account as cooling down for a specified duration
+     * Used for temporary backoff separate from rate limits
+     * @param {string} email - Email of the account
+     * @param {number} cooldownMs - Duration of cooldown in milliseconds
+     * @param {string} [reason] - Reason for the cooldown (use CooldownReason constants)
+     */
+    markAccountCoolingDown(email, cooldownMs, reason = CooldownReason.RATE_LIMIT) {
+        markCoolingDown(this.#accounts, email, cooldownMs, reason);
+    }
+
+    /**
+     * Check if an account is currently cooling down
+     * @param {string} email - Email of the account
+     * @returns {boolean} True if account is cooling down
+     */
+    isAccountCoolingDown(email) {
+        const account = this.#accounts.find(a => a.email === email);
+        return account ? checkCoolingDown(account) : false;
+    }
+
+    /**
+     * Clear the cooldown for an account
+     * @param {string} email - Email of the account
+     */
+    clearAccountCooldown(email) {
+        const account = this.#accounts.find(a => a.email === email);
+        if (account) {
+            clearCooldown(account);
+        }
+    }
+
+    /**
+     * Get time remaining until cooldown expires for an account
+     * @param {string} email - Email of the account
+     * @returns {number} Milliseconds until cooldown expires, 0 if not cooling down
+     */
+    getCooldownRemaining(email) {
+        const account = this.#accounts.find(a => a.email === email);
+        return account ? getCooldownMs(account) : 0;
+    }
+
     /**
      * Get OAuth token for an account
      * @param {Object} account - Account object with email and credentials
@@ -378,4 +456,7 @@ export class AccountManager {
     }
 }
 
+// Re-export CooldownReason for use by handlers
+export { CooldownReason };
+
 export default AccountManager;
diff --git a/src/account-manager/rate-limits.js b/src/account-manager/rate-limits.js
index 0dca07a..b55c457 100644
--- a/src/account-manager/rate-limits.js
+++ b/src/account-manager/rate-limits.js
@@ -133,6 +133,9 @@ export function markRateLimited(accounts, email, resetMs = null, modelId) {
         actualResetMs: actualResetMs             // Original duration from API
     };
 
+    // Track consecutive failures for progressive backoff (matches opencode-antigravity-auth)
+    account.consecutiveFailures = (account.consecutiveFailures || 0) + 1;
+
     // Log appropriately based on duration
     if (actualResetMs > DEFAULT_COOLDOWN_MS) {
         logger.warn(
@@ -235,3 +238,132 @@ export function getRateLimitInfo(accounts, email, modelId) {
         waitMs
     };
 }
+
+/**
+ * Get the consecutive failure count for an account
+ * Used for progressive backoff calculation (matches opencode-antigravity-auth)
+ *
+ * @param {Array} accounts - Array of account objects
+ * @param {string} email - Email of the account
+ * @returns {number} Number of consecutive failures
+ */
+export function getConsecutiveFailures(accounts, email) {
+    const account = accounts.find(a => a.email === email);
+    return account?.consecutiveFailures || 0;
+}
+
+/**
+ * Reset the consecutive failure count for an account
+ * Called on successful request (matches opencode-antigravity-auth)
+ *
+ * @param {Array} accounts - Array of account objects
+ * @param {string} email - Email of the account
+ * @returns {boolean} True if account was found and reset
+ */
+export function resetConsecutiveFailures(accounts, email) {
+    const account = accounts.find(a => a.email === email);
+    if (!account) return false;
+    account.consecutiveFailures = 0;
+    return true;
+}
+
+/**
+ * Increment the consecutive failure count for an account WITHOUT marking as rate limited
+ * Used for quick retries where we want to track failures but not skip the account
+ * (matches opencode-antigravity-auth behavior of always incrementing on 429)
+ *
+ * @param {Array} accounts - Array of account objects
+ * @param {string} email - Email of the account
+ * @returns {number} New consecutive failure count
+ */
+export function incrementConsecutiveFailures(accounts, email) {
+    const account = accounts.find(a => a.email === email);
+    if (!account) return 0;
+    account.consecutiveFailures = (account.consecutiveFailures || 0) + 1;
+    return account.consecutiveFailures;
+}
+
+// ============================================================================
+// Cooldown Mechanism (matches opencode-antigravity-auth)
+// Separate from rate limits - used for temporary backoff after failures
+// ============================================================================
+
+/**
+ * Cooldown reasons for debugging/logging
+ */
+export const CooldownReason = {
+    RATE_LIMIT: 'rate_limit',
+    AUTH_FAILURE: 'auth_failure',
+    CONSECUTIVE_FAILURES: 'consecutive_failures',
+    SERVER_ERROR: 'server_error'
+};
+
+/**
+ * Mark an account as cooling down for a specified duration
+ * Used for temporary backoff separate from rate limits
+ *
+ * @param {Array} accounts - Array of account objects
+ * @param {string} email - Email of the account
+ * @param {number} cooldownMs - Duration of cooldown in milliseconds
+ * @param {string} [reason] - Reason for the cooldown
+ * @returns {boolean} True if account was found and marked
+ */
+export function markAccountCoolingDown(accounts, email, cooldownMs, reason = CooldownReason.RATE_LIMIT) {
+    const account = accounts.find(a => a.email === email);
+    if (!account) return false;
+
+    account.coolingDownUntil = Date.now() + cooldownMs;
+    account.cooldownReason = reason;
+
+    logger.debug(`[AccountManager] Account ${email} cooling down for ${formatDuration(cooldownMs)} (reason: ${reason})`);
+    return true;
+}
+
+/**
+ * Check if an account is currently cooling down
+ * Automatically clears expired cooldowns
+ *
+ * @param {Object} account - Account object
+ * @returns {boolean} True if account is cooling down
+ */
+export function isAccountCoolingDown(account) {
+    if (!account || account.coolingDownUntil === undefined) {
+        return false;
+    }
+
+    const now = Date.now();
+    if (now >= account.coolingDownUntil) {
+        // Cooldown expired - clear it
+        clearAccountCooldown(account);
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * Clear the cooldown for an account
+ *
+ * @param {Object} account - Account object
+ */
+export function clearAccountCooldown(account) {
+    if (account) {
+        delete account.coolingDownUntil;
+        delete account.cooldownReason;
+    }
+}
+
+/**
+ * Get time remaining until cooldown expires for an account
+ *
+ * @param {Object} account - Account object
+ * @returns {number} Milliseconds until cooldown expires, 0 if not cooling down
+ */
+export function getCooldownRemaining(account) {
+    if (!account || account.coolingDownUntil === undefined) {
+        return 0;
+    }
+
+    const remaining = account.coolingDownUntil - Date.now();
+    return remaining > 0 ? remaining : 0;
+}
diff --git a/src/account-manager/strategies/base-strategy.js b/src/account-manager/strategies/base-strategy.js
index 57007cc..d11bcd5 100644
--- a/src/account-manager/strategies/base-strategy.js
+++ b/src/account-manager/strategies/base-strategy.js
@@ -5,6 +5,8 @@
  * All strategies must implement the selectAccount method.
  */
 
+import { isAccountCoolingDown } from '../rate-limits.js';
+
 /**
  * @typedef {Object} SelectionResult
  * @property {Object|null} account - The selected account or null if none available
@@ -77,6 +79,9 @@ export class BaseStrategy {
         // Skip disabled accounts
         if (account.enabled === false) return false;
 
+        // Check if account is cooling down (matches opencode-antigravity-auth)
+        if (isAccountCoolingDown(account)) return false;
+
         // Check model-specific rate limit
         if (modelId && account.modelRateLimits && account.modelRateLimits[modelId]) {
             const limit = account.modelRateLimits[modelId];
diff --git a/src/account-manager/strategies/hybrid-strategy.js b/src/account-manager/strategies/hybrid-strategy.js
index 49a0b49..44fd1d1 100644
--- a/src/account-manager/strategies/hybrid-strategy.js
+++ b/src/account-manager/strategies/hybrid-strategy.js
@@ -65,7 +65,7 @@ export class HybridStrategy extends BaseStrategy {
         }
 
         // Get candidates that pass all filters
-        const candidates = this.#getCandidates(accounts, modelId);
+        const { candidates, fallbackLevel } = this.#getCandidates(accounts, modelId);
 
         if (candidates.length === 0) {
             // Diagnose why no candidates are available and compute wait time
@@ -87,16 +87,30 @@ export class HybridStrategy extends BaseStrategy {
         const best = scored[0];
         best.account.lastUsed = Date.now();
 
-        // Consume a token from the bucket
-        this.#tokenBucketTracker.consume(best.account.email);
+        // Consume a token from the bucket (unless in lastResort mode where we bypassed token check)
+        if (fallbackLevel !== 'lastResort') {
+            this.#tokenBucketTracker.consume(best.account.email);
+        }
 
         if (onSave) onSave();
 
+        // Calculate throttle wait time based on fallback level
+        // This prevents overwhelming the API when all accounts are stressed
+        let waitMs = 0;
+        if (fallbackLevel === 'lastResort') {
+            // All accounts exhausted - add significant delay to allow rate limits to clear
+            waitMs = 500;
+        } else if (fallbackLevel === 'emergency') {
+            // All accounts unhealthy - add moderate delay
+            waitMs = 250;
+        }
+
         const position = best.index + 1;
         const total = accounts.length;
-        logger.info(`[HybridStrategy] Using account: ${best.account.email} (${position}/${total}, score: ${best.score.toFixed(1)})`);
+        const fallbackInfo = fallbackLevel !== 'normal' ? `, fallback: ${fallbackLevel}` : '';
+        logger.info(`[HybridStrategy] Using account: ${best.account.email} (${position}/${total}, score: ${best.score.toFixed(1)}${fallbackInfo})`);
 
-        return { account: best.account, index: best.index, waitMs: 0 };
+        return { account: best.account, index: best.index, waitMs };
     }
 
     /**
@@ -131,6 +145,8 @@ export class HybridStrategy extends BaseStrategy {
     /**
      * Get candidates that pass all filters
      * @private
+     * @returns {{candidates: Array, fallbackLevel: string}} Candidates and fallback level used
+     *   fallbackLevel: 'normal' | 'quota' | 'emergency' | 'lastResort'
      */
     #getCandidates(accounts, modelId) {
         const candidates = accounts
@@ -160,24 +176,56 @@ export class HybridStrategy extends BaseStrategy {
                 return true;
             });
 
-        // If no candidates after quota filter, fall back to all usable accounts
-        // (better to use critical quota than fail entirely)
-        if (candidates.length === 0) {
-            const fallback = accounts
-                .map((account, index) => ({ account, index }))
-                .filter(({ account }) => {
-                    if (!this.isAccountUsable(account, modelId)) return false;
-                    if (!this.#healthTracker.isUsable(account.email)) return false;
-                    if (!this.#tokenBucketTracker.hasTokens(account.email)) return false;
-                    return true;
-                });
-            if (fallback.length > 0) {
-                logger.warn('[HybridStrategy] All accounts have critical quota, using fallback');
-                return fallback;
-            }
+        if (candidates.length > 0) {
+            return { candidates, fallbackLevel: 'normal' };
         }
 
-        return candidates;
+        // If no candidates after quota filter, fall back to all usable accounts
+        // (better to use critical quota than fail entirely)
+        const fallback = accounts
+            .map((account, index) => ({ account, index }))
+            .filter(({ account }) => {
+                if (!this.isAccountUsable(account, modelId)) return false;
+                if (!this.#healthTracker.isUsable(account.email)) return false;
+                if (!this.#tokenBucketTracker.hasTokens(account.email)) return false;
+                return true;
+            });
+        if (fallback.length > 0) {
+            logger.warn('[HybridStrategy] All accounts have critical quota, using fallback');
+            return { candidates: fallback, fallbackLevel: 'quota' };
+        }
+
+        // Emergency fallback: bypass health check when ALL accounts are unhealthy
+        // This prevents "Max retries exceeded" when health scores are too low
+        const emergency = accounts
+            .map((account, index) => ({ account, index }))
+            .filter(({ account }) => {
+                if (!this.isAccountUsable(account, modelId)) return false;
+                if (!this.#tokenBucketTracker.hasTokens(account.email)) return false;
+                // Skip health check - use "least bad" account
+                return true;
+            });
+        if (emergency.length > 0) {
+            logger.warn('[HybridStrategy] EMERGENCY: All accounts unhealthy, using least bad account');
+            return { candidates: emergency, fallbackLevel: 'emergency' };
+        }
+
+        // Last resort: bypass BOTH health AND token bucket checks
+        // Only check basic usability (not rate-limited, not disabled)
+        const lastResort = accounts
+            .map((account, index) => ({ account, index }))
+            .filter(({ account }) => {
+                // Only check if account is usable (not rate-limited, not disabled)
+                if (!this.isAccountUsable(account, modelId)) return false;
+                // Skip health and token bucket checks entirely
+                return true;
+            });
+        if (lastResort.length > 0) {
+            logger.warn('[HybridStrategy] LAST RESORT: All accounts exhausted, using any usable account');
+            return { candidates: lastResort, fallbackLevel: 'lastResort' };
+        }
+
+        return { candidates: [], fallbackLevel: 'normal' };
     }
 
     /**
@@ -202,11 +250,11 @@ export class HybridStrategy extends BaseStrategy {
         const quotaComponent = quotaScore * this.#weights.quota;
 
         // LRU component (older = higher score)
-        // Use time since last use, capped at 1 hour for scoring
+        // Use time since last use in seconds, capped at 1 hour (matches opencode-antigravity-auth)
         const lastUsed = account.lastUsed || 0;
         const timeSinceLastUse = Math.min(Date.now() - lastUsed, 3600000); // Cap at 1 hour
-        const lruMinutes = timeSinceLastUse / 60000;
-        const lruComponent = lruMinutes * this.#weights.lru;
+        const lruSeconds = timeSinceLastUse / 1000;
+        const lruComponent = lruSeconds * this.#weights.lru; // 0-3600 * 0.1 = 0-360 max
 
         return healthComponent + tokenComponent + quotaComponent + lruComponent;
     }
diff --git a/src/account-manager/strategies/trackers/health-tracker.js b/src/account-manager/strategies/trackers/health-tracker.js
index a53274f..15d2da2 100644
--- a/src/account-manager/strategies/trackers/health-tracker.js
+++ b/src/account-manager/strategies/trackers/health-tracker.js
@@ -12,7 +12,7 @@ const DEFAULT_CONFIG = {
     successReward: 1,      // Points on successful request
     rateLimitPenalty: -10, // Points on rate limit
     failurePenalty: -20,   // Points on other failures
-    recoveryPerHour: 2,    // Passive recovery rate
+    recoveryPerHour: 10,   // Passive recovery rate (increased from 2 for faster recovery)
     minUsable: 50,         // Minimum score to be selected
     maxScore: 100          // Maximum score cap
 };
diff --git a/src/cloudcode/message-handler.js b/src/cloudcode/message-handler.js
index a283b46..312a974 100644
--- a/src/cloudcode/message-handler.js
+++ b/src/cloudcode/message-handler.js
@@ -11,63 +11,92 @@ import {
     MAX_WAIT_BEFORE_ERROR_MS,
     DEFAULT_COOLDOWN_MS,
     RATE_LIMIT_DEDUP_WINDOW_MS,
+    RATE_LIMIT_STATE_RESET_MS,
+    FIRST_RETRY_DELAY_MS,
+    SWITCH_ACCOUNT_DELAY_MS,
     MAX_CONSECUTIVE_FAILURES,
     EXTENDED_COOLDOWN_MS,
-    CAPACITY_RETRY_DELAY_MS,
+    CAPACITY_BACKOFF_TIERS_MS,
     MAX_CAPACITY_RETRIES,
+    BACKOFF_BY_ERROR_TYPE,
+    QUOTA_EXHAUSTED_BACKOFF_TIERS_MS,
+    MIN_BACKOFF_MS,
     isThinkingModel
 } from '../constants.js';
 import { convertGoogleToAnthropic } from '../format/index.js';
 import { isRateLimitError, isAuthError } from '../errors.js';
 import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js';
 import { logger } from '../utils/logger.js';
-import { parseResetTime } from './rate-limit-parser.js';
+import { parseResetTime, parseRateLimitReason } from './rate-limit-parser.js';
 import { buildCloudCodeRequest, buildHeaders } from './request-builder.js';
 import { parseThinkingSSEResponse } from './sse-parser.js';
 import { getFallbackModel } from '../fallback-config.js';
 
 /**
- * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits
- * Tracks last rate limit timestamp per model to skip duplicate retries
+ * Rate limit deduplication - prevents thundering herd on concurrent rate limits.
+ * Tracks rate limit state per account+model including consecutive429 count and timestamps.
  */
-const lastRateLimitTimestamps = new Map(); // modelId -> timestamp
+const rateLimitStateByAccountModel = new Map(); // `${email}:${model}` -> { consecutive429, lastAt }
 
 /**
- * Check if we should skip retry due to recent rate limit on this model
+ * Get deduplication key for rate limit tracking
+ * @param {string} email - Account email
  * @param {string} model - Model ID
- * @returns {boolean} True if retry should be skipped (within dedup window)
+ * @returns {string} Dedup key
  */
-function shouldSkipRetryDueToDedup(model) {
-    const lastTimestamp = lastRateLimitTimestamps.get(model);
-    if (!lastTimestamp) return false;
+function getDedupKey(email, model) {
+    return `${email}:${model}`;
+}
 
-    const elapsed = Date.now() - lastTimestamp;
-    if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) {
-        logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`);
-        return true;
+/**
+ * Get rate limit backoff with deduplication and exponential backoff (matches opencode-antigravity-auth)
+ * @param {string} email - Account email
+ * @param {string} model - Model ID
+ * @param {number|null} serverRetryAfterMs - Server-provided retry time
+ * @returns {{attempt: number, delayMs: number, isDuplicate: boolean}} Backoff info
+ */
+function getRateLimitBackoff(email, model, serverRetryAfterMs) {
+    const now = Date.now();
+    const stateKey = getDedupKey(email, model);
+    const previous = rateLimitStateByAccountModel.get(stateKey);
+
+    // Check if within dedup window - return duplicate status
+    if (previous && (now - previous.lastAt < RATE_LIMIT_DEDUP_WINDOW_MS)) {
+        const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS;
+        const backoffDelay = Math.min(baseDelay * Math.pow(2, previous.consecutive429 - 1), 60000);
+        logger.debug(`[CloudCode] Rate limit on ${email}:${model} within dedup window, attempt=${previous.consecutive429}, isDuplicate=true`);
+        return { attempt: previous.consecutive429, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: true };
     }
-    return false;
+
+    // Determine attempt number - reset after RATE_LIMIT_STATE_RESET_MS of inactivity
+    const attempt = previous && (now - previous.lastAt < RATE_LIMIT_STATE_RESET_MS)
+        ? previous.consecutive429 + 1
+        : 1;
+
+    // Update state
+    rateLimitStateByAccountModel.set(stateKey, { consecutive429: attempt, lastAt: now });
+
+    // Calculate exponential backoff
+    const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS;
+    const backoffDelay = Math.min(baseDelay * Math.pow(2, attempt - 1), 60000);
+
+    logger.debug(`[CloudCode] Rate limit backoff for ${email}:${model}: attempt=${attempt}, delayMs=${Math.max(baseDelay, backoffDelay)}`);
+    return { attempt, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: false };
 }
 
 /**
- * Record rate limit timestamp for deduplication
+ * Clear rate limit state after successful request
+ * @param {string} email - Account email
  * @param {string} model - Model ID
  */
-function recordRateLimitTimestamp(model) {
-    lastRateLimitTimestamps.set(model, Date.now());
+function clearRateLimitState(email, model) {
+    const key = getDedupKey(email, model);
+    rateLimitStateByAccountModel.delete(key);
 }
 
 /**
- * Clear rate limit timestamp after successful retry
- * @param {string} model - Model ID
- */
-function clearRateLimitTimestamp(model) {
-    lastRateLimitTimestamps.delete(model);
-}
-
-/**
- * Gap 3: Detect permanent authentication failures that require re-authentication
- * These should mark the account as invalid rather than just clearing cache
+ * Detect permanent authentication failures that require re-authentication.
+ * These should mark the account as invalid rather than just clearing cache.
  * @param {string} errorText - Error message from API
  * @returns {boolean} True if permanent auth failure
  */
@@ -82,8 +111,8 @@ function isPermanentAuthFailure(errorText) {
 }
 
 /**
- * Gap 4: Detect if 429 error is due to model capacity (not user quota)
- * Capacity issues should retry on same account with shorter delay
+ * Detect if 429 error is due to model capacity (not user quota).
+ * Capacity issues should retry on same account with shorter delay.
  * @param {string} errorText - Error message from API
  * @returns {boolean} True if capacity exhausted (not quota)
  */
@@ -95,16 +124,47 @@ function isModelCapacityExhausted(errorText) {
         lower.includes('service temporarily unavailable');
 }
 
-// Periodically clean up stale dedup timestamps (every 60 seconds)
+// Periodically clean up stale rate limit state (every 60 seconds)
 setInterval(() => {
-    const cutoff = Date.now() - 60000; // 1 minute
-    for (const [model, timestamp] of lastRateLimitTimestamps.entries()) {
-        if (timestamp < cutoff) {
-            lastRateLimitTimestamps.delete(model);
+    const cutoff = Date.now() - RATE_LIMIT_STATE_RESET_MS;
+    for (const [key, state] of rateLimitStateByAccountModel.entries()) {
+        if (state.lastAt < cutoff) {
+            rateLimitStateByAccountModel.delete(key);
         }
     }
 }, 60000);
 
+/**
+ * Calculate smart backoff based on error type (matches opencode-antigravity-auth)
+ * @param {string} errorText - Error message
+ * @param {number|null} serverResetMs - Reset time from server
+ * @param {number} consecutiveFailures - Number of consecutive failures
+ * @returns {number} Backoff time in milliseconds
+ */
+function calculateSmartBackoff(errorText, serverResetMs, consecutiveFailures = 0) {
+    // If server provides a reset time, use it (with minimum floor to prevent loops)
+    if (serverResetMs && serverResetMs > 0) {
+        return Math.max(serverResetMs, MIN_BACKOFF_MS);
+    }
+
+    const reason = parseRateLimitReason(errorText);
+
+    switch (reason) {
+        case 'QUOTA_EXHAUSTED':
+            // Progressive backoff: [60s, 5m, 30m, 2h]
+            const tierIndex = Math.min(consecutiveFailures, QUOTA_EXHAUSTED_BACKOFF_TIERS_MS.length - 1);
+            return QUOTA_EXHAUSTED_BACKOFF_TIERS_MS[tierIndex];
+        case 'RATE_LIMIT_EXCEEDED':
+            return BACKOFF_BY_ERROR_TYPE.RATE_LIMIT_EXCEEDED;
+        case 'MODEL_CAPACITY_EXHAUSTED':
+            return BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED;
+        case 'SERVER_ERROR':
+            return BACKOFF_BY_ERROR_TYPE.SERVER_ERROR;
+        default:
+            return BACKOFF_BY_ERROR_TYPE.UNKNOWN;
+    }
+}
+
 /**
  * Send a non-streaming request to Cloud Code with multi-account support
  * Uses SSE endpoint for thinking models (non-streaming doesn't return thinking blocks)
@@ -174,7 +234,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
         // Select account using configured strategy
         const { account, waitMs } = accountManager.selectAccount(model);
 
-        // If strategy returns a wait time, sleep and retry
+        // If strategy returns a wait time without an account, sleep and retry
         if (!account && waitMs > 0) {
             logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`);
             await sleep(waitMs + 500);
@@ -182,6 +242,13 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
             continue;
         }
 
+        // If strategy returns an account with throttle wait (fallback mode), apply delay
+        // This prevents overwhelming the API when using emergency/lastResort fallbacks
+        if (account && waitMs > 0) {
+            logger.debug(`[CloudCode] Throttling request (${waitMs}ms) - fallback mode active`);
+            await sleep(waitMs);
+        }
+
         if (!account) {
             logger.warn(`[CloudCode] Strategy returned no account for ${model} (attempt ${attempt + 1}/${maxAttempts})`);
             continue;
@@ -197,8 +264,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
 
             // Try each endpoint with index-based loop for capacity retry support
             let lastError = null;
-            let retriedOnce = false; // Track if we've already retried for short rate limit
-            let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries
+            let capacityRetryCount = 0;
             let endpointIndex = 0;
 
             while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) {
@@ -219,7 +285,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                         logger.warn(`[CloudCode] Error at ${endpoint}: ${response.status} - ${errorText}`);
 
                         if (response.status === 401) {
-                            // Gap 3: Check for permanent auth failures
+                            // Check for permanent auth failures
                             if (isPermanentAuthFailure(errorText)) {
                                 logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`);
                                 accountManager.markInvalid(account.email, 'Token revoked - re-authentication required');
@@ -236,12 +302,17 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
 
                         if (response.status === 429) {
                             const resetMs = parseResetTime(response, errorText);
+                            const consecutiveFailures = accountManager.getConsecutiveFailures?.(account.email) || 0;
 
-                            // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint
+                            // Check if capacity issue (NOT quota) - retry same endpoint with progressive backoff
                             if (isModelCapacityExhausted(errorText)) {
                                 if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                    // Progressive capacity backoff tiers
+                                    const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1);
+                                    const waitMs = resetMs || CAPACITY_BACKOFF_TIERS_MS[tierIndex];
                                     capacityRetryCount++;
-                                    const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS;
+                                    // Track failures for progressive backoff escalation (matches opencode-antigravity-auth)
+                                    accountManager.incrementConsecutiveFailures(account.email);
                                     logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
                                     await sleep(waitMs);
                                     // Don't increment endpointIndex - retry same endpoint
@@ -251,39 +322,80 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                                 logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`);
                             }
 
-                            // Gap 1: Check deduplication window to prevent thundering herd
-                            if (shouldSkipRetryDueToDedup(model)) {
-                                logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`);
-                                accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model);
+                            // Get rate limit backoff with exponential backoff and state reset
+                            const backoff = getRateLimitBackoff(account.email, model, resetMs);
+
+                            // For very short rate limits (< 1 second), always wait and retry
+                            // Switching accounts won't help when all accounts have per-second rate limits
+                            if (resetMs !== null && resetMs < 1000) {
+                                const waitMs = resetMs;
+                                logger.info(`[CloudCode] Short rate limit on ${account.email} (${resetMs}ms), waiting and retrying...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
+                            }
+
+                            // If within dedup window AND reset time is >= 1s, switch account
+                            if (backoff.isDuplicate) {
+                                const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures);
+                                logger.info(`[CloudCode] Skipping retry due to recent rate limit on ${account.email} (attempt ${backoff.attempt}), switching account...`);
+                                accountManager.markRateLimited(account.email, smartBackoffMs, model);
                                 throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`);
                             }
 
+                            // Calculate smart backoff based on error type
+                            const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures);
+
                             // Decision: wait and retry OR switch account
-                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
-                                // Long-term quota exhaustion (> 10s) - switch to next account
-                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`);
-                                accountManager.markRateLimited(account.email, resetMs, model);
+                            // First 429 gets a quick 1s retry (FIRST_RETRY_DELAY_MS)
+                            if (backoff.attempt === 1 && smartBackoffMs <= DEFAULT_COOLDOWN_MS) {
+                                // Quick 1s retry on first 429 (matches opencode-antigravity-auth)
+                                const waitMs = backoff.delayMs;
+                                // markRateLimited already increments consecutiveFailures internally
+                                // This prevents concurrent retry storms and ensures progressive backoff escalation
+                                accountManager.markRateLimited(account.email, waitMs, model);
+                                logger.info(`[CloudCode] First rate limit on ${account.email}, quick retry after ${formatDuration(waitMs)}...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
+                            } else if (smartBackoffMs > DEFAULT_COOLDOWN_MS) {
+                                // Long-term quota exhaustion (> 10s) - wait SWITCH_ACCOUNT_DELAY_MS then switch
+                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(smartBackoffMs)}), switching account after ${formatDuration(SWITCH_ACCOUNT_DELAY_MS)} delay...`);
+                                await sleep(SWITCH_ACCOUNT_DELAY_MS);
+                                accountManager.markRateLimited(account.email, smartBackoffMs, model);
                                 throw new Error(`QUOTA_EXHAUSTED: ${errorText}`);
                             } else {
-                                // Short-term rate limit (<= 10s) - wait and retry once
-                                const waitMs = resetMs || DEFAULT_COOLDOWN_MS;
-
-                                if (!retriedOnce) {
-                                    retriedOnce = true;
-                                    recordRateLimitTimestamp(model); // Gap 1: Record before retry
-                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
-                                    await sleep(waitMs);
-                                    // Don't increment endpointIndex - retry same endpoint
-                                    continue;
-                                } else {
-                                    // Already retried once, mark and switch
-                                    accountManager.markRateLimited(account.email, waitMs, model);
-                                    throw new Error(`RATE_LIMITED: ${errorText}`);
-                                }
+                                // Short-term rate limit but not first attempt - use exponential backoff delay
+                                const waitMs = backoff.delayMs;
+                                // markRateLimited already increments consecutiveFailures internally
+                                accountManager.markRateLimited(account.email, waitMs, model);
+                                logger.info(`[CloudCode] Rate limit on ${account.email} (attempt ${backoff.attempt}), waiting ${formatDuration(waitMs)}...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
                             }
                         }
 
                         if (response.status >= 400) {
+                            // Check for 503 MODEL_CAPACITY_EXHAUSTED - use progressive backoff like 429 capacity
+                            if (response.status === 503 && isModelCapacityExhausted(errorText)) {
+                                if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                    // Progressive capacity backoff tiers (same as 429 capacity handling)
+                                    const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1);
+                                    const waitMs = CAPACITY_BACKOFF_TIERS_MS[tierIndex];
+                                    capacityRetryCount++;
+                                    accountManager.incrementConsecutiveFailures(account.email);
+                                    logger.info(`[CloudCode] 503 Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
+                                    await sleep(waitMs);
+                                    // Don't increment endpointIndex - retry same endpoint
+                                    continue;
+                                }
+                                // Max capacity retries exceeded - switch account
+                                logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded on 503, switching account`);
+                                accountManager.markRateLimited(account.email, BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED, model);
+                                throw new Error(`CAPACITY_EXHAUSTED: ${errorText}`);
+                            }
+
                             lastError = new Error(`API error ${response.status}: ${errorText}`);
                             // Try next endpoint for 403/404/5xx errors (matches opencode-antigravity-auth behavior)
                             if (response.status === 403 || response.status === 404) {
@@ -300,8 +412,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                     // For thinking models, parse SSE and accumulate all parts
                     if (isThinking) {
                         const result = await parseThinkingSSEResponse(response, anthropicRequest.model);
-                        // Gap 1: Clear timestamp on success
-                        clearRateLimitTimestamp(model);
+                        // Clear rate limit state on success
+                        clearRateLimitState(account.email, model);
                         accountManager.notifySuccess(account, model);
                         return result;
                     }
@@ -309,8 +421,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                     // Non-thinking models use regular JSON
                     const data = await response.json();
                     logger.debug('[CloudCode] Response received');
-                    // Gap 1: Clear timestamp on success
-                    clearRateLimitTimestamp(model);
+                    // Clear rate limit state on success
+                    clearRateLimitState(account.email, model);
                     accountManager.notifySuccess(account, model);
                     return convertGoogleToAnthropic(data, anthropicRequest.model);
 
@@ -350,13 +462,15 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
             if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
                 accountManager.notifyFailure(account, model);
 
-                // Gap 2: Check consecutive failures for extended cooldown
-                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
-                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
-                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                // Track 5xx errors for extended cooldown
+                // Note: markRateLimited already increments consecutiveFailures internally
+                const currentFailures = accountManager.getConsecutiveFailures(account.email);
+                if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
                     accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
                 } else {
-                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error, trying next...`);
+                    accountManager.incrementConsecutiveFailures(account.email);
+                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next...`);
                 }
                 continue;
             }
@@ -364,13 +478,15 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
             if (isNetworkError(error)) {
                 accountManager.notifyFailure(account, model);
 
-                // Gap 2: Check consecutive failures for extended cooldown
-                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
-                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
-                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                // Track network errors for extended cooldown
+                // Note: markRateLimited already increments consecutiveFailures internally
+                const currentFailures = accountManager.getConsecutiveFailures(account.email);
+                if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
                     accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
                 } else {
-                    logger.warn(`[CloudCode] Network error for ${account.email}, trying next account... (${error.message})`);
+                    accountManager.incrementConsecutiveFailures(account.email);
+                    logger.warn(`[CloudCode] Network error for ${account.email} (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next account... (${error.message})`);
                 }
                 await sleep(1000);
                 continue;
diff --git a/src/cloudcode/rate-limit-parser.js b/src/cloudcode/rate-limit-parser.js
index 2fddb26..d6e1cb7 100644
--- a/src/cloudcode/rate-limit-parser.js
+++ b/src/cloudcode/rate-limit-parser.js
@@ -167,15 +167,69 @@ export function parseResetTime(responseOrError, errorText = '') {
         }
     }
 
-    // SANITY CHECK: Enforce strict minimums for found rate limits
-    // If we found a reset time, but it's very small (e.g. < 1s) or negative,
-    // explicitly bump it up to avoid "Available in 0s" loops.
+    // SANITY CHECK: Handle very small or negative reset times
+    // For sub-second rate limits (common with per-second quotas), add a small buffer
+    // For negative or zero, use a reasonable minimum
     if (resetMs !== null) {
-        if (resetMs < 1000) {
-            logger.debug(`[CloudCode] Reset time too small (${resetMs}ms), enforcing 2s buffer`);
-            resetMs = 2000;
+        if (resetMs <= 0) {
+            logger.debug(`[CloudCode] Reset time invalid (${resetMs}ms), using 500ms default`);
+            resetMs = 500;
+        } else if (resetMs < 500) {
+            // Very short reset - add 200ms buffer for network latency
+            logger.debug(`[CloudCode] Short reset time (${resetMs}ms), adding 200ms buffer`);
+            resetMs = resetMs + 200;
         }
+        // Note: No longer enforcing 2s minimum - this was causing cascading failures
+        // when all accounts had short rate limits simultaneously
     }
 
     return resetMs;
 }
+
+/**
+ * Parse the rate limit reason from error text
+ * Used for smart backoff by error type (matches opencode-antigravity-auth)
+ *
+ * @param {string} errorText - Error message/body text
+ * @returns {'RATE_LIMIT_EXCEEDED' | 'QUOTA_EXHAUSTED' | 'MODEL_CAPACITY_EXHAUSTED' | 'SERVER_ERROR' | 'UNKNOWN'} Error reason
+ */
+export function parseRateLimitReason(errorText) {
+    const lower = (errorText || '').toLowerCase();
+
+    // Check for quota exhaustion (daily/hourly limits)
+    if (lower.includes('quota_exhausted') ||
+        lower.includes('quotaresetdelay') ||
+        lower.includes('quotaresettimestamp') ||
+        lower.includes('resource_exhausted') ||
+        lower.includes('daily limit') ||
+        lower.includes('quota exceeded')) {
+        return 'QUOTA_EXHAUSTED';
+    }
+
+    // Check for model capacity issues (temporary, retry quickly)
+    if (lower.includes('model_capacity_exhausted') ||
+        lower.includes('capacity_exhausted') ||
+        lower.includes('model is currently overloaded') ||
+        lower.includes('service temporarily unavailable')) {
+        return 'MODEL_CAPACITY_EXHAUSTED';
+    }
+
+    // Check for rate limiting (per-minute limits)
+    if (lower.includes('rate_limit_exceeded') ||
+        lower.includes('rate limit') ||
+        lower.includes('too many requests') ||
+        lower.includes('throttl')) {
+        return 'RATE_LIMIT_EXCEEDED';
+    }
+
+    // Check for server errors
+    if (lower.includes('internal server error') ||
+        lower.includes('server error') ||
+        lower.includes('503') ||
+        lower.includes('502') ||
+        lower.includes('504')) {
+        return 'SERVER_ERROR';
+    }
+
+    return 'UNKNOWN';
+}
diff --git a/src/cloudcode/streaming-handler.js b/src/cloudcode/streaming-handler.js
index 0cf0f3e..d32c363 100644
--- a/src/cloudcode/streaming-handler.js
+++ b/src/cloudcode/streaming-handler.js
@@ -12,61 +12,90 @@ import {
     MAX_WAIT_BEFORE_ERROR_MS,
     DEFAULT_COOLDOWN_MS,
     RATE_LIMIT_DEDUP_WINDOW_MS,
+    RATE_LIMIT_STATE_RESET_MS,
+    FIRST_RETRY_DELAY_MS,
+    SWITCH_ACCOUNT_DELAY_MS,
     MAX_CONSECUTIVE_FAILURES,
     EXTENDED_COOLDOWN_MS,
-    CAPACITY_RETRY_DELAY_MS,
-    MAX_CAPACITY_RETRIES
+    CAPACITY_BACKOFF_TIERS_MS,
+    MAX_CAPACITY_RETRIES,
+    BACKOFF_BY_ERROR_TYPE,
+    QUOTA_EXHAUSTED_BACKOFF_TIERS_MS,
+    MIN_BACKOFF_MS
 } from '../constants.js';
 import { isRateLimitError, isAuthError, isEmptyResponseError } from '../errors.js';
 import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js';
 import { logger } from '../utils/logger.js';
-import { parseResetTime } from './rate-limit-parser.js';
+import { parseResetTime, parseRateLimitReason } from './rate-limit-parser.js';
 import { buildCloudCodeRequest, buildHeaders } from './request-builder.js';
 import { streamSSEResponse } from './sse-streamer.js';
 import { getFallbackModel } from '../fallback-config.js';
 import crypto from 'crypto';
 
 /**
- * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits
- * Tracks last rate limit timestamp per model to skip duplicate retries
+ * Rate limit deduplication - prevents thundering herd on concurrent rate limits.
+ * Tracks rate limit state per account+model including consecutive429 count and timestamps.
  */
-const lastRateLimitTimestamps = new Map(); // modelId -> timestamp
+const rateLimitStateByAccountModel = new Map(); // `${email}:${model}` -> { consecutive429, lastAt }
 
 /**
- * Check if we should skip retry due to recent rate limit on this model
+ * Get deduplication key for rate limit tracking
+ * @param {string} email - Account email
  * @param {string} model - Model ID
- * @returns {boolean} True if retry should be skipped (within dedup window)
+ * @returns {string} Dedup key
  */
-function shouldSkipRetryDueToDedup(model) {
-    const lastTimestamp = lastRateLimitTimestamps.get(model);
-    if (!lastTimestamp) return false;
+function getDedupKey(email, model) {
+    return `${email}:${model}`;
+}
 
-    const elapsed = Date.now() - lastTimestamp;
-    if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) {
-        logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`);
-        return true;
+/**
+ * Get rate limit backoff with deduplication and exponential backoff (matches opencode-antigravity-auth)
+ * @param {string} email - Account email
+ * @param {string} model - Model ID
+ * @param {number|null} serverRetryAfterMs - Server-provided retry time
+ * @returns {{attempt: number, delayMs: number, isDuplicate: boolean}} Backoff info
+ */
+function getRateLimitBackoff(email, model, serverRetryAfterMs) {
+    const now = Date.now();
+    const stateKey = getDedupKey(email, model);
+    const previous = rateLimitStateByAccountModel.get(stateKey);
+
+    // Check if within dedup window - return duplicate status
+    if (previous && (now - previous.lastAt < RATE_LIMIT_DEDUP_WINDOW_MS)) {
+        const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS;
+        const backoffDelay = Math.min(baseDelay * Math.pow(2, previous.consecutive429 - 1), 60000);
+        logger.debug(`[CloudCode] Rate limit on ${email}:${model} within dedup window, attempt=${previous.consecutive429}, isDuplicate=true`);
+        return { attempt: previous.consecutive429, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: true };
     }
-    return false;
+
+    // Determine attempt number - reset after RATE_LIMIT_STATE_RESET_MS of inactivity
+    const attempt = previous && (now - previous.lastAt < RATE_LIMIT_STATE_RESET_MS)
+        ? previous.consecutive429 + 1
+        : 1;
+
+    // Update state
+    rateLimitStateByAccountModel.set(stateKey, { consecutive429: attempt, lastAt: now });
+
+    // Calculate exponential backoff
+    const baseDelay = serverRetryAfterMs ?? FIRST_RETRY_DELAY_MS;
+    const backoffDelay = Math.min(baseDelay * Math.pow(2, attempt - 1), 60000);
+
+    logger.debug(`[CloudCode] Rate limit backoff for ${email}:${model}: attempt=${attempt}, delayMs=${Math.max(baseDelay, backoffDelay)}`);
+    return { attempt, delayMs: Math.max(baseDelay, backoffDelay), isDuplicate: false };
 }
 
 /**
- * Record rate limit timestamp for deduplication
+ * Clear rate limit state after successful request
+ * @param {string} email - Account email
  * @param {string} model - Model ID
  */
-function recordRateLimitTimestamp(model) {
-    lastRateLimitTimestamps.set(model, Date.now());
+function clearRateLimitState(email, model) {
+    const key = getDedupKey(email, model);
+    rateLimitStateByAccountModel.delete(key);
 }
 
 /**
- * Clear rate limit timestamp after successful retry
- * @param {string} model - Model ID
- */
-function clearRateLimitTimestamp(model) {
-    lastRateLimitTimestamps.delete(model);
-}
-
-/**
- * Gap 3: Detect permanent authentication failures that require re-authentication
+ * Detect permanent authentication failures that require re-authentication.
  * @param {string} errorText - Error message from API
  * @returns {boolean} True if permanent auth failure
  */
@@ -81,7 +110,7 @@ function isPermanentAuthFailure(errorText) {
 }
 
 /**
- * Gap 4: Detect if 429 error is due to model capacity (not user quota)
+ * Detect if 429 error is due to model capacity (not user quota).
  * @param {string} errorText - Error message from API
  * @returns {boolean} True if capacity exhausted (not quota)
  */
@@ -93,16 +122,47 @@ function isModelCapacityExhausted(errorText) {
         lower.includes('service temporarily unavailable');
 }
 
-// Periodically clean up stale dedup timestamps (every 60 seconds)
+// Periodically clean up stale rate limit state (every 60 seconds)
 setInterval(() => {
-    const cutoff = Date.now() - 60000; // 1 minute
-    for (const [model, timestamp] of lastRateLimitTimestamps.entries()) {
-        if (timestamp < cutoff) {
-            lastRateLimitTimestamps.delete(model);
+    const cutoff = Date.now() - RATE_LIMIT_STATE_RESET_MS;
+    for (const [key, state] of rateLimitStateByAccountModel.entries()) {
+        if (state.lastAt < cutoff) {
+            rateLimitStateByAccountModel.delete(key);
         }
     }
 }, 60000);
 
+/**
+ * Calculate smart backoff based on error type (matches opencode-antigravity-auth)
+ * @param {string} errorText - Error message
+ * @param {number|null} serverResetMs - Reset time from server
+ * @param {number} consecutiveFailures - Number of consecutive failures
+ * @returns {number} Backoff time in milliseconds
+ */
+function calculateSmartBackoff(errorText, serverResetMs, consecutiveFailures = 0) {
+    // If server provides a reset time, use it (with minimum floor to prevent loops)
+    if (serverResetMs && serverResetMs > 0) {
+        return Math.max(serverResetMs, MIN_BACKOFF_MS);
+    }
+
+    const reason = parseRateLimitReason(errorText);
+
+    switch (reason) {
+        case 'QUOTA_EXHAUSTED':
+            // Progressive backoff: [60s, 5m, 30m, 2h]
+            const tierIndex = Math.min(consecutiveFailures, QUOTA_EXHAUSTED_BACKOFF_TIERS_MS.length - 1);
+            return QUOTA_EXHAUSTED_BACKOFF_TIERS_MS[tierIndex];
+        case 'RATE_LIMIT_EXCEEDED':
+            return BACKOFF_BY_ERROR_TYPE.RATE_LIMIT_EXCEEDED;
+        case 'MODEL_CAPACITY_EXHAUSTED':
+            return BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED;
+        case 'SERVER_ERROR':
+            return BACKOFF_BY_ERROR_TYPE.SERVER_ERROR;
+        default:
+            return BACKOFF_BY_ERROR_TYPE.UNKNOWN;
+    }
+}
+
 /**
  * Send a streaming request to Cloud Code with multi-account support
  * Streams events in real-time as they arrive from the server
@@ -172,7 +232,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
         // Select account using configured strategy
         const { account, waitMs } = accountManager.selectAccount(model);
 
-        // If strategy returns a wait time, sleep and retry
+        // If strategy returns a wait time without an account, sleep and retry
         if (!account && waitMs > 0) {
             logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`);
             await sleep(waitMs + 500);
@@ -180,6 +240,13 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
             continue;
         }
 
+        // If strategy returns an account with throttle wait (fallback mode), apply delay
+        // This prevents overwhelming the API when using emergency/lastResort fallbacks
+        if (account && waitMs > 0) {
+            logger.debug(`[CloudCode] Throttling request (${waitMs}ms) - fallback mode active`);
+            await sleep(waitMs);
+        }
+
         if (!account) {
             logger.warn(`[CloudCode] Strategy returned no account for ${model} (attempt ${attempt + 1}/${maxAttempts})`);
             continue;
@@ -195,8 +262,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
 
             // Try each endpoint with index-based loop for capacity retry support
             let lastError = null;
-            let retriedOnce = false; // Track if we've already retried for short rate limit
-            let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries
+            let capacityRetryCount = 0;
             let endpointIndex = 0;
 
             while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) {
@@ -215,7 +281,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                         logger.warn(`[CloudCode] Stream error at ${endpoint}: ${response.status} - ${errorText}`);
 
                         if (response.status === 401) {
-                            // Gap 3: Check for permanent auth failures
+                            // Check for permanent auth failures
                             if (isPermanentAuthFailure(errorText)) {
                                 logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`);
                                 accountManager.markInvalid(account.email, 'Token revoked - re-authentication required');
@@ -231,12 +297,17 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
 
                         if (response.status === 429) {
                             const resetMs = parseResetTime(response, errorText);
+                            const consecutiveFailures = accountManager.getConsecutiveFailures?.(account.email) || 0;
 
-                            // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint
+                            // Check if capacity issue (NOT quota) - retry same endpoint with progressive backoff
                             if (isModelCapacityExhausted(errorText)) {
                                 if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                    // Progressive capacity backoff tiers
+                                    const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1);
+                                    const waitMs = resetMs || CAPACITY_BACKOFF_TIERS_MS[tierIndex];
                                     capacityRetryCount++;
-                                    const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS;
+                                    // Track failures for progressive backoff escalation (matches opencode-antigravity-auth)
+                                    accountManager.incrementConsecutiveFailures(account.email);
                                     logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
                                     await sleep(waitMs);
                                     // Don't increment endpointIndex - retry same endpoint
@@ -246,38 +317,78 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                                 logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`);
                             }
 
-                            // Gap 1: Check deduplication window to prevent thundering herd
-                            if (shouldSkipRetryDueToDedup(model)) {
-                                logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`);
-                                accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model);
+                            // Get rate limit backoff with exponential backoff and state reset
+                            const backoff = getRateLimitBackoff(account.email, model, resetMs);
+
+                            // For very short rate limits (< 1 second), always wait and retry
+                            // Switching accounts won't help when all accounts have per-second rate limits
+                            if (resetMs !== null && resetMs < 1000) {
+                                const waitMs = resetMs;
+                                logger.info(`[CloudCode] Short rate limit on ${account.email} (${resetMs}ms), waiting and retrying...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
+                            }
+
+                            // If within dedup window AND reset time is >= 1s, switch account
+                            if (backoff.isDuplicate) {
+                                const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures);
+                                logger.info(`[CloudCode] Skipping retry due to recent rate limit on ${account.email} (attempt ${backoff.attempt}), switching account...`);
+                                accountManager.markRateLimited(account.email, smartBackoffMs, model);
                                 throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`);
                             }
 
+                            // Calculate smart backoff based on error type
+                            const smartBackoffMs = calculateSmartBackoff(errorText, resetMs, consecutiveFailures);
+
                             // Decision: wait and retry OR switch account
-                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
-                                // Long-term quota exhaustion (> 10s) - switch to next account
-                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`);
-                                accountManager.markRateLimited(account.email, resetMs, model);
+                            // First 429 gets a quick 1s retry (FIRST_RETRY_DELAY_MS)
+                            if (backoff.attempt === 1 && smartBackoffMs <= DEFAULT_COOLDOWN_MS) {
+                                // Quick 1s retry on first 429 (matches opencode-antigravity-auth)
+                                const waitMs = backoff.delayMs;
+                                // markRateLimited already increments consecutiveFailures internally
+                                accountManager.markRateLimited(account.email, waitMs, model);
+                                logger.info(`[CloudCode] First rate limit on ${account.email}, quick retry after ${formatDuration(waitMs)}...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
+                            } else if (smartBackoffMs > DEFAULT_COOLDOWN_MS) {
+                                // Long-term quota exhaustion (> 10s) - wait SWITCH_ACCOUNT_DELAY_MS then switch
+                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(smartBackoffMs)}), switching account after ${formatDuration(SWITCH_ACCOUNT_DELAY_MS)} delay...`);
+                                await sleep(SWITCH_ACCOUNT_DELAY_MS);
+                                accountManager.markRateLimited(account.email, smartBackoffMs, model);
                                 throw new Error(`QUOTA_EXHAUSTED: ${errorText}`);
                             } else {
-                                // Short-term rate limit (<= 10s) - wait and retry once
-                                const waitMs = resetMs || DEFAULT_COOLDOWN_MS;
-
-                                if (!retriedOnce) {
-                                    retriedOnce = true;
-                                    recordRateLimitTimestamp(model); // Gap 1: Record before retry
-                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
-                                    await sleep(waitMs);
-                                    // Don't increment endpointIndex - retry same endpoint
-                                    continue;
-                                } else {
-                                    // Already retried once, mark and switch
-                                    accountManager.markRateLimited(account.email, waitMs, model);
-                                    throw new Error(`RATE_LIMITED: ${errorText}`);
-                                }
+                                // Short-term rate limit but not first attempt - use exponential backoff delay
+                                const waitMs = backoff.delayMs;
+                                // markRateLimited already increments consecutiveFailures internally
+                                accountManager.markRateLimited(account.email, waitMs, model);
+                                logger.info(`[CloudCode] Rate limit on ${account.email} (attempt ${backoff.attempt}), waiting ${formatDuration(waitMs)}...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
                             }
                         }
 
+                        // Check for 503 MODEL_CAPACITY_EXHAUSTED - use progressive backoff like 429 capacity
+                        if (response.status === 503 && isModelCapacityExhausted(errorText)) {
+                            if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                // Progressive capacity backoff tiers (same as 429 capacity handling)
+                                const tierIndex = Math.min(capacityRetryCount, CAPACITY_BACKOFF_TIERS_MS.length - 1);
+                                const waitMs = CAPACITY_BACKOFF_TIERS_MS[tierIndex];
+                                capacityRetryCount++;
+                                accountManager.incrementConsecutiveFailures(account.email);
+                                logger.info(`[CloudCode] 503 Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
+                                await sleep(waitMs);
+                                // Don't increment endpointIndex - retry same endpoint
+                                continue;
+                            }
+                            // Max capacity retries exceeded - switch account
+                            logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded on 503, switching account`);
+                            accountManager.markRateLimited(account.email, BACKOFF_BY_ERROR_TYPE.MODEL_CAPACITY_EXHAUSTED, model);
+                            throw new Error(`CAPACITY_EXHAUSTED: ${errorText}`);
+                        }
+
                         lastError = new Error(`API error ${response.status}: ${errorText}`);
 
                         // Try next endpoint for 403/404/5xx errors (matches opencode-antigravity-auth behavior)
@@ -299,8 +410,8 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                         try {
                             yield* streamSSEResponse(currentResponse, anthropicRequest.model);
                             logger.debug('[CloudCode] Stream completed');
-                            // Gap 1: Clear timestamp on success
-                            clearRateLimitTimestamp(model);
+                            // Clear rate limit state on success
+                            clearRateLimitState(account.email, model);
                             accountManager.notifySuccess(account, model);
                             return;
                         } catch (streamError) {
@@ -409,13 +520,15 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
             if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
                 accountManager.notifyFailure(account, model);
 
-                // Gap 2: Check consecutive failures for extended cooldown
-                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
-                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
-                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                // Track 5xx errors for extended cooldown
+                // Note: markRateLimited already increments consecutiveFailures internally
+                const currentFailures = accountManager.getConsecutiveFailures(account.email);
+                if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
                     accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
                 } else {
-                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error, trying next...`);
+                    accountManager.incrementConsecutiveFailures(account.email);
+                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next...`);
                 }
                 continue;
             }
@@ -423,13 +536,15 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
             if (isNetworkError(error)) {
                 accountManager.notifyFailure(account, model);
 
-                // Gap 2: Check consecutive failures for extended cooldown
-                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
-                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
-                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                // Track network errors for extended cooldown
+                // Note: markRateLimited already increments consecutiveFailures internally
+                const currentFailures = accountManager.getConsecutiveFailures(account.email);
+                if (currentFailures + 1 >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${currentFailures + 1} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
                     accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
                 } else {
-                    logger.warn(`[CloudCode] Network error for ${account.email} (stream), trying next account... (${error.message})`);
+                    accountManager.incrementConsecutiveFailures(account.email);
+                    logger.warn(`[CloudCode] Network error for ${account.email} (stream) (${currentFailures + 1}/${MAX_CONSECUTIVE_FAILURES}), trying next account... (${error.message})`);
                 }
                 await sleep(1000);
                 continue;
diff --git a/src/config.js b/src/config.js
index d2c6008..7d23a36 100644
--- a/src/config.js
+++ b/src/config.js
@@ -16,6 +16,11 @@ const DEFAULT_CONFIG = {
     defaultCooldownMs: 10000,  // 10 seconds
     maxWaitBeforeErrorMs: 120000, // 2 minutes
     maxAccounts: 10, // Maximum number of accounts allowed
+    // Rate limit handling (matches opencode-antigravity-auth)
+    rateLimitDedupWindowMs: 2000,  // 2 seconds - prevents concurrent retry storms
+    maxConsecutiveFailures: 3,     // Before applying extended cooldown
+    extendedCooldownMs: 60000,     // 1 minute extended cooldown
+    maxCapacityRetries: 5,         // Max retries for capacity exhaustion
     modelMapping: {},
     // Account selection strategy configuration
     accountSelection: {
diff --git a/src/constants.js b/src/constants.js
index ad20ccf..1bf8a2d 100644
--- a/src/constants.js
+++ b/src/constants.js
@@ -103,16 +103,33 @@ export const MAX_ACCOUNTS = config?.maxAccounts || 10; // From config or 10
 // Rate limit wait thresholds
 export const MAX_WAIT_BEFORE_ERROR_MS = config?.maxWaitBeforeErrorMs || 120000; // From config or 2 minutes
 
-// Gap 1: Retry deduplication - prevents thundering herd on concurrent rate limits
-export const RATE_LIMIT_DEDUP_WINDOW_MS = config?.rateLimitDedupWindowMs || 5000; // 5 seconds
+// Retry deduplication - prevents thundering herd on concurrent rate limits
+export const RATE_LIMIT_DEDUP_WINDOW_MS = config?.rateLimitDedupWindowMs || 2000; // 2 seconds
+export const RATE_LIMIT_STATE_RESET_MS = config?.rateLimitStateResetMs || 120000; // 2 minutes - reset consecutive429 after inactivity
+export const FIRST_RETRY_DELAY_MS = config?.firstRetryDelayMs || 1000; // Quick 1s retry on first 429
+export const SWITCH_ACCOUNT_DELAY_MS = config?.switchAccountDelayMs || 5000; // Delay before switching accounts
 
-// Gap 2: Consecutive failure tracking - extended cooldown after repeated failures
+// Consecutive failure tracking - extended cooldown after repeated failures
 export const MAX_CONSECUTIVE_FAILURES = config?.maxConsecutiveFailures || 3;
 export const EXTENDED_COOLDOWN_MS = config?.extendedCooldownMs || 60000; // 1 minute
 
-// Gap 4: Capacity exhaustion - shorter retry for model capacity issues (not quota)
-export const CAPACITY_RETRY_DELAY_MS = config?.capacityRetryDelayMs || 2000; // 2 seconds
-export const MAX_CAPACITY_RETRIES = config?.maxCapacityRetries || 3;
+// Capacity exhaustion - progressive backoff tiers for model capacity issues
+export const CAPACITY_BACKOFF_TIERS_MS = config?.capacityBackoffTiersMs || [5000, 10000, 20000, 30000, 60000];
+export const MAX_CAPACITY_RETRIES = config?.maxCapacityRetries || 5;
+
+// Smart backoff by error type
+export const BACKOFF_BY_ERROR_TYPE = {
+    RATE_LIMIT_EXCEEDED: 30000,      // 30 seconds
+    MODEL_CAPACITY_EXHAUSTED: 15000, // 15 seconds
+    SERVER_ERROR: 20000,             // 20 seconds
+    UNKNOWN: 60000                   // 1 minute
+};
+
+// Progressive backoff tiers for QUOTA_EXHAUSTED (60s, 5m, 30m, 2h)
+export const QUOTA_EXHAUSTED_BACKOFF_TIERS_MS = [60000, 300000, 1800000, 7200000];
+
+// Minimum backoff floor to prevent "Available in 0s" loops (matches opencode-antigravity-auth)
+export const MIN_BACKOFF_MS = 2000;
 
 // Thinking model constants
 export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length
@@ -258,10 +275,16 @@ export default {
     MAX_ACCOUNTS,
     MAX_WAIT_BEFORE_ERROR_MS,
     RATE_LIMIT_DEDUP_WINDOW_MS,
+    RATE_LIMIT_STATE_RESET_MS,
+    FIRST_RETRY_DELAY_MS,
+    SWITCH_ACCOUNT_DELAY_MS,
     MAX_CONSECUTIVE_FAILURES,
     EXTENDED_COOLDOWN_MS,
-    CAPACITY_RETRY_DELAY_MS,
+    CAPACITY_BACKOFF_TIERS_MS,
     MAX_CAPACITY_RETRIES,
+    BACKOFF_BY_ERROR_TYPE,
+    QUOTA_EXHAUSTED_BACKOFF_TIERS_MS,
+    MIN_BACKOFF_MS,
     MIN_SIGNATURE_LENGTH,
     GEMINI_MAX_OUTPUT_TOKENS,
     GEMINI_SKIP_SIGNATURE,
diff --git a/src/webui/index.js b/src/webui/index.js
index 2326a1d..dbaa09a 100644
--- a/src/webui/index.js
+++ b/src/webui/index.js
@@ -397,7 +397,7 @@ export function mountWebUI(app, dirname, accountManager) {
      */
     app.post('/api/config', (req, res) => {
         try {
-            const { debug, logLevel, maxRetries, retryBaseMs, retryMaxMs, persistTokenCache, defaultCooldownMs, maxWaitBeforeErrorMs, maxAccounts, accountSelection, rateLimitDedupWindowMs, maxConsecutiveFailures, extendedCooldownMs, capacityRetryDelayMs, maxCapacityRetries } = req.body;
+            const { debug, logLevel, maxRetries, retryBaseMs, retryMaxMs, persistTokenCache, defaultCooldownMs, maxWaitBeforeErrorMs, maxAccounts, accountSelection, rateLimitDedupWindowMs, maxConsecutiveFailures, extendedCooldownMs, maxCapacityRetries } = req.body;
 
             // Only allow updating specific fields (security)
             const updates = {};
@@ -435,9 +435,6 @@ export function mountWebUI(app, dirname, accountManager) {
             if (typeof extendedCooldownMs === 'number' && extendedCooldownMs >= 10000 && extendedCooldownMs <= 300000) {
                 updates.extendedCooldownMs = extendedCooldownMs;
             }
-            if (typeof capacityRetryDelayMs === 'number' && capacityRetryDelayMs >= 500 && capacityRetryDelayMs <= 10000) {
-                updates.capacityRetryDelayMs = capacityRetryDelayMs;
-            }
             if (typeof maxCapacityRetries === 'number' && maxCapacityRetries >= 1 && maxCapacityRetries <= 10) {
                 updates.maxCapacityRetries = maxCapacityRetries;
             }
diff --git a/tests/stress-test.cjs b/tests/stress-test.cjs
new file mode 100644
index 0000000..d493afe
--- /dev/null
+++ b/tests/stress-test.cjs
@@ -0,0 +1,93 @@
+/**
+ * Stress Test - Send multiple concurrent requests to test rate limit handling
+ *
+ * Usage: node tests/stress-test.cjs [count] [model]
+ * Example: node tests/stress-test.cjs 10 gemini-3-flash
+ */
+
+const BASE_URL = process.env.ANTHROPIC_BASE_URL || 'http://localhost:8080';
+
+const count = parseInt(process.argv[2]) || 8;
+const model = process.argv[3] || 'gemini-3-flash';
+
+async function sendRequest(id) {
+    const startTime = Date.now();
+    try {
+        const response = await fetch(`${BASE_URL}/v1/messages`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'x-api-key': 'test',
+                'anthropic-version': '2023-06-01'
+            },
+            body: JSON.stringify({
+                model: model,
+                max_tokens: 100,
+                messages: [
+                    { role: 'user', content: `Request ${id}: Say "Hello ${id}" and nothing else.` }
+                ]
+            })
+        });
+
+        const elapsed = Date.now() - startTime;
+
+        if (!response.ok) {
+            const errorText = await response.text();
+            console.log(`[${id}] ❌ ${response.status} after ${elapsed}ms: ${errorText.substring(0, 100)}`);
+            return { id, success: false, status: response.status, elapsed };
+        }
+
+        const data = await response.json();
+        const text = data.content?.[0]?.text?.substring(0, 50) || 'No text';
+        console.log(`[${id}] ✅ 200 after ${elapsed}ms: "${text}..."`);
+        return { id, success: true, status: 200, elapsed };
+    } catch (error) {
+        const elapsed = Date.now() - startTime;
+        console.log(`[${id}] ❌ Error after ${elapsed}ms: ${error.message}`);
+        return { id, success: false, error: error.message, elapsed };
+    }
+}
+
+async function runStressTest() {
+    console.log(`\n🚀 Stress Test: Sending ${count} concurrent requests to ${model}\n`);
+    console.log(`Target: ${BASE_URL}/v1/messages\n`);
+    console.log('─'.repeat(70));
+
+    const startTime = Date.now();
+
+    // Send all requests concurrently
+    const promises = [];
+    for (let i = 1; i <= count; i++) {
+        promises.push(sendRequest(i));
+    }
+
+    const results = await Promise.all(promises);
+
+    const totalElapsed = Date.now() - startTime;
+    console.log('─'.repeat(70));
+
+    // Summary
+    const successful = results.filter(r => r.success).length;
+    const failed = results.filter(r => !r.success).length;
+    const avgElapsed = Math.round(results.reduce((sum, r) => sum + r.elapsed, 0) / results.length);
+
+    console.log(`\n📊 Summary:`);
+    console.log(`   Total time: ${totalElapsed}ms`);
+    console.log(`   Successful: ${successful}/${count}`);
+    console.log(`   Failed: ${failed}/${count}`);
+    console.log(`   Avg response time: ${avgElapsed}ms`);
+
+    if (failed > 0) {
+        const errors = results.filter(r => !r.success);
+        const statusCounts = {};
+        errors.forEach(e => {
+            const key = e.status || 'network';
+            statusCounts[key] = (statusCounts[key] || 0) + 1;
+        });
+        console.log(`   Error breakdown: ${JSON.stringify(statusCounts)}`);
+    }
+
+    console.log('');
+}
+
+runStressTest().catch(console.error);