diff --git a/src/account-manager/index.js b/src/account-manager/index.js
index 79212ea..8db296d 100644
--- a/src/account-manager/index.js
+++ b/src/account-manager/index.js
@@ -14,7 +14,8 @@ import {
     resetAllRateLimits as resetLimits,
     markRateLimited as markLimited,
     markInvalid as markAccountInvalid,
-    getMinWaitTimeMs as getMinWait
+    getMinWaitTimeMs as getMinWait,
+    getRateLimitInfo as getLimitInfo
 } from './rate-limits.js';
 import {
     getTokenForAccount as fetchToken,
@@ -214,6 +215,16 @@ export class AccountManager {
         return getMinWait(this.#accounts, modelId);
     }
 
+    /**
+     * Get rate limit info for a specific account and model
+     * @param {string} email - Email of the account
+     * @param {string} modelId - Model ID to check
+     * @returns {{isRateLimited: boolean, actualResetMs: number|null, waitMs: number}} Rate limit info
+     */
+    getRateLimitInfo(email, modelId) {
+        return getLimitInfo(this.#accounts, email, modelId);
+    }
+
     /**
      * Get OAuth token for an account
      * @param {Object} account - Account object with email and credentials
diff --git a/src/account-manager/rate-limits.js b/src/account-manager/rate-limits.js
index ffe75e6..0dca07a 100644
--- a/src/account-manager/rate-limits.js
+++ b/src/account-manager/rate-limits.js
@@ -22,6 +22,7 @@ export function isAllRateLimited(accounts, modelId) {
 
     return accounts.every(acc => {
         if (acc.isInvalid) return true; // Invalid accounts count as unavailable
+        if (acc.enabled === false) return true; // Disabled accounts count as unavailable
         const modelLimits = acc.modelRateLimits || {};
         const limit = modelLimits[modelId];
         return limit && limit.isRateLimited && limit.resetTime > Date.now();
@@ -118,18 +119,9 @@ export function markRateLimited(accounts, email, resetMs = null, modelId) {
     const account = accounts.find(a => a.email === email);
     if (!account) return false;
 
-    // Use configured cooldown as the maximum wait time
-    // If API returns a reset time, cap it at DEFAULT_COOLDOWN_MS
-    // If API doesn't return a reset time, use DEFAULT_COOLDOWN_MS
-    let cooldownMs;
-    if (resetMs && resetMs > 0) {
-        // API provided a reset time - cap it at configured maximum
-        cooldownMs = Math.min(resetMs, DEFAULT_COOLDOWN_MS);
-    } else {
-        // No reset time from API - use configured default
-        cooldownMs = DEFAULT_COOLDOWN_MS;
-    }
-    const resetTime = Date.now() + cooldownMs;
+    // Store the ACTUAL reset time from the API
+    // This is used to decide whether to wait (short) or switch accounts (long)
+    const actualResetMs = (resetMs && resetMs > 0) ? resetMs : DEFAULT_COOLDOWN_MS;
 
     if (!account.modelRateLimits) {
         account.modelRateLimits = {};
@@ -137,12 +129,20 @@ export function markRateLimited(accounts, email, resetMs = null, modelId) {
 
     account.modelRateLimits[modelId] = {
         isRateLimited: true,
-        resetTime: resetTime
+        resetTime: Date.now() + actualResetMs,  // Actual reset time for decisions
+        actualResetMs: actualResetMs             // Original duration from API
     };
 
-    logger.warn(
-        `[AccountManager] Rate limited: ${email} (model: ${modelId}). Available in ${formatDuration(cooldownMs)}`
-    );
+    // Log appropriately based on duration
+    if (actualResetMs > DEFAULT_COOLDOWN_MS) {
+        logger.warn(
+            `[AccountManager] Quota exhausted: ${email} (model: ${modelId}). Resets in ${formatDuration(actualResetMs)}`
+        );
+    } else {
+        logger.warn(
+            `[AccountManager] Rate limited: ${email} (model: ${modelId}). Available in ${formatDuration(actualResetMs)}`
+        );
+    }
 
     return true;
 }
@@ -209,3 +209,29 @@ export function getMinWaitTimeMs(accounts, modelId) {
 
     return minWait === Infinity ? DEFAULT_COOLDOWN_MS : minWait;
 }
+
+/**
+ * Get the rate limit info for a specific account and model
+ * Returns the actual reset time from API, not capped
+ *
+ * @param {Array} accounts - Array of account objects
+ * @param {string} email - Email of the account
+ * @param {string} modelId - Model ID to check
+ * @returns {{isRateLimited: boolean, actualResetMs: number|null, waitMs: number}} Rate limit info
+ */
+export function getRateLimitInfo(accounts, email, modelId) {
+    const account = accounts.find(a => a.email === email);
+    if (!account || !account.modelRateLimits || !account.modelRateLimits[modelId]) {
+        return { isRateLimited: false, actualResetMs: null, waitMs: 0 };
+    }
+
+    const limit = account.modelRateLimits[modelId];
+    const now = Date.now();
+    const waitMs = limit.resetTime ? Math.max(0, limit.resetTime - now) : 0;
+
+    return {
+        isRateLimited: limit.isRateLimited && waitMs > 0,
+        actualResetMs: limit.actualResetMs || null,
+        waitMs
+    };
+}
diff --git a/src/cloudcode/message-handler.js b/src/cloudcode/message-handler.js
index 596229d..70598b3 100644
--- a/src/cloudcode/message-handler.js
+++ b/src/cloudcode/message-handler.js
@@ -9,6 +9,7 @@ import {
     ANTIGRAVITY_ENDPOINT_FALLBACKS,
     MAX_RETRIES,
     MAX_WAIT_BEFORE_ERROR_MS,
+    DEFAULT_COOLDOWN_MS,
     isThinkingModel
 } from '../constants.js';
 import { convertGoogleToAnthropic } from '../format/index.js';
@@ -39,67 +40,56 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
 
     // Retry loop with account failover
     // Ensure we try at least as many times as there are accounts to cycle through everyone
-    // +1 to ensure we hit the "all accounts rate-limited" check at the start of the next loop
     const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);
 
     for (let attempt = 0; attempt < maxAttempts; attempt++) {
-        // Use sticky account selection for cache continuity
-        const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount(model);
-        let account = stickyAccount;
+        // Clear any expired rate limits before picking
+        accountManager.clearExpiredLimits();
 
-        // Handle waiting for sticky account
-        if (!account && waitMs > 0) {
-            logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
-            await sleep(waitMs);
-            accountManager.clearExpiredLimits();
-            account = accountManager.getCurrentStickyAccount(model);
-        }
+        // Get available accounts for this model
+        const availableAccounts = accountManager.getAvailableAccounts(model);
 
-        // Handle all accounts rate-limited
-        if (!account) {
+        // If no accounts available, check if we should wait or throw error
+        if (availableAccounts.length === 0) {
             if (accountManager.isAllRateLimited(model)) {
-                const allWaitMs = accountManager.getMinWaitTimeMs(model);
-                const resetTime = new Date(Date.now() + allWaitMs).toISOString();
+                const minWaitMs = accountManager.getMinWaitTimeMs(model);
+                const resetTime = new Date(Date.now() + minWaitMs).toISOString();
 
                 // If wait time is too long (> 2 minutes), throw error immediately
-                if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
+                if (minWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
                     throw new Error(
-                        `RESOURCE_EXHAUSTED: Rate limited on ${model}. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
+                        `RESOURCE_EXHAUSTED: Rate limited on ${model}. Quota will reset after ${formatDuration(minWaitMs)}. Next available: ${resetTime}`
                     );
                 }
 
-                // Wait for reset (applies to both single and multi-account modes)
+                // Wait for shortest reset time
                 const accountCount = accountManager.getAccountCount();
-                logger.warn(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
-                await sleep(allWaitMs);
-
-                // Add small buffer after waiting to ensure rate limits have truly expired
-                await sleep(500);
+                logger.warn(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(minWaitMs)}...`);
+                await sleep(minWaitMs + 500); // Add 500ms buffer
                 accountManager.clearExpiredLimits();
-                account = accountManager.pickNext(model);
-
-                // If still no account after waiting, try optimistic reset
-                // This handles cases where the API rate limit is transient
-                if (!account) {
-                    logger.warn('[CloudCode] No account available after wait, attempting optimistic reset...');
-                    accountManager.resetAllRateLimits();
-                    account = accountManager.pickNext(model);
-                }
+                continue; // Retry the loop
             }
 
-            if (!account) {
-                // Check if fallback is enabled and available
-                if (fallbackEnabled) {
-                    const fallbackModel = getFallbackModel(model);
-                    if (fallbackModel) {
-                        logger.warn(`[CloudCode] All accounts exhausted for ${model}. Attempting fallback to ${fallbackModel}`);
-                        // Retry with fallback model
-                        const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
-                        return await sendMessage(fallbackRequest, accountManager, false); // Disable fallback for recursive call
-                    }
+            // Check if fallback is enabled and available
+            if (fallbackEnabled) {
+                const fallbackModel = getFallbackModel(model);
+                if (fallbackModel) {
+                    logger.warn(`[CloudCode] All accounts exhausted for ${model}. Attempting fallback to ${fallbackModel}`);
+                    const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
+                    return await sendMessage(fallbackRequest, accountManager, false);
                 }
-                throw new Error('No accounts available');
             }
+            throw new Error('No accounts available');
+        }
+
+        // Pick sticky account (prefers current for cache continuity)
+        let account = accountManager.getCurrentStickyAccount(model);
+        if (!account) {
+            account = accountManager.pickNext(model);
+        }
+
+        if (!account) {
+            continue; // Shouldn't happen, but safety check
         }
 
         try {
@@ -112,6 +102,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
 
             // Try each endpoint
             let lastError = null;
+            let retriedOnce = false; // Track if we've already retried for short rate limit
+
             for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) {
                 try {
                     const url = isThinking
@@ -137,14 +129,51 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                         }
 
                         if (response.status === 429) {
-                            // Rate limited on this endpoint - try next endpoint first (DAILY → PROD)
-                            logger.debug(`[CloudCode] Rate limited at ${endpoint}, trying next endpoint...`);
                             const resetMs = parseResetTime(response, errorText);
-                            // Keep minimum reset time across all 429 responses
-                            if (!lastError?.is429 || (resetMs && (!lastError.resetMs || resetMs < lastError.resetMs))) {
-                                lastError = { is429: true, response, errorText, resetMs };
+
+                            // Decision: wait and retry OR switch account
+                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
+                                // Long-term quota exhaustion (> 10s) - switch to next account
+                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`);
+                                accountManager.markRateLimited(account.email, resetMs, model);
+                                throw new Error(`QUOTA_EXHAUSTED: ${errorText}`);
+                            } else {
+                                // Short-term rate limit (<= 10s) - wait and retry once
+                                const waitMs = resetMs || DEFAULT_COOLDOWN_MS;
+
+                                if (!retriedOnce) {
+                                    retriedOnce = true;
+                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
+                                    await sleep(waitMs);
+                                    // Retry same endpoint
+                                    const retryResponse = await fetch(url, {
+                                        method: 'POST',
+                                        headers: buildHeaders(token, model, isThinking ? 'text/event-stream' : 'application/json'),
+                                        body: JSON.stringify(payload)
+                                    });
+
+                                    if (retryResponse.ok) {
+                                        // Process retry response
+                                        if (isThinking) {
+                                            return await parseThinkingSSEResponse(retryResponse, anthropicRequest.model);
+                                        }
+                                        const data = await retryResponse.json();
+                                        logger.debug('[CloudCode] Response received after retry');
+                                        return convertGoogleToAnthropic(data, anthropicRequest.model);
+                                    }
+
+                                    // Retry also failed - parse new reset time
+                                    const retryErrorText = await retryResponse.text();
+                                    const retryResetMs = parseResetTime(retryResponse, retryErrorText);
+                                    logger.warn(`[CloudCode] Retry also failed, marking and switching...`);
+                                    accountManager.markRateLimited(account.email, retryResetMs || waitMs, model);
+                                    throw new Error(`RATE_LIMITED_AFTER_RETRY: ${retryErrorText}`);
+                                } else {
+                                    // Already retried once, mark and switch
+                                    accountManager.markRateLimited(account.email, waitMs, model);
+                                    throw new Error(`RATE_LIMITED: ${errorText}`);
+                                }
                             }
-                            continue;
                         }
 
                         if (response.status >= 400) {
@@ -179,7 +208,6 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
 
             // If all endpoints failed for this account
             if (lastError) {
-                // If all endpoints returned 429, mark account as rate-limited
                 if (lastError.is429) {
                     logger.warn(`[CloudCode] All endpoints rate-limited for ${account.email}`);
                     accountManager.markRateLimited(account.email, lastError.resetMs, model);
@@ -199,18 +227,17 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                 logger.warn(`[CloudCode] Account ${account.email} has invalid credentials, trying next...`);
                 continue;
             }
-            // Non-rate-limit error: throw immediately
-            // UNLESS it's a 500 error, then we treat it as a "soft" failure for this account and try the next one
+            // Handle 5xx errors
             if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
                 logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error, trying next...`);
-                accountManager.pickNext(model); // Force advance to next account
+                accountManager.pickNext(model);
                 continue;
             }
 
             if (isNetworkError(error)) {
                 logger.warn(`[CloudCode] Network error for ${account.email}, trying next account... (${error.message})`);
-                await sleep(1000); // Brief pause before retry
-                accountManager.pickNext(model); // Advance to next account
+                await sleep(1000);
+                accountManager.pickNext(model);
                 continue;
             }
 
@@ -224,7 +251,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
         if (fallbackModel) {
             logger.warn(`[CloudCode] All retries exhausted for ${model}. Attempting fallback to ${fallbackModel}`);
             const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
-            return await sendMessage(fallbackRequest, accountManager, false); // Disable fallback for recursive call
+            return await sendMessage(fallbackRequest, accountManager, false);
         }
     }
 
diff --git a/src/cloudcode/model-api.js b/src/cloudcode/model-api.js
index 4ade5ee..35f5e2b 100644
--- a/src/cloudcode/model-api.js
+++ b/src/cloudcode/model-api.js
@@ -57,22 +57,26 @@ export async function listModels(token) {
  * Returns model quotas including remaining fraction and reset time
  *
  * @param {string} token - OAuth access token
+ * @param {string} [projectId] - Optional project ID for accurate quota info
  * @returns {Promise<Object>} Raw response from fetchAvailableModels API
  */
-export async function fetchAvailableModels(token) {
+export async function fetchAvailableModels(token, projectId = null) {
     const headers = {
         'Authorization': `Bearer ${token}`,
         'Content-Type': 'application/json',
         ...ANTIGRAVITY_HEADERS
     };
 
+    // Include project ID in body for accurate quota info (per Quotio implementation)
+    const body = projectId ? { project: projectId } : {};
+
     for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) {
         try {
             const url = `${endpoint}/v1internal:fetchAvailableModels`;
             const response = await fetch(url, {
                 method: 'POST',
                 headers,
-                body: JSON.stringify({})
+                body: JSON.stringify(body)
             });
 
             if (!response.ok) {
@@ -95,10 +99,11 @@ export async function fetchAvailableModels(token) {
  * Extracts quota info (remaining fraction and reset time) for each model
  *
  * @param {string} token - OAuth access token
+ * @param {string} [projectId] - Optional project ID for accurate quota info
  * @returns {Promise<Object>} Map of modelId -> { remainingFraction, resetTime }
  */
-export async function getModelQuotas(token) {
-    const data = await fetchAvailableModels(token);
+export async function getModelQuotas(token, projectId = null) {
+    const data = await fetchAvailableModels(token, projectId);
     if (!data || !data.models) return {};
 
     const quotas = {};
@@ -108,7 +113,8 @@ export async function getModelQuotas(token) {
 
         if (modelData.quotaInfo) {
             quotas[modelId] = {
-                remainingFraction: modelData.quotaInfo.remainingFraction ?? null,
+                // When remainingFraction is missing but resetTime is present, quota is exhausted (0%)
+                remainingFraction: modelData.quotaInfo.remainingFraction ?? (modelData.quotaInfo.resetTime ? 0 : null),
                 resetTime: modelData.quotaInfo.resetTime ?? null
             };
         }
diff --git a/src/cloudcode/rate-limit-parser.js b/src/cloudcode/rate-limit-parser.js
index 6e53b15..2fddb26 100644
--- a/src/cloudcode/rate-limit-parser.js
+++ b/src/cloudcode/rate-limit-parser.js
@@ -78,7 +78,7 @@ export function parseResetTime(responseOrError, errorText = '') {
 
         // Try to extract "quotaResetDelay" first (e.g. "754.431528ms" or "1.5s")
         // This is Google's preferred format for rate limit reset delay
-        const quotaDelayMatch = msg.match(/quotaResetDelay[:\s"]+(\\d+(?:\\.\\d+)?)(ms|s)/i);
+        const quotaDelayMatch = msg.match(/quotaResetDelay[:\s"]+(\d+(?:\.\d+)?)(ms|s)/i);
         if (quotaDelayMatch) {
             const value = parseFloat(quotaDelayMatch[1]);
             const unit = quotaDelayMatch[2].toLowerCase();
@@ -103,7 +103,7 @@ export function parseResetTime(responseOrError, errorText = '') {
         // Try to extract "retry-after-ms" or "retryDelay" - check seconds format first (e.g. "7739.23s")
         // Added stricter regex to avoid partial matches
         if (!resetMs) {
-             const secMatch = msg.match(/(?:retry[-_]?after[-_]?ms|retryDelay)[:\s"]+([\\d\\.]+)(?:s\b|s")/i);
+             const secMatch = msg.match(/(?:retry[-_]?after[-_]?ms|retryDelay)[:\s"]+([\d.]+)(?:s\b|s")/i);
              if (secMatch) {
                  resetMs = Math.ceil(parseFloat(secMatch[1]) * 1000);
                  logger.debug(`[CloudCode] Parsed retry seconds from body (precise): ${resetMs}ms`);
diff --git a/src/cloudcode/streaming-handler.js b/src/cloudcode/streaming-handler.js
index 3f0dc55..188c6de 100644
--- a/src/cloudcode/streaming-handler.js
+++ b/src/cloudcode/streaming-handler.js
@@ -9,7 +9,8 @@ import {
     ANTIGRAVITY_ENDPOINT_FALLBACKS,
     MAX_RETRIES,
     MAX_EMPTY_RESPONSE_RETRIES,
-    MAX_WAIT_BEFORE_ERROR_MS
+    MAX_WAIT_BEFORE_ERROR_MS,
+    DEFAULT_COOLDOWN_MS
 } from '../constants.js';
 import { isRateLimitError, isAuthError, isEmptyResponseError } from '../errors.js';
 import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js';
@@ -38,68 +39,57 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
 
     // Retry loop with account failover
     // Ensure we try at least as many times as there are accounts to cycle through everyone
-    // +1 to ensure we hit the "all accounts rate-limited" check at the start of the next loop
     const maxAttempts = Math.max(MAX_RETRIES, accountManager.getAccountCount() + 1);
 
     for (let attempt = 0; attempt < maxAttempts; attempt++) {
-        // Use sticky account selection for cache continuity
-        const { account: stickyAccount, waitMs } = accountManager.pickStickyAccount(model);
-        let account = stickyAccount;
+        // Clear any expired rate limits before picking
+        accountManager.clearExpiredLimits();
 
-        // Handle waiting for sticky account
-        if (!account && waitMs > 0) {
-            logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for sticky account...`);
-            await sleep(waitMs);
-            accountManager.clearExpiredLimits();
-            account = accountManager.getCurrentStickyAccount(model);
-        }
+        // Get available accounts for this model
+        const availableAccounts = accountManager.getAvailableAccounts(model);
 
-        // Handle all accounts rate-limited
-        if (!account) {
+        // If no accounts available, check if we should wait or throw error
+        if (availableAccounts.length === 0) {
             if (accountManager.isAllRateLimited(model)) {
-                const allWaitMs = accountManager.getMinWaitTimeMs(model);
-                const resetTime = new Date(Date.now() + allWaitMs).toISOString();
+                const minWaitMs = accountManager.getMinWaitTimeMs(model);
+                const resetTime = new Date(Date.now() + minWaitMs).toISOString();
 
                 // If wait time is too long (> 2 minutes), throw error immediately
-                if (allWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
+                if (minWaitMs > MAX_WAIT_BEFORE_ERROR_MS) {
                     throw new Error(
-                        `RESOURCE_EXHAUSTED: Rate limited on ${model}. Quota will reset after ${formatDuration(allWaitMs)}. Next available: ${resetTime}`
+                        `RESOURCE_EXHAUSTED: Rate limited on ${model}. Quota will reset after ${formatDuration(minWaitMs)}. Next available: ${resetTime}`
                     );
                 }
 
-                // Wait for reset (applies to both single and multi-account modes)
+                // Wait for shortest reset time
                 const accountCount = accountManager.getAccountCount();
-                logger.warn(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(allWaitMs)}...`);
-                await sleep(allWaitMs);
-
-                // Add small buffer after waiting to ensure rate limits have truly expired
-                await sleep(500);
+                logger.warn(`[CloudCode] All ${accountCount} account(s) rate-limited. Waiting ${formatDuration(minWaitMs)}...`);
+                await sleep(minWaitMs + 500); // Add 500ms buffer
                 accountManager.clearExpiredLimits();
-                account = accountManager.pickNext(model);
-
-                // If still no account after waiting, try optimistic reset
-                // This handles cases where the API rate limit is transient
-                if (!account) {
-                    logger.warn('[CloudCode] No account available after wait, attempting optimistic reset...');
-                    accountManager.resetAllRateLimits();
-                    account = accountManager.pickNext(model);
-                }
+                continue; // Retry the loop
             }
 
-            if (!account) {
-                // Check if fallback is enabled and available
-                if (fallbackEnabled) {
-                    const fallbackModel = getFallbackModel(model);
-                    if (fallbackModel) {
-                        logger.warn(`[CloudCode] All accounts exhausted for ${model}. Attempting fallback to ${fallbackModel} (streaming)`);
-                        // Retry with fallback model
-                        const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
-                        yield* sendMessageStream(fallbackRequest, accountManager, false); // Disable fallback for recursive call
-                        return;
-                    }
+            // Check if fallback is enabled and available
+            if (fallbackEnabled) {
+                const fallbackModel = getFallbackModel(model);
+                if (fallbackModel) {
+                    logger.warn(`[CloudCode] All accounts exhausted for ${model}. Attempting fallback to ${fallbackModel} (streaming)`);
+                    const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
+                    yield* sendMessageStream(fallbackRequest, accountManager, false);
+                    return;
                 }
-                throw new Error('No accounts available');
             }
+            throw new Error('No accounts available');
+        }
+
+        // Pick sticky account (prefers current for cache continuity)
+        let account = accountManager.getCurrentStickyAccount(model);
+        if (!account) {
+            account = accountManager.pickNext(model);
+        }
+
+        if (!account) {
+            continue; // Shouldn't happen, but safety check
         }
 
         try {
@@ -112,6 +102,8 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
 
             // Try each endpoint for streaming
             let lastError = null;
+            let retriedOnce = false; // Track if we've already retried for short rate limit
+
             for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) {
                 try {
                     const url = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;
@@ -134,14 +126,48 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                         }
 
                         if (response.status === 429) {
-                            // Rate limited on this endpoint - try next endpoint first (DAILY → PROD)
-                            logger.debug(`[CloudCode] Stream rate limited at ${endpoint}, trying next endpoint...`);
                             const resetMs = parseResetTime(response, errorText);
-                            // Keep minimum reset time across all 429 responses
-                            if (!lastError?.is429 || (resetMs && (!lastError.resetMs || resetMs < lastError.resetMs))) {
-                                lastError = { is429: true, response, errorText, resetMs };
+
+                            // Decision: wait and retry OR switch account
+                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
+                                // Long-term quota exhaustion (> 10s) - switch to next account
+                                logger.info(`[CloudCode] Quota exhausted for ${account.email} (${formatDuration(resetMs)}), switching account...`);
+                                accountManager.markRateLimited(account.email, resetMs, model);
+                                throw new Error(`QUOTA_EXHAUSTED: ${errorText}`);
+                            } else {
+                                // Short-term rate limit (<= 10s) - wait and retry once
+                                const waitMs = resetMs || DEFAULT_COOLDOWN_MS;
+
+                                if (!retriedOnce) {
+                                    retriedOnce = true;
+                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
+                                    await sleep(waitMs);
+                                    // Retry same endpoint
+                                    const retryResponse = await fetch(url, {
+                                        method: 'POST',
+                                        headers: buildHeaders(token, model, 'text/event-stream'),
+                                        body: JSON.stringify(payload)
+                                    });
+
+                                    if (retryResponse.ok) {
+                                        // Stream the retry response
+                                        yield* streamSSEResponse(retryResponse, anthropicRequest.model);
+                                        logger.debug('[CloudCode] Stream completed after retry');
+                                        return;
+                                    }
+
+                                    // Retry also failed - parse new reset time
+                                    const retryErrorText = await retryResponse.text();
+                                    const retryResetMs = parseResetTime(retryResponse, retryErrorText);
+                                    logger.warn(`[CloudCode] Retry also failed, marking and switching...`);
+                                    accountManager.markRateLimited(account.email, retryResetMs || waitMs, model);
+                                    throw new Error(`RATE_LIMITED_AFTER_RETRY: ${retryErrorText}`);
+                                } else {
+                                    // Already retried once, mark and switch
+                                    accountManager.markRateLimited(account.email, waitMs, model);
+                                    throw new Error(`RATE_LIMITED: ${errorText}`);
+                                }
                             }
-                            continue;
                         }
 
                         lastError = new Error(`API error ${response.status}: ${errorText}`);
@@ -156,7 +182,6 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                     }
 
                     // Stream the response with retry logic for empty responses
-                    // Uses a for-loop for clearer retry semantics
                     let currentResponse = response;
 
                     for (let emptyRetries = 0; emptyRetries <= MAX_EMPTY_RESPONSE_RETRIES; emptyRetries++) {
@@ -207,28 +232,22 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                                     throw new Error(`401 AUTH_INVALID during retry: ${retryErrorText}`);
                                 }
 
-                                // For 5xx errors, don't pass to streamer - just continue to next retry
+                                // For 5xx errors, continue retrying
                                 if (currentResponse.status >= 500) {
                                     logger.warn(`[CloudCode] Retry got ${currentResponse.status}, will retry...`);
-                                    // Don't continue here - let the loop increment and refetch
-                                    // Set currentResponse to null to force refetch at loop start
-                                    emptyRetries--; // Compensate for loop increment since we didn't actually try
                                     await sleep(1000);
-                                    // Refetch immediately for 5xx
                                     currentResponse = await fetch(url, {
                                         method: 'POST',
                                         headers: buildHeaders(token, model, 'text/event-stream'),
                                         body: JSON.stringify(payload)
                                     });
                                     if (currentResponse.ok) {
-                                        continue; // Try streaming with new response
+                                        continue;
                                     }
-                                    // If still failing, let it fall through to throw
                                 }
 
                                 throw new Error(`Empty response retry failed: ${currentResponse.status} - ${retryErrorText}`);
                             }
-                            // Response is OK, loop will continue to try streamSSEResponse
                         }
                     }
 
@@ -237,7 +256,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                         throw endpointError; // Re-throw to trigger account switch
                     }
                     if (isEmptyResponseError(endpointError)) {
-                        throw endpointError; // Re-throw empty response errors to outer handler
+                        throw endpointError;
                     }
                     logger.warn(`[CloudCode] Stream error at ${endpoint}:`, endpointError.message);
                     lastError = endpointError;
@@ -246,7 +265,6 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
 
             // If all endpoints failed for this account
             if (lastError) {
-                // If all endpoints returned 429, mark account as rate-limited
                 if (lastError.is429) {
                     logger.warn(`[CloudCode] All endpoints rate-limited for ${account.email}`);
                     accountManager.markRateLimited(account.email, lastError.resetMs, model);
@@ -266,18 +284,17 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                 logger.warn(`[CloudCode] Account ${account.email} has invalid credentials, trying next...`);
                 continue;
             }
-            // Non-rate-limit error: throw immediately
-            // UNLESS it's a 500 error, then we treat it as a "soft" failure for this account and try the next one
+            // Handle 5xx errors
             if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
                 logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error, trying next...`);
-                accountManager.pickNext(model); // Force advance to next account
+                accountManager.pickNext(model);
                 continue;
             }
 
             if (isNetworkError(error)) {
                 logger.warn(`[CloudCode] Network error for ${account.email} (stream), trying next account... (${error.message})`);
-                await sleep(1000); // Brief pause before retry
-                accountManager.pickNext(model); // Advance to next account
+                await sleep(1000);
+                accountManager.pickNext(model);
                 continue;
             }
 
@@ -291,7 +308,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
         if (fallbackModel) {
             logger.warn(`[CloudCode] All retries exhausted for ${model}. Attempting fallback to ${fallbackModel} (streaming)`);
             const fallbackRequest = { ...anthropicRequest, model: fallbackModel };
-            yield* sendMessageStream(fallbackRequest, accountManager, false); // Disable fallback for recursive call
+            yield* sendMessageStream(fallbackRequest, accountManager, false);
             return;
         }
     }
diff --git a/src/constants.js b/src/constants.js
index 022ea48..8fadc89 100644
--- a/src/constants.js
+++ b/src/constants.js
@@ -69,15 +69,16 @@ export const ONBOARD_USER_ENDPOINTS = ANTIGRAVITY_ENDPOINT_FALLBACKS;
 
 // Hybrid headers specifically for loadCodeAssist
 // Uses google-api-nodejs-client User-Agent (required for project discovery on some accounts)
-export const LOAD_CODE_ASSIST_HEADERS = {
-    'User-Agent': 'google-api-nodejs-client/9.15.1',
-    'X-Goog-Api-Client': 'google-cloud-sdk vscode_cloudshelleditor/0.1',
-    'Client-Metadata': JSON.stringify({
-        ideType: 'IDE_UNSPECIFIED',
-        platform: 'PLATFORM_UNSPECIFIED',
-        pluginType: 'GEMINI'
-    })
-};
+// export const LOAD_CODE_ASSIST_HEADERS = {
+//     'User-Agent': 'google-api-nodejs-client/9.15.1',
+//     'X-Goog-Api-Client': 'google-cloud-sdk vscode_cloudshelleditor/0.1',
+//     'Client-Metadata': JSON.stringify({
+//         ideType: 'IDE_UNSPECIFIED',
+//         platform: 'PLATFORM_UNSPECIFIED',
+//         pluginType: 'GEMINI'
+//     })
+// };
+export const LOAD_CODE_ASSIST_HEADERS = ANTIGRAVITY_HEADERS;
 
 // Default project ID if none can be discovered
 export const DEFAULT_PROJECT_ID = 'rising-fact-p41fc';
diff --git a/src/server.js b/src/server.js
index c498da5..43e2ccb 100644
--- a/src/server.js
+++ b/src/server.js
@@ -214,7 +214,8 @@ app.get('/health', async (req, res) => {
 
                 try {
                     const token = await accountManager.getTokenForAccount(account);
-                    const quotas = await getModelQuotas(token);
+                    const projectId = account.subscription?.projectId || null;
+                    const quotas = await getModelQuotas(token, projectId);
 
                     // Format quotas for readability
                     const formattedQuotas = {};
@@ -309,11 +310,11 @@ app.get('/account-limits', async (req, res) => {
                 try {
                     const token = await accountManager.getTokenForAccount(account);
 
-                    // Fetch both quotas and subscription tier in parallel
-                    const [quotas, subscription] = await Promise.all([
-                        getModelQuotas(token),
-                        getSubscriptionTier(token)
-                    ]);
+                    // Fetch subscription tier first to get project ID
+                    const subscription = await getSubscriptionTier(token);
+
+                    // Then fetch quotas with project ID for accurate quota info
+                    const quotas = await getModelQuotas(token, subscription.projectId);
 
                     // Update account object with fresh data
                     account.subscription = {