feat: add configurable account selection strategies

Refactor account selection into a strategy pattern with three options: - Sticky: cache-optimized, stays on same account until rate-limited - Round-robin: load-balanced, rotates every request - Hybrid (default): smart distribution using health scores, token buckets, and LRU The hybrid strategy uses multiple signals for optimal account selection: health tracking for reliability, client-side token buckets for rate limiting, and LRU freshness to prefer rested accounts. Includes WebUI settings for strategy selection and unit tests. Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-18 03:48:43 +05:30
parent 973234372b
commit 5ae19a5b72
31 changed files with 2721 additions and 353 deletions
--- a/src/cloudcode/message-handler.js
+++ b/src/cloudcode/message-handler.js
@@ -10,6 +10,11 @@ import {
    MAX_RETRIES,
    MAX_WAIT_BEFORE_ERROR_MS,
    DEFAULT_COOLDOWN_MS,
+    RATE_LIMIT_DEDUP_WINDOW_MS,
+    MAX_CONSECUTIVE_FAILURES,
+    EXTENDED_COOLDOWN_MS,
+    CAPACITY_RETRY_DELAY_MS,
+    MAX_CAPACITY_RETRIES,
    isThinkingModel
 } from '../constants.js';
 import { convertGoogleToAnthropic } from '../format/index.js';
@@ -21,6 +26,85 @@ import { buildCloudCodeRequest, buildHeaders } from './request-builder.js';
 import { parseThinkingSSEResponse } from './sse-parser.js';
 import { getFallbackModel } from '../fallback-config.js';

+/**
+ * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits
+ * Tracks last rate limit timestamp per model to skip duplicate retries
+ */
+const lastRateLimitTimestamps = new Map(); // modelId -> timestamp
+
+/**
+ * Check if we should skip retry due to recent rate limit on this model
+ * @param {string} model - Model ID
+ * @returns {boolean} True if retry should be skipped (within dedup window)
+ */
+function shouldSkipRetryDueToDedup(model) {
+    const lastTimestamp = lastRateLimitTimestamps.get(model);
+    if (!lastTimestamp) return false;
+
+    const elapsed = Date.now() - lastTimestamp;
+    if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) {
+        logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`);
+        return true;
+    }
+    return false;
+}
+
+/**
+ * Record rate limit timestamp for deduplication
+ * @param {string} model - Model ID
+ */
+function recordRateLimitTimestamp(model) {
+    lastRateLimitTimestamps.set(model, Date.now());
+}
+
+/**
+ * Clear rate limit timestamp after successful retry
+ * @param {string} model - Model ID
+ */
+function clearRateLimitTimestamp(model) {
+    lastRateLimitTimestamps.delete(model);
+}
+
+/**
+ * Gap 3: Detect permanent authentication failures that require re-authentication
+ * These should mark the account as invalid rather than just clearing cache
+ * @param {string} errorText - Error message from API
+ * @returns {boolean} True if permanent auth failure
+ */
+function isPermanentAuthFailure(errorText) {
+    const lower = (errorText || '').toLowerCase();
+    return lower.includes('invalid_grant') ||
+        lower.includes('token revoked') ||
+        lower.includes('token has been expired or revoked') ||
+        lower.includes('token_revoked') ||
+        lower.includes('invalid_client') ||
+        lower.includes('credentials are invalid');
+}
+
+/**
+ * Gap 4: Detect if 429 error is due to model capacity (not user quota)
+ * Capacity issues should retry on same account with shorter delay
+ * @param {string} errorText - Error message from API
+ * @returns {boolean} True if capacity exhausted (not quota)
+ */
+function isModelCapacityExhausted(errorText) {
+    const lower = (errorText || '').toLowerCase();
+    return lower.includes('model_capacity_exhausted') ||
+        lower.includes('capacity_exhausted') ||
+        lower.includes('model is currently overloaded') ||
+        lower.includes('service temporarily unavailable');
+}
+
+// Periodically clean up stale dedup timestamps (every 60 seconds)
+setInterval(() => {
+    const cutoff = Date.now() - 60000; // 1 minute
+    for (const [model, timestamp] of lastRateLimitTimestamps.entries()) {
+        if (timestamp < cutoff) {
+            lastRateLimitTimestamps.delete(model);
+        }
+    }
+}, 60000);
+
 /**
 * Send a non-streaming request to Cloud Code with multi-account support
 * Uses SSE endpoint for thinking models (non-streaming doesn't return thinking blocks)
@@ -83,10 +167,14 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
            throw new Error('No accounts available');
        }

-        // Pick sticky account (prefers current for cache continuity)
-        let account = accountManager.getCurrentStickyAccount(model);
-        if (!account) {
-            account = accountManager.pickNext(model);
+        // Select account using configured strategy
+        const { account, waitMs } = accountManager.selectAccount(model);
+
+        // If strategy returns a wait time, sleep and retry
+        if (!account && waitMs > 0) {
+            logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`);
+            await sleep(waitMs + 500);
+            continue;
        }

        if (!account) {
@@ -101,11 +189,14 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab

            logger.debug(`[CloudCode] Sending request for model: ${model}`);

-            // Try each endpoint
+            // Try each endpoint with index-based loop for capacity retry support
            let lastError = null;
            let retriedOnce = false; // Track if we've already retried for short rate limit
+            let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries
+            let endpointIndex = 0;

-            for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) {
+            while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) {
+                const endpoint = ANTIGRAVITY_ENDPOINT_FALLBACKS[endpointIndex];
                try {
                    const url = isThinking
                        ? `${endpoint}/v1internal:streamGenerateContent?alt=sse`
@@ -122,16 +213,45 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                        logger.warn(`[CloudCode] Error at ${endpoint}: ${response.status} - ${errorText}`);

                        if (response.status === 401) {
-                            // Auth error - clear caches and retry with fresh token
-                            logger.warn('[CloudCode] Auth error, refreshing token...');
+                            // Gap 3: Check for permanent auth failures
+                            if (isPermanentAuthFailure(errorText)) {
+                                logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`);
+                                accountManager.markInvalid(account.email, 'Token revoked - re-authentication required');
+                                throw new Error(`AUTH_INVALID_PERMANENT: ${errorText}`);
+                            }
+
+                            // Transient auth error - clear caches and retry with fresh token
+                            logger.warn('[CloudCode] Transient auth error, refreshing token...');
                            accountManager.clearTokenCache(account.email);
                            accountManager.clearProjectCache(account.email);
+                            endpointIndex++;
                            continue;
                        }

                        if (response.status === 429) {
                            const resetMs = parseResetTime(response, errorText);

+                            // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint
+                            if (isModelCapacityExhausted(errorText)) {
+                                if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                    capacityRetryCount++;
+                                    const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS;
+                                    logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
+                                    await sleep(waitMs);
+                                    // Don't increment endpointIndex - retry same endpoint
+                                    continue;
+                                }
+                                // Max capacity retries exceeded - treat as quota exhaustion
+                                logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`);
+                            }
+
+                            // Gap 1: Check deduplication window to prevent thundering herd
+                            if (shouldSkipRetryDueToDedup(model)) {
+                                logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`);
+                                accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model);
+                                throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`);
+                            }
+
                            // Decision: wait and retry OR switch account
                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
                                // Long-term quota exhaustion (> 10s) - switch to next account
@@ -144,31 +264,11 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab

                                if (!retriedOnce) {
                                    retriedOnce = true;
+                                    recordRateLimitTimestamp(model); // Gap 1: Record before retry
                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
                                    await sleep(waitMs);
-                                    // Retry same endpoint
-                                    const retryResponse = await fetch(url, {
-                                        method: 'POST',
-                                        headers: buildHeaders(token, model, isThinking ? 'text/event-stream' : 'application/json'),
-                                        body: JSON.stringify(payload)
-                                    });
-
-                                    if (retryResponse.ok) {
-                                        // Process retry response
-                                        if (isThinking) {
-                                            return await parseThinkingSSEResponse(retryResponse, anthropicRequest.model);
-                                        }
-                                        const data = await retryResponse.json();
-                                        logger.debug('[CloudCode] Response received after retry');
-                                        return convertGoogleToAnthropic(data, anthropicRequest.model);
-                                    }
-
-                                    // Retry also failed - parse new reset time
-                                    const retryErrorText = await retryResponse.text();
-                                    const retryResetMs = parseResetTime(retryResponse, retryErrorText);
-                                    logger.warn(`[CloudCode] Retry also failed, marking and switching...`);
-                                    accountManager.markRateLimited(account.email, retryResetMs || waitMs, model);
-                                    throw new Error(`RATE_LIMITED_AFTER_RETRY: ${retryErrorText}`);
+                                    // Don't increment endpointIndex - retry same endpoint
+                                    continue;
                                } else {
                                    // Already retried once, mark and switch
                                    accountManager.markRateLimited(account.email, waitMs, model);
@@ -184,18 +284,26 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                                logger.warn(`[CloudCode] ${response.status} error, waiting 1s before retry...`);
                                await sleep(1000);
                            }
+                            endpointIndex++;
                            continue;
                        }
                    }

                    // For thinking models, parse SSE and accumulate all parts
                    if (isThinking) {
-                        return await parseThinkingSSEResponse(response, anthropicRequest.model);
+                        const result = await parseThinkingSSEResponse(response, anthropicRequest.model);
+                        // Gap 1: Clear timestamp on success
+                        clearRateLimitTimestamp(model);
+                        accountManager.notifySuccess(account, model);
+                        return result;
                    }

                    // Non-thinking models use regular JSON
                    const data = await response.json();
                    logger.debug('[CloudCode] Response received');
+                    // Gap 1: Clear timestamp on success
+                    clearRateLimitTimestamp(model);
+                    accountManager.notifySuccess(account, model);
                    return convertGoogleToAnthropic(data, anthropicRequest.model);

                } catch (endpointError) {
@@ -204,6 +312,7 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
                    }
                    logger.warn(`[CloudCode] Error at ${endpoint}:`, endpointError.message);
                    lastError = endpointError;
+                    endpointIndex++;
                }
            }

@@ -219,7 +328,8 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab

        } catch (error) {
            if (isRateLimitError(error)) {
-                // Rate limited - already marked, continue to next account
+                // Rate limited - already marked, notify strategy and continue to next account
+                accountManager.notifyRateLimit(account, model);
                logger.info(`[CloudCode] Account ${account.email} rate-limited, trying next...`);
                continue;
            }
@@ -230,15 +340,31 @@ export async function sendMessage(anthropicRequest, accountManager, fallbackEnab
            }
            // Handle 5xx errors
            if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
-                logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error, trying next...`);
-                accountManager.pickNext(model);
+                accountManager.notifyFailure(account, model);
+
+                // Gap 2: Check consecutive failures for extended cooldown
+                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
+                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                    accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
+                } else {
+                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx error, trying next...`);
+                }
                continue;
            }

            if (isNetworkError(error)) {
-                logger.warn(`[CloudCode] Network error for ${account.email}, trying next account... (${error.message})`);
+                accountManager.notifyFailure(account, model);
+
+                // Gap 2: Check consecutive failures for extended cooldown
+                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
+                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                    accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
+                } else {
+                    logger.warn(`[CloudCode] Network error for ${account.email}, trying next account... (${error.message})`);
+                }
                await sleep(1000);
-                accountManager.pickNext(model);
                continue;
            }

--- a/src/cloudcode/streaming-handler.js
+++ b/src/cloudcode/streaming-handler.js
@@ -10,7 +10,12 @@ import {
    MAX_RETRIES,
    MAX_EMPTY_RESPONSE_RETRIES,
    MAX_WAIT_BEFORE_ERROR_MS,
-    DEFAULT_COOLDOWN_MS
+    DEFAULT_COOLDOWN_MS,
+    RATE_LIMIT_DEDUP_WINDOW_MS,
+    MAX_CONSECUTIVE_FAILURES,
+    EXTENDED_COOLDOWN_MS,
+    CAPACITY_RETRY_DELAY_MS,
+    MAX_CAPACITY_RETRIES
 } from '../constants.js';
 import { isRateLimitError, isAuthError, isEmptyResponseError } from '../errors.js';
 import { formatDuration, sleep, isNetworkError } from '../utils/helpers.js';
@@ -21,6 +26,83 @@ import { streamSSEResponse } from './sse-streamer.js';
 import { getFallbackModel } from '../fallback-config.js';
 import crypto from 'crypto';

+/**
+ * Gap 1: Rate limit deduplication - prevents thundering herd on concurrent rate limits
+ * Tracks last rate limit timestamp per model to skip duplicate retries
+ */
+const lastRateLimitTimestamps = new Map(); // modelId -> timestamp
+
+/**
+ * Check if we should skip retry due to recent rate limit on this model
+ * @param {string} model - Model ID
+ * @returns {boolean} True if retry should be skipped (within dedup window)
+ */
+function shouldSkipRetryDueToDedup(model) {
+    const lastTimestamp = lastRateLimitTimestamps.get(model);
+    if (!lastTimestamp) return false;
+
+    const elapsed = Date.now() - lastTimestamp;
+    if (elapsed < RATE_LIMIT_DEDUP_WINDOW_MS) {
+        logger.debug(`[CloudCode] Rate limit on ${model} within dedup window (${elapsed}ms ago), skipping retry`);
+        return true;
+    }
+    return false;
+}
+
+/**
+ * Record rate limit timestamp for deduplication
+ * @param {string} model - Model ID
+ */
+function recordRateLimitTimestamp(model) {
+    lastRateLimitTimestamps.set(model, Date.now());
+}
+
+/**
+ * Clear rate limit timestamp after successful retry
+ * @param {string} model - Model ID
+ */
+function clearRateLimitTimestamp(model) {
+    lastRateLimitTimestamps.delete(model);
+}
+
+/**
+ * Gap 3: Detect permanent authentication failures that require re-authentication
+ * @param {string} errorText - Error message from API
+ * @returns {boolean} True if permanent auth failure
+ */
+function isPermanentAuthFailure(errorText) {
+    const lower = (errorText || '').toLowerCase();
+    return lower.includes('invalid_grant') ||
+        lower.includes('token revoked') ||
+        lower.includes('token has been expired or revoked') ||
+        lower.includes('token_revoked') ||
+        lower.includes('invalid_client') ||
+        lower.includes('credentials are invalid');
+}
+
+/**
+ * Gap 4: Detect if 429 error is due to model capacity (not user quota)
+ * @param {string} errorText - Error message from API
+ * @returns {boolean} True if capacity exhausted (not quota)
+ */
+function isModelCapacityExhausted(errorText) {
+    const lower = (errorText || '').toLowerCase();
+    return lower.includes('model_capacity_exhausted') ||
+        lower.includes('capacity_exhausted') ||
+        lower.includes('model is currently overloaded') ||
+        lower.includes('service temporarily unavailable');
+}
+
+// Periodically clean up stale dedup timestamps (every 60 seconds)
+setInterval(() => {
+    const cutoff = Date.now() - 60000; // 1 minute
+    for (const [model, timestamp] of lastRateLimitTimestamps.entries()) {
+        if (timestamp < cutoff) {
+            lastRateLimitTimestamps.delete(model);
+        }
+    }
+}, 60000);
+
 /**
 * Send a streaming request to Cloud Code with multi-account support
 * Streams events in real-time as they arrive from the server
@@ -83,10 +165,14 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
            throw new Error('No accounts available');
        }

-        // Pick sticky account (prefers current for cache continuity)
-        let account = accountManager.getCurrentStickyAccount(model);
-        if (!account) {
-            account = accountManager.pickNext(model);
+        // Select account using configured strategy
+        const { account, waitMs } = accountManager.selectAccount(model);
+
+        // If strategy returns a wait time, sleep and retry
+        if (!account && waitMs > 0) {
+            logger.info(`[CloudCode] Waiting ${formatDuration(waitMs)} for account...`);
+            await sleep(waitMs + 500);
+            continue;
        }

        if (!account) {
@@ -101,11 +187,14 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb

            logger.debug(`[CloudCode] Starting stream for model: ${model}`);

-            // Try each endpoint for streaming
+            // Try each endpoint with index-based loop for capacity retry support
            let lastError = null;
            let retriedOnce = false; // Track if we've already retried for short rate limit
+            let capacityRetryCount = 0; // Gap 4: Track capacity exhaustion retries
+            let endpointIndex = 0;

-            for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) {
+            while (endpointIndex < ANTIGRAVITY_ENDPOINT_FALLBACKS.length) {
+                const endpoint = ANTIGRAVITY_ENDPOINT_FALLBACKS[endpointIndex];
                try {
                    const url = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;

@@ -120,15 +209,44 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                        logger.warn(`[CloudCode] Stream error at ${endpoint}: ${response.status} - ${errorText}`);

                        if (response.status === 401) {
-                            // Auth error - clear caches and retry
+                            // Gap 3: Check for permanent auth failures
+                            if (isPermanentAuthFailure(errorText)) {
+                                logger.error(`[CloudCode] Permanent auth failure for ${account.email}: ${errorText.substring(0, 100)}`);
+                                accountManager.markInvalid(account.email, 'Token revoked - re-authentication required');
+                                throw new Error(`AUTH_INVALID_PERMANENT: ${errorText}`);
+                            }
+
+                            // Transient auth error - clear caches and retry
                            accountManager.clearTokenCache(account.email);
                            accountManager.clearProjectCache(account.email);
+                            endpointIndex++;
                            continue;
                        }

                        if (response.status === 429) {
                            const resetMs = parseResetTime(response, errorText);

+                            // Gap 4: Check if capacity issue (NOT quota) - retry SAME endpoint
+                            if (isModelCapacityExhausted(errorText)) {
+                                if (capacityRetryCount < MAX_CAPACITY_RETRIES) {
+                                    capacityRetryCount++;
+                                    const waitMs = resetMs || CAPACITY_RETRY_DELAY_MS;
+                                    logger.info(`[CloudCode] Model capacity exhausted, retry ${capacityRetryCount}/${MAX_CAPACITY_RETRIES} after ${formatDuration(waitMs)}...`);
+                                    await sleep(waitMs);
+                                    // Don't increment endpointIndex - retry same endpoint
+                                    continue;
+                                }
+                                // Max capacity retries exceeded - treat as quota exhaustion
+                                logger.warn(`[CloudCode] Max capacity retries (${MAX_CAPACITY_RETRIES}) exceeded, switching account`);
+                            }
+
+                            // Gap 1: Check deduplication window to prevent thundering herd
+                            if (shouldSkipRetryDueToDedup(model)) {
+                                logger.info(`[CloudCode] Skipping retry due to recent rate limit, switching account...`);
+                                accountManager.markRateLimited(account.email, resetMs || DEFAULT_COOLDOWN_MS, model);
+                                throw new Error(`RATE_LIMITED_DEDUP: ${errorText}`);
+                            }
+
                            // Decision: wait and retry OR switch account
                            if (resetMs && resetMs > DEFAULT_COOLDOWN_MS) {
                                // Long-term quota exhaustion (> 10s) - switch to next account
@@ -141,28 +259,11 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb

                                if (!retriedOnce) {
                                    retriedOnce = true;
+                                    recordRateLimitTimestamp(model); // Gap 1: Record before retry
                                    logger.info(`[CloudCode] Short rate limit (${formatDuration(waitMs)}), waiting and retrying...`);
                                    await sleep(waitMs);
-                                    // Retry same endpoint
-                                    const retryResponse = await fetch(url, {
-                                        method: 'POST',
-                                        headers: buildHeaders(token, model, 'text/event-stream'),
-                                        body: JSON.stringify(payload)
-                                    });
-
-                                    if (retryResponse.ok) {
-                                        // Stream the retry response
-                                        yield* streamSSEResponse(retryResponse, anthropicRequest.model);
-                                        logger.debug('[CloudCode] Stream completed after retry');
-                                        return;
-                                    }
-
-                                    // Retry also failed - parse new reset time
-                                    const retryErrorText = await retryResponse.text();
-                                    const retryResetMs = parseResetTime(retryResponse, retryErrorText);
-                                    logger.warn(`[CloudCode] Retry also failed, marking and switching...`);
-                                    accountManager.markRateLimited(account.email, retryResetMs || waitMs, model);
-                                    throw new Error(`RATE_LIMITED_AFTER_RETRY: ${retryErrorText}`);
+                                    // Don't increment endpointIndex - retry same endpoint
+                                    continue;
                                } else {
                                    // Already retried once, mark and switch
                                    accountManager.markRateLimited(account.email, waitMs, model);
@@ -179,6 +280,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                            await sleep(1000);
                        }

+                        endpointIndex++;
                        continue;
                    }

@@ -189,6 +291,9 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                        try {
                            yield* streamSSEResponse(currentResponse, anthropicRequest.model);
                            logger.debug('[CloudCode] Stream completed');
+                            // Gap 1: Clear timestamp on success
+                            clearRateLimitTimestamp(model);
+                            accountManager.notifySuccess(account, model);
                            return;
                        } catch (streamError) {
                            // Only retry on EmptyResponseError
@@ -226,8 +331,13 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                                    throw new Error(`429 RESOURCE_EXHAUSTED during retry: ${retryErrorText}`);
                                }

-                                // Auth error - clear caches and throw with recognizable message
+                                // Auth error - check for permanent failure
                                if (currentResponse.status === 401) {
+                                    if (isPermanentAuthFailure(retryErrorText)) {
+                                        logger.error(`[CloudCode] Permanent auth failure during retry for ${account.email}`);
+                                        accountManager.markInvalid(account.email, 'Token revoked - re-authentication required');
+                                        throw new Error(`AUTH_INVALID_PERMANENT: ${retryErrorText}`);
+                                    }
                                    accountManager.clearTokenCache(account.email);
                                    accountManager.clearProjectCache(account.email);
                                    throw new Error(`401 AUTH_INVALID during retry: ${retryErrorText}`);
@@ -261,6 +371,7 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
                    }
                    logger.warn(`[CloudCode] Stream error at ${endpoint}:`, endpointError.message);
                    lastError = endpointError;
+                    endpointIndex++;
                }
            }

@@ -276,7 +387,8 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb

        } catch (error) {
            if (isRateLimitError(error)) {
-                // Rate limited - already marked, continue to next account
+                // Rate limited - already marked, notify strategy and continue to next account
+                accountManager.notifyRateLimit(account, model);
                logger.info(`[CloudCode] Account ${account.email} rate-limited, trying next...`);
                continue;
            }
@@ -287,15 +399,31 @@ export async function* sendMessageStream(anthropicRequest, accountManager, fallb
            }
            // Handle 5xx errors
            if (error.message.includes('API error 5') || error.message.includes('500') || error.message.includes('503')) {
-                logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error, trying next...`);
-                accountManager.pickNext(model);
+                accountManager.notifyFailure(account, model);
+
+                // Gap 2: Check consecutive failures for extended cooldown
+                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
+                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                    accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
+                } else {
+                    logger.warn(`[CloudCode] Account ${account.email} failed with 5xx stream error, trying next...`);
+                }
                continue;
            }

            if (isNetworkError(error)) {
-                logger.warn(`[CloudCode] Network error for ${account.email} (stream), trying next account... (${error.message})`);
+                accountManager.notifyFailure(account, model);
+
+                // Gap 2: Check consecutive failures for extended cooldown
+                const consecutiveFailures = accountManager.getHealthTracker()?.getConsecutiveFailures(account.email) || 0;
+                if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
+                    logger.warn(`[CloudCode] Account ${account.email} has ${consecutiveFailures} consecutive network failures, applying extended cooldown (${formatDuration(EXTENDED_COOLDOWN_MS)})`);
+                    accountManager.markRateLimited(account.email, EXTENDED_COOLDOWN_MS, model);
+                } else {
+                    logger.warn(`[CloudCode] Network error for ${account.email} (stream), trying next account... (${error.message})`);
+                }
                await sleep(1000);
-                accountManager.pickNext(model);
                continue;
            }