feat: comprehensive rate limit handling overhaul (inspired by opencode-antigravity-auth)

This commit addresses "Max retries exceeded" errors during stress testing where all accounts would become exhausted simultaneously due to short per-second rate limits triggering cascading failures. ## Rate Limit Parser (`rate-limit-parser.js`) - Remove 2s buffer enforcement that caused cascading failures when API returned short reset times (200-600ms). Now adds 200ms buffer for sub-500ms resets - Add `parseRateLimitReason()` for smart backoff based on error type: QUOTA_EXHAUSTED, RATE_LIMIT_EXCEEDED, MODEL_CAPACITY_EXHAUSTED, SERVER_ERROR ## Message/Streaming Handlers - Add per-account+model rate limit state tracking with exponential backoff - For short rate limits (< 1 second), wait and retry on same account instead of switching - prevents thundering herd when all accounts hit per-second limits - Add throttle wait support for fallback modes (emergency/lastResort) - Add `calculateSmartBackoff()` with progressive tiers by error type ## HybridStrategy (`hybrid-strategy.js`) - Refactor `#getCandidates()` to return 4 fallback levels: - `normal`: All filters pass (health, tokens, quota) - `quota`: Bypass critical quota check - `emergency`: Bypass health check when ALL accounts unhealthy - `lastResort`: Bypass BOTH health AND token bucket checks - Add throttle wait times: 500ms for lastResort, 250ms for emergency - Fix LRU calculation to use seconds (matches opencode-antigravity-auth) ## Health Tracker - Increase `recoveryPerHour` from 2 to 10 for faster recovery (1 hour vs 5 hours) ## Account Manager - Add consecutive failure tracking: `getConsecutiveFailures()`, `incrementConsecutiveFailures()`, `resetConsecutiveFailures()` - Add cooldown mechanism separate from rate limits with `CooldownReason` - Reset consecutive failures on successful request ## Base Strategy - Add `isAccountCoolingDown()` check in `isAccountUsable()` ## Constants - Replace fixed `CAPACITY_RETRY_DELAY_MS` with progressive `CAPACITY_BACKOFF_TIERS_MS` - Add `BACKOFF_BY_ERROR_TYPE` for smart backoff - Add `QUOTA_EXHAUSTED_BACKOFF_TIERS_MS` for progressive quota backoff - Add `MIN_BACKOFF_MS` floor to prevent "Available in 0s" loops - Increase `MAX_CAPACITY_RETRIES` from 3 to 5 - Reduce `RATE_LIMIT_DEDUP_WINDOW_MS` from 5s to 2s ## Frontend - Remove `capacityRetryDelayMs` config (replaced by progressive tiers) - Update default `maxCapacityRetries` display from 3 to 5 ## Testing - Add `tests/stress-test.cjs` for concurrent request stress testing Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-24 22:43:53 +05:30
parent 71b9b001fd
commit 5a85f0cfcc
20 changed files with 869 additions and 244 deletions
--- a/tests/stress-test.cjs
+++ b/tests/stress-test.cjs
@@ -0,0 +1,93 @@
+/**
+ * Stress Test - Send multiple concurrent requests to test rate limit handling
+ *
+ * Usage: node tests/stress-test.cjs [count] [model]
+ * Example: node tests/stress-test.cjs 10 gemini-3-flash
+ */
+
+const BASE_URL = process.env.ANTHROPIC_BASE_URL || 'http://localhost:8080';
+
+const count = parseInt(process.argv[2]) || 8;
+const model = process.argv[3] || 'gemini-3-flash';
+
+async function sendRequest(id) {
+    const startTime = Date.now();
+    try {
+        const response = await fetch(`${BASE_URL}/v1/messages`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'x-api-key': 'test',
+                'anthropic-version': '2023-06-01'
+            },
+            body: JSON.stringify({
+                model: model,
+                max_tokens: 100,
+                messages: [
+                    { role: 'user', content: `Request ${id}: Say "Hello ${id}" and nothing else.` }
+                ]
+            })
+        });
+
+        const elapsed = Date.now() - startTime;
+
+        if (!response.ok) {
+            const errorText = await response.text();
+            console.log(`[${id}] ❌ ${response.status} after ${elapsed}ms: ${errorText.substring(0, 100)}`);
+            return { id, success: false, status: response.status, elapsed };
+        }
+
+        const data = await response.json();
+        const text = data.content?.[0]?.text?.substring(0, 50) || 'No text';
+        console.log(`[${id}] ✅ 200 after ${elapsed}ms: "${text}..."`);
+        return { id, success: true, status: 200, elapsed };
+    } catch (error) {
+        const elapsed = Date.now() - startTime;
+        console.log(`[${id}] ❌ Error after ${elapsed}ms: ${error.message}`);
+        return { id, success: false, error: error.message, elapsed };
+    }
+}
+
+async function runStressTest() {
+    console.log(`\n🚀 Stress Test: Sending ${count} concurrent requests to ${model}\n`);
+    console.log(`Target: ${BASE_URL}/v1/messages\n`);
+    console.log('─'.repeat(70));
+
+    const startTime = Date.now();
+
+    // Send all requests concurrently
+    const promises = [];
+    for (let i = 1; i <= count; i++) {
+        promises.push(sendRequest(i));
+    }
+
+    const results = await Promise.all(promises);
+
+    const totalElapsed = Date.now() - startTime;
+    console.log('─'.repeat(70));
+
+    // Summary
+    const successful = results.filter(r => r.success).length;
+    const failed = results.filter(r => !r.success).length;
+    const avgElapsed = Math.round(results.reduce((sum, r) => sum + r.elapsed, 0) / results.length);
+
+    console.log(`\n📊 Summary:`);
+    console.log(`   Total time: ${totalElapsed}ms`);
+    console.log(`   Successful: ${successful}/${count}`);
+    console.log(`   Failed: ${failed}/${count}`);
+    console.log(`   Avg response time: ${avgElapsed}ms`);
+
+    if (failed > 0) {
+        const errors = results.filter(r => !r.success);
+        const statusCounts = {};
+        errors.forEach(e => {
+            const key = e.status || 'network';
+            statusCounts[key] = (statusCounts[key] || 0) + 1;
+        });
+        console.log(`   Error breakdown: ${JSON.stringify(statusCounts)}`);
+    }
+
+    console.log('');
+}
+
+runStressTest().catch(console.error);