diff --git a/CLAUDE.md b/CLAUDE.md index b1ae463..eed2a46 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Antigravity Claude Proxy is a Node.js proxy server that exposes an Anthropic-compatible API backed by Antigravity's Cloud Code service. It enables using Claude models (`claude-sonnet-4-5-thinking`, `claude-opus-4-5-thinking`) with Claude Code CLI. +Antigravity Claude Proxy is a Node.js proxy server that exposes an Anthropic-compatible API backed by Antigravity's Cloud Code service. It enables using Claude models (`claude-sonnet-4-5-thinking`, `claude-opus-4-5-thinking`) and Gemini models (`gemini-3-flash`, `gemini-3-pro-low`, `gemini-3-pro-high`) with Claude Code CLI. The proxy translates requests from Anthropic Messages API format → Google Generative AI format → Antigravity Cloud Code API, then converts responses back to Anthropic format with full thinking/streaming support. @@ -80,11 +80,17 @@ Claude Code CLI → Express Server (server.js) → CloudCode Client → Antigrav **Constants:** All configuration values are centralized in `src/constants.js`: - API endpoints and headers -- Model mappings +- Model mappings and model family detection (`getModelFamily()`, `isThinkingModel()`) - OAuth configuration - Rate limit thresholds - Thinking model settings +**Model Family Handling:** +- `getModelFamily(model)` returns `'claude'` or `'gemini'` based on model name +- Claude models use `signature` field on thinking blocks +- Gemini models use `thoughtSignature` field on functionCall parts +- When Claude Code strips `thoughtSignature`, the proxy uses Google's `skip_thought_signature_validator` sentinel value + **Error Handling:** Use custom error classes from `src/errors.js`: - `RateLimitError` - 429/RESOURCE_EXHAUSTED errors - `AuthError` - Authentication failures diff --git a/README.md b/README.md index 4eefebe..b744cb0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Buy Me A Coffee -A proxy server that exposes an **Anthropic-compatible API** backed by **Antigravity's Cloud Code**, letting you use Claude models like sonnet and opus with **Claude Code CLI**. +A proxy server that exposes an **Anthropic-compatible API** backed by **Antigravity's Cloud Code**, letting you use Claude and Gemini models with **Claude Code CLI**. ![Antigravity Claude Proxy Banner](images/banner.png) @@ -145,7 +145,23 @@ Add this configuration: "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4-5-thinking", "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4-5-thinking", "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-sonnet-4-5", - "CLAUDE_CODE_SUBAGENT_MODEL": "claude-opus-4-5-thinking" + "CLAUDE_CODE_SUBAGENT_MODEL": "claude-sonnet-4-5" + } +} +``` + +Or to use Gemini models: + +```json +{ + "env": { + "ANTHROPIC_AUTH_TOKEN": "test", + "ANTHROPIC_BASE_URL": "http://localhost:8080", + "ANTHROPIC_MODEL": "gemini-3-pro-high", + "ANTHROPIC_DEFAULT_OPUS_MODEL": "gemini-3-pro-high", + "ANTHROPIC_DEFAULT_SONNET_MODEL": "gemini-3-flash", + "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gemini-3-flash", + "CLAUDE_CODE_SUBAGENT_MODEL": "gemini-2.5-flash-lite" } } ``` @@ -164,6 +180,8 @@ claude ## Available Models +### Claude Models + | Model ID | Description | |----------|-------------| | `claude-sonnet-4-5-thinking` | Claude Sonnet 4.5 with extended thinking | @@ -174,6 +192,16 @@ Standard Anthropic model names are automatically mapped: - `claude-sonnet-4-5-20250514` → `claude-sonnet-4-5-thinking` - `claude-opus-4-5-20250514` → `claude-opus-4-5-thinking` +### Gemini Models + +| Model ID | Description | +|----------|-------------| +| `gemini-3-flash` | Gemini 3 Flash with thinking | +| `gemini-3-pro-low` | Gemini 3 Pro Low with thinking | +| `gemini-3-pro-high` | Gemini 3 Pro High with thinking | + +Gemini models include full thinking support with `thoughtSignature` handling for multi-turn conversations. + --- ## Multi-Account Load Balancing diff --git a/src/cloudcode-client.js b/src/cloudcode-client.js index 98f5e5b..b75a2dc 100644 --- a/src/cloudcode-client.js +++ b/src/cloudcode-client.js @@ -15,7 +15,9 @@ import { ANTIGRAVITY_HEADERS, MAX_RETRIES, MAX_WAIT_BEFORE_ERROR_MS, - MIN_SIGNATURE_LENGTH + MIN_SIGNATURE_LENGTH, + getModelFamily, + isThinkingModel } from './constants.js'; import { convertAnthropicToGoogle, @@ -244,9 +246,10 @@ function buildHeaders(token, model, accept = 'application/json') { ...ANTIGRAVITY_HEADERS }; - // Add interleaved thinking header for Claude thinking models - const isThinkingModel = model.toLowerCase().includes('claude') && model.toLowerCase().includes('thinking'); - if (isThinkingModel) { + const modelFamily = getModelFamily(model); + + // Add interleaved thinking header only for Claude thinking models + if (modelFamily === 'claude' && isThinkingModel(model)) { headers['anthropic-beta'] = 'interleaved-thinking-2025-05-14'; } @@ -272,7 +275,7 @@ function buildHeaders(token, model, accept = 'application/json') { */ export async function sendMessage(anthropicRequest, accountManager) { const model = anthropicRequest.model; - const isThinkingModel = model.toLowerCase().includes('claude') && model.toLowerCase().includes('thinking'); + const isThinking = isThinkingModel(model); // Retry loop with account failover // Ensure we try at least as many times as there are accounts to cycle through everyone @@ -330,13 +333,13 @@ export async function sendMessage(anthropicRequest, accountManager) { let lastError = null; for (const endpoint of ANTIGRAVITY_ENDPOINT_FALLBACKS) { try { - const url = isThinkingModel + const url = isThinking ? `${endpoint}/v1internal:streamGenerateContent?alt=sse` : `${endpoint}/v1internal:generateContent`; const response = await fetch(url, { method: 'POST', - headers: buildHeaders(token, model, isThinkingModel ? 'text/event-stream' : 'application/json'), + headers: buildHeaders(token, model, isThinking ? 'text/event-stream' : 'application/json'), body: JSON.stringify(payload) }); @@ -370,7 +373,7 @@ export async function sendMessage(anthropicRequest, accountManager) { } // For thinking models, parse SSE and accumulate all parts - if (isThinkingModel) { + if (isThinking) { return await parseThinkingSSEResponse(response, anthropicRequest.model); } @@ -812,6 +815,10 @@ async function* streamSSEResponse(response, originalModel) { } else if (part.functionCall) { // Handle tool use + // For Gemini 3+, capture thoughtSignature from the functionCall part + // The signature is a sibling to functionCall, not inside it + const functionCallSignature = part.thoughtSignature || ''; + if (currentBlockType === 'thinking' && currentThinkingSignature) { yield { type: 'content_block_delta', @@ -829,15 +836,24 @@ async function* streamSSEResponse(response, originalModel) { const toolId = part.functionCall.id || `toolu_${crypto.randomBytes(12).toString('hex')}`; + // For Gemini, include the thoughtSignature in the tool_use block + // so it can be sent back in subsequent requests + const toolUseBlock = { + type: 'tool_use', + id: toolId, + name: part.functionCall.name, + input: {} + }; + + // Store the signature in the tool_use block for later retrieval + if (functionCallSignature && functionCallSignature.length >= MIN_SIGNATURE_LENGTH) { + toolUseBlock.thoughtSignature = functionCallSignature; + } + yield { type: 'content_block_start', index: blockIndex, - content_block: { - type: 'tool_use', - id: toolId, - name: part.functionCall.name, - input: {} - } + content_block: toolUseBlock }; yield { diff --git a/src/constants.js b/src/constants.js index 1358ea3..8806531 100644 --- a/src/constants.js +++ b/src/constants.js @@ -84,6 +84,40 @@ export const MAX_WAIT_BEFORE_ERROR_MS = 120000; // 2 minutes - throw error if wa // Thinking model constants export const MIN_SIGNATURE_LENGTH = 50; // Minimum valid thinking signature length +// Gemini-specific limits +export const GEMINI_MAX_OUTPUT_TOKENS = 16384; + +/** + * Get the model family from model name (dynamic detection, no hardcoded list). + * @param {string} modelName - The model name from the request + * @returns {'claude' | 'gemini' | 'unknown'} The model family + */ +export function getModelFamily(modelName) { + const lower = (modelName || '').toLowerCase(); + if (lower.includes('claude')) return 'claude'; + if (lower.includes('gemini')) return 'gemini'; + return 'unknown'; +} + +/** + * Check if a model supports thinking/reasoning output. + * @param {string} modelName - The model name from the request + * @returns {boolean} True if the model supports thinking blocks + */ +export function isThinkingModel(modelName) { + const lower = (modelName || '').toLowerCase(); + // Claude thinking models have "thinking" in the name + if (lower.includes('claude') && lower.includes('thinking')) return true; + // Gemini thinking models: explicit "thinking" in name, OR gemini version 3+ + if (lower.includes('gemini')) { + if (lower.includes('thinking')) return true; + // Check for gemini-3 or higher (e.g., gemini-3, gemini-3.5, gemini-4, etc.) + const versionMatch = lower.match(/gemini-(\d+)/); + if (versionMatch && parseInt(versionMatch[1], 10) >= 3) return true; + } + return false; +} + // Google OAuth configuration (from opencode-antigravity-auth) export const OAUTH_CONFIG = { clientId: '1071006060591-tmhssin2h21lcre235vtolojh4g403ep.apps.googleusercontent.com', @@ -117,6 +151,9 @@ export default { MAX_ACCOUNTS, MAX_WAIT_BEFORE_ERROR_MS, MIN_SIGNATURE_LENGTH, + GEMINI_MAX_OUTPUT_TOKENS, + getModelFamily, + isThinkingModel, OAUTH_CONFIG, OAUTH_REDIRECT_URI }; diff --git a/src/format-converter.js b/src/format-converter.js index 2f274ab..3551298 100644 --- a/src/format-converter.js +++ b/src/format-converter.js @@ -9,9 +9,20 @@ import crypto from 'crypto'; import { - MIN_SIGNATURE_LENGTH + MIN_SIGNATURE_LENGTH, + GEMINI_MAX_OUTPUT_TOKENS, + getModelFamily, + isThinkingModel } from './constants.js'; +/** + * Sentinel value to skip thought signature validation for Gemini models. + * Per Google documentation, this value can be used when Claude Code strips + * the thoughtSignature field from tool_use blocks in multi-turn requests. + * See: https://ai.google.dev/gemini-api/docs/thought-signatures + */ +const GEMINI_SKIP_SIGNATURE = 'skip_thought_signature_validator'; + /** * Check if a part is a thinking block * @param {Object} part - Content part to check @@ -272,7 +283,7 @@ export function reorderAssistantContent(content) { /** * Convert Anthropic message content to Google Generative AI parts */ -function convertContentToParts(content, isClaudeModel = false) { +function convertContentToParts(content, isClaudeModel = false, isGeminiModel = false) { if (typeof content === 'string') { return [{ text: content }]; } @@ -337,7 +348,19 @@ function convertContentToParts(content, isClaudeModel = false) { functionCall.id = block.id; } - parts.push({ functionCall }); + // Build the part with functionCall + const part = { functionCall }; + + // For Gemini models, include thoughtSignature at the part level + // This is required by Gemini 3+ for tool calls to work correctly + if (isGeminiModel) { + // Use thoughtSignature from the block if Claude Code preserved it + // Otherwise, use the sentinel value to skip validation (Claude Code strips non-standard fields) + // See: https://ai.google.dev/gemini-api/docs/thought-signatures + part.thoughtSignature = block.thoughtSignature || GEMINI_SKIP_SIGNATURE; + } + + parts.push(part); } else if (block.type === 'tool_result') { // Convert tool_result to functionResponse (Google format) let responseContent = block.content; @@ -400,8 +423,10 @@ function convertRole(role) { export function convertAnthropicToGoogle(anthropicRequest) { const { messages, system, max_tokens, temperature, top_p, top_k, stop_sequences, tools, tool_choice, thinking } = anthropicRequest; const modelName = anthropicRequest.model || ''; - const isClaudeModel = modelName.toLowerCase().includes('claude'); - const isClaudeThinkingModel = isClaudeModel && modelName.toLowerCase().includes('thinking'); + const modelFamily = getModelFamily(modelName); + const isClaudeModel = modelFamily === 'claude'; + const isGeminiModel = modelFamily === 'gemini'; + const isThinking = isThinkingModel(modelName); const googleRequest = { contents: [], @@ -429,7 +454,7 @@ export function convertAnthropicToGoogle(anthropicRequest) { } // Add interleaved thinking hint for Claude thinking models with tools - if (isClaudeThinkingModel && tools && tools.length > 0) { + if (isClaudeModel && isThinking && tools && tools.length > 0) { const hint = 'Interleaved thinking is enabled. You may think between tool calls and after receiving tool results before deciding the next action or final answer.'; if (!googleRequest.systemInstruction) { googleRequest.systemInstruction = { parts: [{ text: hint }] }; @@ -458,7 +483,7 @@ export function convertAnthropicToGoogle(anthropicRequest) { msgContent = reorderAssistantContent(msgContent); } - const parts = convertContentToParts(msgContent, isClaudeModel); + const parts = convertContentToParts(msgContent, isClaudeModel, isGeminiModel); const content = { role: convertRole(msg.role), parts: parts @@ -488,22 +513,34 @@ export function convertAnthropicToGoogle(anthropicRequest) { googleRequest.generationConfig.stopSequences = stop_sequences; } - // Enable thinking for Claude thinking models - if (isClaudeThinkingModel) { - const thinkingConfig = { - include_thoughts: true - }; + // Enable thinking for thinking models (Claude and Gemini 3+) + if (isThinking) { + if (isClaudeModel) { + // Claude thinking config + const thinkingConfig = { + include_thoughts: true + }; - // Only set thinking_budget if explicitly provided - const thinkingBudget = thinking?.budget_tokens; - if (thinkingBudget) { - thinkingConfig.thinking_budget = thinkingBudget; - console.log('[FormatConverter] Thinking enabled with budget:', thinkingBudget); - } else { - console.log('[FormatConverter] Thinking enabled (no budget specified)'); + // Only set thinking_budget if explicitly provided + const thinkingBudget = thinking?.budget_tokens; + if (thinkingBudget) { + thinkingConfig.thinking_budget = thinkingBudget; + console.log('[FormatConverter] Claude thinking enabled with budget:', thinkingBudget); + } else { + console.log('[FormatConverter] Claude thinking enabled (no budget specified)'); + } + + googleRequest.generationConfig.thinkingConfig = thinkingConfig; + } else if (isGeminiModel) { + // Gemini thinking config (uses camelCase) + const thinkingConfig = { + includeThoughts: true, + thinkingBudget: thinking?.budget_tokens || 16000 + }; + console.log('[FormatConverter] Gemini thinking enabled with budget:', thinkingConfig.thinkingBudget); + + googleRequest.generationConfig.thinkingConfig = thinkingConfig; } - - googleRequest.generationConfig.thinkingConfig = thinkingConfig; } // Convert tools to Google format @@ -523,10 +560,18 @@ export function convertAnthropicToGoogle(anthropicRequest) { || tool.parameters || { type: 'object' }; + // Sanitize schema for general compatibility + let parameters = sanitizeSchema(schema); + + // For Gemini models, apply additional cleaning for VALIDATED mode + if (isGeminiModel) { + parameters = cleanSchemaForGemini(parameters); + } + return { name: String(name).replace(/[^a-zA-Z0-9_-]/g, '_').slice(0, 64), description: description, - parameters: sanitizeSchema(schema) + parameters }; }); @@ -534,6 +579,12 @@ export function convertAnthropicToGoogle(anthropicRequest) { console.log('[FormatConverter] Tools:', JSON.stringify(googleRequest.tools).substring(0, 300)); } + // Cap max tokens for Gemini models + if (isGeminiModel && googleRequest.generationConfig.maxOutputTokens > GEMINI_MAX_OUTPUT_TOKENS) { + console.log(`[FormatConverter] Capping Gemini max_tokens from ${googleRequest.generationConfig.maxOutputTokens} to ${GEMINI_MAX_OUTPUT_TOKENS}`); + googleRequest.generationConfig.maxOutputTokens = GEMINI_MAX_OUTPUT_TOKENS; + } + return googleRequest; } @@ -620,6 +671,63 @@ function sanitizeSchema(schema) { return sanitized; } +/** + * Cleans JSON schema for Gemini API compatibility. + * Removes unsupported fields that cause VALIDATED mode errors. + * + * Gemini's VALIDATED mode rejects schemas with certain JSON Schema keywords + * that are not supported by the Gemini API. + * + * @param {Object} schema - The JSON schema to clean + * @returns {Object} Cleaned schema safe for Gemini API + */ +function cleanSchemaForGemini(schema) { + if (!schema || typeof schema !== 'object') return schema; + if (Array.isArray(schema)) return schema.map(cleanSchemaForGemini); + + const result = { ...schema }; + + // Remove unsupported keywords that cause VALIDATED mode errors + const unsupported = [ + 'additionalProperties', 'default', '$schema', '$defs', + 'definitions', '$ref', '$id', '$comment', 'title', + 'minLength', 'maxLength', 'pattern', 'format', + 'minItems', 'maxItems', 'examples' + ]; + + for (const key of unsupported) { + delete result[key]; + } + + // Check for unsupported 'format' in string types + if (result.type === 'string' && result.format) { + const allowed = ['enum', 'date-time']; + if (!allowed.includes(result.format)) { + delete result.format; + } + } + + // Recursively clean nested schemas + for (const [key, value] of Object.entries(result)) { + if (typeof value === 'object' && value !== null) { + result[key] = cleanSchemaForGemini(value); + } + } + + // Validate that required array only contains properties that exist + // Gemini's VALIDATED mode requires this + if (result.required && Array.isArray(result.required) && result.properties) { + const definedProps = new Set(Object.keys(result.properties)); + result.required = result.required.filter(prop => definedProps.has(prop)); + // If required is now empty, remove it + if (result.required.length === 0) { + delete result.required; + } + } + + return result; +} + /** * Convert Google Generative AI response to Anthropic Messages API format * @@ -661,12 +769,20 @@ export function convertGoogleToAnthropic(googleResponse, model) { } else if (part.functionCall) { // Convert functionCall to tool_use // Use the id from the response if available, otherwise generate one - anthropicContent.push({ + const toolId = part.functionCall.id || `toolu_${crypto.randomBytes(12).toString('hex')}`; + const toolUseBlock = { type: 'tool_use', - id: part.functionCall.id || `toolu_${crypto.randomBytes(12).toString('hex')}`, + id: toolId, name: part.functionCall.name, input: part.functionCall.args || {} - }); + }; + + // For Gemini 3+, include thoughtSignature from the part level + if (part.thoughtSignature && part.thoughtSignature.length >= MIN_SIGNATURE_LENGTH) { + toolUseBlock.thoughtSignature = part.thoughtSignature; + } + + anthropicContent.push(toolUseBlock); hasToolCalls = true; } } diff --git a/tests/helpers/http-client.cjs b/tests/helpers/http-client.cjs index e8e7fc3..bdbbc79 100644 --- a/tests/helpers/http-client.cjs +++ b/tests/helpers/http-client.cjs @@ -147,6 +147,12 @@ function analyzeContent(content) { const toolUse = content.filter(b => b.type === 'tool_use'); const text = content.filter(b => b.type === 'text'); + // Check for signatures in thinking blocks (Claude style) + const thinkingHasSignature = thinking.some(t => t.signature && t.signature.length >= 50); + + // Check for signatures in tool_use blocks (Gemini 3+ style) + const toolUseHasSignature = toolUse.some(t => t.thoughtSignature && t.thoughtSignature.length >= 50); + return { thinking, toolUse, @@ -154,7 +160,10 @@ function analyzeContent(content) { hasThinking: thinking.length > 0, hasToolUse: toolUse.length > 0, hasText: text.length > 0, - thinkingHasSignature: thinking.some(t => t.signature && t.signature.length >= 50) + thinkingHasSignature: thinkingHasSignature, + toolUseHasSignature: toolUseHasSignature, + // Combined check: signature exists somewhere (thinking or tool_use) + hasSignature: thinkingHasSignature || toolUseHasSignature }; } diff --git a/tests/helpers/test-models.cjs b/tests/helpers/test-models.cjs new file mode 100644 index 0000000..b6c6c26 --- /dev/null +++ b/tests/helpers/test-models.cjs @@ -0,0 +1,87 @@ +/** + * Test Models Configuration + * + * Provides model configuration for parameterized testing across + * multiple model families (Claude and Gemini). + */ + +// Default test models for each family +const TEST_MODELS = { + claude: 'claude-sonnet-4-5-thinking', + gemini: 'gemini-3-flash' +}; + +// Default thinking model for each family +const THINKING_MODELS = { + claude: 'claude-sonnet-4-5-thinking', + gemini: 'gemini-3-flash' +}; + +/** + * Get models to test, optionally excluding certain families. + * @param {string[]} excludeFamilies - Array of family names to exclude (e.g., ['gemini']) + * @returns {Array<{family: string, model: string}>} Array of model configs to test + */ +function getTestModels(excludeFamilies = []) { + const models = []; + for (const [family, model] of Object.entries(TEST_MODELS)) { + if (!excludeFamilies.includes(family)) { + models.push({ family, model }); + } + } + return models; +} + +/** + * Get thinking models to test, optionally excluding certain families. + * @param {string[]} excludeFamilies - Array of family names to exclude + * @returns {Array<{family: string, model: string}>} Array of thinking model configs + */ +function getThinkingModels(excludeFamilies = []) { + const models = []; + for (const [family, model] of Object.entries(THINKING_MODELS)) { + if (!excludeFamilies.includes(family)) { + models.push({ family, model }); + } + } + return models; +} + +/** + * Check if a model family requires thinking features. + * Both Claude thinking models and Gemini 3+ support thinking. + * @param {string} family - Model family name + * @returns {boolean} True if thinking is expected + */ +function familySupportsThinking(family) { + // Both Claude thinking models and Gemini 3+ support thinking + return family === 'claude' || family === 'gemini'; +} + +/** + * Get model-specific configuration overrides. + * @param {string} family - Model family name + * @returns {Object} Configuration overrides for the model family + */ +function getModelConfig(family) { + if (family === 'gemini') { + return { + // Gemini has lower max output tokens + max_tokens: 8000, + thinking: { type: 'enabled', budget_tokens: 10000 } + }; + } + return { + max_tokens: 16000, + thinking: { type: 'enabled', budget_tokens: 10000 } + }; +} + +module.exports = { + TEST_MODELS, + THINKING_MODELS, + getTestModels, + getThinkingModels, + familySupportsThinking, + getModelConfig +}; diff --git a/tests/test-caching-streaming.cjs b/tests/test-caching-streaming.cjs index a0ba865..834159b 100644 --- a/tests/test-caching-streaming.cjs +++ b/tests/test-caching-streaming.cjs @@ -5,23 +5,28 @@ * - Session ID is stable across turns (derived from first user message) * - cache_read_input_tokens is returned in usage metadata * - Second turn in same conversation should hit cache + * + * Runs for both Claude and Gemini model families. */ const { streamRequest, analyzeContent, extractUsage } = require('./helpers/http-client.cjs'); +const { getTestModels, getModelConfig } = require('./helpers/test-models.cjs'); // Large system prompt to exceed 1024 token minimum for caching // This matches the format used in the working direct API test (~36KB) const LARGE_SYSTEM_PROMPT = 'You are an expert software engineer. Here is important context:\n' + '// Large codebase file content line\n'.repeat(1000); -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('PROMPT CACHING TEST (STREAMING)'); + console.log(`PROMPT CACHING TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Verifies session ID stability and cache token reporting'); console.log('='.repeat(60)); console.log(''); let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); // ===== TURN 1: Initial request ===== console.log('TURN 1: Initial request (establishes cache)'); @@ -35,11 +40,11 @@ async function runTests() { ]; const turn1 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, system: LARGE_SYSTEM_PROMPT, - thinking: { type: 'enabled', budget_tokens: 5000 }, + thinking: modelConfig.thinking, messages: turn1Messages }); @@ -89,11 +94,11 @@ async function runTests() { ]; const turn2 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, system: LARGE_SYSTEM_PROMPT, - thinking: { type: 'enabled', budget_tokens: 5000 }, + thinking: modelConfig.thinking, messages: turn2Messages }); @@ -143,7 +148,7 @@ async function runTests() { // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -156,7 +161,7 @@ async function runTests() { } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); console.log('='.repeat(60)); console.log('\nNote: Cache effectiveness depends on:'); @@ -164,6 +169,25 @@ async function runTests() { console.log(' 2. Sticky account selection (same account across turns)'); console.log(' 3. API-side cache availability (may take time to populate)'); + return allPassed; +} + +async function runTests() { + const models = getTestModels(); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`); + console.log('='.repeat(60)); + process.exit(allPassed ? 0 : 1); } diff --git a/tests/test-images.cjs b/tests/test-images.cjs index 10740b9..0393709 100644 --- a/tests/test-images.cjs +++ b/tests/test-images.cjs @@ -3,34 +3,40 @@ * * Tests that images can be sent to the API with thinking models. * Simulates Claude Code sending screenshots or images for analysis. + * + * Runs for both Claude and Gemini model families. */ const fs = require('fs'); const path = require('path'); -const { streamRequest } = require('./helpers/http-client.cjs'); +const { streamRequest, analyzeContent } = require('./helpers/http-client.cjs'); +const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs'); // Load test image from disk const TEST_IMAGE_PATH = path.join(__dirname, 'utils', 'test_image.jpeg'); const TEST_IMAGE_BASE64 = fs.readFileSync(TEST_IMAGE_PATH).toString('base64'); -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('IMAGE SUPPORT TEST'); + console.log(`IMAGE SUPPORT TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Tests image processing with thinking models'); console.log('='.repeat(60)); console.log(''); let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); + const expectThinking = familySupportsThinking(family); // ===== TEST 1: Single image with question ===== console.log('TEST 1: Single image with question'); console.log('-'.repeat(40)); const result1 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, - thinking: { type: 'enabled', budget_tokens: 8000 }, + thinking: modelConfig.thinking, messages: [{ role: 'user', content: [ @@ -55,20 +61,22 @@ async function runTests() { allPassed = false; results.push({ name: 'Single image processing', passed: false }); } else { - const thinking = result1.content.filter(b => b.type === 'thinking'); - const text = result1.content.filter(b => b.type === 'text'); + const content = analyzeContent(result1.content); - console.log(` Thinking: ${thinking.length > 0 ? 'YES' : 'NO'}`); - console.log(` Text response: ${text.length > 0 ? 'YES' : 'NO'}`); + console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`); + console.log(` Text response: ${content.hasText ? 'YES' : 'NO'}`); - if (thinking.length > 0) { - console.log(` Thinking: "${thinking[0].thinking?.substring(0, 60)}..."`); + if (content.hasThinking && content.thinking[0].thinking) { + console.log(` Thinking: "${content.thinking[0].thinking.substring(0, 60)}..."`); } - if (text.length > 0) { - console.log(` Response: "${text[0].text?.substring(0, 100)}..."`); + if (content.hasText && content.text[0].text) { + console.log(` Response: "${content.text[0].text.substring(0, 100)}..."`); } - const passed = thinking.length > 0 && text.length > 0; + // For thinking models, expect thinking + text. For others, just text. + const passed = expectThinking + ? (content.hasThinking && content.hasText) + : content.hasText; results.push({ name: 'Single image processing', passed }); if (!passed) allPassed = false; } @@ -78,10 +86,10 @@ async function runTests() { console.log('-'.repeat(40)); const result2 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, - thinking: { type: 'enabled', budget_tokens: 8000 }, + thinking: modelConfig.thinking, messages: [ { role: 'user', @@ -119,24 +127,23 @@ async function runTests() { allPassed = false; results.push({ name: 'Image in multi-turn', passed: false }); } else { - const thinking = result2.content.filter(b => b.type === 'thinking'); - const text = result2.content.filter(b => b.type === 'text'); + const content = analyzeContent(result2.content); - console.log(` Thinking: ${thinking.length > 0 ? 'YES' : 'NO'}`); - console.log(` Text response: ${text.length > 0 ? 'YES' : 'NO'}`); + console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`); + console.log(` Text response: ${content.hasText ? 'YES' : 'NO'}`); - if (text.length > 0) { - console.log(` Response: "${text[0].text?.substring(0, 80)}..."`); + if (content.hasText && content.text[0].text) { + console.log(` Response: "${content.text[0].text.substring(0, 80)}..."`); } - const passed = text.length > 0; + const passed = content.hasText; results.push({ name: 'Image in multi-turn', passed }); if (!passed) allPassed = false; } // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -145,7 +152,26 @@ async function runTests() { } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + return allPassed; +} + +async function runTests() { + const models = getTestModels(); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`); console.log('='.repeat(60)); process.exit(allPassed ? 0 : 1); diff --git a/tests/test-interleaved-thinking.cjs b/tests/test-interleaved-thinking.cjs index 5216c7a..620480f 100644 --- a/tests/test-interleaved-thinking.cjs +++ b/tests/test-interleaved-thinking.cjs @@ -8,32 +8,38 @@ * * This simulates complex Claude Code scenarios where the model * thinks multiple times during a single turn. + * + * NOTE: This test is Claude-only. Interleaved thinking requires + * the anthropic-beta header which is specific to Claude thinking models. */ const { streamRequest, commonTools } = require('./helpers/http-client.cjs'); +const { getThinkingModels, getModelConfig } = require('./helpers/test-models.cjs'); // Multiple tools to encourage interleaved thinking const tools = [commonTools.readFile, commonTools.writeFile, commonTools.runTests]; -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('INTERLEAVED THINKING TEST'); + console.log(`INTERLEAVED THINKING TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Tests complex multi-step reasoning with tools'); console.log('='.repeat(60)); console.log(''); let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); // ===== TEST 1: Complex task requiring multiple steps ===== console.log('TEST 1: Complex task - read, modify, write, test'); console.log('-'.repeat(40)); const result = await streamRequest({ - model: 'claude-opus-4-5-thinking', - max_tokens: 32000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 16000 }, + thinking: modelConfig.thinking, messages: [{ role: 'user', content: `I need you to: @@ -92,11 +98,11 @@ Please do this step by step, reading each file before modifying.` const toolUseBlock = result.content.find(b => b.type === 'tool_use'); const result2 = await streamRequest({ - model: 'claude-opus-4-5-thinking', - max_tokens: 32000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 16000 }, + thinking: modelConfig.thinking, messages: [ { role: 'user', @@ -147,7 +153,7 @@ Please do this step by step, reading each file before modifying.` // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -156,7 +162,27 @@ Please do this step by step, reading each file before modifying.` } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + return allPassed; +} + +async function runTests() { + // Interleaved thinking is Claude-only (requires anthropic-beta header) + const models = getThinkingModels(['gemini']); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); console.log('='.repeat(60)); process.exit(allPassed ? 0 : 1); diff --git a/tests/test-multiturn-thinking-tools-streaming.cjs b/tests/test-multiturn-thinking-tools-streaming.cjs index 2aa53d0..6f14dd3 100644 --- a/tests/test-multiturn-thinking-tools-streaming.cjs +++ b/tests/test-multiturn-thinking-tools-streaming.cjs @@ -6,14 +6,18 @@ * - SSE events are properly formatted * - signature_delta events are present * - Thinking blocks accumulate correctly across deltas + * + * Runs for both Claude and Gemini model families. */ const { streamRequest, analyzeContent, analyzeEvents, commonTools } = require('./helpers/http-client.cjs'); +const { getTestModels, getModelConfig } = require('./helpers/test-models.cjs'); const tools = [commonTools.executeCommand]; -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('MULTI-TURN TOOL CALL TEST (STREAMING)'); + console.log(`MULTI-TURN TOOL CALL TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Simulates Claude Code streaming conversation'); console.log('='.repeat(60)); console.log(''); @@ -21,6 +25,7 @@ async function runTests() { let messages = []; let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); // ===== TURN 1: Initial request ===== console.log('TURN 1: User asks to run a command'); @@ -32,11 +37,11 @@ async function runTests() { }); const turn1 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages }); @@ -50,7 +55,7 @@ async function runTests() { console.log(' Content:'); console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`); - console.log(` Signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`); console.log(` Tool Use: ${content.hasToolUse ? 'YES' : 'NO'} (${content.toolUse.length} calls)`); console.log(' Events:'); @@ -67,9 +72,11 @@ async function runTests() { console.log(` Tool: ${content.toolUse[0].name}(${JSON.stringify(content.toolUse[0].input)})`); } - const passed = content.hasThinking && content.thinkingHasSignature && - events.signatureDeltas > 0 && content.hasToolUse; - results.push({ name: 'Turn 1: Thinking + Signature + Tool Use + Events', passed }); + // For Claude: signature is on thinking block and comes via signature_delta events + // For Gemini: signature is on tool_use block (no signature_delta events) + const hasSignature = content.hasSignature || events.signatureDeltas > 0; + const passed = content.hasThinking && hasSignature && content.hasToolUse; + results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed }); if (!passed) allPassed = false; if (content.hasToolUse) { @@ -101,11 +108,11 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests` }); const turn2 = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages }); @@ -119,7 +126,7 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests` console.log(' Content:'); console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`); - console.log(` Signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`); console.log(` Text: ${content.hasText ? 'YES' : 'NO'}`); console.log(' Events:'); @@ -139,7 +146,7 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests` // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -148,7 +155,26 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests` } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + return allPassed; +} + +async function runTests() { + const models = getTestModels(); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`); console.log('='.repeat(60)); process.exit(allPassed ? 0 : 1); diff --git a/tests/test-multiturn-thinking-tools.cjs b/tests/test-multiturn-thinking-tools.cjs index e479d91..0a38adb 100644 --- a/tests/test-multiturn-thinking-tools.cjs +++ b/tests/test-multiturn-thinking-tools.cjs @@ -11,14 +11,18 @@ * - Thinking blocks with signatures are preserved across turns * - Tool use/result flow works correctly * - Interleaved thinking with tools + * + * Runs for both Claude and Gemini model families. */ const { makeRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs'); +const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs'); const tools = [commonTools.searchFiles, commonTools.readFile]; -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('MULTI-TURN TOOL CALL TEST (NON-STREAMING)'); + console.log(`MULTI-TURN TOOL CALL TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Simulates Claude Code conversation pattern'); console.log('='.repeat(60)); console.log(''); @@ -26,6 +30,8 @@ async function runTests() { let messages = []; let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); + const expectThinking = familySupportsThinking(family); // ===== TURN 1: Initial request ===== console.log('TURN 1: User asks to find and read a config file'); @@ -37,11 +43,11 @@ async function runTests() { }); const turn1 = await makeRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: false, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages }); @@ -52,7 +58,7 @@ async function runTests() { } else { const analysis = analyzeContent(turn1.content || []); console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`); - console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`); console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`); console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`); @@ -63,7 +69,11 @@ async function runTests() { console.log(` Tool: ${analysis.toolUse[0].name}(${JSON.stringify(analysis.toolUse[0].input)})`); } - const passed = analysis.hasThinking && analysis.thinkingHasSignature && analysis.hasToolUse; + // For thinking models, expect thinking + signature + tool use + // For non-thinking models, just expect tool use + const passed = expectThinking + ? (analysis.hasThinking && analysis.hasSignature && analysis.hasToolUse) + : analysis.hasToolUse; results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed }); if (!passed) allPassed = false; @@ -91,11 +101,11 @@ async function runTests() { }); const turn2 = await makeRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: false, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages }); @@ -106,7 +116,7 @@ async function runTests() { } else { const analysis = analyzeContent(turn2.content || []); console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`); - console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`); console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`); console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`); @@ -118,7 +128,9 @@ async function runTests() { } // Either tool use (to read file) or text response is acceptable - const passed = analysis.hasThinking && (analysis.hasToolUse || analysis.hasText); + const passed = expectThinking + ? (analysis.hasThinking && (analysis.hasToolUse || analysis.hasText)) + : (analysis.hasToolUse || analysis.hasText); results.push({ name: 'Turn 2: Thinking + (Tool or Text)', passed }); if (!passed) allPassed = false; @@ -155,11 +167,11 @@ async function runTests() { }); const turn3 = await makeRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: false, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages }); @@ -168,19 +180,23 @@ async function runTests() { allPassed = false; results.push({ name: 'Turn 3: Final response', passed: false }); } else { - const analysis = analyzeContent(turn3.content || []); + const analysis = analyzeContent(turn3.content || []); console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`); - console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`); + console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`); console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`); if (analysis.hasText && analysis.text[0].text) { console.log(` Response: "${analysis.text[0].text.substring(0, 100)}..."`); } + if (analysis.hasToolUse) { + console.log(` Tool: ${analysis.toolUse[0].name}(${JSON.stringify(analysis.toolUse[0].input)})`); + } - // Thinking is optional for final responses - model may skip it for simple tasks - const passed = analysis.hasText; - const thinkingNote = analysis.hasThinking ? ' (with thinking)' : ' (no thinking - normal for simple tasks)'; - results.push({ name: 'Turn 3: Text response' + thinkingNote, passed }); + // For final turn: expect text OR another tool call (model may need more info) + const passed = analysis.hasText || analysis.hasToolUse; + const responseType = analysis.hasText ? 'text' : (analysis.hasToolUse ? 'tool_use' : 'none'); + results.push({ name: `Turn 3: Response (${responseType})`, passed }); if (!passed) allPassed = false; } } @@ -188,7 +204,7 @@ async function runTests() { // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -197,7 +213,26 @@ async function runTests() { } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + return allPassed; +} + +async function runTests() { + const models = getTestModels(); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`); console.log('='.repeat(60)); process.exit(allPassed ? 0 : 1); diff --git a/tests/test-thinking-signatures.cjs b/tests/test-thinking-signatures.cjs index fb1fcb1..606f4ee 100644 --- a/tests/test-thinking-signatures.cjs +++ b/tests/test-thinking-signatures.cjs @@ -6,20 +6,28 @@ * * Claude Code sends assistant messages with thinking blocks that include signatures. * These signatures must be preserved and sent back to the API. + * + * Note: Claude puts signatures on thinking blocks, Gemini 3+ puts them on tool_use blocks. + * + * Runs for both Claude and Gemini model families. */ -const { streamRequest, commonTools } = require('./helpers/http-client.cjs'); +const { streamRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs'); +const { getThinkingModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs'); const tools = [commonTools.getWeather]; -async function runTests() { +async function runTestsForModel(family, model) { console.log('='.repeat(60)); - console.log('THINKING SIGNATURE TEST'); + console.log(`THINKING SIGNATURE TEST [${family.toUpperCase()}]`); + console.log(`Model: ${model}`); console.log('Simulates Claude Code multi-turn with thinking blocks'); console.log('='.repeat(60)); console.log(''); let allPassed = true; const results = []; + const modelConfig = getModelConfig(family); + const expectThinking = familySupportsThinking(family); // ===== TEST 1: First turn - get thinking block with signature ===== console.log('TEST 1: Initial request with thinking model'); @@ -30,35 +38,34 @@ async function runTests() { ]; const turn1Result = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages: turn1Messages }); - const turn1Thinking = turn1Result.content.filter(b => b.type === 'thinking'); - const turn1ToolUse = turn1Result.content.filter(b => b.type === 'tool_use'); - const turn1Text = turn1Result.content.filter(b => b.type === 'text'); + const content = analyzeContent(turn1Result.content); - console.log(` Thinking blocks: ${turn1Thinking.length}`); - console.log(` Tool use blocks: ${turn1ToolUse.length}`); - console.log(` Text blocks: ${turn1Text.length}`); + console.log(` Thinking blocks: ${content.thinking.length}`); + console.log(` Tool use blocks: ${content.toolUse.length}`); + console.log(` Text blocks: ${content.text.length}`); - // Check thinking has signature - let turn1HasSignature = false; - if (turn1Thinking.length > 0) { - const sig = turn1Thinking[0].signature || ''; - turn1HasSignature = sig.length >= 50; - console.log(` Signature length: ${sig.length} chars`); - console.log(` Signature present: ${turn1HasSignature ? 'YES' : 'NO'}`); - if (turn1Thinking[0].thinking) { - console.log(` Thinking preview: "${turn1Thinking[0].thinking.substring(0, 80)}..."`); - } + // Check signatures - Claude puts them on thinking blocks, Gemini on tool_use blocks + console.log(` Thinking signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`); + console.log(` Tool use signature: ${content.toolUseHasSignature ? 'YES' : 'NO'}`); + console.log(` Has signature (combined): ${content.hasSignature ? 'YES' : 'NO'}`); + + if (content.hasThinking && content.thinking[0].thinking) { + console.log(` Thinking preview: "${content.thinking[0].thinking.substring(0, 80)}..."`); } - const test1Pass = turn1Thinking.length > 0 && turn1HasSignature && turn1ToolUse.length > 0; + // For models that support thinking, expect thinking + signature (somewhere) + tool use + // For models that don't, just expect tool use + const test1Pass = expectThinking + ? (content.hasThinking && content.hasSignature && content.hasToolUse) + : (content.hasToolUse || content.hasText); results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed: test1Pass }); console.log(` Result: ${test1Pass ? 'PASS' : 'FAIL'}`); if (!test1Pass) allPassed = false; @@ -67,18 +74,22 @@ async function runTests() { console.log('\nTEST 2: Multi-turn with thinking signature in assistant message'); console.log('-'.repeat(40)); - if (!turn1ToolUse.length) { + if (!content.hasToolUse) { console.log(' SKIPPED - No tool use in turn 1'); results.push({ name: 'Turn 2: Multi-turn with signature', passed: false, skipped: true }); } else { // Build assistant message with thinking (including signature) - this is how Claude Code sends it const assistantContent = turn1Result.content; - // Verify the thinking block has signature before sending + // Log what we're sending back const thinkingInAssistant = assistantContent.find(b => b.type === 'thinking'); + const toolUseInAssistant = assistantContent.find(b => b.type === 'tool_use'); if (thinkingInAssistant) { console.log(` Sending thinking with signature: ${(thinkingInAssistant.signature || '').length} chars`); } + if (toolUseInAssistant && toolUseInAssistant.thoughtSignature) { + console.log(` Sending tool_use with thoughtSignature: ${toolUseInAssistant.thoughtSignature.length} chars`); + } const turn2Messages = [ ...turn1Messages, @@ -87,26 +98,25 @@ async function runTests() { role: 'user', content: [{ type: 'tool_result', - tool_use_id: turn1ToolUse[0].id, + tool_use_id: content.toolUse[0].id, content: 'The weather in Paris is 18°C and sunny.' }] } ]; const turn2Result = await streamRequest({ - model: 'claude-sonnet-4-5-thinking', - max_tokens: 16000, + model, + max_tokens: modelConfig.max_tokens, stream: true, tools, - thinking: { type: 'enabled', budget_tokens: 10000 }, + thinking: modelConfig.thinking, messages: turn2Messages }); - const turn2Thinking = turn2Result.content.filter(b => b.type === 'thinking'); - const turn2Text = turn2Result.content.filter(b => b.type === 'text'); + const turn2Content = analyzeContent(turn2Result.content); - console.log(` Thinking blocks: ${turn2Thinking.length}`); - console.log(` Text blocks: ${turn2Text.length}`); + console.log(` Thinking blocks: ${turn2Content.thinking.length}`); + console.log(` Text blocks: ${turn2Content.text.length}`); // Check for errors const hasError = turn2Result.events.some(e => e.type === 'error'); @@ -115,26 +125,22 @@ async function runTests() { console.log(` ERROR: ${errorEvent?.data?.error?.message || 'Unknown error'}`); } - if (turn2Thinking.length > 0) { - const sig = turn2Thinking[0].signature || ''; - console.log(` New signature length: ${sig.length} chars`); - if (turn2Thinking[0].thinking) { - console.log(` Thinking preview: "${turn2Thinking[0].thinking.substring(0, 80)}..."`); - } + if (turn2Content.hasThinking && turn2Content.thinking[0].thinking) { + console.log(` Thinking preview: "${turn2Content.thinking[0].thinking.substring(0, 80)}..."`); } - if (turn2Text.length > 0 && turn2Text[0].text) { - console.log(` Response: "${turn2Text[0].text.substring(0, 100)}..."`); + if (turn2Content.hasText && turn2Content.text[0].text) { + console.log(` Response: "${turn2Content.text[0].text.substring(0, 100)}..."`); } - const test2Pass = !hasError && (turn2Thinking.length > 0 || turn2Text.length > 0); + const test2Pass = !hasError && (turn2Content.hasThinking || turn2Content.hasText); results.push({ name: 'Turn 2: Multi-turn with signature', passed: test2Pass }); console.log(` Result: ${test2Pass ? 'PASS' : 'FAIL'}`); if (!test2Pass) allPassed = false; } // ===== TEST 3: Verify signature_delta events in stream ===== - console.log('\nTEST 3: Verify signature_delta events in stream'); + console.log('\nTEST 3: Verify signature events in stream'); console.log('-'.repeat(40)); const signatureDeltas = turn1Result.events.filter( @@ -147,14 +153,18 @@ async function runTests() { console.log(` Total signature length from deltas: ${totalSigLength} chars`); } - const test3Pass = signatureDeltas.length > 0; - results.push({ name: 'signature_delta events present', passed: test3Pass }); + // For Claude: signature_delta events should be present + // For Gemini: signature is attached to tool_use block directly, may not have signature_delta events + const test3Pass = expectThinking + ? (signatureDeltas.length > 0 || content.toolUseHasSignature) + : true; + results.push({ name: 'Signature present (delta or on tool_use)', passed: test3Pass }); console.log(` Result: ${test3Pass ? 'PASS' : 'FAIL'}`); if (!test3Pass) allPassed = false; // ===== Summary ===== console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); + console.log(`SUMMARY [${family.toUpperCase()}]`); console.log('='.repeat(60)); for (const result of results) { @@ -163,7 +173,26 @@ async function runTests() { } console.log('\n' + '='.repeat(60)); - console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`); + console.log('='.repeat(60)); + + return allPassed; +} + +async function runTests() { + const models = getThinkingModels(); + let allPassed = true; + + for (const { family, model } of models) { + console.log('\n'); + const passed = await runTestsForModel(family, model); + if (!passed) allPassed = false; + } + + console.log('\n' + '='.repeat(60)); + console.log('FINAL RESULT'); + console.log('='.repeat(60)); + console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`); console.log('='.repeat(60)); process.exit(allPassed ? 0 : 1);