Added support for Gemini models

This commit is contained in:
Badri Narayanan S
2025-12-27 14:09:20 +05:30
parent 9b7dcf3a6c
commit c1e1dbb0ef
13 changed files with 641 additions and 176 deletions

View File

@@ -147,6 +147,12 @@ function analyzeContent(content) {
const toolUse = content.filter(b => b.type === 'tool_use');
const text = content.filter(b => b.type === 'text');
// Check for signatures in thinking blocks (Claude style)
const thinkingHasSignature = thinking.some(t => t.signature && t.signature.length >= 50);
// Check for signatures in tool_use blocks (Gemini 3+ style)
const toolUseHasSignature = toolUse.some(t => t.thoughtSignature && t.thoughtSignature.length >= 50);
return {
thinking,
toolUse,
@@ -154,7 +160,10 @@ function analyzeContent(content) {
hasThinking: thinking.length > 0,
hasToolUse: toolUse.length > 0,
hasText: text.length > 0,
thinkingHasSignature: thinking.some(t => t.signature && t.signature.length >= 50)
thinkingHasSignature: thinkingHasSignature,
toolUseHasSignature: toolUseHasSignature,
// Combined check: signature exists somewhere (thinking or tool_use)
hasSignature: thinkingHasSignature || toolUseHasSignature
};
}

View File

@@ -0,0 +1,87 @@
/**
* Test Models Configuration
*
* Provides model configuration for parameterized testing across
* multiple model families (Claude and Gemini).
*/
// Default test models for each family
const TEST_MODELS = {
claude: 'claude-sonnet-4-5-thinking',
gemini: 'gemini-3-flash'
};
// Default thinking model for each family
const THINKING_MODELS = {
claude: 'claude-sonnet-4-5-thinking',
gemini: 'gemini-3-flash'
};
/**
* Get models to test, optionally excluding certain families.
* @param {string[]} excludeFamilies - Array of family names to exclude (e.g., ['gemini'])
* @returns {Array<{family: string, model: string}>} Array of model configs to test
*/
function getTestModels(excludeFamilies = []) {
const models = [];
for (const [family, model] of Object.entries(TEST_MODELS)) {
if (!excludeFamilies.includes(family)) {
models.push({ family, model });
}
}
return models;
}
/**
* Get thinking models to test, optionally excluding certain families.
* @param {string[]} excludeFamilies - Array of family names to exclude
* @returns {Array<{family: string, model: string}>} Array of thinking model configs
*/
function getThinkingModels(excludeFamilies = []) {
const models = [];
for (const [family, model] of Object.entries(THINKING_MODELS)) {
if (!excludeFamilies.includes(family)) {
models.push({ family, model });
}
}
return models;
}
/**
* Check if a model family requires thinking features.
* Both Claude thinking models and Gemini 3+ support thinking.
* @param {string} family - Model family name
* @returns {boolean} True if thinking is expected
*/
function familySupportsThinking(family) {
// Both Claude thinking models and Gemini 3+ support thinking
return family === 'claude' || family === 'gemini';
}
/**
* Get model-specific configuration overrides.
* @param {string} family - Model family name
* @returns {Object} Configuration overrides for the model family
*/
function getModelConfig(family) {
if (family === 'gemini') {
return {
// Gemini has lower max output tokens
max_tokens: 8000,
thinking: { type: 'enabled', budget_tokens: 10000 }
};
}
return {
max_tokens: 16000,
thinking: { type: 'enabled', budget_tokens: 10000 }
};
}
module.exports = {
TEST_MODELS,
THINKING_MODELS,
getTestModels,
getThinkingModels,
familySupportsThinking,
getModelConfig
};

View File

@@ -5,23 +5,28 @@
* - Session ID is stable across turns (derived from first user message)
* - cache_read_input_tokens is returned in usage metadata
* - Second turn in same conversation should hit cache
*
* Runs for both Claude and Gemini model families.
*/
const { streamRequest, analyzeContent, extractUsage } = require('./helpers/http-client.cjs');
const { getTestModels, getModelConfig } = require('./helpers/test-models.cjs');
// Large system prompt to exceed 1024 token minimum for caching
// This matches the format used in the working direct API test (~36KB)
const LARGE_SYSTEM_PROMPT = 'You are an expert software engineer. Here is important context:\n' +
'// Large codebase file content line\n'.repeat(1000);
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('PROMPT CACHING TEST (STREAMING)');
console.log(`PROMPT CACHING TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Verifies session ID stability and cache token reporting');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
// ===== TURN 1: Initial request =====
console.log('TURN 1: Initial request (establishes cache)');
@@ -35,11 +40,11 @@ async function runTests() {
];
const turn1 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
system: LARGE_SYSTEM_PROMPT,
thinking: { type: 'enabled', budget_tokens: 5000 },
thinking: modelConfig.thinking,
messages: turn1Messages
});
@@ -89,11 +94,11 @@ async function runTests() {
];
const turn2 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
system: LARGE_SYSTEM_PROMPT,
thinking: { type: 'enabled', budget_tokens: 5000 },
thinking: modelConfig.thinking,
messages: turn2Messages
});
@@ -143,7 +148,7 @@ async function runTests() {
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -156,7 +161,7 @@ async function runTests() {
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
console.log('\nNote: Cache effectiveness depends on:');
@@ -164,6 +169,25 @@ async function runTests() {
console.log(' 2. Sticky account selection (same account across turns)');
console.log(' 3. API-side cache availability (may take time to populate)');
return allPassed;
}
async function runTests() {
const models = getTestModels();
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);
}

View File

@@ -3,34 +3,40 @@
*
* Tests that images can be sent to the API with thinking models.
* Simulates Claude Code sending screenshots or images for analysis.
*
* Runs for both Claude and Gemini model families.
*/
const fs = require('fs');
const path = require('path');
const { streamRequest } = require('./helpers/http-client.cjs');
const { streamRequest, analyzeContent } = require('./helpers/http-client.cjs');
const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs');
// Load test image from disk
const TEST_IMAGE_PATH = path.join(__dirname, 'utils', 'test_image.jpeg');
const TEST_IMAGE_BASE64 = fs.readFileSync(TEST_IMAGE_PATH).toString('base64');
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('IMAGE SUPPORT TEST');
console.log(`IMAGE SUPPORT TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Tests image processing with thinking models');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
const expectThinking = familySupportsThinking(family);
// ===== TEST 1: Single image with question =====
console.log('TEST 1: Single image with question');
console.log('-'.repeat(40));
const result1 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
thinking: { type: 'enabled', budget_tokens: 8000 },
thinking: modelConfig.thinking,
messages: [{
role: 'user',
content: [
@@ -55,20 +61,22 @@ async function runTests() {
allPassed = false;
results.push({ name: 'Single image processing', passed: false });
} else {
const thinking = result1.content.filter(b => b.type === 'thinking');
const text = result1.content.filter(b => b.type === 'text');
const content = analyzeContent(result1.content);
console.log(` Thinking: ${thinking.length > 0 ? 'YES' : 'NO'}`);
console.log(` Text response: ${text.length > 0 ? 'YES' : 'NO'}`);
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`);
console.log(` Text response: ${content.hasText ? 'YES' : 'NO'}`);
if (thinking.length > 0) {
console.log(` Thinking: "${thinking[0].thinking?.substring(0, 60)}..."`);
if (content.hasThinking && content.thinking[0].thinking) {
console.log(` Thinking: "${content.thinking[0].thinking.substring(0, 60)}..."`);
}
if (text.length > 0) {
console.log(` Response: "${text[0].text?.substring(0, 100)}..."`);
if (content.hasText && content.text[0].text) {
console.log(` Response: "${content.text[0].text.substring(0, 100)}..."`);
}
const passed = thinking.length > 0 && text.length > 0;
// For thinking models, expect thinking + text. For others, just text.
const passed = expectThinking
? (content.hasThinking && content.hasText)
: content.hasText;
results.push({ name: 'Single image processing', passed });
if (!passed) allPassed = false;
}
@@ -78,10 +86,10 @@ async function runTests() {
console.log('-'.repeat(40));
const result2 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
thinking: { type: 'enabled', budget_tokens: 8000 },
thinking: modelConfig.thinking,
messages: [
{
role: 'user',
@@ -119,24 +127,23 @@ async function runTests() {
allPassed = false;
results.push({ name: 'Image in multi-turn', passed: false });
} else {
const thinking = result2.content.filter(b => b.type === 'thinking');
const text = result2.content.filter(b => b.type === 'text');
const content = analyzeContent(result2.content);
console.log(` Thinking: ${thinking.length > 0 ? 'YES' : 'NO'}`);
console.log(` Text response: ${text.length > 0 ? 'YES' : 'NO'}`);
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'}`);
console.log(` Text response: ${content.hasText ? 'YES' : 'NO'}`);
if (text.length > 0) {
console.log(` Response: "${text[0].text?.substring(0, 80)}..."`);
if (content.hasText && content.text[0].text) {
console.log(` Response: "${content.text[0].text.substring(0, 80)}..."`);
}
const passed = text.length > 0;
const passed = content.hasText;
results.push({ name: 'Image in multi-turn', passed });
if (!passed) allPassed = false;
}
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -145,7 +152,26 @@ async function runTests() {
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
return allPassed;
}
async function runTests() {
const models = getTestModels();
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);

View File

@@ -8,32 +8,38 @@
*
* This simulates complex Claude Code scenarios where the model
* thinks multiple times during a single turn.
*
* NOTE: This test is Claude-only. Interleaved thinking requires
* the anthropic-beta header which is specific to Claude thinking models.
*/
const { streamRequest, commonTools } = require('./helpers/http-client.cjs');
const { getThinkingModels, getModelConfig } = require('./helpers/test-models.cjs');
// Multiple tools to encourage interleaved thinking
const tools = [commonTools.readFile, commonTools.writeFile, commonTools.runTests];
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('INTERLEAVED THINKING TEST');
console.log(`INTERLEAVED THINKING TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Tests complex multi-step reasoning with tools');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
// ===== TEST 1: Complex task requiring multiple steps =====
console.log('TEST 1: Complex task - read, modify, write, test');
console.log('-'.repeat(40));
const result = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 32000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
thinking: modelConfig.thinking,
messages: [{
role: 'user',
content: `I need you to:
@@ -92,11 +98,11 @@ Please do this step by step, reading each file before modifying.`
const toolUseBlock = result.content.find(b => b.type === 'tool_use');
const result2 = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 32000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
thinking: modelConfig.thinking,
messages: [
{
role: 'user',
@@ -147,7 +153,7 @@ Please do this step by step, reading each file before modifying.`
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -156,7 +162,27 @@ Please do this step by step, reading each file before modifying.`
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
return allPassed;
}
async function runTests() {
// Interleaved thinking is Claude-only (requires anthropic-beta header)
const models = getThinkingModels(['gemini']);
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);

View File

@@ -6,14 +6,18 @@
* - SSE events are properly formatted
* - signature_delta events are present
* - Thinking blocks accumulate correctly across deltas
*
* Runs for both Claude and Gemini model families.
*/
const { streamRequest, analyzeContent, analyzeEvents, commonTools } = require('./helpers/http-client.cjs');
const { getTestModels, getModelConfig } = require('./helpers/test-models.cjs');
const tools = [commonTools.executeCommand];
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('MULTI-TURN TOOL CALL TEST (STREAMING)');
console.log(`MULTI-TURN TOOL CALL TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Simulates Claude Code streaming conversation');
console.log('='.repeat(60));
console.log('');
@@ -21,6 +25,7 @@ async function runTests() {
let messages = [];
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
// ===== TURN 1: Initial request =====
console.log('TURN 1: User asks to run a command');
@@ -32,11 +37,11 @@ async function runTests() {
});
const turn1 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages
});
@@ -50,7 +55,7 @@ async function runTests() {
console.log(' Content:');
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`);
console.log(` Signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`);
console.log(` Tool Use: ${content.hasToolUse ? 'YES' : 'NO'} (${content.toolUse.length} calls)`);
console.log(' Events:');
@@ -67,9 +72,11 @@ async function runTests() {
console.log(` Tool: ${content.toolUse[0].name}(${JSON.stringify(content.toolUse[0].input)})`);
}
const passed = content.hasThinking && content.thinkingHasSignature &&
events.signatureDeltas > 0 && content.hasToolUse;
results.push({ name: 'Turn 1: Thinking + Signature + Tool Use + Events', passed });
// For Claude: signature is on thinking block and comes via signature_delta events
// For Gemini: signature is on tool_use block (no signature_delta events)
const hasSignature = content.hasSignature || events.signatureDeltas > 0;
const passed = content.hasThinking && hasSignature && content.hasToolUse;
results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed });
if (!passed) allPassed = false;
if (content.hasToolUse) {
@@ -101,11 +108,11 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests`
});
const turn2 = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages
});
@@ -119,7 +126,7 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests`
console.log(' Content:');
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`);
console.log(` Signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`);
console.log(` Text: ${content.hasText ? 'YES' : 'NO'}`);
console.log(' Events:');
@@ -139,7 +146,7 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests`
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -148,7 +155,26 @@ drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests`
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
return allPassed;
}
async function runTests() {
const models = getTestModels();
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);

View File

@@ -11,14 +11,18 @@
* - Thinking blocks with signatures are preserved across turns
* - Tool use/result flow works correctly
* - Interleaved thinking with tools
*
* Runs for both Claude and Gemini model families.
*/
const { makeRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs');
const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs');
const tools = [commonTools.searchFiles, commonTools.readFile];
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('MULTI-TURN TOOL CALL TEST (NON-STREAMING)');
console.log(`MULTI-TURN TOOL CALL TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Simulates Claude Code conversation pattern');
console.log('='.repeat(60));
console.log('');
@@ -26,6 +30,8 @@ async function runTests() {
let messages = [];
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
const expectThinking = familySupportsThinking(family);
// ===== TURN 1: Initial request =====
console.log('TURN 1: User asks to find and read a config file');
@@ -37,11 +43,11 @@ async function runTests() {
});
const turn1 = await makeRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: false,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages
});
@@ -52,7 +58,7 @@ async function runTests() {
} else {
const analysis = analyzeContent(turn1.content || []);
console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`);
console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`);
console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`);
console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`);
@@ -63,7 +69,11 @@ async function runTests() {
console.log(` Tool: ${analysis.toolUse[0].name}(${JSON.stringify(analysis.toolUse[0].input)})`);
}
const passed = analysis.hasThinking && analysis.thinkingHasSignature && analysis.hasToolUse;
// For thinking models, expect thinking + signature + tool use
// For non-thinking models, just expect tool use
const passed = expectThinking
? (analysis.hasThinking && analysis.hasSignature && analysis.hasToolUse)
: analysis.hasToolUse;
results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed });
if (!passed) allPassed = false;
@@ -91,11 +101,11 @@ async function runTests() {
});
const turn2 = await makeRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: false,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages
});
@@ -106,7 +116,7 @@ async function runTests() {
} else {
const analysis = analyzeContent(turn2.content || []);
console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`);
console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`);
console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`);
console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`);
@@ -118,7 +128,9 @@ async function runTests() {
}
// Either tool use (to read file) or text response is acceptable
const passed = analysis.hasThinking && (analysis.hasToolUse || analysis.hasText);
const passed = expectThinking
? (analysis.hasThinking && (analysis.hasToolUse || analysis.hasText))
: (analysis.hasToolUse || analysis.hasText);
results.push({ name: 'Turn 2: Thinking + (Tool or Text)', passed });
if (!passed) allPassed = false;
@@ -155,11 +167,11 @@ async function runTests() {
});
const turn3 = await makeRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: false,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages
});
@@ -168,19 +180,23 @@ async function runTests() {
allPassed = false;
results.push({ name: 'Turn 3: Final response', passed: false });
} else {
const analysis = analyzeContent(turn3.content || []);
const analysis = analyzeContent(turn3.content || []);
console.log(` Thinking: ${analysis.hasThinking ? 'YES' : 'NO'} (${analysis.thinking.length} blocks)`);
console.log(` Signature: ${analysis.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Signature: ${analysis.hasSignature ? 'YES' : 'NO'}`);
console.log(` Tool Use: ${analysis.hasToolUse ? 'YES' : 'NO'} (${analysis.toolUse.length} calls)`);
console.log(` Text: ${analysis.hasText ? 'YES' : 'NO'}`);
if (analysis.hasText && analysis.text[0].text) {
console.log(` Response: "${analysis.text[0].text.substring(0, 100)}..."`);
}
if (analysis.hasToolUse) {
console.log(` Tool: ${analysis.toolUse[0].name}(${JSON.stringify(analysis.toolUse[0].input)})`);
}
// Thinking is optional for final responses - model may skip it for simple tasks
const passed = analysis.hasText;
const thinkingNote = analysis.hasThinking ? ' (with thinking)' : ' (no thinking - normal for simple tasks)';
results.push({ name: 'Turn 3: Text response' + thinkingNote, passed });
// For final turn: expect text OR another tool call (model may need more info)
const passed = analysis.hasText || analysis.hasToolUse;
const responseType = analysis.hasText ? 'text' : (analysis.hasToolUse ? 'tool_use' : 'none');
results.push({ name: `Turn 3: Response (${responseType})`, passed });
if (!passed) allPassed = false;
}
}
@@ -188,7 +204,7 @@ async function runTests() {
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -197,7 +213,26 @@ async function runTests() {
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
return allPassed;
}
async function runTests() {
const models = getTestModels();
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);

View File

@@ -6,20 +6,28 @@
*
* Claude Code sends assistant messages with thinking blocks that include signatures.
* These signatures must be preserved and sent back to the API.
*
* Note: Claude puts signatures on thinking blocks, Gemini 3+ puts them on tool_use blocks.
*
* Runs for both Claude and Gemini model families.
*/
const { streamRequest, commonTools } = require('./helpers/http-client.cjs');
const { streamRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs');
const { getThinkingModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs');
const tools = [commonTools.getWeather];
async function runTests() {
async function runTestsForModel(family, model) {
console.log('='.repeat(60));
console.log('THINKING SIGNATURE TEST');
console.log(`THINKING SIGNATURE TEST [${family.toUpperCase()}]`);
console.log(`Model: ${model}`);
console.log('Simulates Claude Code multi-turn with thinking blocks');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
const modelConfig = getModelConfig(family);
const expectThinking = familySupportsThinking(family);
// ===== TEST 1: First turn - get thinking block with signature =====
console.log('TEST 1: Initial request with thinking model');
@@ -30,35 +38,34 @@ async function runTests() {
];
const turn1Result = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages: turn1Messages
});
const turn1Thinking = turn1Result.content.filter(b => b.type === 'thinking');
const turn1ToolUse = turn1Result.content.filter(b => b.type === 'tool_use');
const turn1Text = turn1Result.content.filter(b => b.type === 'text');
const content = analyzeContent(turn1Result.content);
console.log(` Thinking blocks: ${turn1Thinking.length}`);
console.log(` Tool use blocks: ${turn1ToolUse.length}`);
console.log(` Text blocks: ${turn1Text.length}`);
console.log(` Thinking blocks: ${content.thinking.length}`);
console.log(` Tool use blocks: ${content.toolUse.length}`);
console.log(` Text blocks: ${content.text.length}`);
// Check thinking has signature
let turn1HasSignature = false;
if (turn1Thinking.length > 0) {
const sig = turn1Thinking[0].signature || '';
turn1HasSignature = sig.length >= 50;
console.log(` Signature length: ${sig.length} chars`);
console.log(` Signature present: ${turn1HasSignature ? 'YES' : 'NO'}`);
if (turn1Thinking[0].thinking) {
console.log(` Thinking preview: "${turn1Thinking[0].thinking.substring(0, 80)}..."`);
}
// Check signatures - Claude puts them on thinking blocks, Gemini on tool_use blocks
console.log(` Thinking signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`);
console.log(` Tool use signature: ${content.toolUseHasSignature ? 'YES' : 'NO'}`);
console.log(` Has signature (combined): ${content.hasSignature ? 'YES' : 'NO'}`);
if (content.hasThinking && content.thinking[0].thinking) {
console.log(` Thinking preview: "${content.thinking[0].thinking.substring(0, 80)}..."`);
}
const test1Pass = turn1Thinking.length > 0 && turn1HasSignature && turn1ToolUse.length > 0;
// For models that support thinking, expect thinking + signature (somewhere) + tool use
// For models that don't, just expect tool use
const test1Pass = expectThinking
? (content.hasThinking && content.hasSignature && content.hasToolUse)
: (content.hasToolUse || content.hasText);
results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed: test1Pass });
console.log(` Result: ${test1Pass ? 'PASS' : 'FAIL'}`);
if (!test1Pass) allPassed = false;
@@ -67,18 +74,22 @@ async function runTests() {
console.log('\nTEST 2: Multi-turn with thinking signature in assistant message');
console.log('-'.repeat(40));
if (!turn1ToolUse.length) {
if (!content.hasToolUse) {
console.log(' SKIPPED - No tool use in turn 1');
results.push({ name: 'Turn 2: Multi-turn with signature', passed: false, skipped: true });
} else {
// Build assistant message with thinking (including signature) - this is how Claude Code sends it
const assistantContent = turn1Result.content;
// Verify the thinking block has signature before sending
// Log what we're sending back
const thinkingInAssistant = assistantContent.find(b => b.type === 'thinking');
const toolUseInAssistant = assistantContent.find(b => b.type === 'tool_use');
if (thinkingInAssistant) {
console.log(` Sending thinking with signature: ${(thinkingInAssistant.signature || '').length} chars`);
}
if (toolUseInAssistant && toolUseInAssistant.thoughtSignature) {
console.log(` Sending tool_use with thoughtSignature: ${toolUseInAssistant.thoughtSignature.length} chars`);
}
const turn2Messages = [
...turn1Messages,
@@ -87,26 +98,25 @@ async function runTests() {
role: 'user',
content: [{
type: 'tool_result',
tool_use_id: turn1ToolUse[0].id,
tool_use_id: content.toolUse[0].id,
content: 'The weather in Paris is 18°C and sunny.'
}]
}
];
const turn2Result = await streamRequest({
model: 'claude-sonnet-4-5-thinking',
max_tokens: 16000,
model,
max_tokens: modelConfig.max_tokens,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 10000 },
thinking: modelConfig.thinking,
messages: turn2Messages
});
const turn2Thinking = turn2Result.content.filter(b => b.type === 'thinking');
const turn2Text = turn2Result.content.filter(b => b.type === 'text');
const turn2Content = analyzeContent(turn2Result.content);
console.log(` Thinking blocks: ${turn2Thinking.length}`);
console.log(` Text blocks: ${turn2Text.length}`);
console.log(` Thinking blocks: ${turn2Content.thinking.length}`);
console.log(` Text blocks: ${turn2Content.text.length}`);
// Check for errors
const hasError = turn2Result.events.some(e => e.type === 'error');
@@ -115,26 +125,22 @@ async function runTests() {
console.log(` ERROR: ${errorEvent?.data?.error?.message || 'Unknown error'}`);
}
if (turn2Thinking.length > 0) {
const sig = turn2Thinking[0].signature || '';
console.log(` New signature length: ${sig.length} chars`);
if (turn2Thinking[0].thinking) {
console.log(` Thinking preview: "${turn2Thinking[0].thinking.substring(0, 80)}..."`);
}
if (turn2Content.hasThinking && turn2Content.thinking[0].thinking) {
console.log(` Thinking preview: "${turn2Content.thinking[0].thinking.substring(0, 80)}..."`);
}
if (turn2Text.length > 0 && turn2Text[0].text) {
console.log(` Response: "${turn2Text[0].text.substring(0, 100)}..."`);
if (turn2Content.hasText && turn2Content.text[0].text) {
console.log(` Response: "${turn2Content.text[0].text.substring(0, 100)}..."`);
}
const test2Pass = !hasError && (turn2Thinking.length > 0 || turn2Text.length > 0);
const test2Pass = !hasError && (turn2Content.hasThinking || turn2Content.hasText);
results.push({ name: 'Turn 2: Multi-turn with signature', passed: test2Pass });
console.log(` Result: ${test2Pass ? 'PASS' : 'FAIL'}`);
if (!test2Pass) allPassed = false;
}
// ===== TEST 3: Verify signature_delta events in stream =====
console.log('\nTEST 3: Verify signature_delta events in stream');
console.log('\nTEST 3: Verify signature events in stream');
console.log('-'.repeat(40));
const signatureDeltas = turn1Result.events.filter(
@@ -147,14 +153,18 @@ async function runTests() {
console.log(` Total signature length from deltas: ${totalSigLength} chars`);
}
const test3Pass = signatureDeltas.length > 0;
results.push({ name: 'signature_delta events present', passed: test3Pass });
// For Claude: signature_delta events should be present
// For Gemini: signature is attached to tool_use block directly, may not have signature_delta events
const test3Pass = expectThinking
? (signatureDeltas.length > 0 || content.toolUseHasSignature)
: true;
results.push({ name: 'Signature present (delta or on tool_use)', passed: test3Pass });
console.log(` Result: ${test3Pass ? 'PASS' : 'FAIL'}`);
if (!test3Pass) allPassed = false;
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log(`SUMMARY [${family.toUpperCase()}]`);
console.log('='.repeat(60));
for (const result of results) {
@@ -163,7 +173,26 @@ async function runTests() {
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
return allPassed;
}
async function runTests() {
const models = getThinkingModels();
let allPassed = true;
for (const { family, model } of models) {
console.log('\n');
const passed = await runTestsForModel(family, model);
if (!passed) allPassed = false;
}
console.log('\n' + '='.repeat(60));
console.log('FINAL RESULT');
console.log('='.repeat(60));
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);