- Move TEST_MODELS and DEFAULT_PRESETS to src/constants.js as single source of truth - Update test-models.cjs helper to use dynamic import from constants - Make getTestModels() and getModels() async functions - Update all test files to await async model config loading - Remove duplicate THINKING_MODELS and getThinkingModels() from test helper - Make thinking tests more lenient for Gemini (doesn't always produce thinking blocks) Co-Authored-By: Claude <noreply@anthropic.com>
206 lines
8.2 KiB
JavaScript
206 lines
8.2 KiB
JavaScript
/**
|
|
* Thinking Signature Test
|
|
*
|
|
* Tests that thinking blocks with signatures are properly handled in multi-turn
|
|
* conversations, simulating how Claude Code sends requests.
|
|
*
|
|
* Claude Code sends assistant messages with thinking blocks that include signatures.
|
|
* These signatures must be preserved and sent back to the API.
|
|
*
|
|
* Note: Claude puts signatures on thinking blocks, Gemini 3+ puts them on tool_use blocks.
|
|
*
|
|
* Runs for both Claude and Gemini model families.
|
|
*/
|
|
const { streamRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs');
|
|
const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs');
|
|
|
|
const tools = [commonTools.getWeather];
|
|
|
|
async function runTestsForModel(family, model) {
|
|
console.log('='.repeat(60));
|
|
console.log(`THINKING SIGNATURE TEST [${family.toUpperCase()}]`);
|
|
console.log(`Model: ${model}`);
|
|
console.log('Simulates Claude Code multi-turn with thinking blocks');
|
|
console.log('='.repeat(60));
|
|
console.log('');
|
|
|
|
let allPassed = true;
|
|
const results = [];
|
|
const modelConfig = getModelConfig(family);
|
|
const expectThinking = familySupportsThinking(family);
|
|
|
|
// ===== TEST 1: First turn - get thinking block with signature =====
|
|
console.log('TEST 1: Initial request with thinking model');
|
|
console.log('-'.repeat(40));
|
|
|
|
const turn1Messages = [
|
|
{ role: 'user', content: 'What is the weather in Paris? Use the get_weather tool.' }
|
|
];
|
|
|
|
const turn1Result = await streamRequest({
|
|
model,
|
|
max_tokens: modelConfig.max_tokens,
|
|
stream: true,
|
|
tools,
|
|
thinking: modelConfig.thinking,
|
|
messages: turn1Messages
|
|
});
|
|
|
|
const content = analyzeContent(turn1Result.content);
|
|
|
|
console.log(` Thinking blocks: ${content.thinking.length}`);
|
|
console.log(` Tool use blocks: ${content.toolUse.length}`);
|
|
console.log(` Text blocks: ${content.text.length}`);
|
|
|
|
// Check signatures - Claude puts them on thinking blocks, Gemini on tool_use blocks
|
|
console.log(` Thinking signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`);
|
|
console.log(` Tool use signature: ${content.toolUseHasSignature ? 'YES' : 'NO'}`);
|
|
console.log(` Has signature (combined): ${content.hasSignature ? 'YES' : 'NO'}`);
|
|
|
|
if (content.hasThinking && content.thinking[0].thinking) {
|
|
console.log(` Thinking preview: "${content.thinking[0].thinking.substring(0, 80)}..."`);
|
|
}
|
|
|
|
// For models that support thinking, expect signature (somewhere) + tool use
|
|
// Note: Gemini doesn't always produce thinking blocks, but does put signatures on tool_use
|
|
// Claude always produces thinking blocks with signatures
|
|
const test1Pass = expectThinking
|
|
? (content.hasSignature && content.hasToolUse) // Signature required, thinking optional for Gemini
|
|
: (content.hasToolUse || content.hasText);
|
|
results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed: test1Pass });
|
|
console.log(` Result: ${test1Pass ? 'PASS' : 'FAIL'}`);
|
|
if (!test1Pass) allPassed = false;
|
|
|
|
// ===== TEST 2: Second turn - send back thinking with signature =====
|
|
console.log('\nTEST 2: Multi-turn with thinking signature in assistant message');
|
|
console.log('-'.repeat(40));
|
|
|
|
if (!content.hasToolUse) {
|
|
console.log(' SKIPPED - No tool use in turn 1');
|
|
results.push({ name: 'Turn 2: Multi-turn with signature', passed: false, skipped: true });
|
|
} else {
|
|
// Build assistant message with thinking (including signature) - this is how Claude Code sends it
|
|
const assistantContent = turn1Result.content;
|
|
|
|
// Log what we're sending back
|
|
const thinkingInAssistant = assistantContent.find(b => b.type === 'thinking');
|
|
const toolUseInAssistant = assistantContent.find(b => b.type === 'tool_use');
|
|
if (thinkingInAssistant) {
|
|
console.log(` Sending thinking with signature: ${(thinkingInAssistant.signature || '').length} chars`);
|
|
}
|
|
if (toolUseInAssistant && toolUseInAssistant.thoughtSignature) {
|
|
console.log(` Sending tool_use with thoughtSignature: ${toolUseInAssistant.thoughtSignature.length} chars`);
|
|
}
|
|
|
|
const turn2Messages = [
|
|
...turn1Messages,
|
|
{ role: 'assistant', content: assistantContent },
|
|
{
|
|
role: 'user',
|
|
content: [{
|
|
type: 'tool_result',
|
|
tool_use_id: content.toolUse[0].id,
|
|
content: 'The weather in Paris is 18°C and sunny.'
|
|
}]
|
|
}
|
|
];
|
|
|
|
const turn2Result = await streamRequest({
|
|
model,
|
|
max_tokens: modelConfig.max_tokens,
|
|
stream: true,
|
|
tools,
|
|
thinking: modelConfig.thinking,
|
|
messages: turn2Messages
|
|
});
|
|
|
|
const turn2Content = analyzeContent(turn2Result.content);
|
|
|
|
console.log(` Thinking blocks: ${turn2Content.thinking.length}`);
|
|
console.log(` Text blocks: ${turn2Content.text.length}`);
|
|
|
|
// Check for errors
|
|
const hasError = turn2Result.events.some(e => e.type === 'error');
|
|
if (hasError) {
|
|
const errorEvent = turn2Result.events.find(e => e.type === 'error');
|
|
console.log(` ERROR: ${errorEvent?.data?.error?.message || 'Unknown error'}`);
|
|
}
|
|
|
|
if (turn2Content.hasThinking && turn2Content.thinking[0].thinking) {
|
|
console.log(` Thinking preview: "${turn2Content.thinking[0].thinking.substring(0, 80)}..."`);
|
|
}
|
|
|
|
if (turn2Content.hasText && turn2Content.text[0].text) {
|
|
console.log(` Response: "${turn2Content.text[0].text.substring(0, 100)}..."`);
|
|
}
|
|
|
|
const test2Pass = !hasError && (turn2Content.hasThinking || turn2Content.hasText);
|
|
results.push({ name: 'Turn 2: Multi-turn with signature', passed: test2Pass });
|
|
console.log(` Result: ${test2Pass ? 'PASS' : 'FAIL'}`);
|
|
if (!test2Pass) allPassed = false;
|
|
}
|
|
|
|
// ===== TEST 3: Verify signature_delta events in stream =====
|
|
console.log('\nTEST 3: Verify signature events in stream');
|
|
console.log('-'.repeat(40));
|
|
|
|
const signatureDeltas = turn1Result.events.filter(
|
|
e => e.type === 'content_block_delta' && e.data?.delta?.type === 'signature_delta'
|
|
);
|
|
console.log(` signature_delta events: ${signatureDeltas.length}`);
|
|
|
|
if (signatureDeltas.length > 0) {
|
|
const totalSigLength = signatureDeltas.reduce((sum, e) => sum + (e.data.delta.signature?.length || 0), 0);
|
|
console.log(` Total signature length from deltas: ${totalSigLength} chars`);
|
|
}
|
|
|
|
// For Claude: signature_delta events should be present
|
|
// For Gemini: signature is attached to tool_use block directly, may not have signature_delta events
|
|
const test3Pass = expectThinking
|
|
? (signatureDeltas.length > 0 || content.toolUseHasSignature)
|
|
: true;
|
|
results.push({ name: 'Signature present (delta or on tool_use)', passed: test3Pass });
|
|
console.log(` Result: ${test3Pass ? 'PASS' : 'FAIL'}`);
|
|
if (!test3Pass) allPassed = false;
|
|
|
|
// ===== Summary =====
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log(`SUMMARY [${family.toUpperCase()}]`);
|
|
console.log('='.repeat(60));
|
|
|
|
for (const result of results) {
|
|
const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
|
|
console.log(` [${status}] ${result.name}`);
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
|
|
console.log('='.repeat(60));
|
|
|
|
return allPassed;
|
|
}
|
|
|
|
async function runTests() {
|
|
const models = await getTestModels();
|
|
let allPassed = true;
|
|
|
|
for (const { family, model } of models) {
|
|
console.log('\n');
|
|
const passed = await runTestsForModel(family, model);
|
|
if (!passed) allPassed = false;
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('FINAL RESULT');
|
|
console.log('='.repeat(60));
|
|
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
|
|
console.log('='.repeat(60));
|
|
|
|
process.exit(allPassed ? 0 : 1);
|
|
}
|
|
|
|
runTests().catch(err => {
|
|
console.error('Test failed with error:', err);
|
|
process.exit(1);
|
|
});
|