antigravity-claude-proxy/tests/test-thinking-signatures.cjs

/**
 * Thinking Signature Test
 *
 * Tests that thinking blocks with signatures are properly handled in multi-turn
 * conversations, simulating how Claude Code sends requests.
 *
 * Claude Code sends assistant messages with thinking blocks that include signatures.
 * These signatures must be preserved and sent back to the API.
 *
 * Note: Claude puts signatures on thinking blocks, Gemini 3+ puts them on tool_use blocks.
 *
 * Runs for both Claude and Gemini model families.
 */
const { streamRequest, analyzeContent, commonTools } = require('./helpers/http-client.cjs');
const { getTestModels, getModelConfig, familySupportsThinking } = require('./helpers/test-models.cjs');

const tools = [commonTools.getWeather];

async function runTestsForModel(family, model) {
    console.log('='.repeat(60));
    console.log(`THINKING SIGNATURE TEST [${family.toUpperCase()}]`);
    console.log(`Model: ${model}`);
    console.log('Simulates Claude Code multi-turn with thinking blocks');
    console.log('='.repeat(60));
    console.log('');

    let allPassed = true;
    const results = [];
    const modelConfig = getModelConfig(family);
    const expectThinking = familySupportsThinking(family);

    // ===== TEST 1: First turn - get thinking block with signature =====
    console.log('TEST 1: Initial request with thinking model');
    console.log('-'.repeat(40));

    const turn1Messages = [
        { role: 'user', content: 'What is the weather in Paris? Use the get_weather tool.' }
    ];

    const turn1Result = await streamRequest({
        model,
        max_tokens: modelConfig.max_tokens,
        stream: true,
        tools,
        thinking: modelConfig.thinking,
        messages: turn1Messages
    });

    const content = analyzeContent(turn1Result.content);

    console.log(`  Thinking blocks: ${content.thinking.length}`);
    console.log(`  Tool use blocks: ${content.toolUse.length}`);
    console.log(`  Text blocks: ${content.text.length}`);

    // Check signatures - Claude puts them on thinking blocks, Gemini on tool_use blocks
    console.log(`  Thinking signature: ${content.thinkingHasSignature ? 'YES' : 'NO'}`);
    console.log(`  Tool use signature: ${content.toolUseHasSignature ? 'YES' : 'NO'}`);
    console.log(`  Has signature (combined): ${content.hasSignature ? 'YES' : 'NO'}`);

    if (content.hasThinking && content.thinking[0].thinking) {
        console.log(`  Thinking preview: "${content.thinking[0].thinking.substring(0, 80)}..."`);
    }

    // For models that support thinking, expect signature (somewhere) + tool use
    // Note: Gemini doesn't always produce thinking blocks, but does put signatures on tool_use
    // Claude always produces thinking blocks with signatures
    const test1Pass = expectThinking
        ? (content.hasSignature && content.hasToolUse)  // Signature required, thinking optional for Gemini
        : (content.hasToolUse || content.hasText);
    results.push({ name: 'Turn 1: Thinking + Signature + Tool Use', passed: test1Pass });
    console.log(`  Result: ${test1Pass ? 'PASS' : 'FAIL'}`);
    if (!test1Pass) allPassed = false;

    // ===== TEST 2: Second turn - send back thinking with signature =====
    console.log('\nTEST 2: Multi-turn with thinking signature in assistant message');
    console.log('-'.repeat(40));

    if (!content.hasToolUse) {
        console.log('  SKIPPED - No tool use in turn 1');
        results.push({ name: 'Turn 2: Multi-turn with signature', passed: false, skipped: true });
    } else {
        // Build assistant message with thinking (including signature) - this is how Claude Code sends it
        const assistantContent = turn1Result.content;

        // Log what we're sending back
        const thinkingInAssistant = assistantContent.find(b => b.type === 'thinking');
        const toolUseInAssistant = assistantContent.find(b => b.type === 'tool_use');
        if (thinkingInAssistant) {
            console.log(`  Sending thinking with signature: ${(thinkingInAssistant.signature || '').length} chars`);
        }
        if (toolUseInAssistant && toolUseInAssistant.thoughtSignature) {
            console.log(`  Sending tool_use with thoughtSignature: ${toolUseInAssistant.thoughtSignature.length} chars`);
        }

        const turn2Messages = [
            ...turn1Messages,
            { role: 'assistant', content: assistantContent },
            {
                role: 'user',
                content: [{
                    type: 'tool_result',
                    tool_use_id: content.toolUse[0].id,
                    content: 'The weather in Paris is 18°C and sunny.'
                }]
            }
        ];

        const turn2Result = await streamRequest({
            model,
            max_tokens: modelConfig.max_tokens,
            stream: true,
            tools,
            thinking: modelConfig.thinking,
            messages: turn2Messages
        });

        const turn2Content = analyzeContent(turn2Result.content);

        console.log(`  Thinking blocks: ${turn2Content.thinking.length}`);
        console.log(`  Text blocks: ${turn2Content.text.length}`);

        // Check for errors
        const hasError = turn2Result.events.some(e => e.type === 'error');
        if (hasError) {
            const errorEvent = turn2Result.events.find(e => e.type === 'error');
            console.log(`  ERROR: ${errorEvent?.data?.error?.message || 'Unknown error'}`);
        }

        if (turn2Content.hasThinking && turn2Content.thinking[0].thinking) {
            console.log(`  Thinking preview: "${turn2Content.thinking[0].thinking.substring(0, 80)}..."`);
        }

        if (turn2Content.hasText && turn2Content.text[0].text) {
            console.log(`  Response: "${turn2Content.text[0].text.substring(0, 100)}..."`);
        }

        const test2Pass = !hasError && (turn2Content.hasThinking || turn2Content.hasText);
        results.push({ name: 'Turn 2: Multi-turn with signature', passed: test2Pass });
        console.log(`  Result: ${test2Pass ? 'PASS' : 'FAIL'}`);
        if (!test2Pass) allPassed = false;
    }

    // ===== TEST 3: Verify signature_delta events in stream =====
    console.log('\nTEST 3: Verify signature events in stream');
    console.log('-'.repeat(40));

    const signatureDeltas = turn1Result.events.filter(
        e => e.type === 'content_block_delta' && e.data?.delta?.type === 'signature_delta'
    );
    console.log(`  signature_delta events: ${signatureDeltas.length}`);

    if (signatureDeltas.length > 0) {
        const totalSigLength = signatureDeltas.reduce((sum, e) => sum + (e.data.delta.signature?.length || 0), 0);
        console.log(`  Total signature length from deltas: ${totalSigLength} chars`);
    }

    // For Claude: signature_delta events should be present
    // For Gemini: signature is attached to tool_use block directly, may not have signature_delta events
    const test3Pass = expectThinking
        ? (signatureDeltas.length > 0 || content.toolUseHasSignature)
        : true;
    results.push({ name: 'Signature present (delta or on tool_use)', passed: test3Pass });
    console.log(`  Result: ${test3Pass ? 'PASS' : 'FAIL'}`);
    if (!test3Pass) allPassed = false;

    // ===== Summary =====
    console.log('\n' + '='.repeat(60));
    console.log(`SUMMARY [${family.toUpperCase()}]`);
    console.log('='.repeat(60));

    for (const result of results) {
        const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
        console.log(`  [${status}] ${result.name}`);
    }

    console.log('\n' + '='.repeat(60));
    console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
    console.log('='.repeat(60));

    return allPassed;
}

async function runTests() {
    const models = await getTestModels();
    let allPassed = true;

    for (const { family, model } of models) {
        console.log('\n');
        const passed = await runTestsForModel(family, model);
        if (!passed) allPassed = false;
    }

    console.log('\n' + '='.repeat(60));
    console.log('FINAL RESULT');
    console.log('='.repeat(60));
    console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
    console.log('='.repeat(60));

    process.exit(allPassed ? 0 : 1);
}

runTests().catch(err => {
    console.error('Test failed with error:', err);
    process.exit(1);
});