antigravity-claude-proxy/tests/test-interleaved-thinking.cjs

/**
 * Interleaved Thinking Test
 *
 * Tests that interleaved thinking works correctly:
 * - Multiple thinking blocks can appear in a single response
 * - Thinking blocks between tool calls
 * - Thinking after tool results
 *
 * This simulates complex Claude Code scenarios where the model
 * thinks multiple times during a single turn.
 */
const http = require('http');

const BASE_URL = 'localhost';
const PORT = 8080;

function streamRequest(body) {
    return new Promise((resolve, reject) => {
        const data = JSON.stringify(body);
        const req = http.request({
            host: BASE_URL,
            port: PORT,
            path: '/v1/messages',
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
                'x-api-key': 'test',
                'anthropic-version': '2023-06-01',
                'anthropic-beta': 'interleaved-thinking-2025-05-14',
                'Content-Length': Buffer.byteLength(data)
            }
        }, res => {
            const events = [];
            let fullData = '';

            res.on('data', chunk => {
                fullData += chunk.toString();
            });

            res.on('end', () => {
                const parts = fullData.split('\n\n').filter(e => e.trim());
                for (const part of parts) {
                    const lines = part.split('\n');
                    const eventLine = lines.find(l => l.startsWith('event:'));
                    const dataLine = lines.find(l => l.startsWith('data:'));
                    if (eventLine && dataLine) {
                        try {
                            const eventType = eventLine.replace('event:', '').trim();
                            const eventData = JSON.parse(dataLine.replace('data:', '').trim());
                            events.push({ type: eventType, data: eventData });
                        } catch (e) { }
                    }
                }

                const content = [];
                let currentBlock = null;

                for (const event of events) {
                    if (event.type === 'content_block_start') {
                        currentBlock = { ...event.data.content_block };
                        if (currentBlock.type === 'thinking') {
                            currentBlock.thinking = '';
                            currentBlock.signature = '';
                        }
                        if (currentBlock.type === 'text') currentBlock.text = '';
                    } else if (event.type === 'content_block_delta') {
                        const delta = event.data.delta;
                        if (delta.type === 'thinking_delta' && currentBlock) {
                            currentBlock.thinking += delta.thinking || '';
                        }
                        if (delta.type === 'signature_delta' && currentBlock) {
                            currentBlock.signature += delta.signature || '';
                        }
                        if (delta.type === 'text_delta' && currentBlock) {
                            currentBlock.text += delta.text || '';
                        }
                        if (delta.type === 'input_json_delta' && currentBlock) {
                            currentBlock.partial_json = (currentBlock.partial_json || '') + delta.partial_json;
                        }
                    } else if (event.type === 'content_block_stop') {
                        if (currentBlock?.type === 'tool_use' && currentBlock.partial_json) {
                            try { currentBlock.input = JSON.parse(currentBlock.partial_json); } catch (e) { }
                            delete currentBlock.partial_json;
                        }
                        if (currentBlock) content.push(currentBlock);
                        currentBlock = null;
                    }
                }

                const errorEvent = events.find(e => e.type === 'error');
                if (errorEvent) {
                    resolve({ content, events, error: errorEvent.data.error, statusCode: res.statusCode });
                } else {
                    resolve({ content, events, statusCode: res.statusCode });
                }
            });
        });
        req.on('error', reject);
        req.write(data);
        req.end();
    });
}

// Multiple tools to encourage interleaved thinking
const tools = [{
    name: 'read_file',
    description: 'Read a file',
    input_schema: {
        type: 'object',
        properties: { path: { type: 'string' } },
        required: ['path']
    }
}, {
    name: 'write_file',
    description: 'Write to a file',
    input_schema: {
        type: 'object',
        properties: {
            path: { type: 'string' },
            content: { type: 'string' }
        },
        required: ['path', 'content']
    }
}, {
    name: 'run_tests',
    description: 'Run test suite',
    input_schema: {
        type: 'object',
        properties: { pattern: { type: 'string' } },
        required: ['pattern']
    }
}];

async function runTests() {
    console.log('='.repeat(60));
    console.log('INTERLEAVED THINKING TEST');
    console.log('Tests complex multi-step reasoning with tools');
    console.log('='.repeat(60));
    console.log('');

    let allPassed = true;
    const results = [];

    // ===== TEST 1: Complex task requiring multiple steps =====
    console.log('TEST 1: Complex task - read, modify, write, test');
    console.log('-'.repeat(40));

    const result = await streamRequest({
        model: 'claude-opus-4-5-thinking',
        max_tokens: 8192,
        stream: true,
        tools,
        thinking: { type: 'enabled', budget_tokens: 16000 },
        messages: [{
            role: 'user',
            content: `I need you to:
1. Read the file src/config.js
2. Add a new config option "debug: true"
3. Write the updated file
4. Run the tests to make sure nothing broke

Please do this step by step, reading each file before modifying.`
        }]
    });

    if (result.error) {
        console.log(`  ERROR: ${result.error.message}`);
        allPassed = false;
        results.push({ name: 'Complex multi-step task', passed: false });
    } else {
        const thinking = result.content.filter(b => b.type === 'thinking');
        const toolUse = result.content.filter(b => b.type === 'tool_use');
        const text = result.content.filter(b => b.type === 'text');

        console.log(`  Thinking blocks: ${thinking.length}`);
        console.log(`  Tool use blocks: ${toolUse.length}`);
        console.log(`  Text blocks: ${text.length}`);

        // Check signatures
        const signedThinking = thinking.filter(t => t.signature && t.signature.length >= 50);
        console.log(`  Signed thinking blocks: ${signedThinking.length}`);

        // Analyze block order
        const blockOrder = result.content.map(b => b.type).join(' -> ');
        console.log(`  Block order: ${blockOrder}`);

        // Show thinking previews
        thinking.forEach((t, i) => {
            console.log(`  Thinking ${i + 1}: "${(t.thinking || '').substring(0, 50)}..."`);
        });

        // Show tool calls
        toolUse.forEach((t, i) => {
            console.log(`  Tool ${i + 1}: ${t.name}(${JSON.stringify(t.input).substring(0, 50)}...)`);
        });

        // Expect at least one thinking block (ideally multiple for complex task)
        const passed = thinking.length >= 1 && signedThinking.length >= 1 && toolUse.length >= 1;
        results.push({ name: 'Thinking + Tools in complex task', passed });
        if (!passed) allPassed = false;
    }

    // ===== TEST 2: Multiple tool calls in sequence =====
    console.log('\nTEST 2: Tool result followed by more thinking');
    console.log('-'.repeat(40));

    // Start with previous result and add tool result
    if (result.content && result.content.some(b => b.type === 'tool_use')) {
        const toolUseBlock = result.content.find(b => b.type === 'tool_use');

        const result2 = await streamRequest({
            model: 'claude-opus-4-5-thinking',
            max_tokens: 8192,
            stream: true,
            tools,
            thinking: { type: 'enabled', budget_tokens: 16000 },
            messages: [
                {
                    role: 'user',
                    content: `Read src/config.js and tell me if debug mode is enabled.`
                },
                { role: 'assistant', content: result.content },
                {
                    role: 'user',
                    content: [{
                        type: 'tool_result',
                        tool_use_id: toolUseBlock.id,
                        content: `module.exports = {
    port: 3000,
    host: 'localhost',
    debug: false
};`
                    }]
                }
            ]
        });

        if (result2.error) {
            console.log(`  ERROR: ${result2.error.message}`);
            allPassed = false;
            results.push({ name: 'Thinking after tool result', passed: false });
        } else {
            const thinking2 = result2.content.filter(b => b.type === 'thinking');
            const text2 = result2.content.filter(b => b.type === 'text');
            const toolUse2 = result2.content.filter(b => b.type === 'tool_use');

            console.log(`  Thinking blocks: ${thinking2.length}`);
            console.log(`  Text blocks: ${text2.length}`);
            console.log(`  Tool use blocks: ${toolUse2.length}`);

            if (text2.length > 0) {
                console.log(`  Response: "${text2[0].text?.substring(0, 80)}..."`);
            }

            // Should have thinking after receiving tool result
            const passed = thinking2.length >= 1 && (text2.length > 0 || toolUse2.length > 0);
            results.push({ name: 'Thinking after tool result', passed });
            if (!passed) allPassed = false;
        }
    } else {
        console.log('  SKIPPED - No tool use in previous test');
        results.push({ name: 'Thinking after tool result', passed: false, skipped: true });
    }

    // ===== Summary =====
    console.log('\n' + '='.repeat(60));
    console.log('SUMMARY');
    console.log('='.repeat(60));

    for (const result of results) {
        const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
        console.log(`  [${status}] ${result.name}`);
    }

    console.log('\n' + '='.repeat(60));
    console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
    console.log('='.repeat(60));

    process.exit(allPassed ? 0 : 1);
}

runTests().catch(err => {
    console.error('Test failed with error:', err);
    process.exit(1);
});