Files
antigravity-claude-proxy/tests/test-interleaved-thinking.cjs

169 lines
6.1 KiB
JavaScript

/**
* Interleaved Thinking Test
*
* Tests that interleaved thinking works correctly:
* - Multiple thinking blocks can appear in a single response
* - Thinking blocks between tool calls
* - Thinking after tool results
*
* This simulates complex Claude Code scenarios where the model
* thinks multiple times during a single turn.
*/
const { streamRequest, commonTools } = require('./helpers/http-client.cjs');
// Multiple tools to encourage interleaved thinking
const tools = [commonTools.readFile, commonTools.writeFile, commonTools.runTests];
async function runTests() {
console.log('='.repeat(60));
console.log('INTERLEAVED THINKING TEST');
console.log('Tests complex multi-step reasoning with tools');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
// ===== TEST 1: Complex task requiring multiple steps =====
console.log('TEST 1: Complex task - read, modify, write, test');
console.log('-'.repeat(40));
const result = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 32000,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
messages: [{
role: 'user',
content: `I need you to:
1. Read the file src/config.js
2. Add a new config option "debug: true"
3. Write the updated file
4. Run the tests to make sure nothing broke
Please do this step by step, reading each file before modifying.`
}]
});
if (result.error) {
console.log(` ERROR: ${result.error.message}`);
allPassed = false;
results.push({ name: 'Complex multi-step task', passed: false });
} else {
const thinking = result.content.filter(b => b.type === 'thinking');
const toolUse = result.content.filter(b => b.type === 'tool_use');
const text = result.content.filter(b => b.type === 'text');
console.log(` Thinking blocks: ${thinking.length}`);
console.log(` Tool use blocks: ${toolUse.length}`);
console.log(` Text blocks: ${text.length}`);
// Check signatures
const signedThinking = thinking.filter(t => t.signature && t.signature.length >= 50);
console.log(` Signed thinking blocks: ${signedThinking.length}`);
// Analyze block order
const blockOrder = result.content.map(b => b.type).join(' -> ');
console.log(` Block order: ${blockOrder}`);
// Show thinking previews
thinking.forEach((t, i) => {
console.log(` Thinking ${i + 1}: "${(t.thinking || '').substring(0, 50)}..."`);
});
// Show tool calls
toolUse.forEach((t, i) => {
console.log(` Tool ${i + 1}: ${t.name}(${JSON.stringify(t.input).substring(0, 50)}...)`);
});
// Expect at least one thinking block (ideally multiple for complex task)
const passed = thinking.length >= 1 && signedThinking.length >= 1 && toolUse.length >= 1;
results.push({ name: 'Thinking + Tools in complex task', passed });
if (!passed) allPassed = false;
}
// ===== TEST 2: Multiple tool calls in sequence =====
console.log('\nTEST 2: Tool result followed by more thinking');
console.log('-'.repeat(40));
// Start with previous result and add tool result
if (result.content && result.content.some(b => b.type === 'tool_use')) {
const toolUseBlock = result.content.find(b => b.type === 'tool_use');
const result2 = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 32000,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
messages: [
{
role: 'user',
content: `Read src/config.js and tell me if debug mode is enabled.`
},
{ role: 'assistant', content: result.content },
{
role: 'user',
content: [{
type: 'tool_result',
tool_use_id: toolUseBlock.id,
content: `module.exports = {
port: 3000,
host: 'localhost',
debug: false
};`
}]
}
]
});
if (result2.error) {
console.log(` ERROR: ${result2.error.message}`);
allPassed = false;
results.push({ name: 'Thinking after tool result', passed: false });
} else {
const thinking2 = result2.content.filter(b => b.type === 'thinking');
const text2 = result2.content.filter(b => b.type === 'text');
const toolUse2 = result2.content.filter(b => b.type === 'tool_use');
console.log(` Thinking blocks: ${thinking2.length}`);
console.log(` Text blocks: ${text2.length}`);
console.log(` Tool use blocks: ${toolUse2.length}`);
if (text2.length > 0) {
console.log(` Response: "${text2[0].text?.substring(0, 80)}..."`);
}
// Should have thinking after receiving tool result
const passed = thinking2.length >= 1 && (text2.length > 0 || toolUse2.length > 0);
results.push({ name: 'Thinking after tool result', passed });
if (!passed) allPassed = false;
}
} else {
console.log(' SKIPPED - No tool use in previous test');
results.push({ name: 'Thinking after tool result', passed: false, skipped: true });
}
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log('='.repeat(60));
for (const result of results) {
const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
console.log(` [${status}] ${result.name}`);
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);
}
runTests().catch(err => {
console.error('Test failed with error:', err);
process.exit(1);
});