286 lines
10 KiB
JavaScript
286 lines
10 KiB
JavaScript
/**
|
|
* Interleaved Thinking Test
|
|
*
|
|
* Tests that interleaved thinking works correctly:
|
|
* - Multiple thinking blocks can appear in a single response
|
|
* - Thinking blocks between tool calls
|
|
* - Thinking after tool results
|
|
*
|
|
* This simulates complex Claude Code scenarios where the model
|
|
* thinks multiple times during a single turn.
|
|
*/
|
|
const http = require('http');
|
|
|
|
const BASE_URL = 'localhost';
|
|
const PORT = 8080;
|
|
|
|
function streamRequest(body) {
|
|
return new Promise((resolve, reject) => {
|
|
const data = JSON.stringify(body);
|
|
const req = http.request({
|
|
host: BASE_URL,
|
|
port: PORT,
|
|
path: '/v1/messages',
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'x-api-key': 'test',
|
|
'anthropic-version': '2023-06-01',
|
|
'anthropic-beta': 'interleaved-thinking-2025-05-14',
|
|
'Content-Length': Buffer.byteLength(data)
|
|
}
|
|
}, res => {
|
|
const events = [];
|
|
let fullData = '';
|
|
|
|
res.on('data', chunk => {
|
|
fullData += chunk.toString();
|
|
});
|
|
|
|
res.on('end', () => {
|
|
const parts = fullData.split('\n\n').filter(e => e.trim());
|
|
for (const part of parts) {
|
|
const lines = part.split('\n');
|
|
const eventLine = lines.find(l => l.startsWith('event:'));
|
|
const dataLine = lines.find(l => l.startsWith('data:'));
|
|
if (eventLine && dataLine) {
|
|
try {
|
|
const eventType = eventLine.replace('event:', '').trim();
|
|
const eventData = JSON.parse(dataLine.replace('data:', '').trim());
|
|
events.push({ type: eventType, data: eventData });
|
|
} catch (e) { }
|
|
}
|
|
}
|
|
|
|
const content = [];
|
|
let currentBlock = null;
|
|
|
|
for (const event of events) {
|
|
if (event.type === 'content_block_start') {
|
|
currentBlock = { ...event.data.content_block };
|
|
if (currentBlock.type === 'thinking') {
|
|
currentBlock.thinking = '';
|
|
currentBlock.signature = '';
|
|
}
|
|
if (currentBlock.type === 'text') currentBlock.text = '';
|
|
} else if (event.type === 'content_block_delta') {
|
|
const delta = event.data.delta;
|
|
if (delta.type === 'thinking_delta' && currentBlock) {
|
|
currentBlock.thinking += delta.thinking || '';
|
|
}
|
|
if (delta.type === 'signature_delta' && currentBlock) {
|
|
currentBlock.signature += delta.signature || '';
|
|
}
|
|
if (delta.type === 'text_delta' && currentBlock) {
|
|
currentBlock.text += delta.text || '';
|
|
}
|
|
if (delta.type === 'input_json_delta' && currentBlock) {
|
|
currentBlock.partial_json = (currentBlock.partial_json || '') + delta.partial_json;
|
|
}
|
|
} else if (event.type === 'content_block_stop') {
|
|
if (currentBlock?.type === 'tool_use' && currentBlock.partial_json) {
|
|
try { currentBlock.input = JSON.parse(currentBlock.partial_json); } catch (e) { }
|
|
delete currentBlock.partial_json;
|
|
}
|
|
if (currentBlock) content.push(currentBlock);
|
|
currentBlock = null;
|
|
}
|
|
}
|
|
|
|
const errorEvent = events.find(e => e.type === 'error');
|
|
if (errorEvent) {
|
|
resolve({ content, events, error: errorEvent.data.error, statusCode: res.statusCode });
|
|
} else {
|
|
resolve({ content, events, statusCode: res.statusCode });
|
|
}
|
|
});
|
|
});
|
|
req.on('error', reject);
|
|
req.write(data);
|
|
req.end();
|
|
});
|
|
}
|
|
|
|
// Multiple tools to encourage interleaved thinking
|
|
const tools = [{
|
|
name: 'read_file',
|
|
description: 'Read a file',
|
|
input_schema: {
|
|
type: 'object',
|
|
properties: { path: { type: 'string' } },
|
|
required: ['path']
|
|
}
|
|
}, {
|
|
name: 'write_file',
|
|
description: 'Write to a file',
|
|
input_schema: {
|
|
type: 'object',
|
|
properties: {
|
|
path: { type: 'string' },
|
|
content: { type: 'string' }
|
|
},
|
|
required: ['path', 'content']
|
|
}
|
|
}, {
|
|
name: 'run_tests',
|
|
description: 'Run test suite',
|
|
input_schema: {
|
|
type: 'object',
|
|
properties: { pattern: { type: 'string' } },
|
|
required: ['pattern']
|
|
}
|
|
}];
|
|
|
|
async function runTests() {
|
|
console.log('='.repeat(60));
|
|
console.log('INTERLEAVED THINKING TEST');
|
|
console.log('Tests complex multi-step reasoning with tools');
|
|
console.log('='.repeat(60));
|
|
console.log('');
|
|
|
|
let allPassed = true;
|
|
const results = [];
|
|
|
|
// ===== TEST 1: Complex task requiring multiple steps =====
|
|
console.log('TEST 1: Complex task - read, modify, write, test');
|
|
console.log('-'.repeat(40));
|
|
|
|
const result = await streamRequest({
|
|
model: 'claude-opus-4-5-thinking',
|
|
max_tokens: 8192,
|
|
stream: true,
|
|
tools,
|
|
thinking: { type: 'enabled', budget_tokens: 16000 },
|
|
messages: [{
|
|
role: 'user',
|
|
content: `I need you to:
|
|
1. Read the file src/config.js
|
|
2. Add a new config option "debug: true"
|
|
3. Write the updated file
|
|
4. Run the tests to make sure nothing broke
|
|
|
|
Please do this step by step, reading each file before modifying.`
|
|
}]
|
|
});
|
|
|
|
if (result.error) {
|
|
console.log(` ERROR: ${result.error.message}`);
|
|
allPassed = false;
|
|
results.push({ name: 'Complex multi-step task', passed: false });
|
|
} else {
|
|
const thinking = result.content.filter(b => b.type === 'thinking');
|
|
const toolUse = result.content.filter(b => b.type === 'tool_use');
|
|
const text = result.content.filter(b => b.type === 'text');
|
|
|
|
console.log(` Thinking blocks: ${thinking.length}`);
|
|
console.log(` Tool use blocks: ${toolUse.length}`);
|
|
console.log(` Text blocks: ${text.length}`);
|
|
|
|
// Check signatures
|
|
const signedThinking = thinking.filter(t => t.signature && t.signature.length >= 50);
|
|
console.log(` Signed thinking blocks: ${signedThinking.length}`);
|
|
|
|
// Analyze block order
|
|
const blockOrder = result.content.map(b => b.type).join(' -> ');
|
|
console.log(` Block order: ${blockOrder}`);
|
|
|
|
// Show thinking previews
|
|
thinking.forEach((t, i) => {
|
|
console.log(` Thinking ${i + 1}: "${(t.thinking || '').substring(0, 50)}..."`);
|
|
});
|
|
|
|
// Show tool calls
|
|
toolUse.forEach((t, i) => {
|
|
console.log(` Tool ${i + 1}: ${t.name}(${JSON.stringify(t.input).substring(0, 50)}...)`);
|
|
});
|
|
|
|
// Expect at least one thinking block (ideally multiple for complex task)
|
|
const passed = thinking.length >= 1 && signedThinking.length >= 1 && toolUse.length >= 1;
|
|
results.push({ name: 'Thinking + Tools in complex task', passed });
|
|
if (!passed) allPassed = false;
|
|
}
|
|
|
|
// ===== TEST 2: Multiple tool calls in sequence =====
|
|
console.log('\nTEST 2: Tool result followed by more thinking');
|
|
console.log('-'.repeat(40));
|
|
|
|
// Start with previous result and add tool result
|
|
if (result.content && result.content.some(b => b.type === 'tool_use')) {
|
|
const toolUseBlock = result.content.find(b => b.type === 'tool_use');
|
|
|
|
const result2 = await streamRequest({
|
|
model: 'claude-opus-4-5-thinking',
|
|
max_tokens: 8192,
|
|
stream: true,
|
|
tools,
|
|
thinking: { type: 'enabled', budget_tokens: 16000 },
|
|
messages: [
|
|
{
|
|
role: 'user',
|
|
content: `Read src/config.js and tell me if debug mode is enabled.`
|
|
},
|
|
{ role: 'assistant', content: result.content },
|
|
{
|
|
role: 'user',
|
|
content: [{
|
|
type: 'tool_result',
|
|
tool_use_id: toolUseBlock.id,
|
|
content: `module.exports = {
|
|
port: 3000,
|
|
host: 'localhost',
|
|
debug: false
|
|
};`
|
|
}]
|
|
}
|
|
]
|
|
});
|
|
|
|
if (result2.error) {
|
|
console.log(` ERROR: ${result2.error.message}`);
|
|
allPassed = false;
|
|
results.push({ name: 'Thinking after tool result', passed: false });
|
|
} else {
|
|
const thinking2 = result2.content.filter(b => b.type === 'thinking');
|
|
const text2 = result2.content.filter(b => b.type === 'text');
|
|
const toolUse2 = result2.content.filter(b => b.type === 'tool_use');
|
|
|
|
console.log(` Thinking blocks: ${thinking2.length}`);
|
|
console.log(` Text blocks: ${text2.length}`);
|
|
console.log(` Tool use blocks: ${toolUse2.length}`);
|
|
|
|
if (text2.length > 0) {
|
|
console.log(` Response: "${text2[0].text?.substring(0, 80)}..."`);
|
|
}
|
|
|
|
// Should have thinking after receiving tool result
|
|
const passed = thinking2.length >= 1 && (text2.length > 0 || toolUse2.length > 0);
|
|
results.push({ name: 'Thinking after tool result', passed });
|
|
if (!passed) allPassed = false;
|
|
}
|
|
} else {
|
|
console.log(' SKIPPED - No tool use in previous test');
|
|
results.push({ name: 'Thinking after tool result', passed: false, skipped: true });
|
|
}
|
|
|
|
// ===== Summary =====
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('SUMMARY');
|
|
console.log('='.repeat(60));
|
|
|
|
for (const result of results) {
|
|
const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
|
|
console.log(` [${status}] ${result.name}`);
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
|
|
console.log('='.repeat(60));
|
|
|
|
process.exit(allPassed ? 0 : 1);
|
|
}
|
|
|
|
runTests().catch(err => {
|
|
console.error('Test failed with error:', err);
|
|
process.exit(1);
|
|
});
|