initial commit

This commit is contained in:
Badri Narayanan S
2025-12-19 19:20:28 +05:30
parent 52d72b7bff
commit 5ae29947b1
18 changed files with 3925 additions and 494 deletions

View File

@@ -0,0 +1,285 @@
/**
* Interleaved Thinking Test
*
* Tests that interleaved thinking works correctly:
* - Multiple thinking blocks can appear in a single response
* - Thinking blocks between tool calls
* - Thinking after tool results
*
* This simulates complex Claude Code scenarios where the model
* thinks multiple times during a single turn.
*/
const http = require('http');
const BASE_URL = 'localhost';
const PORT = 8080;
function streamRequest(body) {
return new Promise((resolve, reject) => {
const data = JSON.stringify(body);
const req = http.request({
host: BASE_URL,
port: PORT,
path: '/v1/messages',
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': 'test',
'anthropic-version': '2023-06-01',
'anthropic-beta': 'interleaved-thinking-2025-05-14',
'Content-Length': Buffer.byteLength(data)
}
}, res => {
const events = [];
let fullData = '';
res.on('data', chunk => {
fullData += chunk.toString();
});
res.on('end', () => {
const parts = fullData.split('\n\n').filter(e => e.trim());
for (const part of parts) {
const lines = part.split('\n');
const eventLine = lines.find(l => l.startsWith('event:'));
const dataLine = lines.find(l => l.startsWith('data:'));
if (eventLine && dataLine) {
try {
const eventType = eventLine.replace('event:', '').trim();
const eventData = JSON.parse(dataLine.replace('data:', '').trim());
events.push({ type: eventType, data: eventData });
} catch (e) { }
}
}
const content = [];
let currentBlock = null;
for (const event of events) {
if (event.type === 'content_block_start') {
currentBlock = { ...event.data.content_block };
if (currentBlock.type === 'thinking') {
currentBlock.thinking = '';
currentBlock.signature = '';
}
if (currentBlock.type === 'text') currentBlock.text = '';
} else if (event.type === 'content_block_delta') {
const delta = event.data.delta;
if (delta.type === 'thinking_delta' && currentBlock) {
currentBlock.thinking += delta.thinking || '';
}
if (delta.type === 'signature_delta' && currentBlock) {
currentBlock.signature += delta.signature || '';
}
if (delta.type === 'text_delta' && currentBlock) {
currentBlock.text += delta.text || '';
}
if (delta.type === 'input_json_delta' && currentBlock) {
currentBlock.partial_json = (currentBlock.partial_json || '') + delta.partial_json;
}
} else if (event.type === 'content_block_stop') {
if (currentBlock?.type === 'tool_use' && currentBlock.partial_json) {
try { currentBlock.input = JSON.parse(currentBlock.partial_json); } catch (e) { }
delete currentBlock.partial_json;
}
if (currentBlock) content.push(currentBlock);
currentBlock = null;
}
}
const errorEvent = events.find(e => e.type === 'error');
if (errorEvent) {
resolve({ content, events, error: errorEvent.data.error, statusCode: res.statusCode });
} else {
resolve({ content, events, statusCode: res.statusCode });
}
});
});
req.on('error', reject);
req.write(data);
req.end();
});
}
// Multiple tools to encourage interleaved thinking
const tools = [{
name: 'read_file',
description: 'Read a file',
input_schema: {
type: 'object',
properties: { path: { type: 'string' } },
required: ['path']
}
}, {
name: 'write_file',
description: 'Write to a file',
input_schema: {
type: 'object',
properties: {
path: { type: 'string' },
content: { type: 'string' }
},
required: ['path', 'content']
}
}, {
name: 'run_tests',
description: 'Run test suite',
input_schema: {
type: 'object',
properties: { pattern: { type: 'string' } },
required: ['pattern']
}
}];
async function runTests() {
console.log('='.repeat(60));
console.log('INTERLEAVED THINKING TEST');
console.log('Tests complex multi-step reasoning with tools');
console.log('='.repeat(60));
console.log('');
let allPassed = true;
const results = [];
// ===== TEST 1: Complex task requiring multiple steps =====
console.log('TEST 1: Complex task - read, modify, write, test');
console.log('-'.repeat(40));
const result = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 8192,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
messages: [{
role: 'user',
content: `I need you to:
1. Read the file src/config.js
2. Add a new config option "debug: true"
3. Write the updated file
4. Run the tests to make sure nothing broke
Please do this step by step, reading each file before modifying.`
}]
});
if (result.error) {
console.log(` ERROR: ${result.error.message}`);
allPassed = false;
results.push({ name: 'Complex multi-step task', passed: false });
} else {
const thinking = result.content.filter(b => b.type === 'thinking');
const toolUse = result.content.filter(b => b.type === 'tool_use');
const text = result.content.filter(b => b.type === 'text');
console.log(` Thinking blocks: ${thinking.length}`);
console.log(` Tool use blocks: ${toolUse.length}`);
console.log(` Text blocks: ${text.length}`);
// Check signatures
const signedThinking = thinking.filter(t => t.signature && t.signature.length >= 50);
console.log(` Signed thinking blocks: ${signedThinking.length}`);
// Analyze block order
const blockOrder = result.content.map(b => b.type).join(' -> ');
console.log(` Block order: ${blockOrder}`);
// Show thinking previews
thinking.forEach((t, i) => {
console.log(` Thinking ${i + 1}: "${(t.thinking || '').substring(0, 50)}..."`);
});
// Show tool calls
toolUse.forEach((t, i) => {
console.log(` Tool ${i + 1}: ${t.name}(${JSON.stringify(t.input).substring(0, 50)}...)`);
});
// Expect at least one thinking block (ideally multiple for complex task)
const passed = thinking.length >= 1 && signedThinking.length >= 1 && toolUse.length >= 1;
results.push({ name: 'Thinking + Tools in complex task', passed });
if (!passed) allPassed = false;
}
// ===== TEST 2: Multiple tool calls in sequence =====
console.log('\nTEST 2: Tool result followed by more thinking');
console.log('-'.repeat(40));
// Start with previous result and add tool result
if (result.content && result.content.some(b => b.type === 'tool_use')) {
const toolUseBlock = result.content.find(b => b.type === 'tool_use');
const result2 = await streamRequest({
model: 'claude-opus-4-5-thinking',
max_tokens: 8192,
stream: true,
tools,
thinking: { type: 'enabled', budget_tokens: 16000 },
messages: [
{
role: 'user',
content: `Read src/config.js and tell me if debug mode is enabled.`
},
{ role: 'assistant', content: result.content },
{
role: 'user',
content: [{
type: 'tool_result',
tool_use_id: toolUseBlock.id,
content: `module.exports = {
port: 3000,
host: 'localhost',
debug: false
};`
}]
}
]
});
if (result2.error) {
console.log(` ERROR: ${result2.error.message}`);
allPassed = false;
results.push({ name: 'Thinking after tool result', passed: false });
} else {
const thinking2 = result2.content.filter(b => b.type === 'thinking');
const text2 = result2.content.filter(b => b.type === 'text');
const toolUse2 = result2.content.filter(b => b.type === 'tool_use');
console.log(` Thinking blocks: ${thinking2.length}`);
console.log(` Text blocks: ${text2.length}`);
console.log(` Tool use blocks: ${toolUse2.length}`);
if (text2.length > 0) {
console.log(` Response: "${text2[0].text?.substring(0, 80)}..."`);
}
// Should have thinking after receiving tool result
const passed = thinking2.length >= 1 && (text2.length > 0 || toolUse2.length > 0);
results.push({ name: 'Thinking after tool result', passed });
if (!passed) allPassed = false;
}
} else {
console.log(' SKIPPED - No tool use in previous test');
results.push({ name: 'Thinking after tool result', passed: false, skipped: true });
}
// ===== Summary =====
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log('='.repeat(60));
for (const result of results) {
const status = result.skipped ? 'SKIP' : (result.passed ? 'PASS' : 'FAIL');
console.log(` [${status}] ${result.name}`);
}
console.log('\n' + '='.repeat(60));
console.log(`OVERALL: ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
console.log('='.repeat(60));
process.exit(allPassed ? 0 : 1);
}
runTests().catch(err => {
console.error('Test failed with error:', err);
process.exit(1);
});