- Move TEST_MODELS and DEFAULT_PRESETS to src/constants.js as single source of truth - Update test-models.cjs helper to use dynamic import from constants - Make getTestModels() and getModels() async functions - Update all test files to await async model config loading - Remove duplicate THINKING_MODELS and getThinkingModels() from test helper - Make thinking tests more lenient for Gemini (doesn't always produce thinking blocks) Co-Authored-By: Claude <noreply@anthropic.com>
190 lines
7.1 KiB
JavaScript
190 lines
7.1 KiB
JavaScript
/**
|
|
* Multi-Turn Tool Call Test (Streaming)
|
|
*
|
|
* Simulates Claude Code's streaming multi-turn conversation pattern.
|
|
* Same flow as non-streaming but verifies:
|
|
* - SSE events are properly formatted
|
|
* - signature_delta events are present
|
|
* - Thinking blocks accumulate correctly across deltas
|
|
*
|
|
* Runs for both Claude and Gemini model families.
|
|
*/
|
|
const { streamRequest, analyzeContent, analyzeEvents, commonTools } = require('./helpers/http-client.cjs');
|
|
const { getTestModels, getModelConfig } = require('./helpers/test-models.cjs');
|
|
|
|
const tools = [commonTools.executeCommand];
|
|
|
|
async function runTestsForModel(family, model) {
|
|
console.log('='.repeat(60));
|
|
console.log(`MULTI-TURN TOOL CALL TEST [${family.toUpperCase()}]`);
|
|
console.log(`Model: ${model}`);
|
|
console.log('Simulates Claude Code streaming conversation');
|
|
console.log('='.repeat(60));
|
|
console.log('');
|
|
|
|
let messages = [];
|
|
let allPassed = true;
|
|
const results = [];
|
|
const modelConfig = getModelConfig(family);
|
|
|
|
// ===== TURN 1: Initial request =====
|
|
console.log('TURN 1: User asks to run a command');
|
|
console.log('-'.repeat(40));
|
|
|
|
messages.push({
|
|
role: 'user',
|
|
content: 'Run "ls -la" in the current directory and tell me what files exist.'
|
|
});
|
|
|
|
const turn1 = await streamRequest({
|
|
model,
|
|
max_tokens: modelConfig.max_tokens,
|
|
stream: true,
|
|
tools,
|
|
thinking: modelConfig.thinking,
|
|
messages
|
|
});
|
|
|
|
if (turn1.error) {
|
|
console.log(` ERROR: ${turn1.error.message}`);
|
|
allPassed = false;
|
|
results.push({ name: 'Turn 1: Streaming request', passed: false });
|
|
} else {
|
|
const content = analyzeContent(turn1.content);
|
|
const events = analyzeEvents(turn1.events);
|
|
|
|
console.log(' Content:');
|
|
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`);
|
|
console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`);
|
|
console.log(` Tool Use: ${content.hasToolUse ? 'YES' : 'NO'} (${content.toolUse.length} calls)`);
|
|
|
|
console.log(' Events:');
|
|
console.log(` message_start: ${events.messageStart}`);
|
|
console.log(` content_block_start/stop: ${events.blockStart}/${events.blockStop}`);
|
|
console.log(` thinking_delta: ${events.thinkingDeltas}`);
|
|
console.log(` signature_delta: ${events.signatureDeltas}`);
|
|
console.log(` input_json_delta: ${events.inputJsonDeltas}`);
|
|
|
|
if (content.hasThinking && content.thinking[0].thinking) {
|
|
console.log(` Thinking: "${content.thinking[0].thinking.substring(0, 60)}..."`);
|
|
}
|
|
if (content.hasToolUse) {
|
|
console.log(` Tool: ${content.toolUse[0].name}(${JSON.stringify(content.toolUse[0].input)})`);
|
|
}
|
|
|
|
// For Claude: signature is on thinking block and comes via signature_delta events
|
|
// For Gemini: signature is on tool_use block (no signature_delta events)
|
|
// Note: Some models may skip thinking on simple first requests - signature + tool use is key
|
|
const hasSignature = content.hasSignature || events.signatureDeltas > 0;
|
|
const passed = hasSignature && content.hasToolUse;
|
|
results.push({ name: 'Turn 1: Signature + Tool Use', passed });
|
|
if (!passed) allPassed = false;
|
|
|
|
if (content.hasToolUse) {
|
|
messages.push({ role: 'assistant', content: turn1.content });
|
|
}
|
|
}
|
|
|
|
// ===== TURN 2: Provide tool result =====
|
|
if (messages.length >= 2) {
|
|
console.log('\nTURN 2: Provide command output, expect summary');
|
|
console.log('-'.repeat(40));
|
|
|
|
const lastAssistant = messages[messages.length - 1];
|
|
const toolUseBlock = lastAssistant.content.find(b => b.type === 'tool_use');
|
|
|
|
messages.push({
|
|
role: 'user',
|
|
content: [{
|
|
type: 'tool_result',
|
|
tool_use_id: toolUseBlock.id,
|
|
content: `total 32
|
|
drwxr-xr-x 10 user staff 320 Dec 19 10:00 .
|
|
drwxr-xr-x 5 user staff 160 Dec 19 09:00 ..
|
|
-rw-r--r-- 1 user staff 1024 Dec 19 10:00 package.json
|
|
-rw-r--r-- 1 user staff 2048 Dec 19 10:00 README.md
|
|
drwxr-xr-x 8 user staff 256 Dec 19 10:00 src
|
|
drwxr-xr-x 4 user staff 128 Dec 19 10:00 tests`
|
|
}]
|
|
});
|
|
|
|
const turn2 = await streamRequest({
|
|
model,
|
|
max_tokens: modelConfig.max_tokens,
|
|
stream: true,
|
|
tools,
|
|
thinking: modelConfig.thinking,
|
|
messages
|
|
});
|
|
|
|
if (turn2.error) {
|
|
console.log(` ERROR: ${turn2.error.message}`);
|
|
allPassed = false;
|
|
results.push({ name: 'Turn 2: After tool result', passed: false });
|
|
} else {
|
|
const content = analyzeContent(turn2.content);
|
|
const events = analyzeEvents(turn2.events);
|
|
|
|
console.log(' Content:');
|
|
console.log(` Thinking: ${content.hasThinking ? 'YES' : 'NO'} (${content.thinking.length} blocks)`);
|
|
console.log(` Signature: ${content.hasSignature ? 'YES' : 'NO'}`);
|
|
console.log(` Text: ${content.hasText ? 'YES' : 'NO'}`);
|
|
|
|
console.log(' Events:');
|
|
console.log(` thinking_delta: ${events.thinkingDeltas}`);
|
|
console.log(` signature_delta: ${events.signatureDeltas}`);
|
|
console.log(` text_delta: ${events.textDeltas}`);
|
|
|
|
if (content.hasText && content.text[0].text) {
|
|
console.log(` Response: "${content.text[0].text.substring(0, 100)}..."`);
|
|
}
|
|
|
|
// Text or tool use response is acceptable
|
|
// Note: Models may skip thinking on obvious responses - this is valid behavior
|
|
const passed = (content.hasText && events.textDeltas > 0) || content.hasToolUse;
|
|
results.push({ name: 'Turn 2: Text or Tool response', passed });
|
|
if (!passed) allPassed = false;
|
|
}
|
|
}
|
|
|
|
// ===== Summary =====
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log(`SUMMARY [${family.toUpperCase()}]`);
|
|
console.log('='.repeat(60));
|
|
|
|
for (const result of results) {
|
|
const status = result.passed ? 'PASS' : 'FAIL';
|
|
console.log(` [${status}] ${result.name}`);
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log(`[${family.toUpperCase()}] ${allPassed ? 'ALL TESTS PASSED' : 'SOME TESTS FAILED'}`);
|
|
console.log('='.repeat(60));
|
|
|
|
return allPassed;
|
|
}
|
|
|
|
async function runTests() {
|
|
const models = await getTestModels();
|
|
let allPassed = true;
|
|
|
|
for (const { family, model } of models) {
|
|
console.log('\n');
|
|
const passed = await runTestsForModel(family, model);
|
|
if (!passed) allPassed = false;
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('FINAL RESULT');
|
|
console.log('='.repeat(60));
|
|
console.log(`Overall: ${allPassed ? 'ALL MODEL FAMILIES PASSED' : 'SOME MODEL FAMILIES FAILED'}`);
|
|
console.log('='.repeat(60));
|
|
|
|
process.exit(allPassed ? 0 : 1);
|
|
}
|
|
|
|
runTests().catch(err => {
|
|
console.error('Test failed with error:', err);
|
|
process.exit(1);
|
|
});
|