chore: add empty response retry test and fix flaky tests

- Add test:emptyretry script and include in test suite
- Fix test-interleaved-thinking: use complex prompt to force thinking
- Fix test-multiturn-thinking-tools: make Turn 2 lenient (thinking optional)
- Fix test-multiturn-thinking-tools-streaming: same lenient approach
- Use TEST_MODELS helper instead of hardcoded model ID

Models may skip thinking on obvious next steps - this is valid behavior.
Tests now only require thinking on first turn to verify signatures work.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Badri Narayanan S
2026-01-08 17:54:48 +05:30
parent a696ed0872
commit 7375a2ef6d
6 changed files with 19 additions and 13 deletions

View File

@@ -96,7 +96,7 @@ async function runTestsForModel(family, model) {
content: [{
type: 'tool_result',
tool_use_id: toolUseBlock.id,
content: 'Found files:\n- /project/package.json\n- /project/packages/core/package.json'
content: 'Found files:\n- /project/package.json (root, 2.3KB, modified 2 days ago)\n- /project/packages/core/package.json (workspace, 1.1KB, modified 1 hour ago)\n- /project/packages/legacy/package.json (deprecated, 0.8KB, modified 1 year ago)\n- /project/node_modules/lodash/package.json (dependency, 3.2KB)\n\nIMPORTANT: Before proceeding, reason through which files are most relevant. Consider: Are node_modules relevant? Should deprecated packages be included? Which workspace packages matter for the user\'s question about dependencies?'
}]
});
@@ -128,10 +128,10 @@ async function runTestsForModel(family, model) {
}
// Either tool use (to read file) or text response is acceptable
const passed = expectThinking
? (analysis.hasThinking && (analysis.hasToolUse || analysis.hasText))
: (analysis.hasToolUse || analysis.hasText);
results.push({ name: 'Turn 2: Thinking + (Tool or Text)', passed });
// Note: Claude may skip thinking on obvious next steps - this is valid behavior
// We only require thinking on the first turn to verify signatures work
const passed = analysis.hasToolUse || analysis.hasText;
results.push({ name: 'Turn 2: Tool or Text response', passed });
if (!passed) allPassed = false;
if (analysis.hasToolUse) {