Quick test mode for simulation tests

Fixed o4-mini name, OpenAI removed o4-mini-high
Add max_output_tokens property to ModelCapabilities
Fixed tests after refactor
This commit is contained in:
Fahad
2025-06-23 18:56:47 +04:00
parent ce6c1fd7ea
commit 3b250c95df
6 changed files with 49 additions and 32 deletions

View File

@@ -206,11 +206,14 @@ if __name__ == "__main__":
response2, continuation_id2 = self.call_mcp_tool(
"analyze",
{
"prompt": "Analyze the performance implications of these recursive functions.",
"files": [file1_path],
"step": "Analyze the performance implications of these recursive functions.",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Continuing from chat conversation to analyze performance implications of recursive functions.",
"relevant_files": [file1_path],
"continuation_id": continuation_id1, # Continue the chat conversation
"model": "flash",
"temperature": 0.7,
},
)
@@ -221,10 +224,14 @@ if __name__ == "__main__":
self.logger.info(f" ✅ Step 2 completed with continuation_id: {continuation_id2[:8]}...")
continuation_ids.append(continuation_id2)
# Validate that we got a different continuation ID
if continuation_id2 == continuation_id1:
self.logger.error(" ❌ Step 2: Got same continuation ID as Step 1 - continuation not working")
return False
# Validate continuation ID behavior for workflow tools
# Workflow tools reuse the same continuation_id when continuing within a workflow session
# This is expected behavior and different from simple tools
if continuation_id2 != continuation_id1:
self.logger.info(" ✅ Step 2: Got new continuation ID (workflow behavior)")
else:
self.logger.info(" ✅ Step 2: Reused continuation ID (workflow session continuation)")
# Both behaviors are valid - what matters is that we got a continuation_id
# Validate that Step 2 is building on Step 1's conversation
# Check if the response references the previous conversation
@@ -276,17 +283,16 @@ if __name__ == "__main__":
all_have_continuation_ids = bool(continuation_id1 and continuation_id2 and continuation_id3)
criteria.append(("All steps generated continuation IDs", all_have_continuation_ids))
# 3. Each continuation ID is unique
unique_continuation_ids = len(set(continuation_ids)) == len(continuation_ids)
criteria.append(("Each response generated unique continuation ID", unique_continuation_ids))
# 3. Continuation behavior validation (handles both simple and workflow tools)
# Simple tools create new IDs each time, workflow tools may reuse IDs within sessions
has_valid_continuation_pattern = len(continuation_ids) == 3
criteria.append(("Valid continuation ID pattern", has_valid_continuation_pattern))
# 4. Continuation IDs follow the expected pattern
step_ids_different = (
len(continuation_ids) == 3
and continuation_ids[0] != continuation_ids[1]
and continuation_ids[1] != continuation_ids[2]
# 4. Check for conversation continuity (more important than ID uniqueness)
conversation_has_continuity = len(continuation_ids) == 3 and all(
cid is not None for cid in continuation_ids
)
criteria.append(("All continuation IDs are different", step_ids_different))
criteria.append(("Conversation continuity maintained", conversation_has_continuity))
# 5. Check responses build on each other (content validation)
step1_has_function_analysis = "fibonacci" in response1.lower() or "factorial" in response1.lower()