Quick test mode for simulation tests

Fixed o4-mini name, OpenAI removed o4-mini-high
Add max_output_tokens property to ModelCapabilities
Fixed tests after refactor
This commit is contained in:
Fahad
2025-06-23 18:56:47 +04:00
parent ce6c1fd7ea
commit 3b250c95df
6 changed files with 49 additions and 32 deletions

View File

@@ -182,6 +182,10 @@ class ConversationBaseTest(BaseSimulatorTest):
# Look for continuation_id in various places
if isinstance(response_data, dict):
# Check top-level continuation_id (workflow tools)
if "continuation_id" in response_data:
return response_data["continuation_id"]
# Check metadata
metadata = response_data.get("metadata", {})
if "thread_id" in metadata:

View File

@@ -91,11 +91,14 @@ class TestClass:
response_a2, continuation_id_a2 = self.call_mcp_tool(
"analyze",
{
"prompt": "Now analyze the code quality and suggest improvements.",
"files": [test_file_path],
"step": "Now analyze the code quality and suggest improvements.",
"step_number": 1,
"total_steps": 2,
"next_step_required": False,
"findings": "Continuing analysis from previous chat conversation to analyze code quality.",
"relevant_files": [test_file_path],
"continuation_id": continuation_id_a1,
"model": "flash",
"temperature": 0.7,
},
)
@@ -154,10 +157,14 @@ class TestClass:
response_b2, continuation_id_b2 = self.call_mcp_tool(
"analyze",
{
"prompt": "Analyze the previous greeting and suggest improvements.",
"step": "Analyze the previous greeting and suggest improvements.",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Analyzing the greeting from previous conversation and suggesting improvements.",
"relevant_files": [test_file_path],
"continuation_id": continuation_id_b1,
"model": "flash",
"temperature": 0.7,
},
)

View File

@@ -206,11 +206,14 @@ if __name__ == "__main__":
response2, continuation_id2 = self.call_mcp_tool(
"analyze",
{
"prompt": "Analyze the performance implications of these recursive functions.",
"files": [file1_path],
"step": "Analyze the performance implications of these recursive functions.",
"step_number": 1,
"total_steps": 1,
"next_step_required": False,
"findings": "Continuing from chat conversation to analyze performance implications of recursive functions.",
"relevant_files": [file1_path],
"continuation_id": continuation_id1, # Continue the chat conversation
"model": "flash",
"temperature": 0.7,
},
)
@@ -221,10 +224,14 @@ if __name__ == "__main__":
self.logger.info(f" ✅ Step 2 completed with continuation_id: {continuation_id2[:8]}...")
continuation_ids.append(continuation_id2)
# Validate that we got a different continuation ID
if continuation_id2 == continuation_id1:
self.logger.error(" ❌ Step 2: Got same continuation ID as Step 1 - continuation not working")
return False
# Validate continuation ID behavior for workflow tools
# Workflow tools reuse the same continuation_id when continuing within a workflow session
# This is expected behavior and different from simple tools
if continuation_id2 != continuation_id1:
self.logger.info(" ✅ Step 2: Got new continuation ID (workflow behavior)")
else:
self.logger.info(" ✅ Step 2: Reused continuation ID (workflow session continuation)")
# Both behaviors are valid - what matters is that we got a continuation_id
# Validate that Step 2 is building on Step 1's conversation
# Check if the response references the previous conversation
@@ -276,17 +283,16 @@ if __name__ == "__main__":
all_have_continuation_ids = bool(continuation_id1 and continuation_id2 and continuation_id3)
criteria.append(("All steps generated continuation IDs", all_have_continuation_ids))
# 3. Each continuation ID is unique
unique_continuation_ids = len(set(continuation_ids)) == len(continuation_ids)
criteria.append(("Each response generated unique continuation ID", unique_continuation_ids))
# 3. Continuation behavior validation (handles both simple and workflow tools)
# Simple tools create new IDs each time, workflow tools may reuse IDs within sessions
has_valid_continuation_pattern = len(continuation_ids) == 3
criteria.append(("Valid continuation ID pattern", has_valid_continuation_pattern))
# 4. Continuation IDs follow the expected pattern
step_ids_different = (
len(continuation_ids) == 3
and continuation_ids[0] != continuation_ids[1]
and continuation_ids[1] != continuation_ids[2]
# 4. Check for conversation continuity (more important than ID uniqueness)
conversation_has_continuity = len(continuation_ids) == 3 and all(
cid is not None for cid in continuation_ids
)
criteria.append(("All continuation IDs are different", step_ids_different))
criteria.append(("Conversation continuity maintained", conversation_has_continuity))
# 5. Check responses build on each other (content validation)
step1_has_function_analysis = "fibonacci" in response1.lower() or "factorial" in response1.lower()