Improved consensus to treat a step properly as both a request + response, and initial step includes Claude's assessment.
Improved prompt to not request for code when it's a general business decision
This commit is contained in:
@@ -14,7 +14,7 @@ import os
|
|||||||
# These values are used in server responses and for tracking releases
|
# These values are used in server responses and for tracking releases
|
||||||
# IMPORTANT: This is the single source of truth for version and author info
|
# IMPORTANT: This is the single source of truth for version and author info
|
||||||
# Semantic versioning: MAJOR.MINOR.PATCH
|
# Semantic versioning: MAJOR.MINOR.PATCH
|
||||||
__version__ = "5.5.6"
|
__version__ = "5.5.7"
|
||||||
# Last update date in ISO format
|
# Last update date in ISO format
|
||||||
__updated__ = "2025-06-22"
|
__updated__ = "2025-06-22"
|
||||||
# Primary maintainer
|
# Primary maintainer
|
||||||
|
|||||||
@@ -78,7 +78,11 @@ class TestConsensusConversation(ConversationBaseTest):
|
|||||||
consensus_response, _ = self.call_mcp_tool(
|
consensus_response, _ = self.call_mcp_tool(
|
||||||
"consensus",
|
"consensus",
|
||||||
{
|
{
|
||||||
"prompt": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?",
|
"step": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?",
|
||||||
|
"step_number": 1,
|
||||||
|
"total_steps": 2,
|
||||||
|
"next_step_required": True,
|
||||||
|
"findings": "Initial analysis needed on OAuth2 vs session-based authentication approaches for our web application",
|
||||||
"models": [
|
"models": [
|
||||||
{
|
{
|
||||||
"model": "flash",
|
"model": "flash",
|
||||||
@@ -115,8 +119,10 @@ class TestConsensusConversation(ConversationBaseTest):
|
|||||||
self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}")
|
self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if consensus_data.get("status") != "consensus_success":
|
# Check for step 1 status (Claude analysis + first model consultation)
|
||||||
self.logger.error(f"Consensus failed with status: {consensus_data.get('status')}")
|
expected_status = "analysis_and_first_model_consulted"
|
||||||
|
if consensus_data.get("status") != expected_status:
|
||||||
|
self.logger.error(f"Consensus step 1 failed with status: {consensus_data.get('status')}, expected: {expected_status}")
|
||||||
if "error" in consensus_data:
|
if "error" in consensus_data:
|
||||||
self.logger.error(f"Error: {consensus_data['error']}")
|
self.logger.error(f"Error: {consensus_data['error']}")
|
||||||
return False
|
return False
|
||||||
@@ -172,26 +178,29 @@ class TestConsensusConversation(ConversationBaseTest):
|
|||||||
# Phase 4: Verify response structure
|
# Phase 4: Verify response structure
|
||||||
self.logger.info("Phase 4: Verifying consensus response structure")
|
self.logger.info("Phase 4: Verifying consensus response structure")
|
||||||
|
|
||||||
# Check that consensus has proper models_used
|
# Check that we have model response from step 1
|
||||||
models_used = consensus_data.get("models_used", [])
|
model_response = consensus_data.get("model_response")
|
||||||
if not models_used:
|
if not model_response:
|
||||||
self.logger.error("Consensus response missing models_used")
|
self.logger.error("Consensus step 1 response missing model_response")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check that we have responses
|
# Check that model response has expected structure
|
||||||
responses = consensus_data.get("responses", [])
|
if not model_response.get("model") or not model_response.get("verdict"):
|
||||||
if not responses:
|
self.logger.error("Model response missing required fields (model or verdict)")
|
||||||
self.logger.error("Consensus response missing responses")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Verify at least one successful response
|
# Check step information
|
||||||
successful_responses = [r for r in responses if r.get("status") == "success"]
|
if consensus_data.get("step_number") != 1:
|
||||||
if not successful_responses:
|
self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}")
|
||||||
self.logger.error("No successful responses in consensus")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.logger.info(f"Consensus used models: {models_used}")
|
if not consensus_data.get("next_step_required"):
|
||||||
self.logger.info(f"Consensus had {len(successful_responses)} successful responses")
|
self.logger.error("Expected next_step_required=True for step 1")
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}")
|
||||||
|
self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}")
|
||||||
|
self.logger.info(f"Response status: {model_response.get('status', 'unknown')}")
|
||||||
|
|
||||||
# Phase 5: Cross-tool continuation test
|
# Phase 5: Cross-tool continuation test
|
||||||
self.logger.info("Phase 5: Testing cross-tool continuation from consensus")
|
self.logger.info("Phase 5: Testing cross-tool continuation from consensus")
|
||||||
|
|||||||
@@ -23,11 +23,15 @@ class TestConsensusThreeModels(BaseSimulatorTest):
|
|||||||
try:
|
try:
|
||||||
self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")
|
self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral")
|
||||||
|
|
||||||
# Send request with three ModelConfig objects
|
# Send request with three ModelConfig objects using new workflow parameters
|
||||||
response, continuation_id = self.call_mcp_tool(
|
response, continuation_id = self.call_mcp_tool(
|
||||||
"consensus",
|
"consensus",
|
||||||
{
|
{
|
||||||
"prompt": "Is a sync manager class a good idea for my CoolTodos app?",
|
"step": "Is a sync manager class a good idea for my CoolTodos app?",
|
||||||
|
"step_number": 1,
|
||||||
|
"total_steps": 3, # 3 models = 3 steps
|
||||||
|
"next_step_required": True,
|
||||||
|
"findings": "Initial analysis needed on sync manager class architecture decision for CoolTodos app",
|
||||||
"models": [
|
"models": [
|
||||||
{
|
{
|
||||||
"model": "flash",
|
"model": "flash",
|
||||||
@@ -45,8 +49,7 @@ class TestConsensusThreeModels(BaseSimulatorTest):
|
|||||||
"stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
|
"stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"model": "flash", # Default model for Claude's synthesis
|
"model": "flash", # Default model for Claude's execution
|
||||||
"focus_areas": ["architecture", "maintainability", "complexity", "scalability"],
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -69,8 +72,10 @@ class TestConsensusThreeModels(BaseSimulatorTest):
|
|||||||
self.logger.error("Missing 'status' field in three-model consensus response")
|
self.logger.error("Missing 'status' field in three-model consensus response")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if consensus_data["status"] != "consensus_success":
|
# Check for step 1 status (Claude analysis + first model consultation)
|
||||||
self.logger.error(f"Three-model consensus failed with status: {consensus_data['status']}")
|
expected_status = "analysis_and_first_model_consulted"
|
||||||
|
if consensus_data["status"] != expected_status:
|
||||||
|
self.logger.error(f"Three-model consensus step 1 failed with status: {consensus_data['status']}, expected: {expected_status}")
|
||||||
|
|
||||||
# Log additional error details for debugging
|
# Log additional error details for debugging
|
||||||
if "error" in consensus_data:
|
if "error" in consensus_data:
|
||||||
@@ -84,67 +89,52 @@ class TestConsensusThreeModels(BaseSimulatorTest):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check that models were used correctly
|
# Check that we have model response from step 1
|
||||||
if "models_used" not in consensus_data:
|
model_response = consensus_data.get("model_response")
|
||||||
self.logger.error("Missing 'models_used' field in three-model consensus response")
|
if not model_response:
|
||||||
|
self.logger.error("Three-model consensus step 1 response missing model_response")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
models_used = consensus_data["models_used"]
|
# Check that model response has expected structure
|
||||||
self.logger.info(f"Models used in three-model test: {models_used}")
|
if not model_response.get("model") or not model_response.get("verdict"):
|
||||||
|
self.logger.error("Model response missing required fields (model or verdict)")
|
||||||
# Validate we got the expected models (allowing for some to fail)
|
|
||||||
expected_models = ["flash:against", "flash:for", "local-llama"]
|
|
||||||
successful_models = [m for m in expected_models if m in models_used]
|
|
||||||
|
|
||||||
if len(successful_models) == 0:
|
|
||||||
self.logger.error("No models succeeded in three-model consensus test")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.logger.info(f"Successful models in three-model test: {successful_models}")
|
# Check step information
|
||||||
|
if consensus_data.get("step_number") != 1:
|
||||||
# Validate responses structure
|
self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}")
|
||||||
if "responses" not in consensus_data:
|
|
||||||
self.logger.error("Missing 'responses' field in three-model consensus response")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
responses = consensus_data["responses"]
|
if not consensus_data.get("next_step_required"):
|
||||||
if len(responses) == 0:
|
self.logger.error("Expected next_step_required=True for step 1")
|
||||||
self.logger.error("No responses received in three-model consensus test")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.logger.info(f"Received {len(responses)} responses in three-model test")
|
self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}")
|
||||||
|
self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}")
|
||||||
|
self.logger.info(f"Response status: {model_response.get('status', 'unknown')}")
|
||||||
|
|
||||||
# Count successful responses by stance
|
# Check metadata contains model name
|
||||||
stance_counts = {"for": 0, "against": 0, "neutral": 0}
|
metadata = consensus_data.get("metadata", {})
|
||||||
for resp in responses:
|
if not metadata.get("model_name"):
|
||||||
if resp.get("status") == "success":
|
self.logger.error("Missing model_name in metadata")
|
||||||
stance = resp.get("stance", "neutral")
|
|
||||||
stance_counts[stance] = stance_counts.get(stance, 0) + 1
|
|
||||||
|
|
||||||
self.logger.info(f"Stance distribution: {stance_counts}")
|
|
||||||
|
|
||||||
# Verify we have at least one successful response
|
|
||||||
total_successful = sum(stance_counts.values())
|
|
||||||
if total_successful == 0:
|
|
||||||
self.logger.error("No successful responses in three-model consensus test")
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check for sequential processing indication (>2 models should use sequential)
|
self.logger.info(f"Model name in metadata: {metadata.get('model_name')}")
|
||||||
if len(consensus_data["models_used"]) > 2:
|
|
||||||
self.logger.info("✓ Sequential processing was correctly used for >2 models")
|
|
||||||
else:
|
|
||||||
self.logger.info("✓ Concurrent processing was used (≤2 models)")
|
|
||||||
|
|
||||||
# Verify synthesis guidance is present
|
# Verify we have analysis from Claude
|
||||||
if "next_steps" not in consensus_data:
|
claude_analysis = consensus_data.get("claude_analysis")
|
||||||
self.logger.error("Missing 'next_steps' field in three-model consensus response")
|
if not claude_analysis:
|
||||||
|
self.logger.error("Missing Claude's analysis in step 1")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
analysis_text = claude_analysis.get("initial_analysis", "")
|
||||||
|
self.logger.info(f"Claude analysis length: {len(analysis_text)} characters")
|
||||||
|
|
||||||
self.logger.info("✓ Three-model consensus tool test completed successfully")
|
self.logger.info("✓ Three-model consensus tool test completed successfully")
|
||||||
self.logger.info(f"✓ Total successful responses: {total_successful}")
|
self.logger.info(f"✓ Step 1 completed with model: {model_response.get('model')}")
|
||||||
self.logger.info(
|
self.logger.info(f"✓ Analysis provided: {len(analysis_text)} characters")
|
||||||
f"✓ Stance diversity achieved: {len([s for s in stance_counts.values() if s > 0])} different stances"
|
self.logger.info(f"✓ Model metadata properly included: {metadata.get('model_name')}")
|
||||||
)
|
self.logger.info("✓ Ready for step 2 continuation")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
{
|
|
||||||
"database": {
|
|
||||||
"host": "localhost",
|
|
||||||
"port": 5432,
|
|
||||||
"name": "testdb",
|
|
||||||
"ssl": true
|
|
||||||
},
|
|
||||||
"cache": {
|
|
||||||
"redis_url": "redis://localhost:6379",
|
|
||||||
"ttl": 3600
|
|
||||||
},
|
|
||||||
"logging": {
|
|
||||||
"level": "INFO",
|
|
||||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
"""
|
|
||||||
Sample Python module for testing MCP conversation continuity
|
|
||||||
"""
|
|
||||||
|
|
||||||
def fibonacci(n):
|
|
||||||
"""Calculate fibonacci number recursively"""
|
|
||||||
if n <= 1:
|
|
||||||
return n
|
|
||||||
return fibonacci(n-1) + fibonacci(n-2)
|
|
||||||
|
|
||||||
def factorial(n):
|
|
||||||
"""Calculate factorial iteratively"""
|
|
||||||
result = 1
|
|
||||||
for i in range(1, n + 1):
|
|
||||||
result *= i
|
|
||||||
return result
|
|
||||||
|
|
||||||
class Calculator:
|
|
||||||
"""Simple calculator class"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.history = []
|
|
||||||
|
|
||||||
def add(self, a, b):
|
|
||||||
result = a + b
|
|
||||||
self.history.append(f"{a} + {b} = {result}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
def multiply(self, a, b):
|
|
||||||
result = a * b
|
|
||||||
self.history.append(f"{a} * {b} = {result}")
|
|
||||||
return result
|
|
||||||
@@ -502,6 +502,16 @@ of the evidence, even when it strongly points in one direction.""",
|
|||||||
# Add accumulated responses for tracking
|
# Add accumulated responses for tracking
|
||||||
response_data["accumulated_responses"] = self.accumulated_responses
|
response_data["accumulated_responses"] = self.accumulated_responses
|
||||||
|
|
||||||
|
# Add metadata (since we're bypassing the base class metadata addition)
|
||||||
|
model_name = self.get_request_model_name(request)
|
||||||
|
provider = self.get_model_provider(model_name)
|
||||||
|
response_data["metadata"] = {
|
||||||
|
"tool_name": self.get_name(),
|
||||||
|
"model_name": model_name,
|
||||||
|
"model_used": model_name,
|
||||||
|
"provider_used": provider.get_provider_type().value,
|
||||||
|
}
|
||||||
|
|
||||||
return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
|
return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
|
||||||
|
|
||||||
# Otherwise, use standard workflow execution
|
# Otherwise, use standard workflow execution
|
||||||
|
|||||||
Reference in New Issue
Block a user