diff --git a/config.py b/config.py index da5ccc9..f51acc4 100644 --- a/config.py +++ b/config.py @@ -14,7 +14,7 @@ import os # These values are used in server responses and for tracking releases # IMPORTANT: This is the single source of truth for version and author info # Semantic versioning: MAJOR.MINOR.PATCH -__version__ = "5.5.6" +__version__ = "5.5.7" # Last update date in ISO format __updated__ = "2025-06-22" # Primary maintainer diff --git a/simulator_tests/test_consensus_conversation.py b/simulator_tests/test_consensus_conversation.py index a078342..44eba24 100644 --- a/simulator_tests/test_consensus_conversation.py +++ b/simulator_tests/test_consensus_conversation.py @@ -78,7 +78,11 @@ class TestConsensusConversation(ConversationBaseTest): consensus_response, _ = self.call_mcp_tool( "consensus", { - "prompt": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?", + "step": "Based on our previous discussion about authentication, I need expert consensus: Should we implement OAuth2 or stick with simple session-based auth?", + "step_number": 1, + "total_steps": 2, + "next_step_required": True, + "findings": "Initial analysis needed on OAuth2 vs session-based authentication approaches for our web application", "models": [ { "model": "flash", @@ -115,8 +119,10 @@ class TestConsensusConversation(ConversationBaseTest): self.logger.error(f"Failed to parse consensus response as JSON. Full response: {consensus_response}") return False - if consensus_data.get("status") != "consensus_success": - self.logger.error(f"Consensus failed with status: {consensus_data.get('status')}") + # Check for step 1 status (Claude analysis + first model consultation) + expected_status = "analysis_and_first_model_consulted" + if consensus_data.get("status") != expected_status: + self.logger.error(f"Consensus step 1 failed with status: {consensus_data.get('status')}, expected: {expected_status}") if "error" in consensus_data: self.logger.error(f"Error: {consensus_data['error']}") return False @@ -172,26 +178,29 @@ class TestConsensusConversation(ConversationBaseTest): # Phase 4: Verify response structure self.logger.info("Phase 4: Verifying consensus response structure") - # Check that consensus has proper models_used - models_used = consensus_data.get("models_used", []) - if not models_used: - self.logger.error("Consensus response missing models_used") + # Check that we have model response from step 1 + model_response = consensus_data.get("model_response") + if not model_response: + self.logger.error("Consensus step 1 response missing model_response") return False - # Check that we have responses - responses = consensus_data.get("responses", []) - if not responses: - self.logger.error("Consensus response missing responses") + # Check that model response has expected structure + if not model_response.get("model") or not model_response.get("verdict"): + self.logger.error("Model response missing required fields (model or verdict)") return False - # Verify at least one successful response - successful_responses = [r for r in responses if r.get("status") == "success"] - if not successful_responses: - self.logger.error("No successful responses in consensus") + # Check step information + if consensus_data.get("step_number") != 1: + self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}") return False - self.logger.info(f"Consensus used models: {models_used}") - self.logger.info(f"Consensus had {len(successful_responses)} successful responses") + if not consensus_data.get("next_step_required"): + self.logger.error("Expected next_step_required=True for step 1") + return False + + self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}") + self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}") + self.logger.info(f"Response status: {model_response.get('status', 'unknown')}") # Phase 5: Cross-tool continuation test self.logger.info("Phase 5: Testing cross-tool continuation from consensus") diff --git a/simulator_tests/test_consensus_three_models.py b/simulator_tests/test_consensus_three_models.py index 3cd4773..67b24ed 100644 --- a/simulator_tests/test_consensus_three_models.py +++ b/simulator_tests/test_consensus_three_models.py @@ -23,11 +23,15 @@ class TestConsensusThreeModels(BaseSimulatorTest): try: self.logger.info("Testing consensus tool with three models: flash:against, flash:for, local-llama:neutral") - # Send request with three ModelConfig objects + # Send request with three ModelConfig objects using new workflow parameters response, continuation_id = self.call_mcp_tool( "consensus", { - "prompt": "Is a sync manager class a good idea for my CoolTodos app?", + "step": "Is a sync manager class a good idea for my CoolTodos app?", + "step_number": 1, + "total_steps": 3, # 3 models = 3 steps + "next_step_required": True, + "findings": "Initial analysis needed on sync manager class architecture decision for CoolTodos app", "models": [ { "model": "flash", @@ -45,8 +49,7 @@ class TestConsensusThreeModels(BaseSimulatorTest): "stance_prompt": "You are a pragmatic software engineer. Provide a balanced analysis considering both the benefits and drawbacks. Focus on the specific context of a CoolTodos app and what factors would determine if this is the right choice.", }, ], - "model": "flash", # Default model for Claude's synthesis - "focus_areas": ["architecture", "maintainability", "complexity", "scalability"], + "model": "flash", # Default model for Claude's execution }, ) @@ -69,8 +72,10 @@ class TestConsensusThreeModels(BaseSimulatorTest): self.logger.error("Missing 'status' field in three-model consensus response") return False - if consensus_data["status"] != "consensus_success": - self.logger.error(f"Three-model consensus failed with status: {consensus_data['status']}") + # Check for step 1 status (Claude analysis + first model consultation) + expected_status = "analysis_and_first_model_consulted" + if consensus_data["status"] != expected_status: + self.logger.error(f"Three-model consensus step 1 failed with status: {consensus_data['status']}, expected: {expected_status}") # Log additional error details for debugging if "error" in consensus_data: @@ -84,67 +89,52 @@ class TestConsensusThreeModels(BaseSimulatorTest): return False - # Check that models were used correctly - if "models_used" not in consensus_data: - self.logger.error("Missing 'models_used' field in three-model consensus response") + # Check that we have model response from step 1 + model_response = consensus_data.get("model_response") + if not model_response: + self.logger.error("Three-model consensus step 1 response missing model_response") return False - models_used = consensus_data["models_used"] - self.logger.info(f"Models used in three-model test: {models_used}") - - # Validate we got the expected models (allowing for some to fail) - expected_models = ["flash:against", "flash:for", "local-llama"] - successful_models = [m for m in expected_models if m in models_used] - - if len(successful_models) == 0: - self.logger.error("No models succeeded in three-model consensus test") + # Check that model response has expected structure + if not model_response.get("model") or not model_response.get("verdict"): + self.logger.error("Model response missing required fields (model or verdict)") return False - self.logger.info(f"Successful models in three-model test: {successful_models}") - - # Validate responses structure - if "responses" not in consensus_data: - self.logger.error("Missing 'responses' field in three-model consensus response") + # Check step information + if consensus_data.get("step_number") != 1: + self.logger.error(f"Expected step_number 1, got: {consensus_data.get('step_number')}") return False - responses = consensus_data["responses"] - if len(responses) == 0: - self.logger.error("No responses received in three-model consensus test") + if not consensus_data.get("next_step_required"): + self.logger.error("Expected next_step_required=True for step 1") return False - self.logger.info(f"Received {len(responses)} responses in three-model test") + self.logger.info(f"Consensus step 1 consulted model: {model_response.get('model')}") + self.logger.info(f"Model stance: {model_response.get('stance', 'neutral')}") + self.logger.info(f"Response status: {model_response.get('status', 'unknown')}") - # Count successful responses by stance - stance_counts = {"for": 0, "against": 0, "neutral": 0} - for resp in responses: - if resp.get("status") == "success": - stance = resp.get("stance", "neutral") - stance_counts[stance] = stance_counts.get(stance, 0) + 1 - - self.logger.info(f"Stance distribution: {stance_counts}") - - # Verify we have at least one successful response - total_successful = sum(stance_counts.values()) - if total_successful == 0: - self.logger.error("No successful responses in three-model consensus test") + # Check metadata contains model name + metadata = consensus_data.get("metadata", {}) + if not metadata.get("model_name"): + self.logger.error("Missing model_name in metadata") return False - # Check for sequential processing indication (>2 models should use sequential) - if len(consensus_data["models_used"]) > 2: - self.logger.info("✓ Sequential processing was correctly used for >2 models") - else: - self.logger.info("✓ Concurrent processing was used (≤2 models)") + self.logger.info(f"Model name in metadata: {metadata.get('model_name')}") - # Verify synthesis guidance is present - if "next_steps" not in consensus_data: - self.logger.error("Missing 'next_steps' field in three-model consensus response") + # Verify we have analysis from Claude + claude_analysis = consensus_data.get("claude_analysis") + if not claude_analysis: + self.logger.error("Missing Claude's analysis in step 1") return False + analysis_text = claude_analysis.get("initial_analysis", "") + self.logger.info(f"Claude analysis length: {len(analysis_text)} characters") + self.logger.info("✓ Three-model consensus tool test completed successfully") - self.logger.info(f"✓ Total successful responses: {total_successful}") - self.logger.info( - f"✓ Stance diversity achieved: {len([s for s in stance_counts.values() if s > 0])} different stances" - ) + self.logger.info(f"✓ Step 1 completed with model: {model_response.get('model')}") + self.logger.info(f"✓ Analysis provided: {len(analysis_text)} characters") + self.logger.info(f"✓ Model metadata properly included: {metadata.get('model_name')}") + self.logger.info("✓ Ready for step 2 continuation") return True diff --git a/test_simulation_files/config.json b/test_simulation_files/config.json deleted file mode 100644 index c066b27..0000000 --- a/test_simulation_files/config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "database": { - "host": "localhost", - "port": 5432, - "name": "testdb", - "ssl": true - }, - "cache": { - "redis_url": "redis://localhost:6379", - "ttl": 3600 - }, - "logging": { - "level": "INFO", - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - } -} \ No newline at end of file diff --git a/test_simulation_files/test_module.py b/test_simulation_files/test_module.py deleted file mode 100644 index 5defb99..0000000 --- a/test_simulation_files/test_module.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Sample Python module for testing MCP conversation continuity -""" - -def fibonacci(n): - """Calculate fibonacci number recursively""" - if n <= 1: - return n - return fibonacci(n-1) + fibonacci(n-2) - -def factorial(n): - """Calculate factorial iteratively""" - result = 1 - for i in range(1, n + 1): - result *= i - return result - -class Calculator: - """Simple calculator class""" - - def __init__(self): - self.history = [] - - def add(self, a, b): - result = a + b - self.history.append(f"{a} + {b} = {result}") - return result - - def multiply(self, a, b): - result = a * b - self.history.append(f"{a} * {b} = {result}") - return result diff --git a/tools/consensus.py b/tools/consensus.py index 2d9146e..874c300 100644 --- a/tools/consensus.py +++ b/tools/consensus.py @@ -502,6 +502,16 @@ of the evidence, even when it strongly points in one direction.""", # Add accumulated responses for tracking response_data["accumulated_responses"] = self.accumulated_responses + # Add metadata (since we're bypassing the base class metadata addition) + model_name = self.get_request_model_name(request) + provider = self.get_model_provider(model_name) + response_data["metadata"] = { + "tool_name": self.get_name(), + "model_name": model_name, + "model_used": model_name, + "provider_used": provider.get_provider_type().value, + } + return [TextContent(type="text", text=json.dumps(response_data, indent=2))] # Otherwise, use standard workflow execution