Improved consensus to treat a step properly as both a request + response, and initial step includes Claude's assessment.

Improved prompt to not request for code when it's a general business decision
2025-06-22 13:21:09 +04:00
parent 355331d141
commit 18f6f16ac6
8 changed files with 478 additions and 312 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -11,8 +11,8 @@ from .test_basic_conversation import BasicConversationTest
 from .test_chat_simple_validation import ChatSimpleValidationTest
 from .test_codereview_validation import CodeReviewValidationTest
 from .test_consensus_conversation import TestConsensusConversation
 from .test_consensus_stance import TestConsensusStance
 from .test_consensus_three_models import TestConsensusThreeModels
 from .test_consensus_workflow_accurate import TestConsensusWorkflowAccurate
 from .test_content_validation import ContentValidationTest
 from .test_conversation_chain_validation import ConversationChainValidationTest
 from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
@@ -71,7 +71,7 @@ TEST_REGISTRY = {
    "vision_capability": VisionCapabilityTest,
    "xai_models": XAIModelsTest,
    "consensus_conversation": TestConsensusConversation,
-    "consensus_stance": TestConsensusStance,
+    "consensus_workflow_accurate": TestConsensusWorkflowAccurate,
    "consensus_three_models": TestConsensusThreeModels,
    "analyze_validation": AnalyzeValidationTest,
    "prompt_size_limit_bug": PromptSizeLimitBugTest,
@@ -108,7 +108,7 @@ __all__ = [
    "VisionCapabilityTest",
    "XAIModelsTest",
    "TestConsensusConversation",
-    "TestConsensusStance",
+    "TestConsensusWorkflowAccurate",
    "TestConsensusThreeModels",
    "AnalyzeValidationTest",
    "PromptSizeLimitBugTest",
--- a/simulator_tests/test_consensus_stance.py
+++ b/simulator_tests/test_consensus_stance.py
@@ -1,156 +0,0 @@
 """
 Test consensus tool with explicit stance arguments
 """
 import json
 from .base_test import BaseSimulatorTest
 class TestConsensusStance(BaseSimulatorTest):
    """Test consensus tool functionality with stance steering"""
    @property
    def test_name(self) -> str:
        return "consensus_stance"
    @property
    def test_description(self) -> str:
        return "Test consensus tool with stance steering (for/against/neutral)"
    def run_test(self) -> bool:
        """Run consensus stance test"""
        try:
            self.logger.info("Testing consensus tool with ModelConfig objects and custom stance prompts")
            # Send request with full two-model consensus
            response, continuation_id = self.call_mcp_tool(
                "consensus",
                {
                    "prompt": "Add pizza button: good idea?",
                    "models": [
                        {
                            "model": "flash",
                            "stance": "for",
                            "stance_prompt": "Focus on user engagement benefits.",
                        },
                        {
                            "model": "flash",
                            "stance": "against",
                            "stance_prompt": "Focus on technical complexity issues.",
                        },
                    ],
                    "model": "flash",
                },
            )
            # Validate response
            if not response:
                self.logger.error("Failed to get response from consensus tool")
                return False
            self.logger.info(f"Consensus response preview: {response[:500]}...")
            # Parse the JSON response
            try:
                consensus_data = json.loads(response)
            except json.JSONDecodeError:
                self.logger.error(f"Failed to parse consensus response as JSON: {response}")
                return False
            # Validate consensus structure
            if "status" not in consensus_data:
                self.logger.error("Missing 'status' field in consensus response")
                return False
            if consensus_data["status"] != "consensus_success":
                self.logger.error(f"Consensus failed with status: {consensus_data['status']}")
                # Log additional error details for debugging
                if "error" in consensus_data:
                    self.logger.error(f"Error message: {consensus_data['error']}")
                if "models_errored" in consensus_data:
                    self.logger.error(f"Models that errored: {consensus_data['models_errored']}")
                if "models_skipped" in consensus_data:
                    self.logger.error(f"Models skipped: {consensus_data['models_skipped']}")
                if "next_steps" in consensus_data:
                    self.logger.error(f"Suggested next steps: {consensus_data['next_steps']}")
                return False
            # Check that both models were used with their stances
            if "models_used" not in consensus_data:
                self.logger.error("Missing 'models_used' field in consensus response")
                return False
            models_used = consensus_data["models_used"]
            if len(models_used) != 2:
                self.logger.error(f"Expected 2 models, got {len(models_used)}")
                return False
            if "flash:for" not in models_used:
                self.logger.error("Missing 'flash:for' in models_used")
                return False
            if "flash:against" not in models_used:
                self.logger.error("Missing 'flash:against' in models_used")
                return False
            # Validate responses structure
            if "responses" not in consensus_data:
                self.logger.error("Missing 'responses' field in consensus response")
                return False
            responses = consensus_data["responses"]
            if len(responses) != 2:
                self.logger.error(f"Expected 2 responses, got {len(responses)}")
                return False
            # Check each response has the correct stance
            for_response = None
            against_response = None
            for resp in responses:
                if "stance" not in resp:
                    self.logger.error("Missing 'stance' field in response")
                    return False
                if resp["stance"] == "for":
                    for_response = resp
                elif resp["stance"] == "against":
                    against_response = resp
            # Verify we got both stances
            if not for_response:
                self.logger.error("Missing 'for' stance response")
                return False
            if not against_response:
                self.logger.error("Missing 'against' stance response")
                return False
            # Check that successful responses have verdicts
            if for_response.get("status") == "success":
                if "verdict" not in for_response:
                    self.logger.error("Missing 'verdict' in for_response")
                    return False
                self.logger.info(f"FOR stance verdict preview: {for_response['verdict'][:200]}...")
            if against_response.get("status") == "success":
                if "verdict" not in against_response:
                    self.logger.error("Missing 'verdict' in against_response")
                    return False
                self.logger.info(f"AGAINST stance verdict preview: {against_response['verdict'][:200]}...")
            # Verify synthesis guidance is present
            if "next_steps" not in consensus_data:
                self.logger.error("Missing 'next_steps' field in consensus response")
                return False
            self.logger.info("✓ Consensus tool successfully processed two-model consensus with stance steering")
            return True
        except Exception as e:
            self.logger.error(f"Test failed with exception: {str(e)}")
            return False
--- a/simulator_tests/test_consensus_workflow_accurate.py
+++ b/simulator_tests/test_consensus_workflow_accurate.py
@@ -0,0 +1,226 @@
 """
 Accurate Consensus Workflow Test
 This test validates the complete consensus workflow step-by-step to ensure:
 1. Step 1: Claude provides its own analysis
 2. Step 2: Tool consults first model and returns response to Claude
 3. Step 3: Tool consults second model and returns response to Claude
 4. Step 4: Claude synthesizes all perspectives
 This replaces the old faulty test that used non-workflow parameters.
 """
 import json
 from .conversation_base_test import ConversationBaseTest
 class TestConsensusWorkflowAccurate(ConversationBaseTest):
    """Test complete consensus workflow with accurate step-by-step behavior"""
    @property
    def test_name(self) -> str:
        return "consensus_workflow_accurate"
    @property
    def test_description(self) -> str:
        return "Test NEW efficient consensus workflow: 2 models = 2 steps (Claude+model1, model2+synthesis)"
    def run_test(self) -> bool:
        """Run complete consensus workflow test"""
        # Set up the test environment
        self.setUp()
        try:
            self.logger.info("Testing complete consensus workflow step-by-step")
            self.logger.info("Expected NEW flow: Step1(Claude+Model1) -> Step2(Model2+Synthesis)")
            # ============================================================================
            # STEP 1: Claude analysis + first model consultation
            # ============================================================================
            self.logger.info("=== STEP 1: Claude analysis + flash:for consultation ===")
            step1_response, continuation_id = self.call_mcp_tool_direct(
                "consensus",
                {
                    "step": "Should we add a new AI-powered search feature to our application? Please analyze the technical feasibility, user value, and implementation complexity.",
                    "step_number": 1,
                    "total_steps": 2,  # 2 models (each step includes consultation + analysis)
                    "next_step_required": True,
                    "findings": "Initial assessment of AI search feature proposal considering user needs, technical constraints, and business value.",
                    "models": [
                        {
                            "model": "flash",
                            "stance": "for",
                            "stance_prompt": "Focus on innovation benefits and competitive advantages.",
                        },
                        {
                            "model": "flash",
                            "stance": "against",
                            "stance_prompt": "Focus on implementation complexity and resource requirements.",
                        },
                    ],
                    "model": "flash",  # Claude's execution model
                },
            )
            if not step1_response:
                self.logger.error("Step 1 failed - no response")
                return False
            step1_data = json.loads(step1_response)
            self.logger.info(f"Step 1 status: {step1_data.get('status')}")
            # Validate step 1 response (should include Claude's analysis + first model consultation)
            if step1_data.get("status") != "analysis_and_first_model_consulted":
                self.logger.error(
                    f"Expected status 'analysis_and_first_model_consulted', got: {step1_data.get('status')}"
                )
                return False
            if step1_data.get("step_number") != 1:
                self.logger.error(f"Expected step_number 1, got: {step1_data.get('step_number')}")
                return False
            if not step1_data.get("next_step_required"):
                self.logger.error("Expected next_step_required=True for step 1")
                return False
            # Verify Claude's analysis is included
            if "claude_analysis" not in step1_data:
                self.logger.error("Expected claude_analysis in step 1 response")
                return False
            # Verify first model response is included
            if "model_response" not in step1_data:
                self.logger.error("Expected model_response in step 1 response")
                return False
            model1_response = step1_data["model_response"]
            if model1_response.get("model") != "flash" or model1_response.get("stance") != "for":
                self.logger.error(
                    f"Expected flash:for model response in step 1, got: {model1_response.get('model')}:{model1_response.get('stance')}"
                )
                return False
            self.logger.info("✓ Step 1 completed - Claude analysis + first model (flash:for) consulted")
            # ============================================================================
            # STEP 2: Final step - second model consultation + synthesis
            # ============================================================================
            self.logger.info("=== STEP 2: Final step - second model (flash:against) + synthesis ===")
            step2_response, _ = self.call_mcp_tool_direct(
                "consensus",
                {
                    "step": "I need to review the second model's perspective and provide final synthesis.",
                    "step_number": 2,
                    "total_steps": 2,
                    "next_step_required": False,  # Final step
                    "findings": "Analyzed first model's 'for' perspective. Now ready for second model's 'against' stance and final synthesis.",
                    "continuation_id": continuation_id,
                    "model": "flash",
                },
            )
            if not step2_response:
                self.logger.error("Step 2 failed - no response")
                return False
            self.logger.info(f"Step 2 raw response: {step2_response[:500]}...")
            step2_data = json.loads(step2_response)
            self.logger.info(f"Step 2 status: {step2_data.get('status')}")
            # Validate step 2 - should show consensus completion
            if step2_data.get("status") != "consensus_workflow_complete":
                self.logger.error(f"Expected status 'consensus_workflow_complete', got: {step2_data.get('status')}")
                return False
            if step2_data.get("model_consulted") != "flash":
                self.logger.error(f"Expected model_consulted 'flash', got: {step2_data.get('model_consulted')}")
                return False
            if step2_data.get("model_stance") != "against":
                self.logger.error(f"Expected model_stance 'against', got: {step2_data.get('model_stance')}")
                return False
            # Verify model response is included
            if "model_response" not in step2_data:
                self.logger.error("Expected model_response in step 2")
                return False
            model2_response = step2_data["model_response"]
            if model2_response.get("model") != "flash":
                self.logger.error(f"Expected model_response.model 'flash', got: {model2_response.get('model')}")
                return False
            # Verify consensus completion data
            if not step2_data.get("consensus_complete"):
                self.logger.error("Expected consensus_complete=True in final step")
                return False
            if "complete_consensus" not in step2_data:
                self.logger.error("Expected complete_consensus data in final step")
                return False
            self.logger.info("✓ Step 2 completed - Second model (flash:against) consulted and consensus complete")
            self.logger.info(f"Model 2 verdict preview: {model2_response.get('verdict', 'No verdict')[:100]}...")
            # Validate final consensus completion data
            complete_consensus = step2_data["complete_consensus"]
            if complete_consensus.get("total_responses") != 2:
                self.logger.error(f"Expected 2 model responses, got: {complete_consensus.get('total_responses')}")
                return False
            models_consulted = complete_consensus.get("models_consulted", [])
            expected_models = ["flash:for", "flash:against"]
            if models_consulted != expected_models:
                self.logger.error(f"Expected models {expected_models}, got: {models_consulted}")
                return False
            # ============================================================================
            # VALIDATION: Check accumulated responses are available
            # ============================================================================
            self.logger.info("=== VALIDATION: Checking accumulated responses ===")
            if "accumulated_responses" not in step2_data:
                self.logger.error("Expected accumulated_responses in final step")
                return False
            accumulated = step2_data["accumulated_responses"]
            if len(accumulated) != 2:
                self.logger.error(f"Expected 2 accumulated responses, got: {len(accumulated)}")
                return False
            # Verify first response (flash:for)
            response1 = accumulated[0]
            if response1.get("model") != "flash" or response1.get("stance") != "for":
                self.logger.error(f"First response incorrect: {response1}")
                return False
            # Verify second response (flash:against)
            response2 = accumulated[1]
            if response2.get("model") != "flash" or response2.get("stance") != "against":
                self.logger.error(f"Second response incorrect: {response2}")
                return False
            self.logger.info("✓ All accumulated responses validated")
            # ============================================================================
            # SUCCESS
            # ============================================================================
            self.logger.info("🎉 CONSENSUS WORKFLOW TEST PASSED")
            self.logger.info("✓ Step 1: Claude analysis + first model (flash:for) consulted")
            self.logger.info("✓ Step 2: Second model (flash:against) consulted + synthesis completed")
            self.logger.info("✓ All model responses accumulated correctly")
            self.logger.info("✓ New efficient workflow: 2 models = 2 steps (not 4)")
            self.logger.info("✓ Workflow progression validated at each step")
            return True
        except Exception as e:
            self.logger.error(f"Consensus workflow test failed with exception: {str(e)}")
            import traceback
            self.logger.error(f"Traceback: {traceback.format_exc()}")
            return False
--- a/systemprompts/consensus_prompt.py
+++ b/systemprompts/consensus_prompt.py
@@ -23,15 +23,22 @@ PERSPECTIVE FRAMEWORK
 {stance_prompt}
 IF MORE INFORMATION IS NEEDED
-If you need additional context (e.g., related files, system architecture, requirements, code snippets) to provide thorough
+IMPORTANT: Only request files for TECHNICAL IMPLEMENTATION questions where you need to see actual code, architecture,
-analysis or response, you MUST ONLY respond with this exact JSON (and nothing else). Do NOT ask for the same file you've
+or technical specifications. For business strategy, product decisions, or conceptual questions, provide analysis based
-been provided unless for some reason its content is missing or incomplete:
+on the information given rather than requesting technical files.
 If you need additional technical context (e.g., related files, system architecture, requirements, code snippets) to
 provide thorough analysis of TECHNICAL IMPLEMENTATION details, you MUST ONLY respond with this exact JSON (and nothing else).
 Do NOT ask for the same file you've been provided unless for some reason its content is missing or incomplete:
 {
  "status": "files_required_to_continue",
  "mandatory_instructions": "<your critical instructions for Claude>",
  "files_needed": ["[file name here]", "[or some folder/]"]
 }
 For business strategy, product planning, or conceptual questions, proceed with analysis using your expertise and the
 context provided, even if specific technical details are not available.
 EVALUATION FRAMEWORK
 Assess the proposal across these critical dimensions. Your stance influences HOW you present findings, not WHETHER you
 acknowledge fundamental truths about feasibility, safety, or value:
--- a/test_simulation_files/config.json
+++ b/test_simulation_files/config.json
@@ -0,0 +1,16 @@
 {
  "database": {
    "host": "localhost",
    "port": 5432,
    "name": "testdb",
    "ssl": true
  },
  "cache": {
    "redis_url": "redis://localhost:6379",
    "ttl": 3600
  },
  "logging": {
    "level": "INFO",
    "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  }
 }
--- a/test_simulation_files/test_module.py
+++ b/test_simulation_files/test_module.py
@@ -0,0 +1,32 @@
 """
 Sample Python module for testing MCP conversation continuity
 """
 def fibonacci(n):
    """Calculate fibonacci number recursively"""
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)
 def factorial(n):
    """Calculate factorial iteratively"""
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result
 class Calculator:
    """Simple calculator class"""
    def __init__(self):
        self.history = []
    def add(self, a, b):
        result = a + b
        self.history.append(f"{a} + {b} = {result}")
        return result
    def multiply(self, a, b):
        result = a * b
        self.history.append(f"{a} * {b} = {result}")
        return result
--- a/tests/test_consensus.py
+++ b/tests/test_consensus.py
@@ -2,8 +2,7 @@
 Tests for the Consensus tool using WorkflowTool architecture.
 """
-import json
+from unittest.mock import Mock
 from unittest.mock import Mock, patch
 import pytest
@@ -219,180 +218,112 @@ class TestConsensusTool:
        assert tool.should_call_expert_analysis({}) is False
        assert tool.requires_expert_analysis() is False
-    @pytest.mark.asyncio
+    def test_execute_workflow_step1_basic(self):
-    async def test_execute_workflow_step1(self):
+        """Test basic workflow validation for step 1."""
        """Test workflow execution for step 1."""
        tool = ConsensusTool()
        # Test that step 1 sets up the workflow correctly
        arguments = {
            "step": "Initial analysis of proposal",
            "step_number": 1,
-            "total_steps": 4,
+            "total_steps": 2,
            "next_step_required": True,
            "findings": "Found pros and cons",
            "confidence": "medium",
            "models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
            "relevant_files": ["/proposal.md"],
        }
-        with patch.object(tool, "is_effective_auto_mode", return_value=False):
+        # Verify models_to_consult is set correctly from step 1
-            with patch.object(tool, "get_model_provider", return_value=Mock()):
+        request = tool.get_workflow_request_model()(**arguments)
-                result = await tool.execute_workflow(arguments)
+        assert len(request.models) == 2
        assert request.models[0]["model"] == "flash"
        assert request.models[1]["model"] == "o3-mini"
-        assert len(result) == 1
+    def test_execute_workflow_total_steps_calculation(self):
-        response_text = result[0].text
+        """Test that total_steps is calculated correctly from models."""
        response_data = json.loads(response_text)
        # Verify step 1 response structure
        assert response_data["status"] == "consulting_models"
        assert response_data["step_number"] == 1
        assert "continuation_id" in response_data
    @pytest.mark.asyncio
    async def test_execute_workflow_model_consultation(self):
        """Test workflow execution for model consultation steps."""
        tool = ConsensusTool()
        tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
        tool.initial_prompt = "Test prompt"
        # Test with 2 models
        arguments = {
-            "step": "Processing model response",
+            "step": "Initial analysis",
-            "step_number": 2,
+            "step_number": 1,
-            "total_steps": 4,
+            "total_steps": 4,  # This should be corrected to 2
            "next_step_required": True,
-            "findings": "Model provided perspective",
+            "findings": "Analysis complete",
-            "confidence": "medium",
+            "models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
            "continuation_id": "test-id",
            "current_model_index": 0,
        }
-        # Mock the _consult_model method instead to return a proper dict
+        request = tool.get_workflow_request_model()(**arguments)
-        mock_model_response = {
+        # The tool should set total_steps = len(models) = 2
-            "model": "flash",
+        assert len(request.models) == 2
-            "stance": "neutral",
+
-            "status": "success",
+    def test_consult_model_basic_structure(self):
-            "verdict": "Model analysis response",
+        """Test basic model consultation structure."""
-            "metadata": {"provider": "gemini"},
+        tool = ConsensusTool()
        # Test that _get_stance_enhanced_prompt works
        for_prompt = tool._get_stance_enhanced_prompt("for")
        against_prompt = tool._get_stance_enhanced_prompt("against")
        neutral_prompt = tool._get_stance_enhanced_prompt("neutral")
        assert "SUPPORTIVE PERSPECTIVE" in for_prompt
        assert "CRITICAL PERSPECTIVE" in against_prompt
        assert "BALANCED PERSPECTIVE" in neutral_prompt
    def test_model_configuration_validation(self):
        """Test model configuration validation."""
        tool = ConsensusTool()
        # Test single model config
        models = [{"model": "flash", "stance": "neutral"}]
        arguments = {
            "step": "Test",
            "step_number": 1,
            "total_steps": 1,
            "next_step_required": False,
            "findings": "Test findings",
            "models": models,
        }
-        with patch.object(tool, "_consult_model", return_value=mock_model_response):
+        request = tool.get_workflow_request_model()(**arguments)
-            result = await tool.execute_workflow(arguments)
+        assert len(request.models) == 1
-
+        assert request.models[0]["model"] == "flash"
-        assert len(result) == 1
+        assert request.models[0]["stance"] == "neutral"
        response_text = result[0].text
        response_data = json.loads(response_text)
        # Verify model consultation response
        assert response_data["status"] == "model_consulted"
        assert response_data["model_consulted"] == "flash"
        assert response_data["model_stance"] == "neutral"
        assert "model_response" in response_data
        assert response_data["model_response"]["status"] == "success"
    @pytest.mark.asyncio
    async def test_consult_model_error_handling(self):
        """Test error handling in model consultation."""
        tool = ConsensusTool()
        tool.initial_prompt = "Test prompt"
        # Mock provider to raise an error
        mock_provider = Mock()
        mock_provider.generate_content.side_effect = Exception("Model error")
        with patch.object(tool, "get_model_provider", return_value=mock_provider):
            result = await tool._consult_model(
                {"model": "test-model", "stance": "neutral"}, Mock(relevant_files=[], continuation_id=None, images=None)
            )
        assert result["status"] == "error"
        assert result["error"] == "Model error"
        assert result["model"] == "test-model"
    @pytest.mark.asyncio
    async def test_consult_model_with_images(self):
        """Test model consultation with images."""
        tool = ConsensusTool()
        tool.initial_prompt = "Test prompt"
        # Mock provider
        mock_provider = Mock()
        mock_response = Mock(content="Model response with image analysis")
        mock_provider.generate_content.return_value = mock_response
        mock_provider.get_provider_type.return_value = Mock(value="gemini")
        test_images = ["/path/to/image1.png", "/path/to/image2.jpg"]
        with patch.object(tool, "get_model_provider", return_value=mock_provider):
            result = await tool._consult_model(
                {"model": "test-model", "stance": "neutral"},
                Mock(relevant_files=[], continuation_id=None, images=test_images),
            )
        # Verify that images were passed to generate_content
        mock_provider.generate_content.assert_called_once()
        call_args = mock_provider.generate_content.call_args
        assert call_args.kwargs.get("images") == test_images
        assert result["status"] == "success"
        assert result["model"] == "test-model"
    @pytest.mark.asyncio
    async def test_handle_work_completion(self):
        """Test work completion handling for consensus workflow."""
        tool = ConsensusTool()
        tool.initial_prompt = "Test prompt"
        tool.accumulated_responses = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
        request = Mock(confidence="high")
        response_data = {}
        result = await tool.handle_work_completion(response_data, request, {})
        assert result["consensus_complete"] is True
        assert result["status"] == "consensus_workflow_complete"
        assert "complete_consensus" in result
        assert result["complete_consensus"]["models_consulted"] == ["flash:neutral", "o3-mini:for"]
        assert result["complete_consensus"]["total_responses"] == 2
    def test_handle_work_continuation(self):
-        """Test work continuation handling between steps."""
+        """Test work continuation handling - legacy method for compatibility."""
        tool = ConsensusTool()
        tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
        # Note: In the new workflow, model consultation happens DURING steps in execute_workflow
        # This method is kept for compatibility but not actively used in the step-by-step flow
        # Test after step 1
        request = Mock(step_number=1, current_model_index=0)
        response_data = {}
        result = tool.handle_work_continuation(response_data, request)
-        assert result["status"] == "consulting_models"
+        # The method still exists but returns legacy status for compatibility
-        assert result["next_model"] == {"model": "flash", "stance": "neutral"}
+        assert "status" in result
        # Test between model consultations
        request = Mock(step_number=2, current_model_index=1)
        response_data = {}
        result = tool.handle_work_continuation(response_data, request)
-        assert result["status"] == "consulting_next_model"
+        assert "status" in result
        assert result["next_model"] == {"model": "o3-mini", "stance": "for"}
        assert result["models_remaining"] == 1
    def test_customize_workflow_response(self):
        """Test response customization for consensus workflow."""
        tool = ConsensusTool()
        tool.accumulated_responses = [{"model": "test", "response": "data"}]
-        # Test different step numbers
+        # Test different step numbers (new workflow: 2 models = 2 steps)
-        request = Mock(step_number=1, total_steps=4)
+        request = Mock(step_number=1, total_steps=2)
        response_data = {}
        result = tool.customize_workflow_response(response_data, request)
        assert result["consensus_workflow_status"] == "initial_analysis_complete"
-        request = Mock(step_number=2, total_steps=4)
+        request = Mock(step_number=2, total_steps=2)
        response_data = {}
        result = tool.customize_workflow_response(response_data, request)
        assert result["consensus_workflow_status"] == "consulting_models"
        request = Mock(step_number=4, total_steps=4)
        response_data = {}
        result = tool.customize_workflow_response(response_data, request)
        assert result["consensus_workflow_status"] == "ready_for_synthesis"
--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -48,8 +48,9 @@ CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS = {
        "steps 2+ are for processing individual model responses."
    ),
    "total_steps": (
-        "Total number of steps needed. This equals 1 (your analysis) + number of models to consult + "
+        "Total number of steps needed. This equals the number of models to consult. "
-        "1 (final synthesis)."
+        "Step 1 includes your analysis + first model consultation on return of the call. Final step includes "
        "last model consultation + synthesis."
    ),
    "next_step_required": ("Set to true if more models need to be consulted. False when ready for final synthesis."),
    "findings": (
@@ -182,7 +183,7 @@ class ConsensusTool(WorkflowTool):
            "IMPORTANT: This workflow enforces sequential model consultation:\\n"
            "- Step 1 is always your independent analysis\\n"
            "- Each subsequent step processes one model response\\n"
-            "- Total steps = 1 (your analysis) + number of models + 1 (synthesis)\\n"
+            "- Total steps = number of models (each step includes consultation + response)\\n"
            "- Models can have stances (for/against/neutral) for structured debate\\n"
            "- Same model can be used multiple times with different stances\\n"
            "- Each model + stance combination must be unique\\n\\n"
@@ -435,15 +436,16 @@ of the evidence, even when it strongly points in one direction.""",
            self.initial_prompt = request.step
            self.models_to_consult = request.models or []
            self.accumulated_responses = []
-            # Set total steps: 1 (Claude) + len(models) + 1 (synthesis)
+            # Set total steps: len(models) (each step includes consultation + response)
-            request.total_steps = 1 + len(self.models_to_consult) + 1
+            request.total_steps = len(self.models_to_consult)
        # For all steps (1 through total_steps), consult the corresponding model
        if request.step_number <= request.total_steps:
            # Calculate which model to consult for this step
            model_idx = request.step_number - 1  # 0-based index
        # If this is a model consultation step (2 through total_steps-1)
        elif request.step_number > 1 and request.step_number < request.total_steps:
            # Get the current model to consult
            model_idx = request.current_model_index or 0
            if model_idx < len(self.models_to_consult):
-                # Consult the model
+                # Consult the model for this step
                model_response = await self._consult_model(self.models_to_consult[model_idx], request)
                # Add to accumulated responses
@@ -458,23 +460,48 @@ of the evidence, even when it strongly points in one direction.""",
                    "model_stance": model_response.get("stance", "neutral"),
                    "model_response": model_response,
                    "current_model_index": model_idx + 1,
-                    "next_step_required": request.step_number < request.total_steps - 1,
+                    "next_step_required": request.step_number < request.total_steps,
                }
-                if request.step_number < request.total_steps - 1:
+                # Add Claude's analysis to step 1
                if request.step_number == 1:
                    response_data["claude_analysis"] = {
                        "initial_analysis": request.step,
                        "findings": request.findings,
                    }
                    response_data["status"] = "analysis_and_first_model_consulted"
                # Check if this is the final step
                if request.step_number == request.total_steps:
                    response_data["status"] = "consensus_workflow_complete"
                    response_data["consensus_complete"] = True
                    response_data["complete_consensus"] = {
                        "initial_prompt": self.initial_prompt,
                        "models_consulted": [
                            f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.accumulated_responses
                        ],
                        "total_responses": len(self.accumulated_responses),
                        "consensus_confidence": "high",
                    }
                    response_data["next_steps"] = (
                        "CONSENSUS GATHERING IS COMPLETE. Synthesize all perspectives and present:\n"
                        "1. Key points of AGREEMENT across models\n"
                        "2. Key points of DISAGREEMENT and why they differ\n"
                        "3. Your final consolidated recommendation\n"
                        "4. Specific, actionable next steps for implementation\n"
                        "5. Critical risks or concerns that must be addressed"
                    )
                else:
                    response_data["next_steps"] = (
                        f"Model {model_response['model']} has provided its {model_response.get('stance', 'neutral')} "
                        f"perspective. Please analyze this response and call {self.get_name()} again with:\n"
                        f"- step_number: {request.step_number + 1}\n"
-                        f"- findings: Summarize key points from this model's response\n"
+                        f"- findings: Summarize key points from this model's response"
                        f"- current_model_index: {model_idx + 1}\n"
                        f"- model_responses: (append this response to the list)"
                    )
                else:
                    response_data["next_steps"] = (
                        "All models have been consulted. For the final step, synthesize all perspectives."
                    )
                # Add accumulated responses for tracking
                response_data["accumulated_responses"] = self.accumulated_responses
                return [TextContent(type="text", text=json.dumps(response_data, indent=2))]
        # Otherwise, use standard workflow execution
@@ -520,6 +547,7 @@ of the evidence, even when it strongly points in one direction.""",
                "verdict": response.content,
                "metadata": {
                    "provider": provider.get_provider_type().value,
                    "model_name": model_name,
                },
            }
@@ -627,8 +655,90 @@ of the evidence, even when it strongly points in one direction.""",
        else:
            response_data["consensus_workflow_status"] = "ready_for_synthesis"
        # Customize metadata for consensus workflow
        self._customize_consensus_metadata(response_data, request)
        return response_data
    def _customize_consensus_metadata(self, response_data: dict, request) -> None:
        """
        Customize metadata for consensus workflow to accurately reflect multi-model nature.
        The default workflow metadata shows the model running Claude's analysis steps,
        but consensus is a multi-model tool that consults different models. We need
        to provide accurate metadata that reflects this.
        """
        if "metadata" not in response_data:
            response_data["metadata"] = {}
        metadata = response_data["metadata"]
        # Always preserve tool_name
        metadata["tool_name"] = self.get_name()
        if request.step_number == request.total_steps:
            # Final step - show comprehensive consensus metadata
            models_consulted = []
            if self.models_to_consult:
                models_consulted = [f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.models_to_consult]
            metadata.update(
                {
                    "workflow_type": "multi_model_consensus",
                    "models_consulted": models_consulted,
                    "consensus_complete": True,
                    "total_models": len(self.models_to_consult) if self.models_to_consult else 0,
                }
            )
            # Remove the misleading single model metadata
            metadata.pop("model_used", None)
            metadata.pop("provider_used", None)
        else:
            # Intermediate steps - show consensus workflow in progress
            models_to_consult = []
            if self.models_to_consult:
                models_to_consult = [f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.models_to_consult]
            metadata.update(
                {
                    "workflow_type": "multi_model_consensus",
                    "models_to_consult": models_to_consult,
                    "consultation_step": request.step_number,
                    "total_consultation_steps": request.total_steps,
                }
            )
            # Remove the misleading single model metadata that shows Claude's execution model
            # instead of the models being consulted
            metadata.pop("model_used", None)
            metadata.pop("provider_used", None)
    def _add_workflow_metadata(self, response_data: dict, arguments: dict[str, Any]) -> None:
        """
        Override workflow metadata addition for consensus tool.
        The consensus tool doesn't use single model metadata because it's a multi-model
        workflow. Instead, we provide consensus-specific metadata that accurately
        reflects the models being consulted.
        """
        # Initialize metadata if not present
        if "metadata" not in response_data:
            response_data["metadata"] = {}
        # Add basic tool metadata
        response_data["metadata"]["tool_name"] = self.get_name()
        # The consensus-specific metadata is already added by _customize_consensus_metadata
        # which is called from customize_workflow_response. We don't add the standard
        # single-model metadata (model_used, provider_used) because it's misleading
        # for a multi-model consensus workflow.
        logger.debug(
            f"[CONSENSUS_METADATA] {self.get_name()}: Using consensus-specific metadata instead of single-model metadata"
        )
    def store_initial_issue(self, step_description: str):
        """Store initial prompt for model consultations."""
        self.initial_prompt = step_description