Improvements to consensus

2025-08-08 12:59:41 +05:00
parent b212cae5de
commit e29deb23db
5 changed files with 32 additions and 22 deletions
--- a/config.py
+++ b/config.py
@@ -14,7 +14,7 @@ import os
 # These values are used in server responses and for tracking releases
 # IMPORTANT: This is the single source of truth for version and author info
 # Semantic versioning: MAJOR.MINOR.PATCH
-__version__ = "5.8.4"
+__version__ = "5.8.5"
 # Last update date in ISO format
 __updated__ = "2025-08-08"
 # Primary maintainer
--- a/tests/test_auto_model_planner_fix.py
+++ b/tests/test_auto_model_planner_fix.py
@@ -100,7 +100,7 @@ class TestAutoModelPlannerFix:
        import json

        response_data = json.loads(result[0].text)
-        assert response_data["status"] == "planner_complete"
+        assert response_data["status"] == "planning_complete"
        assert response_data["step_number"] == 1

    @patch("config.DEFAULT_MODEL", "auto")
@@ -172,7 +172,7 @@ class TestAutoModelPlannerFix:
        import json

        response1 = json.loads(result1[0].text)
-        assert response1["status"] == "pause_for_planner"
+        assert response1["status"] == "pause_for_planning"
        assert response1["next_step_required"] is True
        assert "continuation_id" in response1

@@ -190,7 +190,7 @@ class TestAutoModelPlannerFix:
        assert len(result2) > 0

        response2 = json.loads(result2[0].text)
-        assert response2["status"] == "pause_for_planner"
+        assert response2["status"] == "pause_for_planning"
        assert response2["step_number"] == 2

    def test_other_tools_still_require_models(self):
--- a/tests/test_planner.py
+++ b/tests/test_planner.py
@@ -226,7 +226,7 @@ class TestPlannerTool:
        parsed_response = json.loads(response_text)

        # Check final step structure
-        assert parsed_response["status"] == "planner_complete"
+        assert parsed_response["status"] == "planning_complete"
        assert parsed_response["step_number"] == 10
        assert parsed_response["planning_complete"] is True
        assert "plan_summary" in parsed_response
@@ -329,7 +329,7 @@ class TestPlannerTool:
        # Total steps should be adjusted to match current step
        assert parsed_response["total_steps"] == 8
        assert parsed_response["step_number"] == 8
-        assert parsed_response["status"] == "pause_for_planner"
+        assert parsed_response["status"] == "pause_for_planning"

    @pytest.mark.asyncio
    async def test_execute_error_handling(self):
@@ -457,6 +457,6 @@ class TestPlannerToolIntegration:
        assert parsed_response["total_steps"] == 3
        assert parsed_response["continuation_id"] == "test-simple-uuid"
        # For simple plans (< 5 steps), expect normal flow without deep thinking pause
-        assert parsed_response["status"] == "pause_for_planner"
+        assert parsed_response["status"] == "pause_for_planning"
        assert "thinking_required" not in parsed_response
        assert "Continue with step 2" in parsed_response["next_steps"]
--- a/tools/consensus.py
+++ b/tools/consensus.py
@@ -37,11 +37,12 @@ logger = logging.getLogger(__name__)
 # Tool-specific field descriptions for consensus workflow
 CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS = {
    "step": (
-        "Describe your current consensus analysis step. In step 1, provide your own neutral, balanced analysis "
-        "of the proposal/idea/plan after thinking carefully about all aspects. Consider technical feasibility, "
-        "user value, implementation complexity, and alternatives. In subsequent steps (2+), you will receive "
-        "individual model responses to synthesize. CRITICAL: Be thorough and balanced in your initial assessment, "
-        "considering both benefits and risks, opportunities and challenges."
+        "In step 1: Provide the EXACT question or proposal that ALL models will evaluate. This should be phrased as a clear "
+        "question or problem statement, NOT as 'I will analyze...' or 'Let me examine...'. For example: 'Should we build a "
+        "search component in SwiftUI for use in an AppKit app?' or 'Evaluate the proposal to migrate our database from MySQL "
+        "to PostgreSQL'. This exact text will be sent to all models for their independent evaluation. "
+        "In subsequent steps (2+): This field is for internal tracking only - you can provide notes about the model response "
+        "you just received. This will NOT be sent to other models (they all receive the original proposal from step 1)."
    ),
    "step_number": (
        "The index of the current step in the consensus workflow, beginning at 1. Step 1 is your analysis, "
@@ -54,8 +55,11 @@ CONSENSUS_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "next_step_required": ("Set to true if more models need to be consulted. False when ready for final synthesis."),
    "findings": (
-        "In step 1, provide your comprehensive analysis of the proposal. In steps 2+, summarize the key points "
-        "from the model response received, noting agreements and disagreements with previous analyses."
+        "In step 1: Provide YOUR OWN comprehensive analysis of the proposal/question. This is where you share your "
+        "independent evaluation, considering technical feasibility, risks, benefits, and alternatives. This analysis "
+        "is NOT sent to other models - it's recorded for the final synthesis. "
+        "In steps 2+: Summarize the key points from the model response received, noting agreements and disagreements "
+        "with previous analyses."
    ),
    "relevant_files": (
        "Files that are relevant to the consensus analysis. Include files that help understand the proposal, "
@@ -161,6 +165,7 @@ class ConsensusTool(WorkflowTool):
    def __init__(self):
        super().__init__()
        self.initial_prompt: str | None = None
+        self.original_proposal: str | None = None  # Store the original proposal separately
        self.models_to_consult: list[dict] = []
        self.accumulated_responses: list[dict] = []
        self._current_arguments: dict[str, Any] = {}
@@ -394,7 +399,7 @@ of the evidence, even when it strongly points in one direction.""",

        # Prepare final synthesis data
        response_data["complete_consensus"] = {
-            "initial_prompt": self.initial_prompt,
+            "initial_prompt": self.original_proposal if self.original_proposal else self.initial_prompt,
            "models_consulted": [m["model"] + ":" + m.get("stance", "neutral") for m in self.accumulated_responses],
            "total_responses": len(self.accumulated_responses),
            "consensus_confidence": "high",  # Consensus complete
@@ -445,7 +450,9 @@ of the evidence, even when it strongly points in one direction.""",

        # On first step, store the models to consult
        if request.step_number == 1:
-            self.initial_prompt = request.step
+            # Store the original proposal from step 1 - this is what all models should see
+            self.original_proposal = request.step
+            self.initial_prompt = request.step  # Keep for backward compatibility
            self.models_to_consult = request.models or []
            self.accumulated_responses = []
            # Set total steps: len(models) (each step includes consultation + response)
@@ -488,7 +495,7 @@ of the evidence, even when it strongly points in one direction.""",
                    response_data["status"] = "consensus_workflow_complete"
                    response_data["consensus_complete"] = True
                    response_data["complete_consensus"] = {
-                        "initial_prompt": self.initial_prompt,
+                        "initial_prompt": self.original_proposal if self.original_proposal else self.initial_prompt,
                        "models_consulted": [
                            f"{m['model']}:{m.get('stance', 'neutral')}" for m in self.accumulated_responses
                        ],
@@ -539,7 +546,9 @@ of the evidence, even when it strongly points in one direction.""",
            # Prepare the prompt with any relevant files
            # Use continuation_id=None for blinded consensus - each model should only see
            # original prompt + files, not conversation history or other model responses
-            prompt = self.initial_prompt
+            # CRITICAL: Use the original proposal from step 1, NOT what's in request.step for steps 2+!
+            # Steps 2+ contain summaries/notes that must NEVER be sent to other models
+            prompt = self.original_proposal if self.original_proposal else self.initial_prompt
            if request.relevant_files:
                file_content, _ = self._prepare_file_content_for_prompt(
                    request.relevant_files,
@@ -761,7 +770,8 @@ of the evidence, even when it strongly points in one direction.""",

    def store_initial_issue(self, step_description: str):
        """Store initial prompt for model consultations."""
-        self.initial_prompt = step_description
+        self.original_proposal = step_description
+        self.initial_prompt = step_description  # Keep for backward compatibility

    # Required abstract methods from BaseTool
    def get_request_model(self):
--- a/tools/tracer.py
+++ b/tools/tracer.py
@@ -535,9 +535,9 @@ class TracerTool(WorkflowTool):
        tool_name = self.get_name()
        status_mapping = {
            f"{tool_name}_in_progress": "tracing_in_progress",
-            f"pause_for_{tool_name}": f"pause_for_tracing",
-            f"{tool_name}_required": f"tracing_required",
-            f"{tool_name}_complete": f"tracing_complete",
+            f"pause_for_{tool_name}": "pause_for_tracing",
+            f"{tool_name}_required": "tracing_required",
+            f"{tool_name}_complete": "tracing_complete",
        }

        if response_data["status"] in status_mapping: