Add DocGen tool with comprehensive documentation generation capabilities (#109)

* WIP: new workflow architecture

* WIP: further improvements and cleanup

* WIP: cleanup and docks, replace old tool with new

* WIP: cleanup and docks, replace old tool with new

* WIP: new planner implementation using workflow

* WIP: precommit tool working as a workflow instead of a basic tool
Support for passing False to use_assistant_model to skip external models completely and use Claude only

* WIP: precommit workflow version swapped with old

* WIP: codereview

* WIP: replaced codereview

* WIP: replaced codereview

* WIP: replaced refactor

* WIP: workflow for thinkdeep

* WIP: ensure files get embedded correctly

* WIP: thinkdeep replaced with workflow version

* WIP: improved messaging when an external model's response is received

* WIP: analyze tool swapped

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: fixed get_completion_next_steps_message missing param

* Fixed tests
Request for files consistently

* Fixed tests
Request for files consistently

* Fixed tests

* New testgen workflow tool
Updated docs

* Swap testgen workflow

* Fix CI test failures by excluding API-dependent tests

- Update GitHub Actions workflow to exclude simulation tests that require API keys
- Fix collaboration tests to properly mock workflow tool expert analysis calls
- Update test assertions to handle new workflow tool response format
- Ensure unit tests run without external API dependencies in CI

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* WIP - Update tests to match new tools

* WIP - Update tests to match new tools

* WIP - Update tests to match new tools

* Should help with https://github.com/BeehiveInnovations/zen-mcp-server/issues/97
Clear python cache when running script: https://github.com/BeehiveInnovations/zen-mcp-server/issues/96
Improved retry error logging
Cleanup

* WIP - chat tool using new architecture and improved code sharing

* Removed todo

* Removed todo

* Cleanup old name

* Tweak wordings

* Tweak wordings
Migrate old tests

* Support for Flash 2.0 and Flash Lite 2.0

* Support for Flash 2.0 and Flash Lite 2.0

* Support for Flash 2.0 and Flash Lite 2.0
Fixed test

* Improved consensus to use the workflow base class

* Improved consensus to use the workflow base class

* Allow images

* Allow images

* Replaced old consensus tool

* Cleanup tests

* Tests for prompt size

* New tool: docgen
Tests for prompt size
Fixes: https://github.com/BeehiveInnovations/zen-mcp-server/issues/107
Use available token size limits: https://github.com/BeehiveInnovations/zen-mcp-server/issues/105

* Improved docgen prompt
Exclude TestGen from pytest inclusion

* Updated errors

* Lint

* DocGen instructed not to fix bugs, surface them and stick to d

* WIP

* Stop claude from being lazy and only documenting a small handful

* More style rules

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Beehive Innovations
2025-06-21 23:21:19 -07:00
committed by GitHub
parent 0655590a51
commit c960bcb720
58 changed files with 5492 additions and 5558 deletions

View File

@@ -1,220 +1,401 @@
"""
Tests for the Consensus tool
Tests for the Consensus tool using WorkflowTool architecture.
"""
import json
from unittest.mock import patch
from unittest.mock import Mock, patch
import pytest
from tools.consensus import ConsensusTool, ModelConfig
from tools.consensus import ConsensusRequest, ConsensusTool
from tools.models import ToolModelCategory
class TestConsensusTool:
"""Test cases for the Consensus tool"""
def setup_method(self):
"""Set up test fixtures"""
self.tool = ConsensusTool()
"""Test suite for ConsensusTool using WorkflowTool architecture."""
def test_tool_metadata(self):
"""Test tool metadata is correct"""
assert self.tool.get_name() == "consensus"
assert "MULTI-MODEL CONSENSUS" in self.tool.get_description()
assert self.tool.get_default_temperature() == 0.2
"""Test basic tool metadata and configuration."""
tool = ConsensusTool()
def test_input_schema(self):
"""Test input schema is properly defined"""
schema = self.tool.get_input_schema()
assert schema["type"] == "object"
assert "prompt" in schema["properties"]
assert tool.get_name() == "consensus"
assert "COMPREHENSIVE CONSENSUS WORKFLOW" in tool.get_description()
assert tool.get_default_temperature() == 0.2 # TEMPERATURE_ANALYTICAL
assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
assert tool.requires_model() is True
def test_request_validation_step1(self):
"""Test Pydantic request model validation for step 1."""
# Valid step 1 request with models
step1_request = ConsensusRequest(
step="Analyzing the real-time collaboration proposal",
step_number=1,
total_steps=4, # 1 (Claude) + 2 models + 1 (synthesis)
next_step_required=True,
findings="Initial assessment shows strong value but technical complexity",
confidence="medium",
models=[{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
relevant_files=["/proposal.md"],
)
assert step1_request.step_number == 1
assert step1_request.confidence == "medium"
assert len(step1_request.models) == 2
assert step1_request.models[0]["model"] == "flash"
def test_request_validation_missing_models_step1(self):
"""Test that step 1 requires models field."""
with pytest.raises(ValueError, match="Step 1 requires 'models' field"):
ConsensusRequest(
step="Test step",
step_number=1,
total_steps=3,
next_step_required=True,
findings="Test findings",
# Missing models field
)
def test_request_validation_later_steps(self):
"""Test request validation for steps 2+."""
# Step 2+ doesn't require models field
step2_request = ConsensusRequest(
step="Processing first model response",
step_number=2,
total_steps=4,
next_step_required=True,
findings="Model provided supportive perspective",
confidence="medium",
continuation_id="test-id",
current_model_index=1,
)
assert step2_request.step_number == 2
assert step2_request.models is None # Not required after step 1
def test_request_validation_duplicate_model_stance(self):
"""Test that duplicate model+stance combinations are rejected."""
# Valid: same model with different stances
valid_request = ConsensusRequest(
step="Analyze this proposal",
step_number=1,
total_steps=1,
next_step_required=True,
findings="Initial analysis",
models=[
{"model": "o3", "stance": "for"},
{"model": "o3", "stance": "against"},
{"model": "flash", "stance": "neutral"},
],
continuation_id="test-id",
)
assert len(valid_request.models) == 3
# Invalid: duplicate model+stance combination
with pytest.raises(ValueError, match="Duplicate model \\+ stance combination"):
ConsensusRequest(
step="Analyze this proposal",
step_number=1,
total_steps=1,
next_step_required=True,
findings="Initial analysis",
models=[
{"model": "o3", "stance": "for"},
{"model": "flash", "stance": "neutral"},
{"model": "o3", "stance": "for"}, # Duplicate!
],
continuation_id="test-id",
)
def test_input_schema_generation(self):
"""Test that input schema is generated correctly."""
tool = ConsensusTool()
schema = tool.get_input_schema()
# Verify consensus workflow fields are present
assert "step" in schema["properties"]
assert "step_number" in schema["properties"]
assert "total_steps" in schema["properties"]
assert "next_step_required" in schema["properties"]
assert "findings" in schema["properties"]
# confidence field should be excluded
assert "confidence" not in schema["properties"]
assert "models" in schema["properties"]
assert schema["required"] == ["prompt", "models"]
# relevant_files should also be excluded
assert "relevant_files" not in schema["properties"]
# Check that schema includes model configuration information
models_desc = schema["properties"]["models"]["description"]
# Check description includes object format
assert "model configurations" in models_desc
assert "specific stance and custom instructions" in models_desc
# Check example shows new format
assert "'model': 'o3'" in models_desc
assert "'stance': 'for'" in models_desc
assert "'stance_prompt'" in models_desc
# Verify workflow fields that should NOT be present
assert "files_checked" not in schema["properties"]
assert "hypothesis" not in schema["properties"]
assert "issues_found" not in schema["properties"]
assert "temperature" not in schema["properties"]
assert "thinking_mode" not in schema["properties"]
assert "use_websearch" not in schema["properties"]
def test_normalize_stance_basic(self):
"""Test basic stance normalization"""
# Test basic stances
assert self.tool._normalize_stance("for") == "for"
assert self.tool._normalize_stance("against") == "against"
assert self.tool._normalize_stance("neutral") == "neutral"
assert self.tool._normalize_stance(None) == "neutral"
# Images should be present now
assert "images" in schema["properties"]
assert schema["properties"]["images"]["type"] == "array"
assert schema["properties"]["images"]["items"]["type"] == "string"
def test_normalize_stance_synonyms(self):
"""Test stance synonym normalization"""
# Supportive synonyms
assert self.tool._normalize_stance("support") == "for"
assert self.tool._normalize_stance("favor") == "for"
# Verify field types
assert schema["properties"]["step"]["type"] == "string"
assert schema["properties"]["step_number"]["type"] == "integer"
assert schema["properties"]["models"]["type"] == "array"
# Critical synonyms
assert self.tool._normalize_stance("critical") == "against"
assert self.tool._normalize_stance("oppose") == "against"
# Verify models array structure
models_items = schema["properties"]["models"]["items"]
assert models_items["type"] == "object"
assert "model" in models_items["properties"]
assert "stance" in models_items["properties"]
assert "stance_prompt" in models_items["properties"]
# Case insensitive
assert self.tool._normalize_stance("FOR") == "for"
assert self.tool._normalize_stance("Support") == "for"
assert self.tool._normalize_stance("AGAINST") == "against"
assert self.tool._normalize_stance("Critical") == "against"
def test_get_required_actions(self):
"""Test required actions for different consensus phases."""
tool = ConsensusTool()
# Test unknown stances default to neutral
assert self.tool._normalize_stance("supportive") == "neutral"
assert self.tool._normalize_stance("maybe") == "neutral"
assert self.tool._normalize_stance("contra") == "neutral"
assert self.tool._normalize_stance("random") == "neutral"
# Step 1: Claude's initial analysis
actions = tool.get_required_actions(1, "exploring", "Initial findings", 4)
assert any("initial analysis" in action for action in actions)
assert any("consult other models" in action for action in actions)
def test_model_config_validation(self):
"""Test ModelConfig validation"""
# Valid config
config = ModelConfig(model="o3", stance="for", stance_prompt="Custom prompt")
assert config.model == "o3"
assert config.stance == "for"
assert config.stance_prompt == "Custom prompt"
# Step 2-3: Model consultations
actions = tool.get_required_actions(2, "medium", "Model findings", 4)
assert any("Review the model response" in action for action in actions)
# Default stance
config = ModelConfig(model="flash")
assert config.stance == "neutral"
assert config.stance_prompt is None
# Final step: Synthesis
actions = tool.get_required_actions(4, "high", "All findings", 4)
assert any("All models have been consulted" in action for action in actions)
assert any("Synthesize all perspectives" in action for action in actions)
# Test that empty model is handled by validation elsewhere
# Pydantic allows empty strings by default, but the tool validates it
config = ModelConfig(model="")
assert config.model == ""
def test_prepare_step_data(self):
"""Test step data preparation for consensus workflow."""
tool = ConsensusTool()
request = ConsensusRequest(
step="Test step",
step_number=1,
total_steps=3,
next_step_required=True,
findings="Test findings",
confidence="medium",
models=[{"model": "test"}],
relevant_files=["/test.py"],
)
def test_validate_model_combinations(self):
"""Test model combination validation with ModelConfig objects"""
# Valid combinations
configs = [
ModelConfig(model="o3", stance="for"),
ModelConfig(model="pro", stance="against"),
ModelConfig(model="grok"), # neutral default
ModelConfig(model="o3", stance="against"),
]
valid, skipped = self.tool._validate_model_combinations(configs)
assert len(valid) == 4
assert len(skipped) == 0
step_data = tool.prepare_step_data(request)
# Test max instances per combination (2)
configs = [
ModelConfig(model="o3", stance="for"),
ModelConfig(model="o3", stance="for"),
ModelConfig(model="o3", stance="for"), # This should be skipped
ModelConfig(model="pro", stance="against"),
]
valid, skipped = self.tool._validate_model_combinations(configs)
assert len(valid) == 3
assert len(skipped) == 1
assert "max 2 instances" in skipped[0]
# Verify consensus-specific fields
assert step_data["step"] == "Test step"
assert step_data["findings"] == "Test findings"
assert step_data["relevant_files"] == ["/test.py"]
# Test unknown stances get normalized to neutral
configs = [
ModelConfig(model="o3", stance="maybe"), # Unknown stance -> neutral
ModelConfig(model="pro", stance="kinda"), # Unknown stance -> neutral
ModelConfig(model="grok"), # Already neutral
]
valid, skipped = self.tool._validate_model_combinations(configs)
assert len(valid) == 3 # All are valid (normalized to neutral)
assert len(skipped) == 0 # None skipped
# Verify unused workflow fields are empty
assert step_data["files_checked"] == []
assert step_data["relevant_context"] == []
assert step_data["issues_found"] == []
assert step_data["hypothesis"] is None
# Verify normalization worked
assert valid[0].stance == "neutral" # maybe -> neutral
assert valid[1].stance == "neutral" # kinda -> neutral
assert valid[2].stance == "neutral" # already neutral
def test_stance_enhanced_prompt_generation(self):
"""Test stance-enhanced prompt generation."""
tool = ConsensusTool()
def test_get_stance_enhanced_prompt(self):
"""Test stance-enhanced prompt generation"""
# Test that stance prompts are injected correctly
for_prompt = self.tool._get_stance_enhanced_prompt("for")
# Test different stances
for_prompt = tool._get_stance_enhanced_prompt("for")
assert "SUPPORTIVE PERSPECTIVE" in for_prompt
against_prompt = self.tool._get_stance_enhanced_prompt("against")
against_prompt = tool._get_stance_enhanced_prompt("against")
assert "CRITICAL PERSPECTIVE" in against_prompt
neutral_prompt = self.tool._get_stance_enhanced_prompt("neutral")
neutral_prompt = tool._get_stance_enhanced_prompt("neutral")
assert "BALANCED PERSPECTIVE" in neutral_prompt
# Test custom stance prompt
custom_prompt = "Focus on user experience and business value"
enhanced = self.tool._get_stance_enhanced_prompt("for", custom_prompt)
assert custom_prompt in enhanced
assert "SUPPORTIVE PERSPECTIVE" not in enhanced # Should use custom instead
custom = "Focus on specific aspects"
custom_prompt = tool._get_stance_enhanced_prompt("for", custom)
assert custom in custom_prompt
assert "SUPPORTIVE PERSPECTIVE" not in custom_prompt
def test_format_consensus_output(self):
"""Test consensus output formatting"""
responses = [
{"model": "o3", "stance": "for", "status": "success", "verdict": "Good idea"},
{"model": "pro", "stance": "against", "status": "success", "verdict": "Bad idea"},
{"model": "grok", "stance": "neutral", "status": "error", "error": "Timeout"},
]
skipped = ["flash:maybe (invalid stance)"]
output = self.tool._format_consensus_output(responses, skipped)
output_data = json.loads(output)
assert output_data["status"] == "consensus_success"
assert output_data["models_used"] == ["o3:for", "pro:against"]
assert output_data["models_skipped"] == skipped
assert output_data["models_errored"] == ["grok"]
assert "next_steps" in output_data
def test_should_call_expert_analysis(self):
"""Test that consensus workflow doesn't use expert analysis."""
tool = ConsensusTool()
assert tool.should_call_expert_analysis({}) is False
assert tool.requires_expert_analysis() is False
@pytest.mark.asyncio
@patch("tools.consensus.ConsensusTool._get_consensus_responses")
async def test_execute_with_model_configs(self, mock_get_responses):
"""Test execute with ModelConfig objects"""
# Mock responses directly at the consensus level
mock_responses = [
{
"model": "o3",
"stance": "for", # support normalized to for
"status": "success",
"verdict": "This is good for user benefits",
"metadata": {"provider": "openai", "usage": None, "custom_stance_prompt": True},
},
{
"model": "pro",
"stance": "against", # critical normalized to against
"status": "success",
"verdict": "There are technical risks to consider",
"metadata": {"provider": "gemini", "usage": None, "custom_stance_prompt": True},
},
{
"model": "grok",
"stance": "neutral",
"status": "success",
"verdict": "Balanced perspective on the proposal",
"metadata": {"provider": "xai", "usage": None, "custom_stance_prompt": False},
},
]
mock_get_responses.return_value = mock_responses
async def test_execute_workflow_step1(self):
"""Test workflow execution for step 1."""
tool = ConsensusTool()
# Test with ModelConfig objects including custom stance prompts
models = [
{"model": "o3", "stance": "support", "stance_prompt": "Focus on user benefits"}, # Test synonym
{"model": "pro", "stance": "critical", "stance_prompt": "Focus on technical risks"}, # Test synonym
{"model": "grok", "stance": "neutral"},
]
arguments = {
"step": "Initial analysis of proposal",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"findings": "Found pros and cons",
"confidence": "medium",
"models": [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}],
"relevant_files": ["/proposal.md"],
}
result = await self.tool.execute({"prompt": "Test prompt", "models": models})
with patch.object(tool, "is_effective_auto_mode", return_value=False):
with patch.object(tool, "get_model_provider", return_value=Mock()):
result = await tool.execute_workflow(arguments)
# Verify the response structure
assert len(result) == 1
response_text = result[0].text
response_data = json.loads(response_text)
assert response_data["status"] == "consensus_success"
assert len(response_data["models_used"]) == 3
# Verify stance normalization worked in the models_used field
models_used = response_data["models_used"]
assert "o3:for" in models_used # support -> for
assert "pro:against" in models_used # critical -> against
assert "grok" in models_used # neutral (no stance suffix)
# Verify step 1 response structure
assert response_data["status"] == "consulting_models"
assert response_data["step_number"] == 1
assert "continuation_id" in response_data
@pytest.mark.asyncio
async def test_execute_workflow_model_consultation(self):
"""Test workflow execution for model consultation steps."""
tool = ConsensusTool()
tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
tool.initial_prompt = "Test prompt"
arguments = {
"step": "Processing model response",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"findings": "Model provided perspective",
"confidence": "medium",
"continuation_id": "test-id",
"current_model_index": 0,
}
# Mock the _consult_model method instead to return a proper dict
mock_model_response = {
"model": "flash",
"stance": "neutral",
"status": "success",
"verdict": "Model analysis response",
"metadata": {"provider": "gemini"},
}
with patch.object(tool, "_consult_model", return_value=mock_model_response):
result = await tool.execute_workflow(arguments)
assert len(result) == 1
response_text = result[0].text
response_data = json.loads(response_text)
# Verify model consultation response
assert response_data["status"] == "model_consulted"
assert response_data["model_consulted"] == "flash"
assert response_data["model_stance"] == "neutral"
assert "model_response" in response_data
assert response_data["model_response"]["status"] == "success"
@pytest.mark.asyncio
async def test_consult_model_error_handling(self):
"""Test error handling in model consultation."""
tool = ConsensusTool()
tool.initial_prompt = "Test prompt"
# Mock provider to raise an error
mock_provider = Mock()
mock_provider.generate_content.side_effect = Exception("Model error")
with patch.object(tool, "get_model_provider", return_value=mock_provider):
result = await tool._consult_model(
{"model": "test-model", "stance": "neutral"}, Mock(relevant_files=[], continuation_id=None, images=None)
)
assert result["status"] == "error"
assert result["error"] == "Model error"
assert result["model"] == "test-model"
@pytest.mark.asyncio
async def test_consult_model_with_images(self):
"""Test model consultation with images."""
tool = ConsensusTool()
tool.initial_prompt = "Test prompt"
# Mock provider
mock_provider = Mock()
mock_response = Mock(content="Model response with image analysis")
mock_provider.generate_content.return_value = mock_response
mock_provider.get_provider_type.return_value = Mock(value="gemini")
test_images = ["/path/to/image1.png", "/path/to/image2.jpg"]
with patch.object(tool, "get_model_provider", return_value=mock_provider):
result = await tool._consult_model(
{"model": "test-model", "stance": "neutral"},
Mock(relevant_files=[], continuation_id=None, images=test_images),
)
# Verify that images were passed to generate_content
mock_provider.generate_content.assert_called_once()
call_args = mock_provider.generate_content.call_args
assert call_args.kwargs.get("images") == test_images
assert result["status"] == "success"
assert result["model"] == "test-model"
@pytest.mark.asyncio
async def test_handle_work_completion(self):
"""Test work completion handling for consensus workflow."""
tool = ConsensusTool()
tool.initial_prompt = "Test prompt"
tool.accumulated_responses = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
request = Mock(confidence="high")
response_data = {}
result = await tool.handle_work_completion(response_data, request, {})
assert result["consensus_complete"] is True
assert result["status"] == "consensus_workflow_complete"
assert "complete_consensus" in result
assert result["complete_consensus"]["models_consulted"] == ["flash:neutral", "o3-mini:for"]
assert result["complete_consensus"]["total_responses"] == 2
def test_handle_work_continuation(self):
"""Test work continuation handling between steps."""
tool = ConsensusTool()
tool.models_to_consult = [{"model": "flash", "stance": "neutral"}, {"model": "o3-mini", "stance": "for"}]
# Test after step 1
request = Mock(step_number=1, current_model_index=0)
response_data = {}
result = tool.handle_work_continuation(response_data, request)
assert result["status"] == "consulting_models"
assert result["next_model"] == {"model": "flash", "stance": "neutral"}
# Test between model consultations
request = Mock(step_number=2, current_model_index=1)
response_data = {}
result = tool.handle_work_continuation(response_data, request)
assert result["status"] == "consulting_next_model"
assert result["next_model"] == {"model": "o3-mini", "stance": "for"}
assert result["models_remaining"] == 1
def test_customize_workflow_response(self):
"""Test response customization for consensus workflow."""
tool = ConsensusTool()
tool.accumulated_responses = [{"model": "test", "response": "data"}]
# Test different step numbers
request = Mock(step_number=1, total_steps=4)
response_data = {}
result = tool.customize_workflow_response(response_data, request)
assert result["consensus_workflow_status"] == "initial_analysis_complete"
request = Mock(step_number=2, total_steps=4)
response_data = {}
result = tool.customize_workflow_response(response_data, request)
assert result["consensus_workflow_status"] == "consulting_models"
request = Mock(step_number=4, total_steps=4)
response_data = {}
result = tool.customize_workflow_response(response_data, request)
assert result["consensus_workflow_status"] == "ready_for_synthesis"
if __name__ == "__main__":