🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)

* WIP: new workflow architecture

* WIP: further improvements and cleanup

* WIP: cleanup and docks, replace old tool with new

* WIP: cleanup and docks, replace old tool with new

* WIP: new planner implementation using workflow

* WIP: precommit tool working as a workflow instead of a basic tool
Support for passing False to use_assistant_model to skip external models completely and use Claude only

* WIP: precommit workflow version swapped with old

* WIP: codereview

* WIP: replaced codereview

* WIP: replaced codereview

* WIP: replaced refactor

* WIP: workflow for thinkdeep

* WIP: ensure files get embedded correctly

* WIP: thinkdeep replaced with workflow version

* WIP: improved messaging when an external model's response is received

* WIP: analyze tool swapped

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: updated tests
* Extract only the content when building history
* Use "relevant_files" for workflow tools only

* WIP: fixed get_completion_next_steps_message missing param

* Fixed tests
Request for files consistently

* Fixed tests
Request for files consistently

* Fixed tests

* New testgen workflow tool
Updated docs

* Swap testgen workflow

* Fix CI test failures by excluding API-dependent tests

- Update GitHub Actions workflow to exclude simulation tests that require API keys
- Fix collaboration tests to properly mock workflow tool expert analysis calls
- Update test assertions to handle new workflow tool response format
- Ensure unit tests run without external API dependencies in CI

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* WIP - Update tests to match new tools

* WIP - Update tests to match new tools

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Beehive Innovations
2025-06-21 00:08:11 +04:00
committed by GitHub
parent 4dae6e457e
commit 69a3121452
76 changed files with 17111 additions and 7725 deletions

View File

@@ -1,17 +1,13 @@
"""
Tests for the debug tool.
Tests for the debug tool using new WorkflowTool architecture.
"""
from unittest.mock import patch
import pytest
from tools.debug import DebugInvestigationRequest, DebugIssueTool
from tools.models import ToolModelCategory
class TestDebugTool:
"""Test suite for DebugIssueTool."""
"""Test suite for DebugIssueTool using new WorkflowTool architecture."""
def test_tool_metadata(self):
"""Test basic tool metadata and configuration."""
@@ -21,7 +17,7 @@ class TestDebugTool:
assert "DEBUG & ROOT CAUSE ANALYSIS" in tool.get_description()
assert tool.get_default_temperature() == 0.2 # TEMPERATURE_ANALYTICAL
assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
assert tool.requires_model() is True # Requires model resolution for expert analysis
assert tool.requires_model() is True
def test_request_validation(self):
"""Test Pydantic request model validation."""
@@ -29,622 +25,62 @@ class TestDebugTool:
step_request = DebugInvestigationRequest(
step="Investigating null pointer exception in UserService",
step_number=1,
total_steps=5,
total_steps=3,
next_step_required=True,
findings="Found that UserService.getUser() is called with null ID",
)
assert step_request.step == "Investigating null pointer exception in UserService"
assert step_request.step_number == 1
assert step_request.next_step_required is True
assert step_request.confidence == "low" # default
# Request with optional fields
detailed_request = DebugInvestigationRequest(
step="Deep dive into getUser method implementation",
step_number=2,
total_steps=5,
next_step_required=True,
findings="Method doesn't validate input parameters",
files_checked=["/src/UserService.java", "/src/UserController.java"],
findings="Found potential null reference in user authentication flow",
files_checked=["/src/UserService.java"],
relevant_files=["/src/UserService.java"],
relevant_methods=["UserService.getUser", "UserController.handleRequest"],
hypothesis="Null ID passed from controller without validation",
relevant_methods=["authenticate", "validateUser"],
confidence="medium",
hypothesis="Null pointer occurs when user object is not properly validated",
)
assert len(detailed_request.files_checked) == 2
assert len(detailed_request.relevant_files) == 1
assert detailed_request.confidence == "medium"
# Missing required fields should fail
with pytest.raises(ValueError):
DebugInvestigationRequest() # Missing all required fields
with pytest.raises(ValueError):
DebugInvestigationRequest(step="test") # Missing other required fields
assert step_request.step_number == 1
assert step_request.confidence == "medium"
assert len(step_request.relevant_methods) == 2
assert len(step_request.relevant_context) == 2 # Should be mapped from relevant_methods
def test_input_schema_generation(self):
"""Test JSON schema generation for MCP client."""
"""Test that input schema is generated correctly."""
tool = DebugIssueTool()
schema = tool.get_input_schema()
assert schema["type"] == "object"
# Investigation fields
# Verify required investigation fields are present
assert "step" in schema["properties"]
assert "step_number" in schema["properties"]
assert "total_steps" in schema["properties"]
assert "next_step_required" in schema["properties"]
assert "findings" in schema["properties"]
assert "files_checked" in schema["properties"]
assert "relevant_files" in schema["properties"]
assert "relevant_methods" in schema["properties"]
assert "hypothesis" in schema["properties"]
assert "confidence" in schema["properties"]
assert "backtrack_from_step" in schema["properties"]
assert "continuation_id" in schema["properties"]
assert "images" in schema["properties"] # Now supported for visual debugging
# Check model field is present (fixed from previous bug)
assert "model" in schema["properties"]
# Check excluded fields are NOT present
assert "temperature" not in schema["properties"]
assert "thinking_mode" not in schema["properties"]
assert "use_websearch" not in schema["properties"]
# Check required fields
assert "step" in schema["required"]
assert "step_number" in schema["required"]
assert "total_steps" in schema["required"]
assert "next_step_required" in schema["required"]
assert "findings" in schema["required"]
# Verify field types
assert schema["properties"]["step"]["type"] == "string"
assert schema["properties"]["step_number"]["type"] == "integer"
assert schema["properties"]["next_step_required"]["type"] == "boolean"
assert schema["properties"]["relevant_methods"]["type"] == "array"
def test_model_category_for_debugging(self):
"""Test that debug uses extended reasoning category."""
"""Test that debug tool correctly identifies as extended reasoning category."""
tool = DebugIssueTool()
category = tool.get_model_category()
assert tool.get_model_category() == ToolModelCategory.EXTENDED_REASONING
# Debugging needs deep thinking
assert category == ToolModelCategory.EXTENDED_REASONING
def test_field_mapping_relevant_methods_to_context(self):
"""Test that relevant_methods maps to relevant_context internally."""
request = DebugInvestigationRequest(
step="Test investigation",
step_number=1,
total_steps=2,
next_step_required=True,
findings="Test findings",
relevant_methods=["method1", "method2"],
)
@pytest.mark.asyncio
async def test_execute_first_investigation_step(self):
"""Test execute method for first investigation step."""
# External API should have relevant_methods
assert request.relevant_methods == ["method1", "method2"]
# Internal processing should map to relevant_context
assert request.relevant_context == ["method1", "method2"]
# Test step data preparation
tool = DebugIssueTool()
arguments = {
"step": "Investigating intermittent session validation failures in production",
"step_number": 1,
"total_steps": 5,
"next_step_required": True,
"findings": "Users report random session invalidation, occurs more during high traffic",
"files_checked": ["/api/session_manager.py"],
"relevant_files": ["/api/session_manager.py"],
}
# Mock conversation memory functions
with patch("utils.conversation_memory.create_thread", return_value="debug-uuid-123"):
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
assert result[0].type == "text"
# Parse the JSON response
import json
parsed_response = json.loads(result[0].text)
# Debug tool now returns "pause_for_investigation" for ongoing steps
assert parsed_response["status"] == "pause_for_investigation"
assert parsed_response["step_number"] == 1
assert parsed_response["total_steps"] == 5
assert parsed_response["next_step_required"] is True
assert parsed_response["continuation_id"] == "debug-uuid-123"
assert parsed_response["investigation_status"]["files_checked"] == 1
assert parsed_response["investigation_status"]["relevant_files"] == 1
assert parsed_response["investigation_required"] is True
assert "required_actions" in parsed_response
@pytest.mark.asyncio
async def test_execute_subsequent_investigation_step(self):
"""Test execute method for subsequent investigation step."""
tool = DebugIssueTool()
# Set up initial state
tool.initial_issue = "Session validation failures"
tool.consolidated_findings["files_checked"].add("/api/session_manager.py")
arguments = {
"step": "Examining session cleanup method for concurrent modification issues",
"step_number": 2,
"total_steps": 5,
"next_step_required": True,
"findings": "Found dictionary modification during iteration in cleanup_expired_sessions",
"files_checked": ["/api/session_manager.py", "/api/utils.py"],
"relevant_files": ["/api/session_manager.py"],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modified during iteration causing RuntimeError",
"confidence": "high",
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
assert result[0].type == "text"
# Parse the JSON response
import json
parsed_response = json.loads(result[0].text)
assert parsed_response["step_number"] == 2
assert parsed_response["next_step_required"] is True
assert parsed_response["continuation_id"] == "debug-uuid-123"
assert parsed_response["investigation_status"]["files_checked"] == 2 # Cumulative
assert parsed_response["investigation_status"]["relevant_methods"] == 1
assert parsed_response["investigation_status"]["current_confidence"] == "high"
@pytest.mark.asyncio
async def test_execute_final_investigation_step(self):
"""Test execute method for final investigation step with expert analysis."""
tool = DebugIssueTool()
# Set up investigation history
tool.initial_issue = "Session validation failures"
tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation of session validation failures",
"findings": "Initial investigation",
"files_checked": ["/api/utils.py"],
},
{
"step_number": 2,
"step": "Deeper analysis of session manager",
"findings": "Found dictionary issue",
"files_checked": ["/api/session_manager.py"],
},
]
tool.consolidated_findings = {
"files_checked": {"/api/session_manager.py", "/api/utils.py"},
"relevant_files": {"/api/session_manager.py"},
"relevant_methods": {"SessionManager.cleanup_expired_sessions"},
"findings": ["Step 1: Initial investigation", "Step 2: Found dictionary issue"],
"hypotheses": [{"step": 2, "hypothesis": "Dictionary modified during iteration", "confidence": "high"}],
"images": [],
}
arguments = {
"step": "Confirmed the root cause and identified fix",
"step_number": 3,
"total_steps": 3,
"next_step_required": False, # Final step
"findings": "Root cause confirmed: dictionary modification during iteration in cleanup method",
"files_checked": ["/api/session_manager.py"],
"relevant_files": ["/api/session_manager.py"],
"relevant_methods": ["SessionManager.cleanup_expired_sessions"],
"hypothesis": "Dictionary modification during iteration causes intermittent RuntimeError",
"confidence": "high",
"continuation_id": "debug-uuid-123",
}
# Mock the expert analysis call
mock_expert_response = {
"status": "analysis_complete",
"summary": "Dictionary modification during iteration bug identified",
"hypotheses": [
{
"name": "CONCURRENT_MODIFICATION",
"confidence": "High",
"root_cause": "Modifying dictionary while iterating",
"minimal_fix": "Create list of keys to delete first",
}
],
}
# Mock conversation memory and file reading
with patch("utils.conversation_memory.add_turn"):
with patch.object(tool, "_call_expert_analysis", return_value=mock_expert_response):
with patch.object(tool, "_prepare_file_content_for_prompt", return_value=("file content", 100)):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
# Check final step structure
assert parsed_response["status"] == "calling_expert_analysis"
assert parsed_response["investigation_complete"] is True
assert parsed_response["expert_analysis"]["status"] == "analysis_complete"
assert "complete_investigation" in parsed_response
assert parsed_response["complete_investigation"]["steps_taken"] == 3 # All steps including current
@pytest.mark.asyncio
async def test_execute_with_backtracking(self):
"""Test execute method with backtracking to revise findings."""
tool = DebugIssueTool()
# Set up some investigation history with all required fields
tool.investigation_history = [
{
"step": "Initial investigation",
"step_number": 1,
"findings": "Initial findings",
"files_checked": ["file1.py"],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
},
{
"step": "Wrong direction",
"step_number": 2,
"findings": "Wrong path",
"files_checked": ["file2.py"],
"relevant_files": [],
"relevant_methods": [],
"hypothesis": None,
"confidence": "low",
},
]
tool.consolidated_findings = {
"files_checked": {"file1.py", "file2.py"},
"relevant_files": set(),
"relevant_methods": set(),
"findings": ["Step 1: Initial findings", "Step 2: Wrong path"],
"hypotheses": [],
"images": [],
}
arguments = {
"step": "Backtracking to revise approach",
"step_number": 3,
"total_steps": 5,
"next_step_required": True,
"findings": "Taking a different investigation approach",
"files_checked": ["file3.py"],
"backtrack_from_step": 2, # Backtrack from step 2
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
# Debug tool now returns "pause_for_investigation" for ongoing steps
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "pause_for_investigation"
# After backtracking from step 2, history should have step 1 plus the new step
assert len(tool.investigation_history) == 2 # Step 1 + new step 3
assert tool.investigation_history[0]["step_number"] == 1
assert tool.investigation_history[1]["step_number"] == 3 # The new step that triggered backtrack
@pytest.mark.asyncio
async def test_execute_adjusts_total_steps(self):
"""Test execute method adjusts total steps when current step exceeds estimate."""
tool = DebugIssueTool()
arguments = {
"step": "Additional investigation needed",
"step_number": 8,
"total_steps": 5, # Current step exceeds total
"next_step_required": True,
"findings": "More complexity discovered",
"continuation_id": "debug-uuid-123",
}
# Mock conversation memory functions
with patch("utils.conversation_memory.add_turn"):
result = await tool.execute(arguments)
# Should return a list with TextContent
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
# Total steps should be adjusted to match current step
assert parsed_response["total_steps"] == 8
assert parsed_response["step_number"] == 8
@pytest.mark.asyncio
async def test_execute_error_handling(self):
"""Test execute method error handling."""
tool = DebugIssueTool()
# Invalid arguments - missing required fields
arguments = {
"step": "Invalid request"
# Missing required fields
}
result = await tool.execute(arguments)
# Should return error response
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "investigation_failed"
assert "error" in parsed_response
@pytest.mark.asyncio
async def test_execute_with_string_instead_of_list_fields(self):
"""Test execute method handles string inputs for list fields gracefully."""
tool = DebugIssueTool()
arguments = {
"step": "Investigating issue with string inputs",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "Testing string input handling",
# These should be lists but passing strings to test the fix
"files_checked": "relevant_files", # String instead of list
"relevant_files": "some_string", # String instead of list
"relevant_methods": "another_string", # String instead of list
}
# Mock conversation memory functions
with patch("utils.conversation_memory.create_thread", return_value="debug-string-test"):
with patch("utils.conversation_memory.add_turn"):
# Should handle gracefully without crashing
result = await tool.execute(arguments)
# Should return a valid response
assert len(result) == 1
assert result[0].type == "text"
# Parse the JSON response
import json
parsed_response = json.loads(result[0].text)
# Should complete successfully with empty lists
assert parsed_response["status"] == "pause_for_investigation"
assert parsed_response["step_number"] == 1
assert parsed_response["investigation_status"]["files_checked"] == 0 # Empty due to string conversion
assert parsed_response["investigation_status"]["relevant_files"] == 0
assert parsed_response["investigation_status"]["relevant_methods"] == 0
# Verify internal state - should have empty sets, not individual characters
assert tool.consolidated_findings["files_checked"] == set()
assert tool.consolidated_findings["relevant_files"] == set()
assert tool.consolidated_findings["relevant_methods"] == set()
# Should NOT have individual characters like {'r', 'e', 'l', 'e', 'v', 'a', 'n', 't', '_', 'f', 'i', 'l', 'e', 's'}
def test_prepare_investigation_summary(self):
"""Test investigation summary preparation."""
tool = DebugIssueTool()
tool.consolidated_findings = {
"files_checked": {"file1.py", "file2.py", "file3.py"},
"relevant_files": {"file1.py", "file2.py"},
"relevant_methods": {"Class1.method1", "Class2.method2"},
"findings": [
"Step 1: Initial investigation findings",
"Step 2: Discovered potential issue",
"Step 3: Confirmed root cause",
],
"hypotheses": [
{"step": 1, "hypothesis": "Initial hypothesis", "confidence": "low"},
{"step": 2, "hypothesis": "Refined hypothesis", "confidence": "medium"},
{"step": 3, "hypothesis": "Final hypothesis", "confidence": "high"},
],
"images": [],
}
summary = tool._prepare_investigation_summary()
assert "SYSTEMATIC INVESTIGATION SUMMARY" in summary
assert "Files examined: 3" in summary
assert "Relevant files identified: 2" in summary
assert "Methods/functions involved: 2" in summary
assert "INVESTIGATION PROGRESSION" in summary
assert "Step 1:" in summary
assert "Step 2:" in summary
assert "Step 3:" in summary
assert "HYPOTHESIS EVOLUTION" in summary
assert "low confidence" in summary
assert "medium confidence" in summary
assert "high confidence" in summary
def test_extract_error_context(self):
"""Test error context extraction from findings."""
tool = DebugIssueTool()
tool.consolidated_findings = {
"findings": [
"Step 1: Found no issues initially",
"Step 2: Discovered ERROR: Dictionary size changed during iteration",
"Step 3: Stack trace shows RuntimeError in cleanup method",
"Step 4: Exception occurs intermittently",
],
}
error_context = tool._extract_error_context()
assert error_context is not None
assert "ERROR: Dictionary size changed" in error_context
assert "Stack trace shows RuntimeError" in error_context
assert "Exception occurs intermittently" in error_context
assert "Found no issues initially" not in error_context # Should not include non-error findings
def test_reprocess_consolidated_findings(self):
"""Test reprocessing of consolidated findings after backtracking."""
tool = DebugIssueTool()
tool.investigation_history = [
{
"step_number": 1,
"findings": "Initial findings",
"files_checked": ["file1.py"],
"relevant_files": ["file1.py"],
"relevant_methods": ["method1"],
"hypothesis": "Initial hypothesis",
"confidence": "low",
},
{
"step_number": 2,
"findings": "Second findings",
"files_checked": ["file2.py"],
"relevant_files": [],
"relevant_methods": ["method2"],
},
]
tool._reprocess_consolidated_findings()
assert tool.consolidated_findings["files_checked"] == {"file1.py", "file2.py"}
assert tool.consolidated_findings["relevant_files"] == {"file1.py"}
assert tool.consolidated_findings["relevant_methods"] == {"method1", "method2"}
assert len(tool.consolidated_findings["findings"]) == 2
assert len(tool.consolidated_findings["hypotheses"]) == 1
assert tool.consolidated_findings["hypotheses"][0]["hypothesis"] == "Initial hypothesis"
# Integration test
class TestDebugToolIntegration:
"""Integration tests for debug tool."""
def setup_method(self):
"""Set up model context for integration tests."""
from utils.model_context import ModelContext
self.tool = DebugIssueTool()
self.tool._model_context = ModelContext("flash") # Test model
@pytest.mark.asyncio
async def test_complete_investigation_flow(self):
"""Test complete investigation flow from start to expert analysis."""
# Step 1: Initial investigation
arguments = {
"step": "Investigating memory leak in data processing pipeline",
"step_number": 1,
"total_steps": 3,
"next_step_required": True,
"findings": "High memory usage observed during batch processing",
"files_checked": ["/processor/main.py"],
}
# Mock conversation memory and expert analysis
with patch("utils.conversation_memory.create_thread", return_value="debug-flow-uuid"):
with patch("utils.conversation_memory.add_turn"):
result = await self.tool.execute(arguments)
# Verify response structure
# Debug tool now returns "pause_for_investigation" for ongoing steps
assert len(result) == 1
response_text = result[0].text
# Parse the JSON response
import json
parsed_response = json.loads(response_text)
assert parsed_response["status"] == "pause_for_investigation"
assert parsed_response["step_number"] == 1
assert parsed_response["continuation_id"] == "debug-flow-uuid"
@pytest.mark.asyncio
async def test_model_context_initialization_in_expert_analysis(self):
"""Real integration test that model context is properly initialized when expert analysis is called."""
tool = DebugIssueTool()
# Do NOT manually set up model context - let the method do it itself
# Set up investigation state for final step
tool.initial_issue = "Memory leak investigation"
tool.investigation_history = [
{
"step_number": 1,
"step": "Initial investigation",
"findings": "Found memory issues",
"files_checked": [],
}
]
tool.consolidated_findings = {
"files_checked": set(),
"relevant_files": set(), # No files to avoid file I/O in this test
"relevant_methods": {"process_data"},
"findings": ["Step 1: Found memory issues"],
"hypotheses": [],
"images": [],
}
# Test the _call_expert_analysis method directly to verify ModelContext is properly handled
# This is the real test - we're testing that the method can be called without the ModelContext error
try:
# Only mock the API call itself, not the model resolution infrastructure
from unittest.mock import MagicMock
mock_provider = MagicMock()
mock_response = MagicMock()
mock_response.content = '{"status": "analysis_complete", "summary": "Test completed"}'
mock_provider.generate_content.return_value = mock_response
# Use the real get_model_provider method but override its result to avoid API calls
original_get_provider = tool.get_model_provider
tool.get_model_provider = lambda model_name: mock_provider
try:
# Create mock arguments and request for model resolution
from tools.debug import DebugInvestigationRequest
mock_arguments = {"model": None} # No model specified, should fall back to DEFAULT_MODEL
mock_request = DebugInvestigationRequest(
step="Test step", step_number=1, total_steps=1, next_step_required=False, findings="Test findings"
)
# This should NOT raise a ModelContext error - the method should set up context itself
result = await tool._call_expert_analysis(
initial_issue="Test issue",
investigation_summary="Test summary",
relevant_files=[], # Empty to avoid file operations
relevant_methods=["test_method"],
final_hypothesis="Test hypothesis",
error_context=None,
images=[],
model_info=None, # No pre-resolved model info
arguments=mock_arguments, # Provide arguments for model resolution
request=mock_request, # Provide request for model resolution
)
# Should complete without ModelContext error
assert "error" not in result
assert result["status"] == "analysis_complete"
# Verify the model context was actually set up
assert hasattr(tool, "_model_context")
assert hasattr(tool, "_current_model_name")
# Should use DEFAULT_MODEL when no model specified
from config import DEFAULT_MODEL
assert tool._current_model_name == DEFAULT_MODEL
finally:
# Restore original method
tool.get_model_provider = original_get_provider
except RuntimeError as e:
if "ModelContext not initialized" in str(e):
pytest.fail("ModelContext error still occurs - the fix is not working properly")
else:
raise # Re-raise other RuntimeErrors
step_data = tool.prepare_step_data(request)
assert step_data["relevant_context"] == ["method1", "method2"]