🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)

* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-21 00:08:11 +04:00
parent 4dae6e457e
commit 69a3121452
76 changed files with 17111 additions and 7725 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -6,7 +6,9 @@ Each test is in its own file for better organization and maintainability.
 """

 from .base_test import BaseSimulatorTest
+from .test_analyze_validation import AnalyzeValidationTest
 from .test_basic_conversation import BasicConversationTest
+from .test_codereview_validation import CodeReviewValidationTest
 from .test_consensus_conversation import TestConsensusConversation
 from .test_consensus_stance import TestConsensusStance
 from .test_consensus_three_models import TestConsensusThreeModels
@@ -27,10 +29,12 @@ from .test_openrouter_models import OpenRouterModelsTest
 from .test_per_tool_deduplication import PerToolDeduplicationTest
 from .test_planner_continuation_history import PlannerContinuationHistoryTest
 from .test_planner_validation import PlannerValidationTest
+from .test_precommitworkflow_validation import PrecommitWorkflowValidationTest

 # Redis validation test removed - no longer needed for standalone server
 from .test_refactor_validation import RefactorValidationTest
 from .test_testgen_validation import TestGenValidationTest
+from .test_thinkdeep_validation import ThinkDeepWorkflowValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
 from .test_vision_capability import VisionCapabilityTest
 from .test_xai_models import XAIModelsTest
@@ -38,6 +42,7 @@ from .test_xai_models import XAIModelsTest
 # Test registry for dynamic loading
 TEST_REGISTRY = {
    "basic_conversation": BasicConversationTest,
+    "codereview_validation": CodeReviewValidationTest,
    "content_validation": ContentValidationTest,
    "per_tool_deduplication": PerToolDeduplicationTest,
    "cross_tool_continuation": CrossToolContinuationTest,
@@ -52,8 +57,10 @@ TEST_REGISTRY = {
    "openrouter_models": OpenRouterModelsTest,
    "planner_validation": PlannerValidationTest,
    "planner_continuation_history": PlannerContinuationHistoryTest,
+    "precommit_validation": PrecommitWorkflowValidationTest,
    "token_allocation_validation": TokenAllocationValidationTest,
    "testgen_validation": TestGenValidationTest,
+    "thinkdeep_validation": ThinkDeepWorkflowValidationTest,
    "refactor_validation": RefactorValidationTest,
    "debug_validation": DebugValidationTest,
    "debug_certain_confidence": DebugCertainConfidenceTest,
@@ -63,19 +70,20 @@ TEST_REGISTRY = {
    "consensus_conversation": TestConsensusConversation,
    "consensus_stance": TestConsensusStance,
    "consensus_three_models": TestConsensusThreeModels,
+    "analyze_validation": AnalyzeValidationTest,
    # "o3_pro_expensive": O3ProExpensiveTest,  # COMMENTED OUT - too expensive to run by default
 }

 __all__ = [
    "BaseSimulatorTest",
    "BasicConversationTest",
+    "CodeReviewValidationTest",
    "ContentValidationTest",
    "PerToolDeduplicationTest",
    "CrossToolContinuationTest",
    "CrossToolComprehensiveTest",
    "LineNumberValidationTest",
    "LogsValidationTest",
-    # "RedisValidationTest",  # Removed - no longer needed for standalone server
    "TestModelThinkingConfig",
    "O3ModelSelectionTest",
    "O3ProExpensiveTest",
@@ -84,8 +92,10 @@ __all__ = [
    "OpenRouterModelsTest",
    "PlannerValidationTest",
    "PlannerContinuationHistoryTest",
+    "PrecommitWorkflowValidationTest",
    "TokenAllocationValidationTest",
    "TestGenValidationTest",
+    "ThinkDeepWorkflowValidationTest",
    "RefactorValidationTest",
    "DebugValidationTest",
    "DebugCertainConfidenceTest",
@@ -95,5 +105,6 @@ __all__ = [
    "TestConsensusConversation",
    "TestConsensusStance",
    "TestConsensusThreeModels",
+    "AnalyzeValidationTest",
    "TEST_REGISTRY",
 ]