🚀 Major Enhancement: Workflow-Based Tool Architecture v5.5.0 (#95)

* WIP: new workflow architecture * WIP: further improvements and cleanup * WIP: cleanup and docks, replace old tool with new * WIP: cleanup and docks, replace old tool with new * WIP: new planner implementation using workflow * WIP: precommit tool working as a workflow instead of a basic tool Support for passing False to use_assistant_model to skip external models completely and use Claude only * WIP: precommit workflow version swapped with old * WIP: codereview * WIP: replaced codereview * WIP: replaced codereview * WIP: replaced refactor * WIP: workflow for thinkdeep * WIP: ensure files get embedded correctly * WIP: thinkdeep replaced with workflow version * WIP: improved messaging when an external model's response is received * WIP: analyze tool swapped * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: updated tests * Extract only the content when building history * Use "relevant_files" for workflow tools only * WIP: fixed get_completion_next_steps_message missing param * Fixed tests Request for files consistently * Fixed tests Request for files consistently * Fixed tests * New testgen workflow tool Updated docs * Swap testgen workflow * Fix CI test failures by excluding API-dependent tests - Update GitHub Actions workflow to exclude simulation tests that require API keys - Fix collaboration tests to properly mock workflow tool expert analysis calls - Update test assertions to handle new workflow tool response format - Ensure unit tests run without external API dependencies in CI 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * WIP - Update tests to match new tools * WIP - Update tests to match new tools --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-06-21 00:08:11 +04:00
parent 4dae6e457e
commit 69a3121452
76 changed files with 17111 additions and 7725 deletions
--- a/tools/init.py
+++ b/tools/init.py
@@ -9,9 +9,9 @@ from .consensus import ConsensusTool
 from .debug import DebugIssueTool
 from .listmodels import ListModelsTool
 from .planner import PlannerTool
-from .precommit import Precommit
+from .precommit import PrecommitTool
 from .refactor import RefactorTool
-from .testgen import TestGenerationTool
+from .testgen import TestGenTool
 from .thinkdeep import ThinkDeepTool
 from .tracer import TracerTool

@@ -24,8 +24,8 @@ __all__ = [
    "ConsensusTool",
    "ListModelsTool",
    "PlannerTool",
-    "Precommit",
+    "PrecommitTool",
    "RefactorTool",
-    "TestGenerationTool",
+    "TestGenTool",
    "TracerTool",
 ]
--- a/tools/analyze.py
+++ b/tools/analyze.py
@@ -1,116 +1,198 @@
 """
-Analyze tool - General-purpose code and file analysis
+AnalyzeWorkflow tool - Step-by-step code analysis with systematic investigation
+
+This tool provides a structured workflow for comprehensive code and file analysis.
+It guides Claude through systematic investigation steps with forced pauses between each step
+to ensure thorough code examination, pattern identification, and architectural assessment before proceeding.
+The tool supports complex analysis scenarios including architectural review, performance analysis,
+security assessment, and maintainability evaluation.
+
+Key features:
+- Step-by-step analysis workflow with progress tracking
+- Context-aware file embedding (references during investigation, full content for analysis)
+- Automatic pattern and insight tracking with categorization
+- Expert analysis integration with external models
+- Support for focused analysis (architecture, performance, security, quality)
+- Confidence-based workflow optimization
 """

-from typing import TYPE_CHECKING, Any, Optional
+import logging
+from typing import TYPE_CHECKING, Any, Literal, Optional

-from pydantic import Field
+from pydantic import Field, model_validator

 if TYPE_CHECKING:
    from tools.models import ToolModelCategory

 from config import TEMPERATURE_ANALYTICAL
 from systemprompts import ANALYZE_PROMPT
+from tools.shared.base_models import WorkflowRequest

-from .base import BaseTool, ToolRequest
+from .workflow.base import WorkflowTool

-# Field descriptions to avoid duplication between Pydantic and JSON schema
-ANALYZE_FIELD_DESCRIPTIONS = {
-    "files": "Files or directories to analyze (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)",
-    "prompt": "What to analyze or look for",
-    "analysis_type": "Type of analysis to perform",
-    "output_format": "How to format the output",
+logger = logging.getLogger(__name__)
+
+# Tool-specific field descriptions for analyze workflow
+ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS = {
+    "step": (
+        "What to analyze or look for in this step. In step 1, describe what you want to analyze and begin forming "
+        "an analytical approach after thinking carefully about what needs to be examined. Consider code quality, "
+        "performance implications, architectural patterns, and design decisions. Map out the codebase structure, "
+        "understand the business logic, and identify areas requiring deeper analysis. In later steps, continue "
+        "exploring with precision and adapt your understanding as you uncover more insights."
+    ),
+    "step_number": (
+        "The index of the current step in the analysis sequence, beginning at 1. Each step should build upon or "
+        "revise the previous one."
+    ),
+    "total_steps": (
+        "Your current estimate for how many steps will be needed to complete the analysis. "
+        "Adjust as new findings emerge."
+    ),
+    "next_step_required": (
+        "Set to true if you plan to continue the investigation with another step. False means you believe the "
+        "analysis is complete and ready for expert validation."
+    ),
+    "findings": (
+        "Summarize everything discovered in this step about the code being analyzed. Include analysis of architectural "
+        "patterns, design decisions, tech stack assessment, scalability characteristics, performance implications, "
+        "maintainability factors, security posture, and strategic improvement opportunities. Be specific and avoid "
+        "vague language—document what you now know about the codebase and how it affects your assessment. "
+        "IMPORTANT: Document both strengths (good patterns, solid architecture, well-designed components) and "
+        "concerns (tech debt, scalability risks, overengineering, unnecessary complexity). In later steps, confirm "
+        "or update past findings with additional evidence."
+    ),
+    "files_checked": (
+        "List all files (as absolute paths, do not clip or shrink file names) examined during the analysis "
+        "investigation so far. Include even files ruled out or found to be unrelated, as this tracks your "
+        "exploration path."
+    ),
+    "relevant_files": (
+        "Subset of files_checked (as full absolute paths) that contain code directly relevant to the analysis or "
+        "contain significant patterns, architectural decisions, or examples worth highlighting. Only list those that are "
+        "directly tied to important findings, architectural insights, performance characteristics, or strategic "
+        "improvement opportunities. This could include core implementation files, configuration files, or files "
+        "demonstrating key patterns."
+    ),
+    "relevant_context": (
+        "List methods, functions, classes, or modules that are central to the analysis findings, in the format "
+        "'ClassName.methodName', 'functionName', or 'module.ClassName'. Prioritize those that demonstrate important "
+        "patterns, represent key architectural decisions, show performance characteristics, or highlight strategic "
+        "improvement opportunities."
+    ),
+    "backtrack_from_step": (
+        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
+        "start over. Use this to acknowledge investigative dead ends and correct the course."
+    ),
+    "images": (
+        "Optional list of absolute paths to architecture diagrams, design documents, or visual references "
+        "that help with analysis context. Only include if they materially assist understanding or assessment."
+    ),
+    "confidence": (
+        "Your confidence level in the current analysis findings: exploring (early investigation), "
+        "low (some insights but more needed), medium (solid understanding), high (comprehensive insights), "
+        "certain (complete analysis ready for expert validation)"
+    ),
+    "analysis_type": "Type of analysis to perform (architecture, performance, security, quality, general)",
+    "output_format": "How to format the output (summary, detailed, actionable)",
 }


-class AnalyzeRequest(ToolRequest):
-    """Request model for analyze tool"""
+class AnalyzeWorkflowRequest(WorkflowRequest):
+    """Request model for analyze workflow investigation steps"""

-    files: list[str] = Field(..., description=ANALYZE_FIELD_DESCRIPTIONS["files"])
-    prompt: str = Field(..., description=ANALYZE_FIELD_DESCRIPTIONS["prompt"])
-    analysis_type: Optional[str] = Field(None, description=ANALYZE_FIELD_DESCRIPTIONS["analysis_type"])
-    output_format: Optional[str] = Field("detailed", description=ANALYZE_FIELD_DESCRIPTIONS["output_format"])
+    # Required fields for each investigation step
+    step: str = Field(..., description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])
+
+    # Investigation tracking fields
+    findings: str = Field(..., description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["findings"])
+    files_checked: list[str] = Field(
+        default_factory=list, description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"]
+    )
+    relevant_files: list[str] = Field(
+        default_factory=list, description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"]
+    )
+    relevant_context: list[str] = Field(
+        default_factory=list, description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"]
+    )
+
+    # Issues found during analysis (structured with severity)
+    issues_found: list[dict] = Field(
+        default_factory=list,
+        description="Issues or concerns identified during analysis, each with severity level (critical, high, medium, low)",
+    )
+
+    # Optional backtracking field
+    backtrack_from_step: Optional[int] = Field(
+        None, description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"]
+    )
+
+    # Optional images for visual context
+    images: Optional[list[str]] = Field(default=None, description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["images"])
+
+    # Analyze-specific fields (only used in step 1 to initialize)
+    # Note: Use relevant_files field instead of files for consistency across workflow tools
+    analysis_type: Optional[Literal["architecture", "performance", "security", "quality", "general"]] = Field(
+        "general", description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["analysis_type"]
+    )
+    output_format: Optional[Literal["summary", "detailed", "actionable"]] = Field(
+        "detailed", description=ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["output_format"]
+    )
+
+    # Keep thinking_mode and use_websearch from original analyze tool
+    # temperature is inherited from WorkflowRequest
+
+    @model_validator(mode="after")
+    def validate_step_one_requirements(self):
+        """Ensure step 1 has required relevant_files."""
+        if self.step_number == 1:
+            if not self.relevant_files:
+                raise ValueError("Step 1 requires 'relevant_files' field to specify files or directories to analyze")
+        return self


-class AnalyzeTool(BaseTool):
-    """General-purpose file and code analysis tool"""
+class AnalyzeTool(WorkflowTool):
+    """
+    Analyze workflow tool for step-by-step code analysis and expert validation.
+
+    This tool implements a structured analysis workflow that guides users through
+    methodical investigation steps, ensuring thorough code examination, pattern identification,
+    and architectural assessment before reaching conclusions. It supports complex analysis scenarios
+    including architectural review, performance analysis, security assessment, and maintainability evaluation.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.initial_request = None
+        self.analysis_config = {}

    def get_name(self) -> str:
        return "analyze"

    def get_description(self) -> str:
        return (
-            "ANALYZE FILES & CODE - General-purpose analysis for understanding code. "
-            "Supports both individual files and entire directories. "
-            "Use this when you need to analyze files, examine code, or understand specific aspects of a codebase. "
-            "Perfect for: codebase exploration, dependency analysis, pattern detection. "
-            "Always uses file paths for clean terminal output. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
+            "COMPREHENSIVE ANALYSIS WORKFLOW - Step-by-step code analysis with expert validation. "
+            "This tool guides you through a systematic investigation process where you:\\n\\n"
+            "1. Start with step 1: describe your analysis investigation plan\\n"
+            "2. STOP and investigate code structure, patterns, and architectural decisions\\n"
+            "3. Report findings in step 2 with concrete evidence from actual code analysis\\n"
+            "4. Continue investigating between each step\\n"
+            "5. Track findings, relevant files, and insights throughout\\n"
+            "6. Update assessments as understanding evolves\\n"
+            "7. Once investigation is complete, always receive expert validation\\n\\n"
+            "IMPORTANT: This tool enforces investigation between steps:\\n"
+            "- After each call, you MUST investigate before calling again\\n"
+            "- Each step must include NEW evidence from code examination\\n"
+            "- No recursive calls without actual investigation work\\n"
+            "- The tool will specify which step number to use next\\n"
+            "- Follow the required_actions list for investigation guidance\\n\\n"
+            "Perfect for: comprehensive code analysis, architectural assessment, performance evaluation, "
+            "security analysis, maintainability review, pattern detection, strategic planning."
        )

-    def get_input_schema(self) -> dict[str, Any]:
-        schema = {
-            "type": "object",
-            "properties": {
-                "files": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": ANALYZE_FIELD_DESCRIPTIONS["files"],
-                },
-                "model": self.get_model_field_schema(),
-                "prompt": {
-                    "type": "string",
-                    "description": ANALYZE_FIELD_DESCRIPTIONS["prompt"],
-                },
-                "analysis_type": {
-                    "type": "string",
-                    "enum": [
-                        "architecture",
-                        "performance",
-                        "security",
-                        "quality",
-                        "general",
-                    ],
-                    "description": ANALYZE_FIELD_DESCRIPTIONS["analysis_type"],
-                },
-                "output_format": {
-                    "type": "string",
-                    "enum": ["summary", "detailed", "actionable"],
-                    "default": "detailed",
-                    "description": ANALYZE_FIELD_DESCRIPTIONS["output_format"],
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "Temperature (0-1, default 0.2)",
-                    "minimum": 0,
-                    "maximum": 1,
-                },
-                "thinking_mode": {
-                    "type": "string",
-                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
-                },
-                "use_websearch": {
-                    "type": "boolean",
-                    "description": (
-                        "Enable web search for documentation, best practices, and current information. "
-                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
-                        "exploring industry best practices, working with specific frameworks/technologies, "
-                        "researching solutions to complex problems, or when current documentation and "
-                        "community insights would enhance the analysis."
-                    ),
-                    "default": True,
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
-                },
-            },
-            "required": ["files", "prompt"] + (["model"] if self.is_effective_auto_mode() else []),
-        }
-
-        return schema
-
    def get_system_prompt(self) -> str:
        return ANALYZE_PROMPT

@@ -118,88 +200,425 @@ class AnalyzeTool(BaseTool):
        return TEMPERATURE_ANALYTICAL

    def get_model_category(self) -> "ToolModelCategory":
-        """Analyze requires deep understanding and reasoning"""
+        """Analyze workflow requires thorough analysis and reasoning"""
        from tools.models import ToolModelCategory

        return ToolModelCategory.EXTENDED_REASONING

-    def get_request_model(self):
-        return AnalyzeRequest
+    def get_workflow_request_model(self):
+        """Return the analyze workflow-specific request model."""
+        return AnalyzeWorkflowRequest

-    async def prepare_prompt(self, request: AnalyzeRequest) -> str:
-        """Prepare the analysis prompt"""
-        # Check for prompt.txt in files
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
+    def get_input_schema(self) -> dict[str, Any]:
+        """Generate input schema using WorkflowSchemaBuilder with analyze-specific overrides."""
+        from .workflow.schema_builders import WorkflowSchemaBuilder

-        # If prompt.txt was found, use it as the prompt
-        if prompt_content:
-            request.prompt = prompt_content
+        # Fields to exclude from analyze workflow (inherited from WorkflowRequest but not used)
+        excluded_fields = {"hypothesis", "confidence"}

-        # Check user input size at MCP transport boundary (before adding internal content)
-        size_check = self.check_prompt_size(request.prompt)
-        if size_check:
-            from tools.models import ToolOutput
+        # Analyze workflow-specific field overrides
+        analyze_field_overrides = {
+            "step": {
+                "type": "string",
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["step"],
+            },
+            "step_number": {
+                "type": "integer",
+                "minimum": 1,
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["step_number"],
+            },
+            "total_steps": {
+                "type": "integer",
+                "minimum": 1,
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"],
+            },
+            "next_step_required": {
+                "type": "boolean",
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"],
+            },
+            "findings": {
+                "type": "string",
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["findings"],
+            },
+            "files_checked": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"],
+            },
+            "relevant_files": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"],
+            },
+            "confidence": {
+                "type": "string",
+                "enum": ["exploring", "low", "medium", "high", "certain"],
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["confidence"],
+            },
+            "backtrack_from_step": {
+                "type": "integer",
+                "minimum": 1,
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"],
+            },
+            "images": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["images"],
+            },
+            "issues_found": {
+                "type": "array",
+                "items": {"type": "object"},
+                "description": "Issues or concerns identified during analysis, each with severity level (critical, high, medium, low)",
+            },
+            "analysis_type": {
+                "type": "string",
+                "enum": ["architecture", "performance", "security", "quality", "general"],
+                "default": "general",
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["analysis_type"],
+            },
+            "output_format": {
+                "type": "string",
+                "enum": ["summary", "detailed", "actionable"],
+                "default": "detailed",
+                "description": ANALYZE_WORKFLOW_FIELD_DESCRIPTIONS["output_format"],
+            },
+        }

-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
-
-        # Update request files list
-        if updated_files is not None:
-            request.files = updated_files
-
-        # File size validation happens at MCP boundary in server.py
-
-        # Use centralized file processing logic
-        continuation_id = getattr(request, "continuation_id", None)
-        file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Files")
-        self._actually_processed_files = processed_files
-
-        # Build analysis instructions
-        analysis_focus = []
-
-        if request.analysis_type:
-            type_focus = {
-                "architecture": "Focus on architectural patterns, structure, and design decisions",
-                "performance": "Focus on performance characteristics and optimization opportunities",
-                "security": "Focus on security implications and potential vulnerabilities",
-                "quality": "Focus on code quality, maintainability, and best practices",
-                "general": "Provide a comprehensive general analysis",
-            }
-            analysis_focus.append(type_focus.get(request.analysis_type, ""))
-
-        if request.output_format == "summary":
-            analysis_focus.append("Provide a concise summary of key findings")
-        elif request.output_format == "actionable":
-            analysis_focus.append("Focus on actionable insights and specific recommendations")
-
-        focus_instruction = "\n".join(analysis_focus) if analysis_focus else ""
-
-        # Add web search instruction if enabled
-        websearch_instruction = self.get_websearch_instruction(
-            request.use_websearch,
-            """When analyzing code, consider if searches for these would help:
- Documentation for technologies or frameworks found in the code
- Best practices and design patterns relevant to the analysis
- API references and usage examples
- Known issues or solutions for patterns you identify""",
+        # Use WorkflowSchemaBuilder with analyze-specific tool fields
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=analyze_field_overrides,
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
+            excluded_workflow_fields=list(excluded_fields),
        )

-        # Combine everything
-        full_prompt = f"""{self.get_system_prompt()}
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """Define required actions for each investigation phase."""
+        if step_number == 1:
+            # Initial analysis investigation tasks
+            return [
+                "Read and understand the code files specified for analysis",
+                "Map the tech stack, frameworks, and overall architecture",
+                "Identify the main components, modules, and their relationships",
+                "Understand the business logic and intended functionality",
+                "Examine architectural patterns and design decisions used",
+                "Look for strengths, risks, and strategic improvement areas",
+            ]
+        elif step_number < total_steps:
+            # Need deeper investigation
+            return [
+                "Examine specific architectural patterns and design decisions in detail",
+                "Analyze scalability characteristics and performance implications",
+                "Assess maintainability factors: module cohesion, coupling, tech debt",
+                "Identify security posture and potential systemic vulnerabilities",
+                "Look for overengineering, unnecessary complexity, or missing abstractions",
+                "Evaluate how well the architecture serves business and scaling goals",
+            ]
+        else:
+            # Close to completion - need final verification
+            return [
+                "Verify all significant architectural insights have been documented",
+                "Confirm strategic improvement opportunities are comprehensively captured",
+                "Ensure both strengths and risks are properly identified with evidence",
+                "Validate that findings align with the analysis type and goals specified",
+                "Check that recommendations are actionable and proportional to the codebase",
+                "Confirm the analysis provides clear guidance for strategic decisions",
+            ]

-{focus_instruction}{websearch_instruction}
+    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
+        """
+        Always call expert analysis for comprehensive validation.

-=== USER QUESTION ===
-{request.prompt}
-=== END QUESTION ===
+        Analysis benefits from a second opinion to ensure completeness.
+        """
+        # Check if user explicitly requested to skip assistant model
+        if request and not self.get_request_use_assistant_model(request):
+            return False

-=== FILES TO ANALYZE ===
-{file_content}
-=== END FILES ===
+        # For analysis, we always want expert validation if we have any meaningful data
+        return len(consolidated_findings.relevant_files) > 0 or len(consolidated_findings.findings) >= 1

-Please analyze these files to answer the user's question."""
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """Prepare context for external model call for final analysis validation."""
+        context_parts = [
+            f"=== ANALYSIS REQUEST ===\\n{self.initial_request or 'Code analysis workflow initiated'}\\n=== END REQUEST ==="
+        ]

-        return full_prompt
+        # Add investigation summary
+        investigation_summary = self._build_analysis_summary(consolidated_findings)
+        context_parts.append(
+            f"\\n=== CLAUDE'S ANALYSIS INVESTIGATION ===\\n{investigation_summary}\\n=== END INVESTIGATION ==="
+        )

-    def format_response(self, response: str, request: AnalyzeRequest, model_info: Optional[dict] = None) -> str:
-        """Format the analysis response"""
-        return f"{response}\n\n---\n\n**Next Steps:** Use this analysis to actively continue your task. Investigate deeper into any findings, implement solutions based on these insights, and carry out the necessary work. Only pause to ask the user if you need their explicit approval for major changes or if critical decisions require their input."
+        # Add analysis configuration context if available
+        if self.analysis_config:
+            config_text = "\\n".join(f"- {key}: {value}" for key, value in self.analysis_config.items() if value)
+            context_parts.append(f"\\n=== ANALYSIS CONFIGURATION ===\\n{config_text}\\n=== END CONFIGURATION ===")
+
+        # Add relevant code elements if available
+        if consolidated_findings.relevant_context:
+            methods_text = "\\n".join(f"- {method}" for method in consolidated_findings.relevant_context)
+            context_parts.append(f"\\n=== RELEVANT CODE ELEMENTS ===\\n{methods_text}\\n=== END CODE ELEMENTS ===")
+
+        # Add assessment evolution if available
+        if consolidated_findings.hypotheses:
+            assessments_text = "\\n".join(
+                f"Step {h['step']}: {h['hypothesis']}" for h in consolidated_findings.hypotheses
+            )
+            context_parts.append(f"\\n=== ASSESSMENT EVOLUTION ===\\n{assessments_text}\\n=== END ASSESSMENTS ===")
+
+        # Add images if available
+        if consolidated_findings.images:
+            images_text = "\\n".join(f"- {img}" for img in consolidated_findings.images)
+            context_parts.append(
+                f"\\n=== VISUAL ANALYSIS INFORMATION ===\\n{images_text}\\n=== END VISUAL INFORMATION ==="
+            )
+
+        return "\\n".join(context_parts)
+
+    def _build_analysis_summary(self, consolidated_findings) -> str:
+        """Prepare a comprehensive summary of the analysis investigation."""
+        summary_parts = [
+            "=== SYSTEMATIC ANALYSIS INVESTIGATION SUMMARY ===",
+            f"Total steps: {len(consolidated_findings.findings)}",
+            f"Files examined: {len(consolidated_findings.files_checked)}",
+            f"Relevant files identified: {len(consolidated_findings.relevant_files)}",
+            f"Code elements analyzed: {len(consolidated_findings.relevant_context)}",
+            "",
+            "=== INVESTIGATION PROGRESSION ===",
+        ]
+
+        for finding in consolidated_findings.findings:
+            summary_parts.append(finding)
+
+        return "\\n".join(summary_parts)
+
+    def should_include_files_in_expert_prompt(self) -> bool:
+        """Include files in expert analysis for comprehensive validation."""
+        return True
+
+    def should_embed_system_prompt(self) -> bool:
+        """Embed system prompt in expert analysis for proper context."""
+        return True
+
+    def get_expert_thinking_mode(self) -> str:
+        """Use high thinking mode for thorough analysis."""
+        return "high"
+
+    def get_expert_analysis_instruction(self) -> str:
+        """Get specific instruction for analysis expert validation."""
+        return (
+            "Please provide comprehensive analysis validation based on the investigation findings. "
+            "Focus on identifying any remaining architectural insights, validating the completeness of the analysis, "
+            "and providing final strategic recommendations following the structured format specified in the system prompt."
+        )
+
+    # Hook method overrides for analyze-specific behavior
+
+    def prepare_step_data(self, request) -> dict:
+        """
+        Map analyze-specific fields for internal processing.
+        """
+        step_data = {
+            "step": request.step,
+            "step_number": request.step_number,
+            "findings": request.findings,
+            "files_checked": request.files_checked,
+            "relevant_files": request.relevant_files,
+            "relevant_context": request.relevant_context,
+            "issues_found": request.issues_found,  # Analyze workflow uses issues_found for structured problem tracking
+            "confidence": "medium",  # Fixed value for workflow compatibility
+            "hypothesis": request.findings,  # Map findings to hypothesis for compatibility
+            "images": request.images or [],
+        }
+        return step_data
+
+    def should_skip_expert_analysis(self, request, consolidated_findings) -> bool:
+        """
+        Analyze workflow always uses expert analysis for comprehensive validation.
+
+        Analysis benefits from a second opinion to ensure completeness and catch
+        any missed insights or alternative perspectives.
+        """
+        return False
+
+    def store_initial_issue(self, step_description: str):
+        """Store initial request for expert analysis."""
+        self.initial_request = step_description
+
+    # Override inheritance hooks for analyze-specific behavior
+
+    def get_completion_status(self) -> str:
+        """Analyze tools use analysis-specific status."""
+        return "analysis_complete_ready_for_implementation"
+
+    def get_completion_data_key(self) -> str:
+        """Analyze uses 'complete_analysis' key."""
+        return "complete_analysis"
+
+    def get_final_analysis_from_request(self, request):
+        """Analyze tools use 'findings' field."""
+        return request.findings
+
+    def get_confidence_level(self, request) -> str:
+        """Analyze tools use fixed confidence for consistency."""
+        return "medium"
+
+    def get_completion_message(self) -> str:
+        """Analyze-specific completion message."""
+        return (
+            "Analysis complete. You have identified all significant patterns, "
+            "architectural insights, and strategic opportunities. MANDATORY: Present the user with the complete "
+            "analysis results organized by strategic impact, and IMMEDIATELY proceed with implementing the "
+            "highest priority recommendations or provide specific guidance for improvements. Focus on actionable "
+            "strategic insights."
+        )
+
+    def get_skip_reason(self) -> str:
+        """Analyze-specific skip reason."""
+        return "Claude completed comprehensive analysis"
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Analyze-specific expert analysis skip status."""
+        return "skipped_due_to_complete_analysis"
+
+    def prepare_work_summary(self) -> str:
+        """Analyze-specific work summary."""
+        return self._build_analysis_summary(self.consolidated_findings)
+
+    def get_completion_next_steps_message(self, expert_analysis_used: bool = False) -> str:
+        """
+        Analyze-specific completion message.
+        """
+        base_message = (
+            "ANALYSIS IS COMPLETE. You MUST now summarize and present ALL analysis findings organized by "
+            "strategic impact (Critical → High → Medium → Low), specific architectural insights with code references, "
+            "and exact recommendations for improvement. Clearly prioritize the top 3 strategic opportunities that need "
+            "immediate attention. Provide concrete, actionable guidance for each finding—make it easy for a developer "
+            "to understand exactly what strategic improvements to implement and how to approach them."
+        )
+
+        # Add expert analysis guidance only when expert analysis was actually used
+        if expert_analysis_used:
+            expert_guidance = self.get_expert_analysis_guidance()
+            if expert_guidance:
+                return f"{base_message}\n\n{expert_guidance}"
+
+        return base_message
+
+    def get_expert_analysis_guidance(self) -> str:
+        """
+        Provide specific guidance for handling expert analysis in code analysis.
+        """
+        return (
+            "IMPORTANT: Analysis from an assistant model has been provided above. You MUST thoughtfully evaluate and validate "
+            "the expert insights rather than treating them as definitive conclusions. Cross-reference the expert "
+            "analysis with your own systematic investigation, verify that architectural recommendations are "
+            "appropriate for this codebase's scale and context, and ensure suggested improvements align with "
+            "the project's goals and constraints. Present a comprehensive synthesis that combines your detailed "
+            "analysis with validated expert perspectives, clearly distinguishing between patterns you've "
+            "independently identified and additional strategic insights from expert validation."
+        )
+
+    def get_step_guidance_message(self, request) -> str:
+        """
+        Analyze-specific step guidance with detailed investigation instructions.
+        """
+        step_guidance = self.get_analyze_step_guidance(request.step_number, request)
+        return step_guidance["next_steps"]
+
+    def get_analyze_step_guidance(self, step_number: int, request) -> dict[str, Any]:
+        """
+        Provide step-specific guidance for analyze workflow.
+        """
+        # Generate the next steps instruction based on required actions
+        required_actions = self.get_required_actions(step_number, "medium", request.findings, request.total_steps)
+
+        if step_number == 1:
+            next_steps = (
+                f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. You MUST first examine "
+                f"the code files thoroughly using appropriate tools. CRITICAL AWARENESS: You need to understand "
+                f"the architectural patterns, assess scalability and performance characteristics, identify strategic "
+                f"improvement areas, and look for systemic risks, overengineering, and missing abstractions. "
+                f"Use file reading tools, code analysis, and systematic examination to gather comprehensive information. "
+                f"Only call {self.get_name()} again AFTER completing your investigation. When you call "
+                f"{self.get_name()} next time, use step_number: {step_number + 1} and report specific "
+                f"files examined, architectural insights found, and strategic assessment discoveries."
+            )
+        elif step_number < request.total_steps:
+            next_steps = (
+                f"STOP! Do NOT call {self.get_name()} again yet. Based on your findings, you've identified areas that need "
+                f"deeper analysis. MANDATORY ACTIONS before calling {self.get_name()} step {step_number + 1}:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nOnly call {self.get_name()} again with step_number: {step_number + 1} AFTER "
+                + "completing these analysis tasks."
+            )
+        else:
+            next_steps = (
+                f"WAIT! Your analysis needs final verification. DO NOT call {self.get_name()} immediately. REQUIRED ACTIONS:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nREMEMBER: Ensure you have identified all significant architectural insights and strategic "
+                f"opportunities across all areas. Document findings with specific file references and "
+                f"code examples where applicable, then call {self.get_name()} with step_number: {step_number + 1}."
+            )
+
+        return {"next_steps": next_steps}
+
+    def customize_workflow_response(self, response_data: dict, request) -> dict:
+        """
+        Customize response to match analyze workflow format.
+        """
+        # Store initial request on first step
+        if request.step_number == 1:
+            self.initial_request = request.step
+            # Store analysis configuration for expert analysis
+            if request.relevant_files:
+                self.analysis_config = {
+                    "relevant_files": request.relevant_files,
+                    "analysis_type": request.analysis_type,
+                    "output_format": request.output_format,
+                }
+
+        # Convert generic status names to analyze-specific ones
+        tool_name = self.get_name()
+        status_mapping = {
+            f"{tool_name}_in_progress": "analysis_in_progress",
+            f"pause_for_{tool_name}": "pause_for_analysis",
+            f"{tool_name}_required": "analysis_required",
+            f"{tool_name}_complete": "analysis_complete",
+        }
+
+        if response_data["status"] in status_mapping:
+            response_data["status"] = status_mapping[response_data["status"]]
+
+        # Rename status field to match analyze workflow
+        if f"{tool_name}_status" in response_data:
+            response_data["analysis_status"] = response_data.pop(f"{tool_name}_status")
+            # Add analyze-specific status fields
+            response_data["analysis_status"]["insights_by_severity"] = {}
+            for insight in self.consolidated_findings.issues_found:
+                severity = insight.get("severity", "unknown")
+                if severity not in response_data["analysis_status"]["insights_by_severity"]:
+                    response_data["analysis_status"]["insights_by_severity"][severity] = 0
+                response_data["analysis_status"]["insights_by_severity"][severity] += 1
+            response_data["analysis_status"]["analysis_confidence"] = self.get_request_confidence(request)
+
+        # Map complete_analyze to complete_analysis
+        if f"complete_{tool_name}" in response_data:
+            response_data["complete_analysis"] = response_data.pop(f"complete_{tool_name}")
+
+        # Map the completion flag to match analyze workflow
+        if f"{tool_name}_complete" in response_data:
+            response_data["analysis_complete"] = response_data.pop(f"{tool_name}_complete")
+
+        return response_data
+
+    # Required abstract methods from BaseTool
+    def get_request_model(self):
+        """Return the analyze workflow-specific request model."""
+        return AnalyzeWorkflowRequest
+
+    async def prepare_prompt(self, request) -> str:
+        """Not used - workflow tools use execute_workflow()."""
+        return ""  # Workflow tools use execute_workflow() directly
--- a/tools/base.py
+++ b/tools/base.py
@@ -691,6 +691,65 @@ class BaseTool(ABC):

        return parts

+    def _extract_clean_content_for_history(self, formatted_content: str) -> str:
+        """
+        Extract clean content suitable for conversation history storage.
+
+        This method removes internal metadata, continuation offers, and other
+        tool-specific formatting that should not appear in conversation history
+        when passed to expert models or other tools.
+
+        Args:
+            formatted_content: The full formatted response from the tool
+
+        Returns:
+            str: Clean content suitable for conversation history storage
+        """
+        try:
+            # Try to parse as JSON first (for structured responses)
+            import json
+
+            response_data = json.loads(formatted_content)
+
+            # If it's a ToolOutput-like structure, extract just the content
+            if isinstance(response_data, dict) and "content" in response_data:
+                # Remove continuation_offer and other metadata fields
+                clean_data = {
+                    "content": response_data.get("content", ""),
+                    "status": response_data.get("status", "success"),
+                    "content_type": response_data.get("content_type", "text"),
+                }
+                return json.dumps(clean_data, indent=2)
+            else:
+                # For non-ToolOutput JSON, return as-is but ensure no continuation_offer
+                if "continuation_offer" in response_data:
+                    clean_data = {k: v for k, v in response_data.items() if k != "continuation_offer"}
+                    return json.dumps(clean_data, indent=2)
+                return formatted_content
+
+        except (json.JSONDecodeError, TypeError):
+            # Not JSON, treat as plain text
+            # Remove any lines that contain continuation metadata
+            lines = formatted_content.split("\n")
+            clean_lines = []
+
+            for line in lines:
+                # Skip lines containing internal metadata patterns
+                if any(
+                    pattern in line.lower()
+                    for pattern in [
+                        "continuation_id",
+                        "remaining_turns",
+                        "suggested_tool_params",
+                        "if you'd like to continue",
+                        "continuation available",
+                    ]
+                ):
+                    continue
+                clean_lines.append(line)
+
+            return "\n".join(clean_lines).strip()
+
    def _prepare_file_content_for_prompt(
        self,
        request_files: list[str],
@@ -972,6 +1031,26 @@ When recommending searches, be specific about what information you need and why
                        f"Please provide the full absolute path starting with '/' (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)"
                    )

+        # Check if request has 'files_checked' attribute (used by workflow tools)
+        if hasattr(request, "files_checked") and request.files_checked:
+            for file_path in request.files_checked:
+                if not os.path.isabs(file_path):
+                    return (
+                        f"Error: All file paths must be FULL absolute paths to real files / folders - DO NOT SHORTEN. "
+                        f"Received relative path: {file_path}\n"
+                        f"Please provide the full absolute path starting with '/' (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)"
+                    )
+
+        # Check if request has 'relevant_files' attribute (used by workflow tools)
+        if hasattr(request, "relevant_files") and request.relevant_files:
+            for file_path in request.relevant_files:
+                if not os.path.isabs(file_path):
+                    return (
+                        f"Error: All file paths must be FULL absolute paths to real files / folders - DO NOT SHORTEN. "
+                        f"Received relative path: {file_path}\n"
+                        f"Please provide the full absolute path starting with '/' (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)"
+                    )
+
        # Check if request has 'path' attribute (used by review_changes tool)
        if hasattr(request, "path") and request.path:
            if not os.path.isabs(request.path):
@@ -1605,10 +1684,13 @@ When recommending searches, be specific about what information you need and why
                if model_response:
                    model_metadata = {"usage": model_response.usage, "metadata": model_response.metadata}

+            # CRITICAL: Store clean content for conversation history (exclude internal metadata)
+            clean_content = self._extract_clean_content_for_history(formatted_content)
+
            success = add_turn(
                continuation_id,
                "assistant",
-                formatted_content,
+                clean_content,  # Use cleaned content instead of full formatted response
                files=request_files,
                images=request_images,
                tool_name=self.name,
@@ -1728,10 +1810,13 @@ When recommending searches, be specific about what information you need and why
                if model_response:
                    model_metadata = {"usage": model_response.usage, "metadata": model_response.metadata}

+            # CRITICAL: Store clean content for conversation history (exclude internal metadata)
+            clean_content = self._extract_clean_content_for_history(content)
+
            add_turn(
                thread_id,
                "assistant",
-                content,
+                clean_content,  # Use cleaned content instead of full formatted response
                files=request_files,
                images=request_images,
                tool_name=self.name,
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -1,316 +1,671 @@
 """
-Code Review tool - Comprehensive code analysis and review
+CodeReview Workflow tool - Systematic code review with step-by-step analysis

-This tool provides professional-grade code review capabilities using
-the chosen model's understanding of code patterns, best practices, and common issues.
-It can analyze individual files or entire codebases, providing actionable
-feedback categorized by severity.
+This tool provides a structured workflow for comprehensive code review and analysis.
+It guides Claude through systematic investigation steps with forced pauses between each step
+to ensure thorough code examination, issue identification, and quality assessment before proceeding.
+The tool supports complex review scenarios including security analysis, performance evaluation,
+and architectural assessment.

-Key Features:
- Multi-file and directory support
- Configurable review types (full, security, performance, quick)
- Severity-based issue filtering
- Custom focus areas and coding standards
- Structured output with specific remediation steps
+Key features:
+- Step-by-step code review workflow with progress tracking
+- Context-aware file embedding (references during investigation, full content for analysis)
+- Automatic issue tracking with severity classification
+- Expert analysis integration with external models
+- Support for focused reviews (security, performance, architecture)
+- Confidence-based workflow optimization
 """

-from typing import Any, Optional
+import logging
+from typing import TYPE_CHECKING, Any, Literal, Optional

-from pydantic import Field
+from pydantic import Field, model_validator
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 from config import TEMPERATURE_ANALYTICAL
 from systemprompts import CODEREVIEW_PROMPT
+from tools.shared.base_models import WorkflowRequest

-from .base import BaseTool, ToolRequest
+from .workflow.base import WorkflowTool

-# Field descriptions to avoid duplication between Pydantic and JSON schema
-CODEREVIEW_FIELD_DESCRIPTIONS = {
-    "files": "Code files or directories to review that are relevant to the code that needs review or are closely "
-    "related to the code or component that needs to be reviewed (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)."
-    "Validate that these files exist on disk before sharing and only share code that is relevant.",
-    "prompt": (
-        "User's summary of what the code does, expected behavior, constraints, and review objectives. "
-        "IMPORTANT: Before using this tool, you should first perform its own preliminary review - "
-        "examining the code structure, identifying potential issues, understanding the business logic, "
-        "and noting areas of concern. Include your initial observations about code quality, potential "
-        "bugs, architectural patterns, and specific areas that need deeper scrutiny. This dual-perspective "
-        "approach (your analysis + external model's review) provides more comprehensive feedback and "
-        "catches issues that either reviewer might miss alone."
+logger = logging.getLogger(__name__)
+
+# Tool-specific field descriptions for code review workflow
+CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
+    "step": (
+        "Describe what you're currently investigating for code review by thinking deeply about the code structure, "
+        "patterns, and potential issues. In step 1, clearly state your review plan and begin forming a systematic "
+        "approach after thinking carefully about what needs to be analyzed. CRITICAL: Remember to thoroughly examine "
+        "code quality, security implications, performance concerns, and architectural patterns. Consider not only "
+        "obvious bugs and issues but also subtle concerns like over-engineering, unnecessary complexity, design "
+        "patterns that could be simplified, areas where architecture might not scale well, missing abstractions, "
+        "and ways to reduce complexity while maintaining functionality. Map out the codebase structure, understand "
+        "the business logic, and identify areas requiring deeper analysis. In all later steps, continue exploring "
+        "with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover more evidence."
+    ),
+    "step_number": (
+        "The index of the current step in the code review sequence, beginning at 1. Each step should build upon or "
+        "revise the previous one."
+    ),
+    "total_steps": (
+        "Your current estimate for how many steps will be needed to complete the code review. "
+        "Adjust as new findings emerge."
+    ),
+    "next_step_required": (
+        "Set to true if you plan to continue the investigation with another step. False means you believe the "
+        "code review analysis is complete and ready for expert validation."
+    ),
+    "findings": (
+        "Summarize everything discovered in this step about the code being reviewed. Include analysis of code quality, "
+        "security concerns, performance issues, architectural patterns, design decisions, potential bugs, code smells, "
+        "and maintainability considerations. Be specific and avoid vague language—document what you now know about "
+        "the code and how it affects your assessment. IMPORTANT: Document both positive findings (good patterns, "
+        "proper implementations, well-designed components) and concerns (potential issues, anti-patterns, security "
+        "risks, performance bottlenecks). In later steps, confirm or update past findings with additional evidence."
+    ),
+    "files_checked": (
+        "List all files (as absolute paths, do not clip or shrink file names) examined during the code review "
+        "investigation so far. Include even files ruled out or found to be unrelated, as this tracks your "
+        "exploration path."
+    ),
+    "relevant_files": (
+        "Subset of files_checked (as full absolute paths) that contain code directly relevant to the review or "
+        "contain significant issues, patterns, or examples worth highlighting. Only list those that are directly "
+        "tied to important findings, security concerns, performance issues, or architectural decisions. This could "
+        "include core implementation files, configuration files, or files with notable patterns."
+    ),
+    "relevant_context": (
+        "List methods, functions, classes, or modules that are central to the code review findings, in the format "
+        "'ClassName.methodName', 'functionName', or 'module.ClassName'. Prioritize those that contain issues, "
+        "demonstrate patterns, show security concerns, or represent key architectural decisions."
+    ),
+    "issues_found": (
+        "List of issues identified during the investigation. Each issue should be a dictionary with 'severity' "
+        "(critical, high, medium, low) and 'description' fields. Include security vulnerabilities, performance "
+        "bottlenecks, code quality issues, architectural concerns, maintainability problems, over-engineering, "
+        "unnecessary complexity, etc."
+    ),
+    "confidence": (
+        "Indicate your current confidence in the code review assessment. Use: 'exploring' (starting analysis), 'low' "
+        "(early investigation), 'medium' (some evidence gathered), 'high' (strong evidence), 'certain' (only when "
+        "the code review is thoroughly complete and all significant issues are identified). Do NOT use 'certain' "
+        "unless the code review is comprehensively complete, use 'high' instead not 100% sure. Using 'certain' "
+        "prevents additional expert analysis."
+    ),
+    "backtrack_from_step": (
+        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
+        "start over. Use this to acknowledge investigative dead ends and correct the course."
    ),
    "images": (
-        "Optional images of architecture diagrams, UI mockups, design documents, or visual references "
-        "for code review context"
+        "Optional list of absolute paths to architecture diagrams, UI mockups, design documents, or visual references "
+        "that help with code review context. Only include if they materially assist understanding or assessment."
    ),
-    "review_type": "Type of review to perform",
-    "focus_on": "Specific aspects to focus on, or additional context that would help understand areas of concern",
-    "standards": "Coding standards to enforce",
-    "severity_filter": "Minimum severity level to report",
+    "review_type": "Type of review to perform (full, security, performance, quick)",
+    "focus_on": "Specific aspects to focus on or additional context that would help understand areas of concern",
+    "standards": "Coding standards to enforce during the review",
+    "severity_filter": "Minimum severity level to report on the issues found",
 }


-class CodeReviewRequest(ToolRequest):
-    """
-    Request model for the code review tool.
+class CodeReviewRequest(WorkflowRequest):
+    """Request model for code review workflow investigation steps"""

-    This model defines all parameters that can be used to customize
-    the code review process, from selecting files to specifying
-    review focus and standards.
+    # Required fields for each investigation step
+    step: str = Field(..., description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])
+
+    # Investigation tracking fields
+    findings: str = Field(..., description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["findings"])
+    files_checked: list[str] = Field(
+        default_factory=list, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"]
+    )
+    relevant_files: list[str] = Field(
+        default_factory=list, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"]
+    )
+    relevant_context: list[str] = Field(
+        default_factory=list, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"]
+    )
+    issues_found: list[dict] = Field(
+        default_factory=list, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["issues_found"]
+    )
+    confidence: Optional[str] = Field("low", description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["confidence"])
+
+    # Optional backtracking field
+    backtrack_from_step: Optional[int] = Field(
+        None, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"]
+    )
+
+    # Optional images for visual context
+    images: Optional[list[str]] = Field(default=None, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["images"])
+
+    # Code review-specific fields (only used in step 1 to initialize)
+    review_type: Optional[Literal["full", "security", "performance", "quick"]] = Field(
+        "full", description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["review_type"]
+    )
+    focus_on: Optional[str] = Field(None, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["focus_on"])
+    standards: Optional[str] = Field(None, description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["standards"])
+    severity_filter: Optional[Literal["critical", "high", "medium", "low", "all"]] = Field(
+        "all", description=CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["severity_filter"]
+    )
+
+    # Override inherited fields to exclude them from schema (except model which needs to be available)
+    temperature: Optional[float] = Field(default=None, exclude=True)
+    thinking_mode: Optional[str] = Field(default=None, exclude=True)
+    use_websearch: Optional[bool] = Field(default=None, exclude=True)
+
+    @model_validator(mode="after")
+    def validate_step_one_requirements(self):
+        """Ensure step 1 has required relevant_files field."""
+        if self.step_number == 1 and not self.relevant_files:
+            raise ValueError("Step 1 requires 'relevant_files' field to specify code files or directories to review")
+        return self
+
+
+class CodeReviewTool(WorkflowTool):
+    """
+    Code Review workflow tool for step-by-step code review and expert analysis.
+
+    This tool implements a structured code review workflow that guides users through
+    methodical investigation steps, ensuring thorough code examination, issue identification,
+    and quality assessment before reaching conclusions. It supports complex review scenarios
+    including security audits, performance analysis, architectural review, and maintainability assessment.
    """

-    files: list[str] = Field(..., description=CODEREVIEW_FIELD_DESCRIPTIONS["files"])
-    prompt: str = Field(..., description=CODEREVIEW_FIELD_DESCRIPTIONS["prompt"])
-    images: Optional[list[str]] = Field(None, description=CODEREVIEW_FIELD_DESCRIPTIONS["images"])
-    review_type: str = Field("full", description=CODEREVIEW_FIELD_DESCRIPTIONS["review_type"])
-    focus_on: Optional[str] = Field(None, description=CODEREVIEW_FIELD_DESCRIPTIONS["focus_on"])
-    standards: Optional[str] = Field(None, description=CODEREVIEW_FIELD_DESCRIPTIONS["standards"])
-    severity_filter: str = Field("all", description=CODEREVIEW_FIELD_DESCRIPTIONS["severity_filter"])
-
-
-class CodeReviewTool(BaseTool):
-    """
-    Professional code review tool implementation.
-
-    This tool analyzes code for bugs, security vulnerabilities, performance
-    issues, and code quality problems. It provides detailed feedback with
-    severity ratings and specific remediation steps.
-    """
+    def __init__(self):
+        super().__init__()
+        self.initial_request = None
+        self.review_config = {}

    def get_name(self) -> str:
        return "codereview"

    def get_description(self) -> str:
        return (
-            "PROFESSIONAL CODE REVIEW - Comprehensive analysis for bugs, security, and quality. "
-            "Supports both individual files and entire directories/projects. "
-            "Use this when you need to review code, check for issues, find bugs, or perform security audits. "
-            "ALSO use this to validate claims about code, verify code flow and logic, confirm assertions, "
-            "cross-check functionality, or investigate how code actually behaves when you need to be certain. "
-            "I'll identify issues by severity (Critical→High→Medium→Low) with specific fixes. "
-            "Supports focused reviews: security, performance, or quick checks. "
-            "Choose thinking_mode based on review scope: 'low' for small code snippets, "
-            "'medium' for standard files/modules (default), 'high' for complex systems/architectures, "
-            "'max' for critical security audits or large codebases requiring deepest analysis. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools "
-            "can provide enhanced capabilities."
+            "COMPREHENSIVE CODE REVIEW WORKFLOW - Step-by-step code review with expert analysis. "
+            "This tool guides you through a systematic investigation process where you:\\n\\n"
+            "1. Start with step 1: describe your code review investigation plan\\n"
+            "2. STOP and investigate code structure, patterns, and potential issues\\n"
+            "3. Report findings in step 2 with concrete evidence from actual code analysis\\n"
+            "4. Continue investigating between each step\\n"
+            "5. Track findings, relevant files, and issues throughout\\n"
+            "6. Update assessments as understanding evolves\\n"
+            "7. Once investigation is complete, receive expert analysis\\n\\n"
+            "IMPORTANT: This tool enforces investigation between steps:\\n"
+            "- After each call, you MUST investigate before calling again\\n"
+            "- Each step must include NEW evidence from code examination\\n"
+            "- No recursive calls without actual investigation work\\n"
+            "- The tool will specify which step number to use next\\n"
+            "- Follow the required_actions list for investigation guidance\\n\\n"
+            "Perfect for: comprehensive code review, security audits, performance analysis, "
+            "architectural assessment, code quality evaluation, anti-pattern detection."
        )

-    def get_input_schema(self) -> dict[str, Any]:
-        schema = {
-            "type": "object",
-            "properties": {
-                "files": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["files"],
-                },
-                "model": self.get_model_field_schema(),
-                "prompt": {
-                    "type": "string",
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["prompt"],
-                },
-                "images": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["images"],
-                },
-                "review_type": {
-                    "type": "string",
-                    "enum": ["full", "security", "performance", "quick"],
-                    "default": "full",
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["review_type"],
-                },
-                "focus_on": {
-                    "type": "string",
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["focus_on"],
-                },
-                "standards": {
-                    "type": "string",
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["standards"],
-                },
-                "severity_filter": {
-                    "type": "string",
-                    "enum": ["critical", "high", "medium", "low", "all"],
-                    "default": "all",
-                    "description": CODEREVIEW_FIELD_DESCRIPTIONS["severity_filter"],
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "Temperature (0-1, default 0.2 for consistency)",
-                    "minimum": 0,
-                    "maximum": 1,
-                },
-                "thinking_mode": {
-                    "type": "string",
-                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": (
-                        "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), "
-                        "max (100% of model max)"
-                    ),
-                },
-                "use_websearch": {
-                    "type": "boolean",
-                    "description": (
-                        "Enable web search for documentation, best practices, and current information. "
-                        "Particularly useful for: brainstorming sessions, architectural design discussions, "
-                        "exploring industry best practices, working with specific frameworks/technologies, "
-                        "researching solutions to complex problems, or when current documentation and community "
-                        "insights would enhance the analysis."
-                    ),
-                    "default": True,
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": (
-                        "Thread continuation ID for multi-turn conversations. Can be used to continue "
-                        "conversations across different tools. Only provide this if continuing a previous "
-                        "conversation thread."
-                    ),
-                },
-            },
-            "required": ["files", "prompt"] + (["model"] if self.is_effective_auto_mode() else []),
-        }
-
-        return schema
-
    def get_system_prompt(self) -> str:
        return CODEREVIEW_PROMPT

    def get_default_temperature(self) -> float:
        return TEMPERATURE_ANALYTICAL

-    # Line numbers are enabled by default from base class for precise feedback
+    def get_model_category(self) -> "ToolModelCategory":
+        """Code review requires thorough analysis and reasoning"""
+        from tools.models import ToolModelCategory

-    def get_request_model(self):
+        return ToolModelCategory.EXTENDED_REASONING
+
+    def get_workflow_request_model(self):
+        """Return the code review workflow-specific request model."""
        return CodeReviewRequest

-    async def prepare_prompt(self, request: CodeReviewRequest) -> str:
-        """
-        Prepare the code review prompt with customized instructions.
+    def get_input_schema(self) -> dict[str, Any]:
+        """Generate input schema using WorkflowSchemaBuilder with code review-specific overrides."""
+        from .workflow.schema_builders import WorkflowSchemaBuilder

-        This method reads the requested files, validates token limits,
-        and constructs a detailed prompt based on the review parameters.
+        # Code review workflow-specific field overrides
+        codereview_field_overrides = {
+            "step": {
+                "type": "string",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["step"],
+            },
+            "step_number": {
+                "type": "integer",
+                "minimum": 1,
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["step_number"],
+            },
+            "total_steps": {
+                "type": "integer",
+                "minimum": 1,
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"],
+            },
+            "next_step_required": {
+                "type": "boolean",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"],
+            },
+            "findings": {
+                "type": "string",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["findings"],
+            },
+            "files_checked": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"],
+            },
+            "relevant_files": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"],
+            },
+            "confidence": {
+                "type": "string",
+                "enum": ["exploring", "low", "medium", "high", "certain"],
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["confidence"],
+            },
+            "backtrack_from_step": {
+                "type": "integer",
+                "minimum": 1,
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"],
+            },
+            "issues_found": {
+                "type": "array",
+                "items": {"type": "object"},
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["issues_found"],
+            },
+            "images": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["images"],
+            },
+            # Code review-specific fields (for step 1)
+            "review_type": {
+                "type": "string",
+                "enum": ["full", "security", "performance", "quick"],
+                "default": "full",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["review_type"],
+            },
+            "focus_on": {
+                "type": "string",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["focus_on"],
+            },
+            "standards": {
+                "type": "string",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["standards"],
+            },
+            "severity_filter": {
+                "type": "string",
+                "enum": ["critical", "high", "medium", "low", "all"],
+                "default": "all",
+                "description": CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS["severity_filter"],
+            },
+        }

-        Args:
-            request: The validated review request
-
-        Returns:
-            str: Complete prompt for the model
-
-        Raises:
-            ValueError: If the code exceeds token limits
-        """
-        # Check for prompt.txt in files
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
-
-        # If prompt.txt was found, incorporate it into the prompt
-        if prompt_content:
-            request.prompt = prompt_content + "\n\n" + request.prompt
-
-        # Update request files list
-        if updated_files is not None:
-            request.files = updated_files
-
-        # File size validation happens at MCP boundary in server.py
-
-        # Check user input size at MCP transport boundary (before adding internal content)
-        user_content = request.prompt
-        size_check = self.check_prompt_size(user_content)
-        if size_check:
-            from tools.models import ToolOutput
-
-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
-
-        # Also check focus_on field if provided (user input)
-        if request.focus_on:
-            focus_size_check = self.check_prompt_size(request.focus_on)
-            if focus_size_check:
-                from tools.models import ToolOutput
-
-                raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**focus_size_check).model_dump_json()}")
-
-        # Use centralized file processing logic
-        continuation_id = getattr(request, "continuation_id", None)
-        file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Code")
-        self._actually_processed_files = processed_files
-
-        # Build customized review instructions based on review type
-        review_focus = []
-        if request.review_type == "security":
-            review_focus.append("Focus on security vulnerabilities and authentication issues")
-        elif request.review_type == "performance":
-            review_focus.append("Focus on performance bottlenecks and optimization opportunities")
-        elif request.review_type == "quick":
-            review_focus.append("Provide a quick review focusing on critical issues only")
-
-        # Add any additional focus areas specified by the user
-        if request.focus_on:
-            review_focus.append(f"Pay special attention to: {request.focus_on}")
-
-        # Include custom coding standards if provided
-        if request.standards:
-            review_focus.append(f"Enforce these standards: {request.standards}")
-
-        # Apply severity filtering to reduce noise if requested
-        if request.severity_filter != "all":
-            review_focus.append(f"Only report issues of {request.severity_filter} severity or higher")
-
-        focus_instruction = "\n".join(review_focus) if review_focus else ""
-
-        # Add web search instruction if enabled
-        websearch_instruction = self.get_websearch_instruction(
-            request.use_websearch,
-            """When reviewing code, consider if searches for these would help:
- Security vulnerabilities and CVEs for libraries/frameworks used
- Best practices for the languages and frameworks in the code
- Common anti-patterns and their solutions
- Performance optimization techniques
- Recent updates or deprecations in APIs used""",
+        # Use WorkflowSchemaBuilder with code review-specific tool fields
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=codereview_field_overrides,
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
        )

-        # Construct the complete prompt with system instructions and code
-        full_prompt = f"""{self.get_system_prompt()}{websearch_instruction}
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """Define required actions for each investigation phase."""
+        if step_number == 1:
+            # Initial code review investigation tasks
+            return [
+                "Read and understand the code files specified for review",
+                "Examine the overall structure, architecture, and design patterns used",
+                "Identify the main components, classes, and functions in the codebase",
+                "Understand the business logic and intended functionality",
+                "Look for obvious issues: bugs, security concerns, performance problems",
+                "Note any code smells, anti-patterns, or areas of concern",
+            ]
+        elif confidence in ["exploring", "low"]:
+            # Need deeper investigation
+            return [
+                "Examine specific code sections you've identified as concerning",
+                "Analyze security implications: input validation, authentication, authorization",
+                "Check for performance issues: algorithmic complexity, resource usage, inefficiencies",
+                "Look for architectural problems: tight coupling, missing abstractions, scalability issues",
+                "Identify code quality issues: readability, maintainability, error handling",
+                "Search for over-engineering, unnecessary complexity, or design patterns that could be simplified",
+            ]
+        elif confidence in ["medium", "high"]:
+            # Close to completion - need final verification
+            return [
+                "Verify all identified issues have been properly documented with severity levels",
+                "Check for any missed critical security vulnerabilities or performance bottlenecks",
+                "Confirm that architectural concerns and code quality issues are comprehensively captured",
+                "Ensure positive aspects and well-implemented patterns are also noted",
+                "Validate that your assessment aligns with the review type and focus areas specified",
+                "Double-check that findings are actionable and provide clear guidance for improvements",
+            ]
+        else:
+            # General investigation needed
+            return [
+                "Continue examining the codebase for additional patterns and potential issues",
+                "Gather more evidence using appropriate code analysis techniques",
+                "Test your assumptions about code behavior and design decisions",
+                "Look for patterns that confirm or refute your current assessment",
+                "Focus on areas that haven't been thoroughly examined yet",
+            ]

-=== USER CONTEXT ===
-{request.prompt}
-=== END CONTEXT ===
-
-{focus_instruction}
-
-=== CODE TO REVIEW ===
-{file_content}
-=== END CODE ===
-
-Please provide a code review aligned with the user's context and expectations, following the format specified """
-        "in the system prompt." ""
-
-        return full_prompt
-
-    def format_response(self, response: str, request: CodeReviewRequest, model_info: Optional[dict] = None) -> str:
+    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
        """
-        Format the review response.
+        Decide when to call external model based on investigation completeness.

-        Args:
-            response: The raw review from the model
-            request: The original request for context
-            model_info: Optional dict with model metadata
-
-        Returns:
-            str: Formatted response with next steps
+        Don't call expert analysis if Claude has certain confidence - trust their judgment.
        """
-        return f"""{response}
+        # Check if user requested to skip assistant model
+        if request and not self.get_request_use_assistant_model(request):
+            return False

---
+        # Check if we have meaningful investigation data
+        return (
+            len(consolidated_findings.relevant_files) > 0
+            or len(consolidated_findings.findings) >= 2
+            or len(consolidated_findings.issues_found) > 0
+        )

-**Your Next Steps:**
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """Prepare context for external model call for final code review validation."""
+        context_parts = [
+            f"=== CODE REVIEW REQUEST ===\\n{self.initial_request or 'Code review workflow initiated'}\\n=== END REQUEST ==="
+        ]

-1. **Understand the Context**: First examine the specific functions, files, and code sections mentioned in """
-        """the review to understand each issue thoroughly.
+        # Add investigation summary
+        investigation_summary = self._build_code_review_summary(consolidated_findings)
+        context_parts.append(
+            f"\\n=== CLAUDE'S CODE REVIEW INVESTIGATION ===\\n{investigation_summary}\\n=== END INVESTIGATION ==="
+        )

-2. **Present Options to User**: After understanding the issues, ask the user which specific improvements """
-        """they would like to implement, presenting them as a clear list of options.
+        # Add review configuration context if available
+        if self.review_config:
+            config_text = "\\n".join(f"- {key}: {value}" for key, value in self.review_config.items() if value)
+            context_parts.append(f"\\n=== REVIEW CONFIGURATION ===\\n{config_text}\\n=== END CONFIGURATION ===")

-3. **Implement Selected Fixes**: Only implement the fixes the user chooses, ensuring each change is made """
-        """correctly and maintains code quality.
+        # Add relevant code elements if available
+        if consolidated_findings.relevant_context:
+            methods_text = "\\n".join(f"- {method}" for method in consolidated_findings.relevant_context)
+            context_parts.append(f"\\n=== RELEVANT CODE ELEMENTS ===\\n{methods_text}\\n=== END CODE ELEMENTS ===")

-Remember: Always understand the code context before suggesting fixes, and let the user decide which """
-        """improvements to implement."""
+        # Add issues found if available
+        if consolidated_findings.issues_found:
+            issues_text = "\\n".join(
+                f"[{issue.get('severity', 'unknown').upper()}] {issue.get('description', 'No description')}"
+                for issue in consolidated_findings.issues_found
+            )
+            context_parts.append(f"\\n=== ISSUES IDENTIFIED ===\\n{issues_text}\\n=== END ISSUES ===")
+
+        # Add assessment evolution if available
+        if consolidated_findings.hypotheses:
+            assessments_text = "\\n".join(
+                f"Step {h['step']} ({h['confidence']} confidence): {h['hypothesis']}"
+                for h in consolidated_findings.hypotheses
+            )
+            context_parts.append(f"\\n=== ASSESSMENT EVOLUTION ===\\n{assessments_text}\\n=== END ASSESSMENTS ===")
+
+        # Add images if available
+        if consolidated_findings.images:
+            images_text = "\\n".join(f"- {img}" for img in consolidated_findings.images)
+            context_parts.append(
+                f"\\n=== VISUAL REVIEW INFORMATION ===\\n{images_text}\\n=== END VISUAL INFORMATION ==="
+            )
+
+        return "\\n".join(context_parts)
+
+    def _build_code_review_summary(self, consolidated_findings) -> str:
+        """Prepare a comprehensive summary of the code review investigation."""
+        summary_parts = [
+            "=== SYSTEMATIC CODE REVIEW INVESTIGATION SUMMARY ===",
+            f"Total steps: {len(consolidated_findings.findings)}",
+            f"Files examined: {len(consolidated_findings.files_checked)}",
+            f"Relevant files identified: {len(consolidated_findings.relevant_files)}",
+            f"Code elements analyzed: {len(consolidated_findings.relevant_context)}",
+            f"Issues identified: {len(consolidated_findings.issues_found)}",
+            "",
+            "=== INVESTIGATION PROGRESSION ===",
+        ]
+
+        for finding in consolidated_findings.findings:
+            summary_parts.append(finding)
+
+        return "\\n".join(summary_parts)
+
+    def should_include_files_in_expert_prompt(self) -> bool:
+        """Include files in expert analysis for comprehensive code review."""
+        return True
+
+    def should_embed_system_prompt(self) -> bool:
+        """Embed system prompt in expert analysis for proper context."""
+        return True
+
+    def get_expert_thinking_mode(self) -> str:
+        """Use high thinking mode for thorough code review analysis."""
+        return "high"
+
+    def get_expert_analysis_instruction(self) -> str:
+        """Get specific instruction for code review expert analysis."""
+        return (
+            "Please provide comprehensive code review analysis based on the investigation findings. "
+            "Focus on identifying any remaining issues, validating the completeness of the analysis, "
+            "and providing final recommendations for code improvements, following the severity-based "
+            "format specified in the system prompt."
+        )
+
+    # Hook method overrides for code review-specific behavior
+
+    def prepare_step_data(self, request) -> dict:
+        """
+        Map code review-specific fields for internal processing.
+        """
+        step_data = {
+            "step": request.step,
+            "step_number": request.step_number,
+            "findings": request.findings,
+            "files_checked": request.files_checked,
+            "relevant_files": request.relevant_files,
+            "relevant_context": request.relevant_context,
+            "issues_found": request.issues_found,
+            "confidence": request.confidence,
+            "hypothesis": request.findings,  # Map findings to hypothesis for compatibility
+            "images": request.images or [],
+        }
+        return step_data
+
+    def should_skip_expert_analysis(self, request, consolidated_findings) -> bool:
+        """
+        Code review workflow skips expert analysis when Claude has "certain" confidence.
+        """
+        return request.confidence == "certain" and not request.next_step_required
+
+    def store_initial_issue(self, step_description: str):
+        """Store initial request for expert analysis."""
+        self.initial_request = step_description
+
+    # Override inheritance hooks for code review-specific behavior
+
+    def get_completion_status(self) -> str:
+        """Code review tools use review-specific status."""
+        return "code_review_complete_ready_for_implementation"
+
+    def get_completion_data_key(self) -> str:
+        """Code review uses 'complete_code_review' key."""
+        return "complete_code_review"
+
+    def get_final_analysis_from_request(self, request):
+        """Code review tools use 'findings' field."""
+        return request.findings
+
+    def get_confidence_level(self, request) -> str:
+        """Code review tools use 'certain' for high confidence."""
+        return "certain"
+
+    def get_completion_message(self) -> str:
+        """Code review-specific completion message."""
+        return (
+            "Code review complete with CERTAIN confidence. You have identified all significant issues "
+            "and provided comprehensive analysis. MANDATORY: Present the user with the complete review results "
+            "categorized by severity, and IMMEDIATELY proceed with implementing the highest priority fixes "
+            "or provide specific guidance for improvements. Focus on actionable recommendations."
+        )
+
+    def get_skip_reason(self) -> str:
+        """Code review-specific skip reason."""
+        return "Claude completed comprehensive code review with full confidence"
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Code review-specific expert analysis skip status."""
+        return "skipped_due_to_certain_review_confidence"
+
+    def prepare_work_summary(self) -> str:
+        """Code review-specific work summary."""
+        return self._build_code_review_summary(self.consolidated_findings)
+
+    def get_completion_next_steps_message(self, expert_analysis_used: bool = False) -> str:
+        """
+        Code review-specific completion message.
+        """
+        base_message = (
+            "CODE REVIEW IS COMPLETE. You MUST now summarize and present ALL review findings organized by "
+            "severity (Critical → High → Medium → Low), specific code locations with line numbers, and exact "
+            "recommendations for improvement. Clearly prioritize the top 3 issues that need immediate attention. "
+            "Provide concrete, actionable guidance for each issue—make it easy for a developer to understand "
+            "exactly what needs to be fixed and how to implement the improvements."
+        )
+
+        # Add expert analysis guidance only when expert analysis was actually used
+        if expert_analysis_used:
+            expert_guidance = self.get_expert_analysis_guidance()
+            if expert_guidance:
+                return f"{base_message}\n\n{expert_guidance}"
+
+        return base_message
+
+    def get_expert_analysis_guidance(self) -> str:
+        """
+        Provide specific guidance for handling expert analysis in code reviews.
+        """
+        return (
+            "IMPORTANT: Analysis from an assistant model has been provided above. You MUST critically evaluate and validate "
+            "the expert findings rather than accepting them blindly. Cross-reference the expert analysis with "
+            "your own investigation findings, verify that suggested improvements are appropriate for this "
+            "codebase's context and patterns, and ensure recommendations align with the project's standards. "
+            "Present a synthesis that combines your systematic review with validated expert insights, clearly "
+            "distinguishing between findings you've independently confirmed and additional insights from expert analysis."
+        )
+
+    def get_step_guidance_message(self, request) -> str:
+        """
+        Code review-specific step guidance with detailed investigation instructions.
+        """
+        step_guidance = self.get_code_review_step_guidance(request.step_number, request.confidence, request)
+        return step_guidance["next_steps"]
+
+    def get_code_review_step_guidance(self, step_number: int, confidence: str, request) -> dict[str, Any]:
+        """
+        Provide step-specific guidance for code review workflow.
+        """
+        # Generate the next steps instruction based on required actions
+        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)
+
+        if step_number == 1:
+            next_steps = (
+                f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. You MUST first examine "
+                f"the code files thoroughly using appropriate tools. CRITICAL AWARENESS: You need to understand "
+                f"the code structure, identify potential issues across security, performance, and quality dimensions, "
+                f"and look for architectural concerns, over-engineering, unnecessary complexity, and scalability issues. "
+                f"Use file reading tools, code analysis, and systematic examination to gather comprehensive information. "
+                f"Only call {self.get_name()} again AFTER completing your investigation. When you call "
+                f"{self.get_name()} next time, use step_number: {step_number + 1} and report specific "
+                f"files examined, issues found, and code quality assessments discovered."
+            )
+        elif confidence in ["exploring", "low"]:
+            next_steps = (
+                f"STOP! Do NOT call {self.get_name()} again yet. Based on your findings, you've identified areas that need "
+                f"deeper analysis. MANDATORY ACTIONS before calling {self.get_name()} step {step_number + 1}:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nOnly call {self.get_name()} again with step_number: {step_number + 1} AFTER "
+                + "completing these code review tasks."
+            )
+        elif confidence in ["medium", "high"]:
+            next_steps = (
+                f"WAIT! Your code review needs final verification. DO NOT call {self.get_name()} immediately. REQUIRED ACTIONS:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nREMEMBER: Ensure you have identified all significant issues across all severity levels and "
+                f"verified the completeness of your review. Document findings with specific file references and "
+                f"line numbers where applicable, then call {self.get_name()} with step_number: {step_number + 1}."
+            )
+        else:
+            next_steps = (
+                f"PAUSE REVIEW. Before calling {self.get_name()} step {step_number + 1}, you MUST examine more code thoroughly. "
+                + "Required: "
+                + ", ".join(required_actions[:2])
+                + ". "
+                + f"Your next {self.get_name()} call (step_number: {step_number + 1}) must include "
+                f"NEW evidence from actual code analysis, not just theories. NO recursive {self.get_name()} calls "
+                f"without investigation work!"
+            )
+
+        return {"next_steps": next_steps}
+
+    def customize_workflow_response(self, response_data: dict, request) -> dict:
+        """
+        Customize response to match code review workflow format.
+        """
+        # Store initial request on first step
+        if request.step_number == 1:
+            self.initial_request = request.step
+            # Store review configuration for expert analysis
+            if request.relevant_files:
+                self.review_config = {
+                    "relevant_files": request.relevant_files,
+                    "review_type": request.review_type,
+                    "focus_on": request.focus_on,
+                    "standards": request.standards,
+                    "severity_filter": request.severity_filter,
+                }
+
+        # Convert generic status names to code review-specific ones
+        tool_name = self.get_name()
+        status_mapping = {
+            f"{tool_name}_in_progress": "code_review_in_progress",
+            f"pause_for_{tool_name}": "pause_for_code_review",
+            f"{tool_name}_required": "code_review_required",
+            f"{tool_name}_complete": "code_review_complete",
+        }
+
+        if response_data["status"] in status_mapping:
+            response_data["status"] = status_mapping[response_data["status"]]
+
+        # Rename status field to match code review workflow
+        if f"{tool_name}_status" in response_data:
+            response_data["code_review_status"] = response_data.pop(f"{tool_name}_status")
+            # Add code review-specific status fields
+            response_data["code_review_status"]["issues_by_severity"] = {}
+            for issue in self.consolidated_findings.issues_found:
+                severity = issue.get("severity", "unknown")
+                if severity not in response_data["code_review_status"]["issues_by_severity"]:
+                    response_data["code_review_status"]["issues_by_severity"][severity] = 0
+                response_data["code_review_status"]["issues_by_severity"][severity] += 1
+            response_data["code_review_status"]["review_confidence"] = self.get_request_confidence(request)
+
+        # Map complete_codereviewworkflow to complete_code_review
+        if f"complete_{tool_name}" in response_data:
+            response_data["complete_code_review"] = response_data.pop(f"complete_{tool_name}")
+
+        # Map the completion flag to match code review workflow
+        if f"{tool_name}_complete" in response_data:
+            response_data["code_review_complete"] = response_data.pop(f"{tool_name}_complete")
+
+        return response_data
+
+    # Required abstract methods from BaseTool
+    def get_request_model(self):
+        """Return the code review workflow-specific request model."""
+        return CodeReviewRequest
+
+    async def prepare_prompt(self, request) -> str:
+        """Not used - workflow tools use execute_workflow()."""
+        return ""  # Workflow tools use execute_workflow() directly
--- a/tools/debug.py
+++ b/tools/debug.py
--- a/tools/planner.py
+++ b/tools/planner.py
@@ -1,80 +1,43 @@
 """
-Planner tool
+Interactive Sequential Planner - Break down complex tasks through step-by-step planning

-This tool helps you break down complex ideas, problems, or projects into multiple
-manageable steps. It enables Claude to think through larger problems sequentially, creating
-detailed action plans with clear dependencies and alternatives where applicable.
+This tool enables structured planning through an interactive, step-by-step process that builds
+plans incrementally with the ability to revise, branch, and adapt as understanding deepens.

-=== CONTINUATION FLOW LOGIC ===
+The planner guides users through sequential thinking with forced pauses between steps to ensure
+thorough consideration of alternatives, dependencies, and strategic decisions before moving to
+tactical implementation details.

-The tool implements sophisticated continuation logic that enables multi-session planning:
+Key features:
+- Sequential planning with full context awareness
+- Forced deep reflection for complex plans (≥5 steps) in early stages
+- Branching capabilities for exploring alternative approaches
+- Revision capabilities to update earlier decisions
+- Dynamic step count adjustment as plans evolve
+- Self-contained completion without external expert analysis

-RULE 1: No continuation_id + step_number=1
-→ Creates NEW planning thread
-→ NO previous context loaded
-→ Returns continuation_id for future steps
-
-RULE 2: continuation_id provided + step_number=1
-→ Loads PREVIOUS COMPLETE PLAN as context
-→ Starts NEW planning session with historical context
-→ Claude sees summary of previous completed plan
-
-RULE 3: continuation_id provided + step_number>1
-→ NO previous context loaded (middle of current planning session)
-→ Continues current planning without historical interference
-
-RULE 4: next_step_required=false (final step)
-→ Stores COMPLETE PLAN summary in conversation memory
-→ Returns continuation_id for future planning sessions
-
-=== CONCRETE EXAMPLE ===
-
-FIRST PLANNING SESSION (Feature A):
-Call 1: planner(step="Plan user authentication", step_number=1, total_steps=3, next_step_required=true)
-        → NEW thread created: "uuid-abc123"
-        → Response: {"step_number": 1, "continuation_id": "uuid-abc123"}
-
-Call 2: planner(step="Design login flow", step_number=2, total_steps=3, next_step_required=true, continuation_id="uuid-abc123")
-        → Middle of current plan - NO context loading
-        → Response: {"step_number": 2, "continuation_id": "uuid-abc123"}
-
-Call 3: planner(step="Security implementation", step_number=3, total_steps=3, next_step_required=FALSE, continuation_id="uuid-abc123")
-        → FINAL STEP: Stores "COMPLETE PLAN: Security implementation (3 steps completed)"
-        → Response: {"step_number": 3, "planning_complete": true, "continuation_id": "uuid-abc123"}
-
-LATER PLANNING SESSION (Feature B):
-Call 1: planner(step="Plan dashboard system", step_number=1, total_steps=2, next_step_required=true, continuation_id="uuid-abc123")
-        → Loads previous complete plan as context
-        → Response includes: "=== PREVIOUS COMPLETE PLAN CONTEXT === Security implementation..."
-        → Claude sees previous work and can build upon it
-
-Call 2: planner(step="Dashboard widgets", step_number=2, total_steps=2, next_step_required=FALSE, continuation_id="uuid-abc123")
-        → FINAL STEP: Stores new complete plan summary
-        → Both planning sessions now available for future continuations
-
-This enables Claude to say: "Continue planning feature C using the authentication and dashboard work"
-and the tool will provide context from both previous completed planning sessions.
+Perfect for: complex project planning, system design with unknowns, migration strategies,
+architectural decisions, and breaking down large problems into manageable steps.
 """

-import json
 import logging
 from typing import TYPE_CHECKING, Any, Optional

-from pydantic import Field
+from pydantic import Field, field_validator

 if TYPE_CHECKING:
    from tools.models import ToolModelCategory

 from config import TEMPERATURE_BALANCED
 from systemprompts import PLANNER_PROMPT
+from tools.shared.base_models import WorkflowRequest

-from .base import BaseTool, ToolRequest
+from .workflow.base import WorkflowTool

 logger = logging.getLogger(__name__)

-# Field descriptions to avoid duplication between Pydantic and JSON schema
+# Tool-specific field descriptions matching original planner tool
 PLANNER_FIELD_DESCRIPTIONS = {
-    # Interactive planning fields for step-by-step planning
    "step": (
        "Your current planning step. For the first step, describe the task/problem to plan and be extremely expressive "
        "so that subsequent steps can break this down into simpler steps. "
@@ -91,25 +54,11 @@ PLANNER_FIELD_DESCRIPTIONS = {
    "branch_from_step": "If is_branch_point is true, which step number is the branching point",
    "branch_id": "Identifier for the current branch (e.g., 'approach-A', 'microservices-path')",
    "more_steps_needed": "True if more steps are needed beyond the initial estimate",
-    "continuation_id": "Thread continuation ID for multi-turn planning sessions (useful for seeding new plans with prior context)",
 }


-class PlanStep:
-    """Represents a single step in the planning process."""
-
-    def __init__(
-        self, step_number: int, content: str, branch_id: Optional[str] = None, parent_step: Optional[int] = None
-    ):
-        self.step_number = step_number
-        self.content = content
-        self.branch_id = branch_id or "main"
-        self.parent_step = parent_step
-        self.children = []
-
-
-class PlannerRequest(ToolRequest):
-    """Request model for the planner tool - interactive step-by-step planning."""
+class PlannerRequest(WorkflowRequest):
+    """Request model for planner workflow tool matching original planner exactly"""

    # Required fields for each planning step
    step: str = Field(..., description=PLANNER_FIELD_DESCRIPTIONS["step"])
@@ -117,7 +66,7 @@ class PlannerRequest(ToolRequest):
    total_steps: int = Field(..., description=PLANNER_FIELD_DESCRIPTIONS["total_steps"])
    next_step_required: bool = Field(..., description=PLANNER_FIELD_DESCRIPTIONS["next_step_required"])

-    # Optional revision/branching fields
+    # Optional revision/branching fields (planning-specific)
    is_step_revision: Optional[bool] = Field(False, description=PLANNER_FIELD_DESCRIPTIONS["is_step_revision"])
    revises_step_number: Optional[int] = Field(None, description=PLANNER_FIELD_DESCRIPTIONS["revises_step_number"])
    is_branch_point: Optional[bool] = Field(False, description=PLANNER_FIELD_DESCRIPTIONS["is_branch_point"])
@@ -125,23 +74,58 @@ class PlannerRequest(ToolRequest):
    branch_id: Optional[str] = Field(None, description=PLANNER_FIELD_DESCRIPTIONS["branch_id"])
    more_steps_needed: Optional[bool] = Field(False, description=PLANNER_FIELD_DESCRIPTIONS["more_steps_needed"])

-    # Optional continuation field
-    continuation_id: Optional[str] = Field(None, description=PLANNER_FIELD_DESCRIPTIONS["continuation_id"])
+    # Exclude all investigation/analysis fields that aren't relevant to planning
+    findings: str = Field(
+        default="", exclude=True, description="Not used for planning - step content serves as findings"
+    )
+    files_checked: list[str] = Field(default_factory=list, exclude=True, description="Planning doesn't examine files")
+    relevant_files: list[str] = Field(default_factory=list, exclude=True, description="Planning doesn't use files")
+    relevant_context: list[str] = Field(
+        default_factory=list, exclude=True, description="Planning doesn't track code context"
+    )
+    issues_found: list[dict] = Field(default_factory=list, exclude=True, description="Planning doesn't find issues")
+    confidence: str = Field(default="planning", exclude=True, description="Planning uses different confidence model")
+    hypothesis: Optional[str] = Field(default=None, exclude=True, description="Planning doesn't use hypothesis")
+    backtrack_from_step: Optional[int] = Field(default=None, exclude=True, description="Planning uses revision instead")

-    # Override inherited fields to exclude them from schema
-    model: Optional[str] = Field(default=None, exclude=True)
+    # Exclude other non-planning fields
    temperature: Optional[float] = Field(default=None, exclude=True)
    thinking_mode: Optional[str] = Field(default=None, exclude=True)
    use_websearch: Optional[bool] = Field(default=None, exclude=True)
-    images: Optional[list] = Field(default=None, exclude=True)
+    use_assistant_model: Optional[bool] = Field(default=False, exclude=True, description="Planning is self-contained")
+    images: Optional[list] = Field(default=None, exclude=True, description="Planning doesn't use images")
+
+    @field_validator("step_number")
+    @classmethod
+    def validate_step_number(cls, v):
+        if v < 1:
+            raise ValueError("step_number must be at least 1")
+        return v
+
+    @field_validator("total_steps")
+    @classmethod
+    def validate_total_steps(cls, v):
+        if v < 1:
+            raise ValueError("total_steps must be at least 1")
+        return v


-class PlannerTool(BaseTool):
-    """Sequential planning tool with step-by-step breakdown and refinement."""
+class PlannerTool(WorkflowTool):
+    """
+    Planner workflow tool for step-by-step planning using the workflow architecture.
+
+    This tool provides the same planning capabilities as the original planner tool
+    but uses the new workflow architecture for consistency with other workflow tools.
+    It maintains all the original functionality including:
+    - Sequential step-by-step planning
+    - Branching and revision capabilities
+    - Deep thinking pauses for complex plans
+    - Conversation memory integration
+    - Self-contained operation (no expert analysis)
+    """

    def __init__(self):
        super().__init__()
-        self.step_history = []
        self.branches = {}

    def get_name(self) -> str:
@@ -172,351 +156,381 @@ class PlannerTool(BaseTool):
            "migration strategies, architectural decisions, problem decomposition."
        )

-    def get_input_schema(self) -> dict[str, Any]:
-        schema = {
-            "type": "object",
-            "properties": {
-                # Interactive planning fields
-                "step": {
-                    "type": "string",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["step"],
-                },
-                "step_number": {
-                    "type": "integer",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["step_number"],
-                    "minimum": 1,
-                },
-                "total_steps": {
-                    "type": "integer",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["total_steps"],
-                    "minimum": 1,
-                },
-                "next_step_required": {
-                    "type": "boolean",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["next_step_required"],
-                },
-                "is_step_revision": {
-                    "type": "boolean",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["is_step_revision"],
-                },
-                "revises_step_number": {
-                    "type": "integer",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["revises_step_number"],
-                    "minimum": 1,
-                },
-                "is_branch_point": {
-                    "type": "boolean",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["is_branch_point"],
-                },
-                "branch_from_step": {
-                    "type": "integer",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["branch_from_step"],
-                    "minimum": 1,
-                },
-                "branch_id": {
-                    "type": "string",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["branch_id"],
-                },
-                "more_steps_needed": {
-                    "type": "boolean",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["more_steps_needed"],
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": PLANNER_FIELD_DESCRIPTIONS["continuation_id"],
-                },
-            },
-            # Required fields for interactive planning
-            "required": ["step", "step_number", "total_steps", "next_step_required"],
-        }
-        return schema
-
    def get_system_prompt(self) -> str:
        return PLANNER_PROMPT

-    def get_request_model(self):
-        return PlannerRequest
-
    def get_default_temperature(self) -> float:
        return TEMPERATURE_BALANCED

    def get_model_category(self) -> "ToolModelCategory":
+        """Planner requires deep analysis and reasoning"""
        from tools.models import ToolModelCategory

-        return ToolModelCategory.EXTENDED_REASONING  # Planning benefits from deep thinking
-
-    def get_default_thinking_mode(self) -> str:
-        return "high"  # Default to high thinking for comprehensive planning
+        return ToolModelCategory.EXTENDED_REASONING

    def requires_model(self) -> bool:
        """
-        Planner tool doesn't require AI model access - it's pure data processing.
+        Planner tool doesn't require model resolution at the MCP boundary.

-        This prevents the server from trying to resolve model names like "auto"
-        when the planner tool is used, since it overrides execute() and doesn't
-        make any AI API calls.
+        The planner is a pure data processing tool that organizes planning steps
+        and provides structured guidance without calling external AI models.
+
+        Returns:
+            bool: False - planner doesn't need AI model access
        """
        return False

-    async def execute(self, arguments: dict[str, Any]) -> list:
+    def get_workflow_request_model(self):
+        """Return the planner-specific request model."""
+        return PlannerRequest
+
+    def get_tool_fields(self) -> dict[str, dict[str, Any]]:
+        """Return planning-specific field definitions beyond the standard workflow fields."""
+        return {
+            # Planning-specific optional fields
+            "is_step_revision": {
+                "type": "boolean",
+                "description": PLANNER_FIELD_DESCRIPTIONS["is_step_revision"],
+            },
+            "revises_step_number": {
+                "type": "integer",
+                "minimum": 1,
+                "description": PLANNER_FIELD_DESCRIPTIONS["revises_step_number"],
+            },
+            "is_branch_point": {
+                "type": "boolean",
+                "description": PLANNER_FIELD_DESCRIPTIONS["is_branch_point"],
+            },
+            "branch_from_step": {
+                "type": "integer",
+                "minimum": 1,
+                "description": PLANNER_FIELD_DESCRIPTIONS["branch_from_step"],
+            },
+            "branch_id": {
+                "type": "string",
+                "description": PLANNER_FIELD_DESCRIPTIONS["branch_id"],
+            },
+            "more_steps_needed": {
+                "type": "boolean",
+                "description": PLANNER_FIELD_DESCRIPTIONS["more_steps_needed"],
+            },
+        }
+
+    def get_input_schema(self) -> dict[str, Any]:
+        """Generate input schema using WorkflowSchemaBuilder with field exclusion."""
+        from .workflow.schema_builders import WorkflowSchemaBuilder
+
+        # Exclude investigation-specific fields that planning doesn't need
+        excluded_workflow_fields = [
+            "findings",  # Planning uses step content instead
+            "files_checked",  # Planning doesn't examine files
+            "relevant_files",  # Planning doesn't use files
+            "relevant_context",  # Planning doesn't track code context
+            "issues_found",  # Planning doesn't find issues
+            "confidence",  # Planning uses different confidence model
+            "hypothesis",  # Planning doesn't use hypothesis
+            "backtrack_from_step",  # Planning uses revision instead
+        ]
+
+        # Exclude common fields that planning doesn't need
+        excluded_common_fields = [
+            "temperature",  # Planning doesn't need temperature control
+            "thinking_mode",  # Planning doesn't need thinking mode
+            "use_websearch",  # Planning doesn't need web search
+            "images",  # Planning doesn't use images
+            "files",  # Planning doesn't use files
+        ]
+
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=self.get_tool_fields(),
+            required_fields=[],  # No additional required fields beyond workflow defaults
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
+            excluded_workflow_fields=excluded_workflow_fields,
+            excluded_common_fields=excluded_common_fields,
+        )
+
+    # ================================================================================
+    # Abstract Methods - Required Implementation from BaseWorkflowMixin
+    # ================================================================================
+
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """Define required actions for each planning phase."""
+        if step_number == 1:
+            # Initial planning tasks
+            return [
+                "Think deeply about the complete scope and complexity of what needs to be planned",
+                "Consider multiple approaches and their trade-offs",
+                "Identify key constraints, dependencies, and potential challenges",
+                "Think about stakeholders, success criteria, and critical requirements",
+            ]
+        elif step_number <= 3 and total_steps >= 5:
+            # Complex plan early stages - force deep thinking
+            if step_number == 2:
+                return [
+                    "Evaluate the approach from step 1 - are there better alternatives?",
+                    "Break down the major phases and identify critical decision points",
+                    "Consider resource requirements and potential bottlenecks",
+                    "Think about how different parts interconnect and affect each other",
+                ]
+            else:  # step_number == 3
+                return [
+                    "Validate that the emerging plan addresses the original requirements",
+                    "Identify any gaps or assumptions that need clarification",
+                    "Consider how to validate progress and adjust course if needed",
+                    "Think about what the first concrete steps should be",
+                ]
+        else:
+            # Later steps or simple plans
+            return [
+                "Continue developing the plan with concrete, actionable steps",
+                "Consider implementation details and practical considerations",
+                "Think about how to sequence and coordinate different activities",
+                "Prepare for execution planning and resource allocation",
+            ]
+
+    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
+        """Planner is self-contained and doesn't need expert analysis."""
+        return False
+
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """Planner doesn't use expert analysis."""
+        return ""
+
+    def requires_expert_analysis(self) -> bool:
+        """Planner is self-contained like the original planner tool."""
+        return False
+
+    # ================================================================================
+    # Workflow Customization - Match Original Planner Behavior
+    # ================================================================================
+
+    def prepare_step_data(self, request) -> dict:
        """
-        Override execute to work like original TypeScript tool - no AI calls, just data processing.
-
-        This method implements the core continuation logic that enables multi-session planning:
-
-        CONTINUATION LOGIC:
-        1. If no continuation_id + step_number=1: Create new planning thread
-        2. If continuation_id + step_number=1: Load previous complete plan as context for NEW planning
-        3. If continuation_id + step_number>1: Continue current plan (no context loading)
-        4. If next_step_required=false: Mark complete and store plan summary for future use
-
-        CONVERSATION MEMORY INTEGRATION:
-        - Each step is stored in conversation memory for cross-tool continuation
-        - Final steps store COMPLETE PLAN summaries that can be loaded as context
-        - Only step 1 with continuation_id loads previous context (new planning session)
-        - Steps 2+ with continuation_id continue current session without context interference
+        Prepare step data from request with planner-specific fields.
        """
-        from mcp.types import TextContent
+        step_data = {
+            "step": request.step,
+            "step_number": request.step_number,
+            "findings": f"Planning step {request.step_number}: {request.step}",  # Use step content as findings
+            "files_checked": [],  # Planner doesn't check files
+            "relevant_files": [],  # Planner doesn't use files
+            "relevant_context": [],  # Planner doesn't track context like debug
+            "issues_found": [],  # Planner doesn't track issues
+            "confidence": "planning",  # Planning confidence is different from investigation
+            "hypothesis": None,  # Planner doesn't use hypothesis
+            "images": [],  # Planner doesn't use images
+            # Planner-specific fields
+            "is_step_revision": request.is_step_revision or False,
+            "revises_step_number": request.revises_step_number,
+            "is_branch_point": request.is_branch_point or False,
+            "branch_from_step": request.branch_from_step,
+            "branch_id": request.branch_id,
+            "more_steps_needed": request.more_steps_needed or False,
+        }
+        return step_data

-        from utils.conversation_memory import add_turn, create_thread, get_thread
+    def build_base_response(self, request, continuation_id: str = None) -> dict:
+        """
+        Build the base response structure with planner-specific fields.
+        """
+        # Use work_history from workflow mixin for consistent step tracking
+        # Add 1 to account for current step being processed
+        current_step_count = len(self.work_history) + 1

-        try:
-            # Validate request like the original
-            request_model = self.get_request_model()
-            request = request_model(**arguments)
-
-            # Process step like original TypeScript tool
-            if request.step_number > request.total_steps:
-                request.total_steps = request.step_number
-
-            # === CONTINUATION LOGIC IMPLEMENTATION ===
-            # This implements the 4 rules documented in the module docstring
-
-            continuation_id = request.continuation_id
-            previous_plan_context = ""
-
-            # RULE 1: No continuation_id + step_number=1 → Create NEW planning thread
-            if not continuation_id and request.step_number == 1:
-                # Filter arguments to only include serializable data for conversation memory
-                serializable_args = {
-                    k: v
-                    for k, v in arguments.items()
-                    if not hasattr(v, "__class__") or v.__class__.__module__ != "utils.model_context"
-                }
-                continuation_id = create_thread("planner", serializable_args)
-                # Result: New thread created, no previous context, returns continuation_id
-
-            # RULE 2: continuation_id + step_number=1 → Load PREVIOUS COMPLETE PLAN as context
-            elif continuation_id and request.step_number == 1:
-                thread = get_thread(continuation_id)
-                if thread:
-                    # Search for most recent COMPLETE PLAN from previous planning sessions
-                    for turn in reversed(thread.turns):  # Newest first
-                        if turn.tool_name == "planner" and turn.role == "assistant":
-                            # Try to parse as JSON first (new format)
-                            try:
-                                turn_data = json.loads(turn.content)
-                                if isinstance(turn_data, dict) and turn_data.get("planning_complete"):
-                                    # New JSON format
-                                    plan_summary = turn_data.get("plan_summary", "")
-                                    if plan_summary:
-                                        previous_plan_context = plan_summary[:500]
-                                        break
-                            except (json.JSONDecodeError, ValueError):
-                                # Fallback to old text format
-                                if "planning_complete" in turn.content:
-                                    try:
-                                        if "COMPLETE PLAN:" in turn.content:
-                                            plan_start = turn.content.find("COMPLETE PLAN:")
-                                            previous_plan_context = turn.content[plan_start : plan_start + 500] + "..."
-                                        else:
-                                            previous_plan_context = turn.content[:300] + "..."
-                                        break
-                                    except Exception:
-                                        pass
-
-                    if previous_plan_context:
-                        previous_plan_context = f"\\n\\n=== PREVIOUS COMPLETE PLAN CONTEXT ===\\n{previous_plan_context}\\n=== END CONTEXT ===\\n"
-                # Result: NEW planning session with previous complete plan as context
-
-            # RULE 3: continuation_id + step_number>1 → Continue current plan (no context loading)
-            # This case is handled by doing nothing - we're in the middle of current planning
-            # Result: Current planning continues without historical interference
-
-            step_data = {
-                "step": request.step,
-                "step_number": request.step_number,
-                "total_steps": request.total_steps,
-                "next_step_required": request.next_step_required,
-                "is_step_revision": request.is_step_revision,
+        response_data = {
+            "status": f"{self.get_name()}_in_progress",
+            "step_number": request.step_number,
+            "total_steps": request.total_steps,
+            "next_step_required": request.next_step_required,
+            "step_content": request.step,
+            f"{self.get_name()}_status": {
+                "files_checked": len(self.consolidated_findings.files_checked),
+                "relevant_files": len(self.consolidated_findings.relevant_files),
+                "relevant_context": len(self.consolidated_findings.relevant_context),
+                "issues_found": len(self.consolidated_findings.issues_found),
+                "images_collected": len(self.consolidated_findings.images),
+                "current_confidence": self.get_request_confidence(request),
+                "step_history_length": current_step_count,  # Use work_history + current step
+            },
+            "metadata": {
+                "branches": list(self.branches.keys()),
+                "step_history_length": current_step_count,  # Use work_history + current step
+                "is_step_revision": request.is_step_revision or False,
                "revises_step_number": request.revises_step_number,
-                "is_branch_point": request.is_branch_point,
+                "is_branch_point": request.is_branch_point or False,
                "branch_from_step": request.branch_from_step,
                "branch_id": request.branch_id,
-                "more_steps_needed": request.more_steps_needed,
-                "continuation_id": request.continuation_id,
-            }
+                "more_steps_needed": request.more_steps_needed or False,
+            },
+        }

-            # Store in local history like original
-            self.step_history.append(step_data)
+        if continuation_id:
+            response_data["continuation_id"] = continuation_id

-            # Handle branching like original
-            if request.is_branch_point and request.branch_from_step and request.branch_id:
-                if request.branch_id not in self.branches:
-                    self.branches[request.branch_id] = []
-                self.branches[request.branch_id].append(step_data)
+        return response_data

-            # Build structured JSON response like other tools (consensus, refactor)
-            response_data = {
-                "status": "planning_success",
-                "step_number": request.step_number,
-                "total_steps": request.total_steps,
-                "next_step_required": request.next_step_required,
-                "step_content": request.step,
-                "metadata": {
-                    "branches": list(self.branches.keys()),
-                    "step_history_length": len(self.step_history),
-                    "is_step_revision": request.is_step_revision or False,
-                    "revises_step_number": request.revises_step_number,
-                    "is_branch_point": request.is_branch_point or False,
-                    "branch_from_step": request.branch_from_step,
-                    "branch_id": request.branch_id,
-                    "more_steps_needed": request.more_steps_needed or False,
-                },
-                "output": {
-                    "instructions": "This is a structured planning response. Present the step_content as the main planning analysis. If next_step_required is true, continue with the next step. If planning_complete is true, present the complete plan in a well-structured format with clear sections, headings, numbered steps, and visual elements like ASCII charts for phases/dependencies. Use bullet points, sub-steps, sequences, and visual organization to make complex plans easy to understand and follow. IMPORTANT: Do NOT use emojis - use clear text formatting and ASCII characters only. Do NOT mention time estimates or costs unless explicitly requested.",
-                    "format": "step_by_step_planning",
-                    "presentation_guidelines": {
-                        "completed_plans": "Use clear headings, numbered phases, ASCII diagrams for workflows/dependencies, bullet points for sub-tasks, and visual sequences where helpful. No emojis. No time/cost estimates unless requested.",
-                        "step_content": "Present as main analysis with clear structure and actionable insights. No emojis. No time/cost estimates unless requested.",
-                        "continuation": "Use continuation_id for related planning sessions or implementation planning",
-                    },
-                },
-            }
+    def handle_work_continuation(self, response_data: dict, request) -> dict:
+        """
+        Handle work continuation with planner-specific deep thinking pauses.
+        """
+        response_data["status"] = f"pause_for_{self.get_name()}"
+        response_data[f"{self.get_name()}_required"] = True

-            # Always include continuation_id if we have one (enables step chaining within session)
-            if continuation_id:
-                response_data["continuation_id"] = continuation_id
+        # Get planner-specific required actions
+        required_actions = self.get_required_actions(request.step_number, "planning", request.step, request.total_steps)
+        response_data["required_actions"] = required_actions

-            # Add previous plan context if available
-            if previous_plan_context:
-                response_data["previous_plan_context"] = previous_plan_context.strip()
+        # Enhanced deep thinking pauses for complex plans
+        if request.total_steps >= 5 and request.step_number <= 3:
+            response_data["status"] = "pause_for_deep_thinking"
+            response_data["thinking_required"] = True
+            response_data["required_thinking"] = required_actions

-            # RULE 4: next_step_required=false → Mark complete and store plan summary
-            if not request.next_step_required:
-                response_data["planning_complete"] = True
-                response_data["plan_summary"] = (
-                    f"COMPLETE PLAN: {request.step} (Total {request.total_steps} steps completed)"
-                )
+            if request.step_number == 1:
                response_data["next_steps"] = (
-                    "Planning complete. Present the complete plan to the user in a well-structured format with clear sections, "
-                    "numbered steps, visual elements (ASCII charts/diagrams where helpful), sub-step breakdowns, and implementation guidance. "
-                    "Use headings, bullet points, and visual organization to make the plan easy to follow. "
-                    "If there are phases, dependencies, or parallel tracks, show these relationships visually. "
-                    "IMPORTANT: Do NOT use emojis - use clear text formatting and ASCII characters only. "
-                    "Do NOT mention time estimates or costs unless explicitly requested. "
-                    "After presenting the plan, offer to either help implement specific parts or use the continuation_id to start related planning sessions."
+                    f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. This is a complex plan ({request.total_steps} steps) "
+                    f"that requires deep thinking. You MUST first spend time reflecting on the planning challenge:\n\n"
+                    f"REQUIRED DEEP THINKING before calling {self.get_name()} step {request.step_number + 1}:\n"
+                    f"1. Analyze the FULL SCOPE: What exactly needs to be accomplished?\n"
+                    f"2. Consider MULTIPLE APPROACHES: What are 2-3 different ways to tackle this?\n"
+                    f"3. Identify CONSTRAINTS & DEPENDENCIES: What limits our options?\n"
+                    f"4. Think about SUCCESS CRITERIA: How will we know we've succeeded?\n"
+                    f"5. Consider RISKS & MITIGATION: What could go wrong early vs late?\n\n"
+                    f"Only call {self.get_name()} again with step_number: {request.step_number + 1} AFTER this deep analysis."
                )
-                # Result: Planning marked complete, summary stored for future context loading
-            else:
-                response_data["planning_complete"] = False
-                remaining_steps = request.total_steps - request.step_number
-
-                # ENHANCED: Add deep thinking pauses for complex plans in early stages
-                # Only for complex plans (>=5 steps) and first 3 steps - force deep reflection
-                if request.total_steps >= 5 and request.step_number <= 3:
-                    response_data["status"] = "pause_for_deep_thinking"
-                    response_data["thinking_required"] = True
-
-                    if request.step_number == 1:
-                        # Initial deep thinking - understand the full scope
-                        response_data["required_thinking"] = [
-                            "Analyze the complete scope and complexity of what needs to be planned",
-                            "Consider multiple approaches and their trade-offs",
-                            "Identify key constraints, dependencies, and potential challenges",
-                            "Think about stakeholders, success criteria, and critical requirements",
-                            "Consider what could go wrong and how to mitigate risks early",
-                        ]
-                        response_data["next_steps"] = (
-                            f"MANDATORY: DO NOT call the planner tool again immediately. This is a complex plan ({request.total_steps} steps) "
-                            f"that requires deep thinking. You MUST first spend time reflecting on the planning challenge:\n\n"
-                            f"REQUIRED DEEP THINKING before calling planner step {request.step_number + 1}:\n"
-                            f"1. Analyze the FULL SCOPE: What exactly needs to be accomplished?\n"
-                            f"2. Consider MULTIPLE APPROACHES: What are 2-3 different ways to tackle this?\n"
-                            f"3. Identify CONSTRAINTS & DEPENDENCIES: What limits our options?\n"
-                            f"4. Think about SUCCESS CRITERIA: How will we know we've succeeded?\n"
-                            f"5. Consider RISKS & MITIGATION: What could go wrong early vs late?\n\n"
-                            f"Only call planner again with step_number: {request.step_number + 1} AFTER this deep analysis."
-                        )
-                    elif request.step_number == 2:
-                        # Refine approach - dig deeper into the chosen direction
-                        response_data["required_thinking"] = [
-                            "Evaluate the approach from step 1 - are there better alternatives?",
-                            "Break down the major phases and identify critical decision points",
-                            "Consider resource requirements and potential bottlenecks",
-                            "Think about how different parts interconnect and affect each other",
-                            "Identify areas that need the most careful planning vs quick wins",
-                        ]
-                        response_data["next_steps"] = (
-                            f"STOP! Complex planning requires reflection between steps. DO NOT call planner immediately.\n\n"
-                            f"MANDATORY REFLECTION before planner step {request.step_number + 1}:\n"
-                            f"1. EVALUATE YOUR APPROACH: Is the direction from step 1 still the best?\n"
-                            f"2. IDENTIFY MAJOR PHASES: What are the 3-5 main chunks of work?\n"
-                            f"3. SPOT DEPENDENCIES: What must happen before what?\n"
-                            f"4. CONSIDER RESOURCES: What skills, tools, or access do we need?\n"
-                            f"5. FIND CRITICAL PATHS: Where could delays hurt the most?\n\n"
-                            f"Think deeply about these aspects, then call planner with step_number: {request.step_number + 1}."
-                        )
-                    elif request.step_number == 3:
-                        # Final deep thinking - validate and prepare for execution planning
-                        response_data["required_thinking"] = [
-                            "Validate that the emerging plan addresses the original requirements",
-                            "Identify any gaps or assumptions that need clarification",
-                            "Consider how to validate progress and adjust course if needed",
-                            "Think about what the first concrete steps should be",
-                            "Prepare for transition from strategic to tactical planning",
-                        ]
-                        response_data["next_steps"] = (
-                            f"PAUSE for final strategic reflection. DO NOT call planner yet.\n\n"
-                            f"FINAL DEEP THINKING before planner step {request.step_number + 1}:\n"
-                            f"1. VALIDATE COMPLETENESS: Does this plan address all original requirements?\n"
-                            f"2. CHECK FOR GAPS: What assumptions need validation? What's unclear?\n"
-                            f"3. PLAN FOR ADAPTATION: How will we know if we need to change course?\n"
-                            f"4. DEFINE FIRST STEPS: What are the first 2-3 concrete actions?\n"
-                            f"5. TRANSITION MINDSET: Ready to shift from strategic to tactical planning?\n\n"
-                            f"After this reflection, call planner with step_number: {request.step_number + 1} to continue with tactical details."
-                        )
-                else:
-                    # Normal flow for simple plans or later steps of complex plans
-                    response_data["next_steps"] = (
-                        f"Continue with step {request.step_number + 1}. Approximately {remaining_steps} steps remaining."
-                    )
-                # Result: Intermediate step, planning continues (with optional deep thinking pause)
-
-            # Convert to clean JSON response
-            response_content = json.dumps(response_data, indent=2)
-
-            # Store this step in conversation memory
-            if continuation_id:
-                add_turn(
-                    thread_id=continuation_id,
-                    role="assistant",
-                    content=response_content,
-                    tool_name="planner",
-                    model_name="claude-planner",
+            elif request.step_number == 2:
+                response_data["next_steps"] = (
+                    f"STOP! Complex planning requires reflection between steps. DO NOT call {self.get_name()} immediately.\n\n"
+                    f"MANDATORY REFLECTION before {self.get_name()} step {request.step_number + 1}:\n"
+                    f"1. EVALUATE YOUR APPROACH: Is the direction from step 1 still the best?\n"
+                    f"2. IDENTIFY MAJOR PHASES: What are the 3-5 main chunks of work?\n"
+                    f"3. SPOT DEPENDENCIES: What must happen before what?\n"
+                    f"4. CONSIDER RESOURCES: What skills, tools, or access do we need?\n"
+                    f"5. FIND CRITICAL PATHS: Where could delays hurt the most?\n\n"
+                    f"Think deeply about these aspects, then call {self.get_name()} with step_number: {request.step_number + 1}."
                )
+            elif request.step_number == 3:
+                response_data["next_steps"] = (
+                    f"PAUSE for final strategic reflection. DO NOT call {self.get_name()} yet.\n\n"
+                    f"FINAL DEEP THINKING before {self.get_name()} step {request.step_number + 1}:\n"
+                    f"1. VALIDATE COMPLETENESS: Does this plan address all original requirements?\n"
+                    f"2. CHECK FOR GAPS: What assumptions need validation? What's unclear?\n"
+                    f"3. PLAN FOR ADAPTATION: How will we know if we need to change course?\n"
+                    f"4. DEFINE FIRST STEPS: What are the first 2-3 concrete actions?\n"
+                    f"5. TRANSITION MINDSET: Ready to shift from strategic to tactical planning?\n\n"
+                    f"After this reflection, call {self.get_name()} with step_number: {request.step_number + 1} to continue with tactical details."
+                )
+        else:
+            # Normal flow for simple plans or later steps
+            remaining_steps = request.total_steps - request.step_number
+            response_data["next_steps"] = (
+                f"Continue with step {request.step_number + 1}. Approximately {remaining_steps} steps remaining."
+            )

-            # Return the JSON response directly as text content, like consensus tool
-            return [TextContent(type="text", text=response_content)]
+        return response_data

-        except Exception as e:
-            # Error handling - return JSON directly like consensus tool
-            error_data = {"error": str(e), "status": "planning_failed"}
-            return [TextContent(type="text", text=json.dumps(error_data, indent=2))]
+    def customize_workflow_response(self, response_data: dict, request) -> dict:
+        """
+        Customize response to match original planner tool format.
+        """
+        # No need to append to step_history since workflow mixin already manages work_history
+        # and we calculate step counts from work_history

-    # Stub implementations for abstract methods (not used since we override execute)
-    async def prepare_prompt(self, request: PlannerRequest) -> str:
-        return ""  # Not used - execute() is overridden
+        # Handle branching like original planner
+        if request.is_branch_point and request.branch_from_step and request.branch_id:
+            if request.branch_id not in self.branches:
+                self.branches[request.branch_id] = []
+            step_data = self.prepare_step_data(request)
+            self.branches[request.branch_id].append(step_data)

-    def format_response(self, response: str, request: PlannerRequest, model_info: dict = None) -> str:
-        return response  # Not used - execute() is overridden
+            # Update metadata to reflect the new branch
+            if "metadata" in response_data:
+                response_data["metadata"]["branches"] = list(self.branches.keys())
+
+        # Add planner-specific output instructions for final steps
+        if not request.next_step_required:
+            response_data["planning_complete"] = True
+            response_data["plan_summary"] = (
+                f"COMPLETE PLAN: {request.step} (Total {request.total_steps} steps completed)"
+            )
+            response_data["output"] = {
+                "instructions": "This is a structured planning response. Present the step_content as the main planning analysis. If next_step_required is true, continue with the next step. If planning_complete is true, present the complete plan in a well-structured format with clear sections, headings, numbered steps, and visual elements like ASCII charts for phases/dependencies. Use bullet points, sub-steps, sequences, and visual organization to make complex plans easy to understand and follow. IMPORTANT: Do NOT use emojis - use clear text formatting and ASCII characters only. Do NOT mention time estimates or costs unless explicitly requested.",
+                "format": "step_by_step_planning",
+                "presentation_guidelines": {
+                    "completed_plans": "Use clear headings, numbered phases, ASCII diagrams for workflows/dependencies, bullet points for sub-tasks, and visual sequences where helpful. No emojis. No time/cost estimates unless requested.",
+                    "step_content": "Present as main analysis with clear structure and actionable insights. No emojis. No time/cost estimates unless requested.",
+                    "continuation": "Use continuation_id for related planning sessions or implementation planning",
+                },
+            }
+            response_data["next_steps"] = (
+                "Planning complete. Present the complete plan to the user in a well-structured format with clear sections, "
+                "numbered steps, visual elements (ASCII charts/diagrams where helpful), sub-step breakdowns, and implementation guidance. "
+                "Use headings, bullet points, and visual organization to make the plan easy to follow. "
+                "If there are phases, dependencies, or parallel tracks, show these relationships visually. "
+                "IMPORTANT: Do NOT use emojis - use clear text formatting and ASCII characters only. "
+                "Do NOT mention time estimates or costs unless explicitly requested. "
+                "After presenting the plan, offer to either help implement specific parts or use the continuation_id to start related planning sessions."
+            )
+
+        # Convert generic status names to planner-specific ones
+        tool_name = self.get_name()
+        status_mapping = {
+            f"{tool_name}_in_progress": "planning_success",
+            f"pause_for_{tool_name}": f"pause_for_{tool_name}",  # Keep the full tool name for workflow consistency
+            f"{tool_name}_required": f"{tool_name}_required",  # Keep the full tool name for workflow consistency
+            f"{tool_name}_complete": f"{tool_name}_complete",  # Keep the full tool name for workflow consistency
+        }
+
+        if response_data["status"] in status_mapping:
+            response_data["status"] = status_mapping[response_data["status"]]
+
+        return response_data
+
+    # ================================================================================
+    # Hook Method Overrides for Planner-Specific Behavior
+    # ================================================================================
+
+    def get_completion_status(self) -> str:
+        """Planner uses planning-specific status."""
+        return "planning_complete"
+
+    def get_completion_data_key(self) -> str:
+        """Planner uses 'complete_planning' key."""
+        return "complete_planning"
+
+    def get_completion_message(self) -> str:
+        """Planner-specific completion message."""
+        return (
+            "Planning complete. Present the complete plan to the user in a well-structured format "
+            "and offer to help implement specific parts or start related planning sessions."
+        )
+
+    def get_skip_reason(self) -> str:
+        """Planner-specific skip reason."""
+        return "Planner is self-contained and completes planning without external analysis"
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Planner-specific expert analysis skip status."""
+        return "skipped_by_tool_design"
+
+    def store_initial_issue(self, step_description: str):
+        """Store initial planning description."""
+        self.initial_planning_description = step_description
+
+    def get_initial_request(self, fallback_step: str) -> str:
+        """Get initial planning description."""
+        try:
+            return self.initial_planning_description
+        except AttributeError:
+            return fallback_step
+
+    # Required abstract methods from BaseTool
+    def get_request_model(self):
+        """Return the planner-specific request model."""
+        return PlannerRequest
+
+    async def prepare_prompt(self, request) -> str:
+        """Not used - workflow tools use execute_workflow()."""
+        return ""  # Workflow tools use execute_workflow() directly
--- a/tools/precommit.py
+++ b/tools/precommit.py
--- a/tools/refactor.py
+++ b/tools/refactor.py
--- a/tools/shared/init.py
+++ b/tools/shared/init.py
@@ -0,0 +1,19 @@
+"""
+Shared infrastructure for Zen MCP tools.
+
+This module contains the core base classes and utilities that are shared
+across all tool types. It provides the foundation for the tool architecture.
+"""
+
+from .base_models import BaseWorkflowRequest, ConsolidatedFindings, ToolRequest, WorkflowRequest
+from .base_tool import BaseTool
+from .schema_builders import SchemaBuilder
+
+__all__ = [
+    "BaseTool",
+    "ToolRequest",
+    "BaseWorkflowRequest",
+    "WorkflowRequest",
+    "ConsolidatedFindings",
+    "SchemaBuilder",
+]
--- a/tools/shared/base_models.py
+++ b/tools/shared/base_models.py
@@ -0,0 +1,188 @@
+"""
+Base models for Zen MCP tools.
+
+This module contains the shared Pydantic models used across all tools,
+extracted to avoid circular imports and promote code reuse.
+
+Key Models:
+- ToolRequest: Base request model for all tools
+- WorkflowRequest: Extended request model for workflow-based tools
+- ConsolidatedFindings: Model for tracking workflow progress
+"""
+
+import logging
+from typing import Optional
+
+from pydantic import BaseModel, Field, field_validator
+
+logger = logging.getLogger(__name__)
+
+
+# Shared field descriptions to avoid duplication
+COMMON_FIELD_DESCRIPTIONS = {
+    "model": (
+        "Model to use. See tool's input schema for available models and their capabilities. "
+        "Use 'auto' to let Claude select the best model for the task."
+    ),
+    "temperature": (
+        "Temperature for response (0.0 to 1.0). Lower values are more focused and deterministic, "
+        "higher values are more creative. Tool-specific defaults apply if not specified."
+    ),
+    "thinking_mode": (
+        "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), "
+        "max (100% of model max). Higher modes enable deeper reasoning at the cost of speed."
+    ),
+    "use_websearch": (
+        "Enable web search for documentation, best practices, and current information. "
+        "When enabled, the model can request Claude to perform web searches and share results back "
+        "during conversations. Particularly useful for: brainstorming sessions, architectural design "
+        "discussions, exploring industry best practices, working with specific frameworks/technologies, "
+        "researching solutions to complex problems, or when current documentation and community insights "
+        "would enhance the analysis."
+    ),
+    "continuation_id": (
+        "Thread continuation ID for multi-turn conversations. When provided, the complete conversation "
+        "history is automatically embedded as context. Your response should build upon this history "
+        "without repeating previous analysis or instructions. Focus on providing only new insights, "
+        "additional findings, or answers to follow-up questions. Can be used across different tools."
+    ),
+    "images": (
+        "Optional image(s) for visual context. Accepts absolute file paths or "
+        "base64 data URLs. Only provide when user explicitly mentions images. "
+        "When including images, please describe what you believe each image contains "
+        "to aid with contextual understanding. Useful for UI discussions, diagrams, "
+        "visual problems, error screens, architecture mockups, and visual analysis tasks."
+    ),
+    "files": ("Optional files for context (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)"),
+}
+
+# Workflow-specific field descriptions
+WORKFLOW_FIELD_DESCRIPTIONS = {
+    "step": "Current work step content and findings from your overall work",
+    "step_number": "Current step number in the work sequence (starts at 1)",
+    "total_steps": "Estimated total steps needed to complete the work",
+    "next_step_required": "Whether another work step is needed after this one",
+    "findings": "Important findings, evidence and insights discovered in this step of the work",
+    "files_checked": "List of files examined during this work step",
+    "relevant_files": "Files identified as relevant to the issue/goal",
+    "relevant_context": "Methods/functions identified as involved in the issue",
+    "issues_found": "Issues identified with severity levels during work",
+    "confidence": "Confidence level in findings: exploring, low, medium, high, certain",
+    "hypothesis": "Current theory about the issue/goal based on work",
+    "backtrack_from_step": "Step number to backtrack from if work needs revision",
+    "use_assistant_model": (
+        "Whether to use assistant model for expert analysis after completing the workflow steps. "
+        "Set to False to skip expert analysis and rely solely on Claude's investigation. "
+        "Defaults to True for comprehensive validation."
+    ),
+}
+
+
+class ToolRequest(BaseModel):
+    """
+    Base request model for all Zen MCP tools.
+
+    This model defines common fields that all tools accept, including
+    model selection, temperature control, and conversation threading.
+    Tool-specific request models should inherit from this class.
+    """
+
+    # Model configuration
+    model: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["model"])
+    temperature: Optional[float] = Field(None, ge=0.0, le=1.0, description=COMMON_FIELD_DESCRIPTIONS["temperature"])
+    thinking_mode: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["thinking_mode"])
+
+    # Features
+    use_websearch: Optional[bool] = Field(True, description=COMMON_FIELD_DESCRIPTIONS["use_websearch"])
+
+    # Conversation support
+    continuation_id: Optional[str] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["continuation_id"])
+
+    # Visual context
+    images: Optional[list[str]] = Field(None, description=COMMON_FIELD_DESCRIPTIONS["images"])
+
+
+class BaseWorkflowRequest(ToolRequest):
+    """
+    Minimal base request model for workflow tools.
+
+    This provides only the essential fields that ALL workflow tools need,
+    allowing for maximum flexibility in tool-specific implementations.
+    """
+
+    # Core workflow fields that ALL workflow tools need
+    step: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])
+
+
+class WorkflowRequest(BaseWorkflowRequest):
+    """
+    Extended request model for workflow-based tools.
+
+    This model extends ToolRequest with fields specific to the workflow
+    pattern, where tools perform multi-step work with forced pauses between steps.
+
+    Used by: debug, precommit, codereview, refactor, thinkdeep, analyze
+    """
+
+    # Required workflow fields
+    step: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])
+
+    # Work tracking fields
+    findings: str = Field(..., description=WORKFLOW_FIELD_DESCRIPTIONS["findings"])
+    files_checked: list[str] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["files_checked"])
+    relevant_files: list[str] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"])
+    relevant_context: list[str] = Field(
+        default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"]
+    )
+    issues_found: list[dict] = Field(default_factory=list, description=WORKFLOW_FIELD_DESCRIPTIONS["issues_found"])
+    confidence: str = Field("low", description=WORKFLOW_FIELD_DESCRIPTIONS["confidence"])
+
+    # Optional workflow fields
+    hypothesis: Optional[str] = Field(None, description=WORKFLOW_FIELD_DESCRIPTIONS["hypothesis"])
+    backtrack_from_step: Optional[int] = Field(
+        None, ge=1, description=WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"]
+    )
+    use_assistant_model: Optional[bool] = Field(True, description=WORKFLOW_FIELD_DESCRIPTIONS["use_assistant_model"])
+
+    @field_validator("files_checked", "relevant_files", "relevant_context", mode="before")
+    @classmethod
+    def convert_string_to_list(cls, v):
+        """Convert string inputs to empty lists to handle malformed inputs gracefully."""
+        if isinstance(v, str):
+            logger.warning(f"Field received string '{v}' instead of list, converting to empty list")
+            return []
+        return v
+
+
+class ConsolidatedFindings(BaseModel):
+    """
+    Model for tracking consolidated findings across workflow steps.
+
+    This model accumulates findings, files, methods, and issues
+    discovered during multi-step work. It's used by
+    BaseWorkflowMixin to track progress across workflow steps.
+    """
+
+    files_checked: set[str] = Field(default_factory=set, description="All files examined across all steps")
+    relevant_files: set[str] = Field(
+        default_factory=set,
+        description="A subset of files_checked that have been identified as relevant for the work at hand",
+    )
+    relevant_context: set[str] = Field(
+        default_factory=set, description="All methods/functions identified during overall work being performed"
+    )
+    findings: list[str] = Field(default_factory=list, description="Chronological list of findings from each work step")
+    hypotheses: list[dict] = Field(default_factory=list, description="Evolution of hypotheses across work steps")
+    issues_found: list[dict] = Field(default_factory=list, description="All issues found with severity levels")
+    images: list[str] = Field(default_factory=list, description="Images collected during overall work")
+    confidence: str = Field("low", description="Latest confidence level from work steps")
+
+
+# Tool-specific field descriptions are now declared in each tool file
+# This keeps concerns separated and makes each tool self-contained
--- a/tools/shared/base_tool.py
+++ b/tools/shared/base_tool.py
--- a/tools/shared/schema_builders.py
+++ b/tools/shared/schema_builders.py
@@ -0,0 +1,163 @@
+"""
+Core schema building functionality for Zen MCP tools.
+
+This module provides base schema generation functionality for simple tools.
+Workflow-specific schema building is located in workflow/schema_builders.py
+to maintain proper separation of concerns.
+"""
+
+from typing import Any
+
+from .base_models import COMMON_FIELD_DESCRIPTIONS
+
+
+class SchemaBuilder:
+    """
+    Base schema builder for simple MCP tools.
+
+    This class provides static methods to build consistent schemas for simple tools.
+    Workflow tools use WorkflowSchemaBuilder in workflow/schema_builders.py.
+    """
+
+    # Common field schemas that can be reused across all tool types
+    COMMON_FIELD_SCHEMAS = {
+        "temperature": {
+            "type": "number",
+            "description": COMMON_FIELD_DESCRIPTIONS["temperature"],
+            "minimum": 0.0,
+            "maximum": 1.0,
+        },
+        "thinking_mode": {
+            "type": "string",
+            "enum": ["minimal", "low", "medium", "high", "max"],
+            "description": COMMON_FIELD_DESCRIPTIONS["thinking_mode"],
+        },
+        "use_websearch": {
+            "type": "boolean",
+            "description": COMMON_FIELD_DESCRIPTIONS["use_websearch"],
+            "default": True,
+        },
+        "continuation_id": {
+            "type": "string",
+            "description": COMMON_FIELD_DESCRIPTIONS["continuation_id"],
+        },
+        "images": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": COMMON_FIELD_DESCRIPTIONS["images"],
+        },
+    }
+
+    # Simple tool-specific field schemas (workflow tools use relevant_files instead)
+    SIMPLE_FIELD_SCHEMAS = {
+        "files": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": COMMON_FIELD_DESCRIPTIONS["files"],
+        },
+    }
+
+    @staticmethod
+    def build_schema(
+        tool_specific_fields: dict[str, dict[str, Any]] = None,
+        required_fields: list[str] = None,
+        model_field_schema: dict[str, Any] = None,
+        auto_mode: bool = False,
+    ) -> dict[str, Any]:
+        """
+        Build complete schema for simple tools.
+
+        Args:
+            tool_specific_fields: Additional fields specific to the tool
+            required_fields: List of required field names
+            model_field_schema: Schema for the model field
+            auto_mode: Whether the tool is in auto mode (affects model requirement)
+
+        Returns:
+            Complete JSON schema for the tool
+        """
+        properties = {}
+
+        # Add common fields (temperature, thinking_mode, etc.)
+        properties.update(SchemaBuilder.COMMON_FIELD_SCHEMAS)
+
+        # Add simple tool-specific fields (files field for simple tools)
+        properties.update(SchemaBuilder.SIMPLE_FIELD_SCHEMAS)
+
+        # Add model field if provided
+        if model_field_schema:
+            properties["model"] = model_field_schema
+
+        # Add tool-specific fields if provided
+        if tool_specific_fields:
+            properties.update(tool_specific_fields)
+
+        # Build required fields list
+        required = required_fields or []
+        if auto_mode and "model" not in required:
+            required.append("model")
+
+        # Build the complete schema
+        schema = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": properties,
+            "additionalProperties": False,
+        }
+
+        if required:
+            schema["required"] = required
+
+        return schema
+
+    @staticmethod
+    def get_common_fields() -> dict[str, dict[str, Any]]:
+        """Get the standard field schemas for simple tools."""
+        return SchemaBuilder.COMMON_FIELD_SCHEMAS.copy()
+
+    @staticmethod
+    def create_field_schema(
+        field_type: str,
+        description: str,
+        enum_values: list[str] = None,
+        minimum: float = None,
+        maximum: float = None,
+        items_type: str = None,
+        default: Any = None,
+    ) -> dict[str, Any]:
+        """
+        Helper method to create field schemas with common patterns.
+
+        Args:
+            field_type: JSON schema type ("string", "number", "array", etc.)
+            description: Human-readable description of the field
+            enum_values: For enum fields, list of allowed values
+            minimum: For numeric fields, minimum value
+            maximum: For numeric fields, maximum value
+            items_type: For array fields, type of array items
+            default: Default value for the field
+
+        Returns:
+            JSON schema object for the field
+        """
+        schema = {
+            "type": field_type,
+            "description": description,
+        }
+
+        if enum_values:
+            schema["enum"] = enum_values
+
+        if minimum is not None:
+            schema["minimum"] = minimum
+
+        if maximum is not None:
+            schema["maximum"] = maximum
+
+        if items_type and field_type == "array":
+            schema["items"] = {"type": items_type}
+
+        if default is not None:
+            schema["default"] = default
+
+        return schema
--- a/tools/simple/init.py
+++ b/tools/simple/init.py
@@ -0,0 +1,18 @@
+"""
+Simple tools for Zen MCP.
+
+Simple tools follow a basic request → AI model → response pattern.
+They inherit from SimpleTool which provides streamlined functionality
+for tools that don't need multi-step workflows.
+
+Available simple tools:
+- chat: General chat and collaborative thinking
+- consensus: Multi-perspective analysis
+- listmodels: Model listing and information
+- testgen: Test generation
+- tracer: Execution tracing
+"""
+
+from .base import SimpleTool
+
+__all__ = ["SimpleTool"]
--- a/tools/simple/base.py
+++ b/tools/simple/base.py
@@ -0,0 +1,232 @@
+"""
+Base class for simple MCP tools.
+
+Simple tools follow a straightforward pattern:
+1. Receive request
+2. Prepare prompt (with files, context, etc.)
+3. Call AI model
+4. Format and return response
+
+They use the shared SchemaBuilder for consistent schema generation
+and inherit all the conversation, file processing, and model handling
+capabilities from BaseTool.
+"""
+
+from abc import abstractmethod
+from typing import Any, Optional
+
+from tools.shared.base_models import ToolRequest
+from tools.shared.base_tool import BaseTool
+from tools.shared.schema_builders import SchemaBuilder
+
+
+class SimpleTool(BaseTool):
+    """
+    Base class for simple (non-workflow) tools.
+
+    Simple tools are request/response tools that don't require multi-step workflows.
+    They benefit from:
+    - Automatic schema generation using SchemaBuilder
+    - Inherited conversation handling and file processing
+    - Standardized model integration
+    - Consistent error handling and response formatting
+
+    To create a simple tool:
+    1. Inherit from SimpleTool
+    2. Implement get_tool_fields() to define tool-specific fields
+    3. Implement prepare_prompt() for prompt preparation
+    4. Optionally override format_response() for custom formatting
+    5. Optionally override get_required_fields() for custom requirements
+
+    Example:
+        class ChatTool(SimpleTool):
+            def get_name(self) -> str:
+                return "chat"
+
+            def get_tool_fields(self) -> Dict[str, Dict[str, Any]]:
+                return {
+                    "prompt": {
+                        "type": "string",
+                        "description": "Your question or idea...",
+                    },
+                    "files": SimpleTool.FILES_FIELD,
+                }
+
+            def get_required_fields(self) -> List[str]:
+                return ["prompt"]
+    """
+
+    # Common field definitions that simple tools can reuse
+    FILES_FIELD = SchemaBuilder.SIMPLE_FIELD_SCHEMAS["files"]
+    IMAGES_FIELD = SchemaBuilder.COMMON_FIELD_SCHEMAS["images"]
+
+    @abstractmethod
+    def get_tool_fields(self) -> dict[str, dict[str, Any]]:
+        """
+        Return tool-specific field definitions.
+
+        This method should return a dictionary mapping field names to their
+        JSON schema definitions. Common fields (model, temperature, etc.)
+        are added automatically by the base class.
+
+        Returns:
+            Dict mapping field names to JSON schema objects
+
+        Example:
+            return {
+                "prompt": {
+                    "type": "string",
+                    "description": "The user's question or request",
+                },
+                "files": SimpleTool.FILES_FIELD,  # Reuse common field
+                "max_tokens": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "description": "Maximum tokens for response",
+                }
+            }
+        """
+        pass
+
+    def get_required_fields(self) -> list[str]:
+        """
+        Return list of required field names.
+
+        Override this to specify which fields are required for your tool.
+        The model field is automatically added if in auto mode.
+
+        Returns:
+            List of required field names
+        """
+        return []
+
+    def get_input_schema(self) -> dict[str, Any]:
+        """
+        Generate the complete input schema using SchemaBuilder.
+
+        This method automatically combines:
+        - Tool-specific fields from get_tool_fields()
+        - Common fields (temperature, thinking_mode, etc.)
+        - Model field with proper auto-mode handling
+        - Required fields from get_required_fields()
+
+        Returns:
+            Complete JSON schema for the tool
+        """
+        return SchemaBuilder.build_schema(
+            tool_specific_fields=self.get_tool_fields(),
+            required_fields=self.get_required_fields(),
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+        )
+
+    def get_request_model(self):
+        """
+        Return the request model class.
+
+        Simple tools use the base ToolRequest by default.
+        Override this if your tool needs a custom request model.
+        """
+        return ToolRequest
+
+    # Convenience methods for common tool patterns
+
+    def build_standard_prompt(
+        self, system_prompt: str, user_content: str, request, file_context_title: str = "CONTEXT FILES"
+    ) -> str:
+        """
+        Build a standard prompt with system prompt, user content, and optional files.
+
+        This is a convenience method that handles the common pattern of:
+        1. Adding file content if present
+        2. Checking token limits
+        3. Adding web search instructions
+        4. Combining everything into a well-formatted prompt
+
+        Args:
+            system_prompt: The system prompt for the tool
+            user_content: The main user request/content
+            request: The validated request object
+            file_context_title: Title for the file context section
+
+        Returns:
+            Complete formatted prompt ready for the AI model
+        """
+        # Add context files if provided
+        if hasattr(request, "files") and request.files:
+            file_content, processed_files = self._prepare_file_content_for_prompt(
+                request.files, request.continuation_id, "Context files"
+            )
+            self._actually_processed_files = processed_files
+            if file_content:
+                user_content = f"{user_content}\n\n=== {file_context_title} ===\n{file_content}\n=== END CONTEXT ===="
+
+        # Check token limits
+        self._validate_token_limit(user_content, "Content")
+
+        # Add web search instruction if enabled
+        websearch_instruction = ""
+        if hasattr(request, "use_websearch") and request.use_websearch:
+            websearch_instruction = self.get_websearch_instruction(request.use_websearch, self.get_websearch_guidance())
+
+        # Combine system prompt with user content
+        full_prompt = f"""{system_prompt}{websearch_instruction}
+
+=== USER REQUEST ===
+{user_content}
+=== END REQUEST ===
+
+Please provide a thoughtful, comprehensive response:"""
+
+        return full_prompt
+
+    def get_websearch_guidance(self) -> Optional[str]:
+        """
+        Return tool-specific web search guidance.
+
+        Override this to provide tool-specific guidance for when web searches
+        would be helpful. Return None to use the default guidance.
+
+        Returns:
+            Tool-specific web search guidance or None for default
+        """
+        return None
+
+    def handle_prompt_file_with_fallback(self, request) -> str:
+        """
+        Handle prompt.txt files with fallback to request field.
+
+        This is a convenience method for tools that accept prompts either
+        as a field or as a prompt.txt file. It handles the extraction
+        and validation automatically.
+
+        Args:
+            request: The validated request object
+
+        Returns:
+            The effective prompt content
+
+        Raises:
+            ValueError: If prompt is too large for MCP transport
+        """
+        # Check for prompt.txt in files
+        if hasattr(request, "files"):
+            prompt_content, updated_files = self.handle_prompt_file(request.files)
+
+            # Update request files list
+            if updated_files is not None:
+                request.files = updated_files
+        else:
+            prompt_content = None
+
+        # Use prompt.txt content if available, otherwise use the prompt field
+        user_content = prompt_content if prompt_content else getattr(request, "prompt", "")
+
+        # Check user input size at MCP transport boundary
+        size_check = self.check_prompt_size(user_content)
+        if size_check:
+            from tools.models import ToolOutput
+
+            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
+
+        return user_content
--- a/tools/testgen.py
+++ b/tools/testgen.py
@@ -1,67 +1,155 @@
 """
-TestGen tool - Comprehensive test suite generation with edge case coverage
+TestGen Workflow tool - Step-by-step test generation with expert validation

-This tool generates comprehensive test suites by analyzing code paths,
-identifying edge cases, and producing test scaffolding that follows
-project conventions when test examples are provided.
+This tool provides a structured workflow for comprehensive test generation.
+It guides Claude through systematic investigation steps with forced pauses between each step
+to ensure thorough code examination, test planning, and pattern identification before proceeding.
+The tool supports backtracking, finding updates, and expert analysis integration for
+comprehensive test suite generation.

-Key Features:
- Multi-file and directory support
- Framework detection from existing tests
- Edge case identification (nulls, boundaries, async issues, etc.)
- Test pattern following when examples provided
- Deterministic test example sampling for large test suites
+Key features:
+- Step-by-step test generation workflow with progress tracking
+- Context-aware file embedding (references during investigation, full content for analysis)
+- Automatic test pattern detection and framework identification
+- Expert analysis integration with external models for additional test suggestions
+- Support for edge case identification and comprehensive coverage
+- Confidence-based workflow optimization
 """

 import logging
-import os
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional

-from pydantic import Field
+from pydantic import Field, model_validator
+
+if TYPE_CHECKING:
+    from tools.models import ToolModelCategory

 from config import TEMPERATURE_ANALYTICAL
 from systemprompts import TESTGEN_PROMPT
+from tools.shared.base_models import WorkflowRequest

-from .base import BaseTool, ToolRequest
+from .workflow.base import WorkflowTool

 logger = logging.getLogger(__name__)

-# Field descriptions to avoid duplication between Pydantic and JSON schema
-TESTGEN_FIELD_DESCRIPTIONS = {
-    "files": "Code files or directories to generate tests for (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)",
-    "prompt": "Description of what to test, testing objectives, and specific scope/focus areas. Be specific about any "
-    "particular component, module, class of function you would like to generate tests for.",
-    "test_examples": (
-        "Optional existing test files or directories to use as style/pattern reference (must be FULL absolute paths to real files / folders - DO NOT SHORTEN). "
-        "If not provided, the tool will determine the best testing approach based on the code structure. "
-        "For large test directories, only the smallest representative tests should be included to determine testing patterns. "
-        "If similar tests exist for the code being tested, include those for the most relevant patterns."
+# Tool-specific field descriptions for test generation workflow
+TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS = {
+    "step": (
+        "What to analyze or look for in this step. In step 1, describe what you want to test and begin forming an "
+        "analytical approach after thinking carefully about what needs to be examined. Consider code structure, "
+        "business logic, critical paths, edge cases, and potential failure modes. Map out the codebase structure, "
+        "understand the functionality, and identify areas requiring test coverage. In later steps, continue exploring "
+        "with precision and adapt your understanding as you uncover more insights about testable behaviors."
+    ),
+    "step_number": (
+        "The index of the current step in the test generation sequence, beginning at 1. Each step should build upon or "
+        "revise the previous one."
+    ),
+    "total_steps": (
+        "Your current estimate for how many steps will be needed to complete the test generation analysis. "
+        "Adjust as new findings emerge."
+    ),
+    "next_step_required": (
+        "Set to true if you plan to continue the investigation with another step. False means you believe the "
+        "test generation analysis is complete and ready for expert validation."
+    ),
+    "findings": (
+        "Summarize everything discovered in this step about the code being tested. Include analysis of functionality, "
+        "critical paths, edge cases, boundary conditions, error handling, async behavior, state management, and "
+        "integration points. Be specific and avoid vague language—document what you now know about the code and "
+        "what test scenarios are needed. IMPORTANT: Document both the happy paths and potential failure modes. "
+        "Identify existing test patterns if examples were provided. In later steps, confirm or update past findings "
+        "with additional evidence."
+    ),
+    "files_checked": (
+        "List all files (as absolute paths, do not clip or shrink file names) examined during the test generation "
+        "investigation so far. Include even files ruled out or found to be unrelated, as this tracks your "
+        "exploration path."
+    ),
+    "relevant_files": (
+        "Subset of files_checked (as full absolute paths) that contain code directly needing tests or are essential "
+        "for understanding test requirements. Only list those that are directly tied to the functionality being tested. "
+        "This could include implementation files, interfaces, dependencies, or existing test examples."
+    ),
+    "relevant_context": (
+        "List methods, functions, classes, or modules that need test coverage, in the format "
+        "'ClassName.methodName', 'functionName', or 'module.ClassName'. Prioritize critical business logic, "
+        "public APIs, complex algorithms, and error-prone code paths."
+    ),
+    "confidence": (
+        "Indicate your current confidence in the test generation assessment. Use: 'exploring' (starting analysis), "
+        "'low' (early investigation), 'medium' (some patterns identified), 'high' (strong understanding), 'certain' "
+        "(only when the test plan is thoroughly complete and all test scenarios are identified). Do NOT use 'certain' "
+        "unless the test generation analysis is comprehensively complete, use 'high' instead not 100% sure. Using "
+        "'certain' prevents additional expert analysis."
+    ),
+    "backtrack_from_step": (
+        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
+        "start over. Use this to acknowledge investigative dead ends and correct the course."
+    ),
+    "images": (
+        "Optional list of absolute paths to architecture diagrams, flow charts, or visual documentation that help "
+        "understand the code structure and test requirements. Only include if they materially assist test planning."
    ),
 }


-class TestGenerationRequest(ToolRequest):
-    """
-    Request model for the test generation tool.
+class TestGenRequest(WorkflowRequest):
+    """Request model for test generation workflow investigation steps"""

-    This model defines all parameters that can be used to customize
-    the test generation process, from selecting code files to providing
-    test examples for style consistency.
+    # Required fields for each investigation step
+    step: str = Field(..., description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["step"])
+    step_number: int = Field(..., description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["step_number"])
+    total_steps: int = Field(..., description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"])
+    next_step_required: bool = Field(..., description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"])
+
+    # Investigation tracking fields
+    findings: str = Field(..., description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["findings"])
+    files_checked: list[str] = Field(
+        default_factory=list, description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"]
+    )
+    relevant_files: list[str] = Field(
+        default_factory=list, description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"]
+    )
+    relevant_context: list[str] = Field(
+        default_factory=list, description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"]
+    )
+    confidence: Optional[str] = Field("low", description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["confidence"])
+
+    # Optional backtracking field
+    backtrack_from_step: Optional[int] = Field(
+        None, description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"]
+    )
+
+    # Optional images for visual context
+    images: Optional[list[str]] = Field(default=None, description=TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["images"])
+
+    # Override inherited fields to exclude them from schema (except model which needs to be available)
+    temperature: Optional[float] = Field(default=None, exclude=True)
+    thinking_mode: Optional[str] = Field(default=None, exclude=True)
+    use_websearch: Optional[bool] = Field(default=None, exclude=True)
+
+    @model_validator(mode="after")
+    def validate_step_one_requirements(self):
+        """Ensure step 1 has required relevant_files field."""
+        if self.step_number == 1 and not self.relevant_files:
+            raise ValueError("Step 1 requires 'relevant_files' field to specify code files to generate tests for")
+        return self
+
+
+class TestGenTool(WorkflowTool):
+    """
+    Test Generation workflow tool for step-by-step test planning and expert validation.
+
+    This tool implements a structured test generation workflow that guides users through
+    methodical investigation steps, ensuring thorough code examination, pattern identification,
+    and test scenario planning before reaching conclusions. It supports complex testing scenarios
+    including edge case identification, framework detection, and comprehensive coverage planning.
    """

-    files: list[str] = Field(..., description=TESTGEN_FIELD_DESCRIPTIONS["files"])
-    prompt: str = Field(..., description=TESTGEN_FIELD_DESCRIPTIONS["prompt"])
-    test_examples: Optional[list[str]] = Field(None, description=TESTGEN_FIELD_DESCRIPTIONS["test_examples"])
-
-
-class TestGenerationTool(BaseTool):
-    """
-    Test generation tool implementation.
-
-    This tool analyzes code to generate comprehensive test suites with
-    edge case coverage, following existing test patterns when examples
-    are provided.
-    """
+    def __init__(self):
+        super().__init__()
+        self.initial_request = None

    def get_name(self) -> str:
        return "testgen"
@@ -75,390 +163,406 @@ class TestGenerationTool(BaseTool):
            "'Create tests for authentication error handling'. If user request is vague, either ask for "
            "clarification about specific components to test, or make focused scope decisions and explain them. "
            "Analyzes code paths, identifies realistic failure modes, and generates framework-specific tests. "
-            "Supports test pattern following when examples are provided. "
-            "Choose thinking_mode based on code complexity: 'low' for simple functions, "
-            "'medium' for standard modules (default), 'high' for complex systems with many interactions, "
-            "'max' for critical systems requiring exhaustive test coverage. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
+            "Supports test pattern following when examples are provided. Choose thinking_mode based on "
+            "code complexity: 'low' for simple functions, 'medium' for standard modules (default), "
+            "'high' for complex systems with many interactions, 'max' for critical systems requiring "
+            "exhaustive test coverage. Note: If you're not currently using a top-tier model such as "
+            "Opus 4 or above, these tools can provide enhanced capabilities."
        )

-    def get_input_schema(self) -> dict[str, Any]:
-        schema = {
-            "type": "object",
-            "properties": {
-                "files": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": TESTGEN_FIELD_DESCRIPTIONS["files"],
-                },
-                "model": self.get_model_field_schema(),
-                "prompt": {
-                    "type": "string",
-                    "description": TESTGEN_FIELD_DESCRIPTIONS["prompt"],
-                },
-                "test_examples": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": TESTGEN_FIELD_DESCRIPTIONS["test_examples"],
-                },
-                "thinking_mode": {
-                    "type": "string",
-                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": "Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max)",
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": (
-                        "Thread continuation ID for multi-turn conversations. Can be used to continue conversations "
-                        "across different tools. Only provide this if continuing a previous conversation thread."
-                    ),
-                },
-            },
-            "required": ["files", "prompt"] + (["model"] if self.is_effective_auto_mode() else []),
-        }
-
-        return schema
-
    def get_system_prompt(self) -> str:
        return TESTGEN_PROMPT

    def get_default_temperature(self) -> float:
        return TEMPERATURE_ANALYTICAL

-    # Line numbers are enabled by default from base class for precise targeting
-
-    def get_model_category(self):
-        """TestGen requires extended reasoning for comprehensive test analysis"""
+    def get_model_category(self) -> "ToolModelCategory":
+        """Test generation requires thorough analysis and reasoning"""
        from tools.models import ToolModelCategory

        return ToolModelCategory.EXTENDED_REASONING

-    def get_request_model(self):
-        return TestGenerationRequest
+    def get_workflow_request_model(self):
+        """Return the test generation workflow-specific request model."""
+        return TestGenRequest

-    def _process_test_examples(
-        self, test_examples: list[str], continuation_id: Optional[str], available_tokens: int = None
-    ) -> tuple[str, str]:
-        """
-        Process test example files using available token budget for optimal sampling.
+    def get_input_schema(self) -> dict[str, Any]:
+        """Generate input schema using WorkflowSchemaBuilder with test generation-specific overrides."""
+        from .workflow.schema_builders import WorkflowSchemaBuilder

-        Args:
-            test_examples: List of test file paths
-            continuation_id: Continuation ID for filtering already embedded files
-            available_tokens: Available token budget for test examples
+        # Test generation workflow-specific field overrides
+        testgen_field_overrides = {
+            "step": {
+                "type": "string",
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["step"],
+            },
+            "step_number": {
+                "type": "integer",
+                "minimum": 1,
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["step_number"],
+            },
+            "total_steps": {
+                "type": "integer",
+                "minimum": 1,
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["total_steps"],
+            },
+            "next_step_required": {
+                "type": "boolean",
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"],
+            },
+            "findings": {
+                "type": "string",
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["findings"],
+            },
+            "files_checked": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["files_checked"],
+            },
+            "relevant_files": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"],
+            },
+            "confidence": {
+                "type": "string",
+                "enum": ["exploring", "low", "medium", "high", "certain"],
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["confidence"],
+            },
+            "backtrack_from_step": {
+                "type": "integer",
+                "minimum": 1,
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"],
+            },
+            "images": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": TESTGEN_WORKFLOW_FIELD_DESCRIPTIONS["images"],
+            },
+        }

-        Returns:
-            tuple: (formatted_content, summary_note)
-        """
-        logger.debug(f"[TESTGEN] Processing {len(test_examples)} test examples")
-
-        if not test_examples:
-            logger.debug("[TESTGEN] No test examples provided")
-            return "", ""
-
-        # Use existing file filtering to avoid duplicates in continuation
-        examples_to_process = self.filter_new_files(test_examples, continuation_id)
-        logger.debug(f"[TESTGEN] After filtering: {len(examples_to_process)} new test examples to process")
-
-        if not examples_to_process:
-            logger.info(f"[TESTGEN] All {len(test_examples)} test examples already in conversation history")
-            return "", ""
-
-        logger.debug(f"[TESTGEN] Processing {len(examples_to_process)} file paths")
-
-        # Calculate token budget for test examples (25% of available tokens, or fallback)
-        if available_tokens:
-            test_examples_budget = int(available_tokens * 0.25)  # 25% for test examples
-            logger.debug(
-                f"[TESTGEN] Allocating {test_examples_budget:,} tokens (25% of {available_tokens:,}) for test examples"
-            )
-        else:
-            test_examples_budget = 30000  # Fallback if no budget provided
-            logger.debug(f"[TESTGEN] Using fallback budget of {test_examples_budget:,} tokens for test examples")
-
-        original_count = len(examples_to_process)
-        logger.debug(
-            f"[TESTGEN] Processing {original_count} test example files with {test_examples_budget:,} token budget"
+        # Use WorkflowSchemaBuilder with test generation-specific tool fields
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=testgen_field_overrides,
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
        )

-        # Sort by file size (smallest first) for pattern-focused selection
-        file_sizes = []
-        for file_path in examples_to_process:
-            try:
-                size = os.path.getsize(file_path)
-                file_sizes.append((file_path, size))
-                logger.debug(f"[TESTGEN] Test example {os.path.basename(file_path)}: {size:,} bytes")
-            except (OSError, FileNotFoundError) as e:
-                # If we can't get size, put it at the end
-                logger.warning(f"[TESTGEN] Could not get size for {file_path}: {e}")
-                file_sizes.append((file_path, float("inf")))
-
-        # Sort by size and take smallest files for pattern reference
-        file_sizes.sort(key=lambda x: x[1])
-        examples_to_process = [f[0] for f in file_sizes]  # All files, sorted by size
-        logger.debug(
-            f"[TESTGEN] Sorted test examples by size (smallest first): {[os.path.basename(f) for f in examples_to_process]}"
-        )
-
-        # Use standard file content preparation with dynamic token budget
-        try:
-            logger.debug(f"[TESTGEN] Preparing file content for {len(examples_to_process)} test examples")
-            content, processed_files = self._prepare_file_content_for_prompt(
-                examples_to_process,
-                continuation_id,
-                "Test examples",
-                max_tokens=test_examples_budget,
-                reserve_tokens=1000,
-            )
-            # Store processed files for tracking - test examples are tracked separately from main code files
-
-            # Determine how many files were actually included
-            if content:
-                from utils.token_utils import estimate_tokens
-
-                used_tokens = estimate_tokens(content)
-                logger.info(
-                    f"[TESTGEN] Successfully embedded test examples: {used_tokens:,} tokens used ({test_examples_budget:,} available)"
-                )
-                if original_count > 1:
-                    truncation_note = f"Note: Used {used_tokens:,} tokens ({test_examples_budget:,} available) for test examples from {original_count} files to determine testing patterns."
-                else:
-                    truncation_note = ""
-            else:
-                logger.warning("[TESTGEN] No content generated for test examples")
-                truncation_note = ""
-
-            return content, truncation_note
-
-        except Exception as e:
-            # If test example processing fails, continue without examples rather than failing
-            logger.error(f"[TESTGEN] Failed to process test examples: {type(e).__name__}: {e}")
-            return "", f"Warning: Could not process test examples: {str(e)}"
-
-    async def prepare_prompt(self, request: TestGenerationRequest) -> str:
-        """
-        Prepare the test generation prompt with code analysis and optional test examples.
-
-        This method reads the requested files, processes any test examples,
-        and constructs a detailed prompt for comprehensive test generation.
-
-        Args:
-            request: The validated test generation request
-
-        Returns:
-            str: Complete prompt for the model
-
-        Raises:
-            ValueError: If the code exceeds token limits
-        """
-        logger.debug(f"[TESTGEN] Preparing prompt for {len(request.files)} code files")
-        if request.test_examples:
-            logger.debug(f"[TESTGEN] Including {len(request.test_examples)} test examples for pattern reference")
-        # Check for prompt.txt in files
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
-
-        # If prompt.txt was found, incorporate it into the prompt
-        if prompt_content:
-            logger.debug("[TESTGEN] Found prompt.txt file, incorporating content")
-            request.prompt = prompt_content + "\n\n" + request.prompt
-
-        # Update request files list
-        if updated_files is not None:
-            logger.debug(f"[TESTGEN] Updated files list after prompt.txt processing: {len(updated_files)} files")
-            request.files = updated_files
-
-        # Check user input size at MCP transport boundary (before adding internal content)
-        user_content = request.prompt
-        size_check = self.check_prompt_size(user_content)
-        if size_check:
-            from tools.models import ToolOutput
-
-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
-
-        # Calculate available token budget for dynamic allocation
-        continuation_id = getattr(request, "continuation_id", None)
-
-        # Get model context for token budget calculation
-        available_tokens = None
-
-        if hasattr(self, "_model_context") and self._model_context:
-            try:
-                capabilities = self._model_context.capabilities
-                # Use 75% of context for content (code + test examples), 25% for response
-                available_tokens = int(capabilities.context_window * 0.75)
-                logger.debug(
-                    f"[TESTGEN] Token budget calculation: {available_tokens:,} tokens (75% of {capabilities.context_window:,}) for model {self._model_context.model_name}"
-                )
-            except Exception as e:
-                # Fallback to conservative estimate
-                logger.warning(f"[TESTGEN] Could not get model capabilities: {e}")
-                available_tokens = 120000  # Conservative fallback
-                logger.debug(f"[TESTGEN] Using fallback token budget: {available_tokens:,} tokens")
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """Define required actions for each investigation phase."""
+        if step_number == 1:
+            # Initial test generation investigation tasks
+            return [
+                "Read and understand the code files specified for test generation",
+                "Analyze the overall structure, public APIs, and main functionality",
+                "Identify critical business logic and complex algorithms that need testing",
+                "Look for existing test patterns or examples if provided",
+                "Understand dependencies, external interactions, and integration points",
+                "Note any potential testability issues or areas that might be hard to test",
+            ]
+        elif confidence in ["exploring", "low"]:
+            # Need deeper investigation
+            return [
+                "Examine specific functions and methods to understand their behavior",
+                "Trace through code paths to identify all possible execution flows",
+                "Identify edge cases, boundary conditions, and error scenarios",
+                "Check for async operations, state management, and side effects",
+                "Look for non-deterministic behavior or external dependencies",
+                "Analyze error handling and exception cases that need testing",
+            ]
+        elif confidence in ["medium", "high"]:
+            # Close to completion - need final verification
+            return [
+                "Verify all critical paths have been identified for testing",
+                "Confirm edge cases and boundary conditions are comprehensive",
+                "Check that test scenarios cover both success and failure cases",
+                "Ensure async behavior and concurrency issues are addressed",
+                "Validate that the testing strategy aligns with code complexity",
+                "Double-check that findings include actionable test scenarios",
+            ]
        else:
-            # No model context available (shouldn't happen in normal flow)
-            available_tokens = 120000  # Conservative fallback
-            logger.debug(f"[TESTGEN] No model context, using fallback token budget: {available_tokens:,} tokens")
-
-        # Process test examples first to determine token allocation
-        test_examples_content = ""
-        test_examples_note = ""
-
-        if request.test_examples:
-            logger.debug(f"[TESTGEN] Processing {len(request.test_examples)} test examples")
-            test_examples_content, test_examples_note = self._process_test_examples(
-                request.test_examples, continuation_id, available_tokens
-            )
-            if test_examples_content:
-                logger.info("[TESTGEN] Test examples processed successfully for pattern reference")
-            else:
-                logger.info("[TESTGEN] No test examples content after processing")
-
-        # Remove files that appear in both 'files' and 'test_examples' to avoid duplicate embedding
-        # Files in test_examples take precedence as they're used for pattern reference
-        code_files_to_process = request.files.copy()
-        if request.test_examples:
-            # Normalize paths for comparison (resolve any relative paths, handle case sensitivity)
-            test_example_set = {os.path.normpath(os.path.abspath(f)) for f in request.test_examples}
-            original_count = len(code_files_to_process)
-
-            code_files_to_process = [
-                f for f in code_files_to_process if os.path.normpath(os.path.abspath(f)) not in test_example_set
+            # General investigation needed
+            return [
+                "Continue examining the codebase for additional test scenarios",
+                "Gather more evidence about code behavior and dependencies",
+                "Test your assumptions about how the code should be tested",
+                "Look for patterns that confirm your testing strategy",
+                "Focus on areas that haven't been thoroughly examined yet",
            ]

-            duplicates_removed = original_count - len(code_files_to_process)
-            if duplicates_removed > 0:
-                logger.info(
-                    f"[TESTGEN] Removed {duplicates_removed} duplicate files from code files list "
-                    f"(already included in test examples for pattern reference)"
-                )
+    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
+        """
+        Decide when to call external model based on investigation completeness.

-        # Calculate remaining tokens for main code after test examples
-        if test_examples_content and available_tokens:
-            from utils.token_utils import estimate_tokens
+        Always call expert analysis for test generation to get additional test ideas.
+        """
+        # Check if user requested to skip assistant model
+        if request and not self.get_request_use_assistant_model(request):
+            return False

-            test_tokens = estimate_tokens(test_examples_content)
-            remaining_tokens = available_tokens - test_tokens - 5000  # Reserve for prompt structure
-            logger.debug(
-                f"[TESTGEN] Token allocation: {test_tokens:,} for examples, {remaining_tokens:,} remaining for code files"
+        # Always benefit from expert analysis for comprehensive test coverage
+        return len(consolidated_findings.relevant_files) > 0 or len(consolidated_findings.findings) >= 1
+
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """Prepare context for external model call for test generation validation."""
+        context_parts = [
+            f"=== TEST GENERATION REQUEST ===\\n{self.initial_request or 'Test generation workflow initiated'}\\n=== END REQUEST ==="
+        ]
+
+        # Add investigation summary
+        investigation_summary = self._build_test_generation_summary(consolidated_findings)
+        context_parts.append(
+            f"\\n=== CLAUDE'S TEST PLANNING INVESTIGATION ===\\n{investigation_summary}\\n=== END INVESTIGATION ==="
+        )
+
+        # Add relevant code elements if available
+        if consolidated_findings.relevant_context:
+            methods_text = "\\n".join(f"- {method}" for method in consolidated_findings.relevant_context)
+            context_parts.append(f"\\n=== CODE ELEMENTS TO TEST ===\\n{methods_text}\\n=== END CODE ELEMENTS ===")
+
+        # Add images if available
+        if consolidated_findings.images:
+            images_text = "\\n".join(f"- {img}" for img in consolidated_findings.images)
+            context_parts.append(f"\\n=== VISUAL DOCUMENTATION ===\\n{images_text}\\n=== END VISUAL DOCUMENTATION ===")
+
+        return "\\n".join(context_parts)
+
+    def _build_test_generation_summary(self, consolidated_findings) -> str:
+        """Prepare a comprehensive summary of the test generation investigation."""
+        summary_parts = [
+            "=== SYSTEMATIC TEST GENERATION INVESTIGATION SUMMARY ===",
+            f"Total steps: {len(consolidated_findings.findings)}",
+            f"Files examined: {len(consolidated_findings.files_checked)}",
+            f"Relevant files identified: {len(consolidated_findings.relevant_files)}",
+            f"Code elements to test: {len(consolidated_findings.relevant_context)}",
+            "",
+            "=== INVESTIGATION PROGRESSION ===",
+        ]
+
+        for finding in consolidated_findings.findings:
+            summary_parts.append(finding)
+
+        return "\\n".join(summary_parts)
+
+    def should_include_files_in_expert_prompt(self) -> bool:
+        """Include files in expert analysis for comprehensive test generation."""
+        return True
+
+    def should_embed_system_prompt(self) -> bool:
+        """Embed system prompt in expert analysis for proper context."""
+        return True
+
+    def get_expert_thinking_mode(self) -> str:
+        """Use high thinking mode for thorough test generation analysis."""
+        return "high"
+
+    def get_expert_analysis_instruction(self) -> str:
+        """Get specific instruction for test generation expert analysis."""
+        return (
+            "Please provide comprehensive test generation guidance based on the investigation findings. "
+            "Focus on identifying additional test scenarios, edge cases not yet covered, framework-specific "
+            "best practices, and providing concrete test implementation examples following the multi-agent "
+            "workflow specified in the system prompt."
+        )
+
+    # Hook method overrides for test generation-specific behavior
+
+    def prepare_step_data(self, request) -> dict:
+        """
+        Map test generation-specific fields for internal processing.
+        """
+        step_data = {
+            "step": request.step,
+            "step_number": request.step_number,
+            "findings": request.findings,
+            "files_checked": request.files_checked,
+            "relevant_files": request.relevant_files,
+            "relevant_context": request.relevant_context,
+            "confidence": request.confidence,
+            "images": request.images or [],
+        }
+        return step_data
+
+    def should_skip_expert_analysis(self, request, consolidated_findings) -> bool:
+        """
+        Test generation workflow skips expert analysis when Claude has "certain" confidence.
+        """
+        return request.confidence == "certain" and not request.next_step_required
+
+    def store_initial_issue(self, step_description: str):
+        """Store initial request for expert analysis."""
+        self.initial_request = step_description
+
+    # Override inheritance hooks for test generation-specific behavior
+
+    def get_completion_status(self) -> str:
+        """Test generation tools use test-specific status."""
+        return "test_generation_complete_ready_for_implementation"
+
+    def get_completion_data_key(self) -> str:
+        """Test generation uses 'complete_test_generation' key."""
+        return "complete_test_generation"
+
+    def get_final_analysis_from_request(self, request):
+        """Test generation tools use findings for final analysis."""
+        return request.findings
+
+    def get_confidence_level(self, request) -> str:
+        """Test generation tools use 'certain' for high confidence."""
+        return "certain"
+
+    def get_completion_message(self) -> str:
+        """Test generation-specific completion message."""
+        return (
+            "Test generation analysis complete with CERTAIN confidence. You have identified all test scenarios "
+            "and provided comprehensive coverage strategy. MANDATORY: Present the user with the complete test plan "
+            "and IMMEDIATELY proceed with creating the test files following the identified patterns and framework. "
+            "Focus on implementing concrete, runnable tests with proper assertions."
+        )
+
+    def get_skip_reason(self) -> str:
+        """Test generation-specific skip reason."""
+        return "Claude completed comprehensive test planning with full confidence"
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Test generation-specific expert analysis skip status."""
+        return "skipped_due_to_certain_test_confidence"
+
+    def prepare_work_summary(self) -> str:
+        """Test generation-specific work summary."""
+        return self._build_test_generation_summary(self.consolidated_findings)
+
+    def get_completion_next_steps_message(self, expert_analysis_used: bool = False) -> str:
+        """
+        Test generation-specific completion message.
+        """
+        base_message = (
+            "TEST GENERATION ANALYSIS IS COMPLETE. You MUST now implement ALL identified test scenarios, "
+            "creating comprehensive test files that cover happy paths, edge cases, error conditions, and "
+            "boundary scenarios. Organize tests by functionality, use appropriate assertions, and follow "
+            "the identified framework patterns. Provide concrete, executable test code—make it easy for "
+            "a developer to run the tests and understand what each test validates."
+        )
+
+        # Add expert analysis guidance only when expert analysis was actually used
+        if expert_analysis_used:
+            expert_guidance = self.get_expert_analysis_guidance()
+            if expert_guidance:
+                return f"{base_message}\\n\\n{expert_guidance}"
+
+        return base_message
+
+    def get_expert_analysis_guidance(self) -> str:
+        """
+        Provide specific guidance for handling expert analysis in test generation.
+        """
+        return (
+            "IMPORTANT: Additional test scenarios and edge cases have been provided by the expert analysis above. "
+            "You MUST incorporate these suggestions into your test implementation, ensuring comprehensive coverage. "
+            "Validate that the expert's test ideas are practical and align with the codebase structure. Combine "
+            "your systematic investigation findings with the expert's additional scenarios to create a thorough "
+            "test suite that catches real-world bugs before they reach production."
+        )
+
+    def get_step_guidance_message(self, request) -> str:
+        """
+        Test generation-specific step guidance with detailed investigation instructions.
+        """
+        step_guidance = self.get_test_generation_step_guidance(request.step_number, request.confidence, request)
+        return step_guidance["next_steps"]
+
+    def get_test_generation_step_guidance(self, step_number: int, confidence: str, request) -> dict[str, Any]:
+        """
+        Provide step-specific guidance for test generation workflow.
+        """
+        # Generate the next steps instruction based on required actions
+        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)
+
+        if step_number == 1:
+            next_steps = (
+                f"MANDATORY: DO NOT call the {self.get_name()} tool again immediately. You MUST first analyze "
+                f"the code thoroughly using appropriate tools. CRITICAL AWARENESS: You need to understand "
+                f"the code structure, identify testable behaviors, find edge cases and boundary conditions, "
+                f"and determine the appropriate testing strategy. Use file reading tools, code analysis, and "
+                f"systematic examination to gather comprehensive information about what needs to be tested. "
+                f"Only call {self.get_name()} again AFTER completing your investigation. When you call "
+                f"{self.get_name()} next time, use step_number: {step_number + 1} and report specific "
+                f"code paths examined, test scenarios identified, and testing patterns discovered."
+            )
+        elif confidence in ["exploring", "low"]:
+            next_steps = (
+                f"STOP! Do NOT call {self.get_name()} again yet. Based on your findings, you've identified areas that need "
+                f"deeper analysis for test generation. MANDATORY ACTIONS before calling {self.get_name()} step {step_number + 1}:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nOnly call {self.get_name()} again with step_number: {step_number + 1} AFTER "
+                + "completing these test planning tasks."
+            )
+        elif confidence in ["medium", "high"]:
+            next_steps = (
+                f"WAIT! Your test generation analysis needs final verification. DO NOT call {self.get_name()} immediately. REQUIRED ACTIONS:\\n"
+                + "\\n".join(f"{i+1}. {action}" for i, action in enumerate(required_actions))
+                + f"\\n\\nREMEMBER: Ensure you have identified all test scenarios including edge cases and error conditions. "
+                f"Document findings with specific test cases to implement, then call {self.get_name()} "
+                f"with step_number: {step_number + 1}."
            )
        else:
-            remaining_tokens = available_tokens - 10000 if available_tokens else None
-            if remaining_tokens:
-                logger.debug(
-                    f"[TESTGEN] Token allocation: {remaining_tokens:,} tokens available for code files (no test examples)"
-                )
-
-        # Use centralized file processing logic for main code files (after deduplication)
-        logger.debug(f"[TESTGEN] Preparing {len(code_files_to_process)} code files for analysis")
-        code_content, processed_files = self._prepare_file_content_for_prompt(
-            code_files_to_process, continuation_id, "Code to test", max_tokens=remaining_tokens, reserve_tokens=2000
-        )
-        self._actually_processed_files = processed_files
-
-        if code_content:
-            from utils.token_utils import estimate_tokens
-
-            code_tokens = estimate_tokens(code_content)
-            logger.info(f"[TESTGEN] Code files embedded successfully: {code_tokens:,} tokens")
-        else:
-            logger.warning("[TESTGEN] No code content after file processing")
-
-        # Test generation is based on code analysis, no web search needed
-        logger.debug("[TESTGEN] Building complete test generation prompt")
-
-        # Build the complete prompt
-        prompt_parts = []
-
-        # Add system prompt
-        prompt_parts.append(self.get_system_prompt())
-
-        # Add user context
-        prompt_parts.append("=== USER CONTEXT ===")
-        prompt_parts.append(request.prompt)
-        prompt_parts.append("=== END CONTEXT ===")
-
-        # Add test examples if provided
-        if test_examples_content:
-            prompt_parts.append("\n=== TEST EXAMPLES FOR STYLE REFERENCE ===")
-            if test_examples_note:
-                prompt_parts.append(f"// {test_examples_note}")
-            prompt_parts.append(test_examples_content)
-            prompt_parts.append("=== END TEST EXAMPLES ===")
-
-        # Add main code to test
-        prompt_parts.append("\n=== CODE TO TEST ===")
-        prompt_parts.append(code_content)
-        prompt_parts.append("=== END CODE ===")
-
-        # Add generation instructions
-        prompt_parts.append(
-            "\nPlease analyze the code and generate comprehensive tests following the multi-agent workflow specified in the system prompt."
-        )
-        if test_examples_content:
-            prompt_parts.append(
-                "Use the provided test examples as a reference for style, framework, and testing patterns."
+            next_steps = (
+                f"PAUSE ANALYSIS. Before calling {self.get_name()} step {step_number + 1}, you MUST examine more code thoroughly. "
+                + "Required: "
+                + ", ".join(required_actions[:2])
+                + ". "
+                + f"Your next {self.get_name()} call (step_number: {step_number + 1}) must include "
+                f"NEW test scenarios from actual code analysis, not just theories. NO recursive {self.get_name()} calls "
+                f"without investigation work!"
            )

-        full_prompt = "\n".join(prompt_parts)
+        return {"next_steps": next_steps}

-        # Log final prompt statistics
-        from utils.token_utils import estimate_tokens
-
-        total_tokens = estimate_tokens(full_prompt)
-        logger.info(f"[TESTGEN] Complete prompt prepared: {total_tokens:,} tokens, {len(full_prompt):,} characters")
-
-        return full_prompt
-
-    def format_response(self, response: str, request: TestGenerationRequest, model_info: Optional[dict] = None) -> str:
+    def customize_workflow_response(self, response_data: dict, request) -> dict:
        """
-        Format the test generation response.
-
-        Args:
-            response: The raw test generation from the model
-            request: The original request for context
-            model_info: Optional dict with model metadata
-
-        Returns:
-            str: Formatted response with next steps
+        Customize response to match test generation workflow format.
        """
-        return f"""{response}
+        # Store initial request on first step
+        if request.step_number == 1:
+            self.initial_request = request.step

---
+        # Convert generic status names to test generation-specific ones
+        tool_name = self.get_name()
+        status_mapping = {
+            f"{tool_name}_in_progress": "test_generation_in_progress",
+            f"pause_for_{tool_name}": "pause_for_test_analysis",
+            f"{tool_name}_required": "test_analysis_required",
+            f"{tool_name}_complete": "test_generation_complete",
+        }

-Claude, you are now in EXECUTION MODE. Take immediate action:
+        if response_data["status"] in status_mapping:
+            response_data["status"] = status_mapping[response_data["status"]]

-## Step 1: THINK & CREATE TESTS
-ULTRATHINK while creating these in order to verify that every code reference, import, function name, and logic path is
-100% accurate before saving.
+        # Rename status field to match test generation workflow
+        if f"{tool_name}_status" in response_data:
+            response_data["test_generation_status"] = response_data.pop(f"{tool_name}_status")
+            # Add test generation-specific status fields
+            response_data["test_generation_status"]["test_scenarios_identified"] = len(
+                self.consolidated_findings.relevant_context
+            )
+            response_data["test_generation_status"]["analysis_confidence"] = self.get_request_confidence(request)

- CREATE all test files in the correct project structure
- SAVE each test using proper naming conventions
- VALIDATE all imports, references, and dependencies are correct as required by the current framework / project / file
+        # Map complete_testgen to complete_test_generation
+        if f"complete_{tool_name}" in response_data:
+            response_data["complete_test_generation"] = response_data.pop(f"complete_{tool_name}")

-## Step 2: DISPLAY RESULTS TO USER
-After creating each test file, MUST show the user:
-```
-✅ Created: path/to/test_file.py
-   - test_function_name(): Brief description of what it tests
-   - test_another_function(): Brief description
-   - [Total: X test functions]
-```
+        # Map the completion flag to match test generation workflow
+        if f"{tool_name}_complete" in response_data:
+            response_data["test_generation_complete"] = response_data.pop(f"{tool_name}_complete")

-## Step 3: VALIDATE BY EXECUTION
-CRITICAL: Run the tests immediately to confirm they work:
- Install any missing dependencies first or request user to perform step if this cannot be automated
- Execute the test suite
- Fix any failures or errors
- Confirm 100% pass rate. If there's a failure, re-iterate, go over each test, validate and understand why it's failing
+        return response_data

-## Step 4: INTEGRATION VERIFICATION
- Verify tests integrate with existing test infrastructure
- Confirm test discovery works
- Validate test naming and organization
+    # Required abstract methods from BaseTool
+    def get_request_model(self):
+        """Return the test generation workflow-specific request model."""
+        return TestGenRequest

-## Step 5: MOVE TO NEXT ACTION
-Once tests are confirmed working, immediately proceed to the next logical step for the project.
-
-MANDATORY: Do NOT stop after generating - you MUST create, validate, run, and confirm the tests work and all of the
-steps listed above are carried out correctly. Take full ownership of the testing implementation and move to your
-next work. If you were supplied a more_work_required request in the response above, you MUST honor it."""
+    async def prepare_prompt(self, request) -> str:
+        """Not used - workflow tools use execute_workflow()."""
+        return ""  # Workflow tools use execute_workflow() directly
--- a/tools/thinkdeep.py
+++ b/tools/thinkdeep.py
@@ -1,7 +1,19 @@
 """
-ThinkDeep tool - Extended reasoning and problem-solving
+ThinkDeep Workflow Tool - Extended Reasoning with Systematic Investigation
+
+This tool provides step-by-step deep thinking capabilities using a systematic workflow approach.
+It enables comprehensive analysis of complex problems with expert validation at completion.
+
+Key Features:
+- Systematic step-by-step thinking process
+- Multi-step analysis with evidence gathering
+- Confidence-based investigation flow
+- Expert analysis integration with external models
+- Support for focused analysis areas (architecture, performance, security, etc.)
+- Confidence-based workflow optimization
 """

+import logging
 from typing import TYPE_CHECKING, Any, Optional

 from pydantic import Field
@@ -11,224 +23,544 @@ if TYPE_CHECKING:

 from config import TEMPERATURE_CREATIVE
 from systemprompts import THINKDEEP_PROMPT
+from tools.shared.base_models import WorkflowRequest

-from .base import BaseTool, ToolRequest
+from .workflow.base import WorkflowTool

-# Field descriptions to avoid duplication between Pydantic and JSON schema
-THINKDEEP_FIELD_DESCRIPTIONS = {
-    "prompt": (
-        "MANDATORY: you MUST first think hard and establish a deep understanding of the topic and question by thinking through all "
-        "relevant details, context, constraints, and implications. Provide your thought-partner all of your current thinking/analysis "
-        "to extend and validate. Share these extended thoughts and ideas in "
-        "the prompt so your assistant has comprehensive information to work with for the best analysis."
-    ),
-    "problem_context": "Provate additional context about the problem or goal. Be as expressive as possible. More information will "
-    "be very helpful to your thought-partner.",
-    "focus_areas": "Specific aspects to focus on (architecture, performance, security, etc.)",
-    "files": "Optional absolute file paths or directories for additional context (must be FULL absolute paths to real files / folders - DO NOT SHORTEN)",
-    "images": "Optional images for visual analysis - diagrams, charts, system architectures, or any visual information to analyze. "
-    "(must be FULL absolute paths to real files / folders - DO NOT SHORTEN)",
-}
+logger = logging.getLogger(__name__)


-class ThinkDeepRequest(ToolRequest):
-    """Request model for thinkdeep tool"""
+class ThinkDeepWorkflowRequest(WorkflowRequest):
+    """Request model for thinkdeep workflow tool with comprehensive investigation capabilities"""

-    prompt: str = Field(..., description=THINKDEEP_FIELD_DESCRIPTIONS["prompt"])
-    problem_context: Optional[str] = Field(None, description=THINKDEEP_FIELD_DESCRIPTIONS["problem_context"])
-    focus_areas: Optional[list[str]] = Field(None, description=THINKDEEP_FIELD_DESCRIPTIONS["focus_areas"])
-    files: Optional[list[str]] = Field(None, description=THINKDEEP_FIELD_DESCRIPTIONS["files"])
-    images: Optional[list[str]] = Field(None, description=THINKDEEP_FIELD_DESCRIPTIONS["images"])
+    # Core workflow parameters
+    step: str = Field(description="Current work step content and findings from your overall work")
+    step_number: int = Field(description="Current step number in the work sequence (starts at 1)", ge=1)
+    total_steps: int = Field(description="Estimated total steps needed to complete the work", ge=1)
+    next_step_required: bool = Field(description="Whether another work step is needed after this one")
+    findings: str = Field(
+        description="Summarize everything discovered in this step about the problem/goal. Include new insights, "
+        "connections made, implications considered, alternative approaches, potential issues identified, "
+        "and evidence from thinking. Be specific and avoid vague language—document what you now know "
+        "and how it affects your hypothesis or understanding. IMPORTANT: If you find compelling evidence "
+        "that contradicts earlier assumptions, document this clearly. In later steps, confirm or update "
+        "past findings with additional reasoning."
+    )
+
+    # Investigation tracking
+    files_checked: list[str] = Field(
+        default_factory=list,
+        description="List all files (as absolute paths) examined during the investigation so far. "
+        "Include even files ruled out or found unrelated, as this tracks your exploration path.",
+    )
+    relevant_files: list[str] = Field(
+        default_factory=list,
+        description="Subset of files_checked (as full absolute paths) that contain information directly "
+        "relevant to the problem or goal. Only list those directly tied to the root cause, "
+        "solution, or key insights. This could include the source of the issue, documentation "
+        "that explains the expected behavior, configuration files that affect the outcome, or "
+        "examples that illustrate the concept being analyzed.",
+    )
+    relevant_context: list[str] = Field(
+        default_factory=list,
+        description="Key concepts, methods, or principles that are central to the thinking analysis, "
+        "in the format 'concept_name' or 'ClassName.methodName'. Focus on those that drive "
+        "the core insights, represent critical decision points, or define the scope of the analysis.",
+    )
+    hypothesis: Optional[str] = Field(
+        default=None,
+        description="Current theory or understanding about the problem/goal based on evidence gathered. "
+        "This should be a concrete theory that can be validated or refined through further analysis. "
+        "You are encouraged to revise or abandon hypotheses in later steps based on new evidence.",
+    )
+
+    # Analysis metadata
+    issues_found: list[dict] = Field(
+        default_factory=list,
+        description="Issues identified during work with severity levels - each as a dict with "
+        "'severity' (critical, high, medium, low) and 'description' fields.",
+    )
+    confidence: str = Field(
+        default="low",
+        description="Indicate your current confidence in the analysis. Use: 'exploring' (starting analysis), "
+        "'low' (early thinking), 'medium' (some insights gained), 'high' (strong understanding), "
+        "'certain' (only when the analysis is complete and conclusions are definitive). "
+        "Do NOT use 'certain' unless the thinking is comprehensively complete, use 'high' instead when in doubt. "
+        "Using 'certain' prevents additional expert analysis to save time and money.",
+    )
+
+    # Advanced workflow features
+    backtrack_from_step: Optional[int] = Field(
+        default=None,
+        description="If an earlier finding or hypothesis needs to be revised or discarded, "
+        "specify the step number from which to start over. Use this to acknowledge analytical "
+        "dead ends and correct the course.",
+        ge=1,
+    )
+
+    # Expert analysis configuration - keep these fields available for configuring the final assistant model
+    # in expert analysis (commented out exclude=True)
+    temperature: Optional[float] = Field(
+        default=None,
+        description="Temperature for creative thinking (0-1, default 0.7)",
+        ge=0.0,
+        le=1.0,
+        # exclude=True  # Excluded from MCP schema but available for internal use
+    )
+    thinking_mode: Optional[str] = Field(
+        default=None,
+        description="Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max). Defaults to 'high' if not specified.",
+        # exclude=True  # Excluded from MCP schema but available for internal use
+    )
+    use_websearch: Optional[bool] = Field(
+        default=None,
+        description="Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
+        # exclude=True  # Excluded from MCP schema but available for internal use
+    )
+
+    # Context files and investigation scope
+    problem_context: Optional[str] = Field(
+        default=None,
+        description="Provide additional context about the problem or goal. Be as expressive as possible. More information will be very helpful for the analysis.",
+    )
+    focus_areas: Optional[list[str]] = Field(
+        default=None,
+        description="Specific aspects to focus on (architecture, performance, security, etc.)",
+    )


-class ThinkDeepTool(BaseTool):
-    """Extended thinking and reasoning tool"""
+class ThinkDeepTool(WorkflowTool):
+    """
+    ThinkDeep Workflow Tool - Systematic Deep Thinking Analysis
+
+    Provides comprehensive step-by-step thinking capabilities with expert validation.
+    Uses workflow architecture for systematic investigation and analysis.
+    """
+
+    name = "thinkdeep"
+    description = (
+        "EXTENDED THINKING & REASONING - Your deep thinking partner for complex problems. "
+        "Use this when you need to think deeper about a problem, extend your analysis, explore alternatives, "
+        "or validate approaches. Perfect for: architecture decisions, complex bugs, performance challenges, "
+        "security analysis. I'll challenge assumptions, find edge cases, and provide alternative solutions. "
+        "IMPORTANT: Choose the appropriate thinking_mode based on task complexity - 'low' for quick analysis, "
+        "'medium' for standard problems, 'high' for complex issues (default), 'max' for extremely complex "
+        "challenges requiring deepest analysis. When in doubt, err on the side of a higher mode for truly "
+        "deep thought and evaluation. Note: If you're not currently using a top-tier model such as Opus 4 or above, "
+        "these tools can provide enhanced capabilities."
+    )
+
+    def __init__(self):
+        """Initialize the ThinkDeep workflow tool"""
+        super().__init__()
+        # Storage for request parameters to use in expert analysis
+        self.stored_request_params = {}

    def get_name(self) -> str:
-        return "thinkdeep"
+        """Return the tool name"""
+        return self.name

    def get_description(self) -> str:
-        return (
-            "EXTENDED THINKING & REASONING - Your deep thinking partner for complex problems. "
-            "Use this when you need to think deeper about a problem, extend your analysis, explore alternatives, or validate approaches. "
-            "Perfect for: architecture decisions, complex bugs, performance challenges, security analysis. "
-            "I'll challenge assumptions, find edge cases, and provide alternative solutions. "
-            "IMPORTANT: Choose the appropriate thinking_mode based on task complexity - "
-            "'low' for quick analysis, 'medium' for standard problems, 'high' for complex issues (default), "
-            "'max' for extremely complex challenges requiring deepest analysis. "
-            "When in doubt, err on the side of a higher mode for truly deep thought and evaluation. "
-            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
-        )
-
-    def get_input_schema(self) -> dict[str, Any]:
-        schema = {
-            "type": "object",
-            "properties": {
-                "prompt": {
-                    "type": "string",
-                    "description": THINKDEEP_FIELD_DESCRIPTIONS["prompt"],
-                },
-                "model": self.get_model_field_schema(),
-                "problem_context": {
-                    "type": "string",
-                    "description": THINKDEEP_FIELD_DESCRIPTIONS["problem_context"],
-                },
-                "focus_areas": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": THINKDEEP_FIELD_DESCRIPTIONS["focus_areas"],
-                },
-                "files": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": THINKDEEP_FIELD_DESCRIPTIONS["files"],
-                },
-                "images": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "description": THINKDEEP_FIELD_DESCRIPTIONS["images"],
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "Temperature for creative thinking (0-1, default 0.7)",
-                    "minimum": 0,
-                    "maximum": 1,
-                },
-                "thinking_mode": {
-                    "type": "string",
-                    "enum": ["minimal", "low", "medium", "high", "max"],
-                    "description": f"Thinking depth: minimal (0.5% of model max), low (8%), medium (33%), high (67%), max (100% of model max). Defaults to '{self.get_default_thinking_mode()}' if not specified.",
-                },
-                "use_websearch": {
-                    "type": "boolean",
-                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
-                    "default": True,
-                },
-                "continuation_id": {
-                    "type": "string",
-                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
-                },
-            },
-            "required": ["prompt"] + (["model"] if self.is_effective_auto_mode() else []),
-        }
-
-        return schema
-
-    def get_system_prompt(self) -> str:
-        return THINKDEEP_PROMPT
-
-    def get_default_temperature(self) -> float:
-        return TEMPERATURE_CREATIVE
-
-    def get_default_thinking_mode(self) -> str:
-        """ThinkDeep uses configurable thinking mode, defaults to high"""
-        from config import DEFAULT_THINKING_MODE_THINKDEEP
-
-        return DEFAULT_THINKING_MODE_THINKDEEP
+        """Return the tool description"""
+        return self.description

    def get_model_category(self) -> "ToolModelCategory":
-        """ThinkDeep requires extended reasoning capabilities"""
+        """Return the model category for this tool"""
        from tools.models import ToolModelCategory

        return ToolModelCategory.EXTENDED_REASONING

-    def get_request_model(self):
-        return ThinkDeepRequest
+    def get_workflow_request_model(self):
+        """Return the workflow request model for this tool"""
+        return ThinkDeepWorkflowRequest

-    async def prepare_prompt(self, request: ThinkDeepRequest) -> str:
-        """Prepare the full prompt for extended thinking"""
-        # Check for prompt.txt in files
-        prompt_content, updated_files = self.handle_prompt_file(request.files)
+    def get_input_schema(self) -> dict[str, Any]:
+        """Generate input schema using WorkflowSchemaBuilder with thinkdeep-specific overrides."""
+        from .workflow.schema_builders import WorkflowSchemaBuilder

-        # Use prompt.txt content if available, otherwise use the prompt field
-        current_analysis = prompt_content if prompt_content else request.prompt
+        # ThinkDeep workflow-specific field overrides
+        thinkdeep_field_overrides = {
+            "problem_context": {
+                "type": "string",
+                "description": "Provide additional context about the problem or goal. Be as expressive as possible. More information will be very helpful for the analysis.",
+            },
+            "focus_areas": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Specific aspects to focus on (architecture, performance, security, etc.)",
+            },
+        }

-        # Check user input size at MCP transport boundary (before adding internal content)
-        size_check = self.check_prompt_size(current_analysis)
-        if size_check:
-            from tools.models import ToolOutput
-
-            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")
-
-        # Update request files list
-        if updated_files is not None:
-            request.files = updated_files
-
-        # File size validation happens at MCP boundary in server.py
-
-        # Build context parts
-        context_parts = [f"=== CLAUDE'S CURRENT ANALYSIS ===\n{current_analysis}\n=== END ANALYSIS ==="]
-
-        if request.problem_context:
-            context_parts.append(f"\n=== PROBLEM CONTEXT ===\n{request.problem_context}\n=== END CONTEXT ===")
-
-        # Add reference files if provided
-        if request.files:
-            # Use centralized file processing logic
-            continuation_id = getattr(request, "continuation_id", None)
-            file_content, processed_files = self._prepare_file_content_for_prompt(
-                request.files, continuation_id, "Reference files"
-            )
-            self._actually_processed_files = processed_files
-
-            if file_content:
-                context_parts.append(f"\n=== REFERENCE FILES ===\n{file_content}\n=== END FILES ===")
-
-        full_context = "\n".join(context_parts)
-
-        # Check token limits
-        self._validate_token_limit(full_context, "Context")
-
-        # Add focus areas instruction if specified
-        focus_instruction = ""
-        if request.focus_areas:
-            areas = ", ".join(request.focus_areas)
-            focus_instruction = f"\n\nFOCUS AREAS: Please pay special attention to {areas} aspects."
-
-        # Add web search instruction if enabled
-        websearch_instruction = self.get_websearch_instruction(
-            request.use_websearch,
-            """When analyzing complex problems, consider if searches for these would help:
- Current documentation for specific technologies, frameworks, or APIs mentioned
- Known issues, workarounds, or community solutions for similar problems
- Recent updates, deprecations, or best practices that might affect the approach
- Official sources to verify assumptions or clarify technical details""",
+        # Use WorkflowSchemaBuilder with thinkdeep-specific tool fields
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=thinkdeep_field_overrides,
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
        )

-        # Combine system prompt with context
-        full_prompt = f"""{self.get_system_prompt()}{focus_instruction}{websearch_instruction}
+    def get_system_prompt(self) -> str:
+        """Return the system prompt for this workflow tool"""
+        return THINKDEEP_PROMPT

-{full_context}
+    def get_default_temperature(self) -> float:
+        """Return default temperature for deep thinking"""
+        return TEMPERATURE_CREATIVE

-Please provide deep analysis that extends Claude's thinking with:
-1. Alternative approaches and solutions
-2. Edge cases and potential failure modes
-3. Critical evaluation of assumptions
-4. Concrete implementation suggestions
-5. Risk assessment and mitigation strategies"""
+    def get_default_thinking_mode(self) -> str:
+        """Return default thinking mode for thinkdeep"""
+        from config import DEFAULT_THINKING_MODE_THINKDEEP

-        return full_prompt
+        return DEFAULT_THINKING_MODE_THINKDEEP

-    def format_response(self, response: str, request: ThinkDeepRequest, model_info: Optional[dict] = None) -> str:
-        """Format the response with clear attribution and critical thinking prompt"""
-        # Get the friendly model name
-        model_name = "your fellow developer"
-        if model_info and model_info.get("model_response"):
-            model_name = model_info["model_response"].friendly_name or "your fellow developer"
+    def customize_workflow_response(self, response_data: dict, request, **kwargs) -> dict:
+        """
+        Customize the workflow response for thinkdeep-specific needs
+        """
+        # Store request parameters for later use in expert analysis
+        self.stored_request_params = {
+            "temperature": getattr(request, "temperature", None),
+            "thinking_mode": getattr(request, "thinking_mode", None),
+            "use_websearch": getattr(request, "use_websearch", None),
+        }

-        return f"""{response}
+        # Add thinking-specific context to response
+        response_data.update(
+            {
+                "thinking_status": {
+                    "current_step": request.step_number,
+                    "total_steps": request.total_steps,
+                    "files_checked": len(request.files_checked),
+                    "relevant_files": len(request.relevant_files),
+                    "thinking_confidence": request.confidence,
+                    "analysis_focus": request.focus_areas or ["general"],
+                }
+            }
+        )

---
+        # Add thinking_complete field for final steps (test expects this)
+        if not request.next_step_required:
+            response_data["thinking_complete"] = True

-## Critical Evaluation Required
+            # Add complete_thinking summary (test expects this)
+            response_data["complete_thinking"] = {
+                "steps_completed": len(self.work_history),
+                "final_confidence": request.confidence,
+                "relevant_context": list(self.consolidated_findings.relevant_context),
+                "key_findings": self.consolidated_findings.findings,
+                "issues_identified": self.consolidated_findings.issues_found,
+                "files_analyzed": list(self.consolidated_findings.relevant_files),
+            }

-Claude, please critically evaluate {model_name}'s analysis by thinking hard about the following:
+        # Add thinking-specific completion message based on confidence
+        if request.confidence == "certain":
+            response_data["completion_message"] = (
+                "Deep thinking analysis is complete with high certainty. "
+                "All aspects have been thoroughly considered and conclusions are definitive."
+            )
+        elif not request.next_step_required:
+            response_data["completion_message"] = (
+                "Deep thinking analysis phase complete. Expert validation will provide additional insights and recommendations."
+            )

-1. **Technical merit** - Which suggestions are valuable vs. have limitations?
-2. **Constraints** - Fit with codebase patterns, performance, security, architecture
-3. **Risks** - Hidden complexities, edge cases, potential failure modes
-4. **Final recommendation** - Synthesize both perspectives, then ultrathink on your own to explore additional
-considerations and arrive at the best technical solution. Feel free to use zen's chat tool for a follow-up discussion
-if needed.
+        return response_data

-Remember: Use {model_name}'s insights to enhance, not replace, your analysis."""
+    def should_skip_expert_analysis(self, request, consolidated_findings) -> bool:
+        """
+        ThinkDeep tool skips expert analysis when Claude has "certain" confidence.
+        """
+        return request.confidence == "certain" and not request.next_step_required
+
+    def get_completion_status(self) -> str:
+        """ThinkDeep tools use thinking-specific status."""
+        return "deep_thinking_complete_ready_for_implementation"
+
+    def get_completion_data_key(self) -> str:
+        """ThinkDeep uses 'complete_thinking' key."""
+        return "complete_thinking"
+
+    def get_final_analysis_from_request(self, request):
+        """ThinkDeep tools use 'findings' field."""
+        return request.findings
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Status when skipping expert analysis for certain confidence."""
+        return "skipped_due_to_certain_thinking_confidence"
+
+    def get_skip_reason(self) -> str:
+        """Reason for skipping expert analysis."""
+        return "Claude expressed certain confidence in the deep thinking analysis - no additional validation needed"
+
+    def get_completion_message(self) -> str:
+        """Message for completion without expert analysis."""
+        return "Deep thinking analysis complete with certain confidence. Proceed with implementation based on the analysis."
+
+    def customize_expert_analysis_prompt(self, base_prompt: str, request, file_content: str = "") -> str:
+        """
+        Customize the expert analysis prompt for deep thinking validation
+        """
+        thinking_context = f"""
+DEEP THINKING ANALYSIS VALIDATION
+
+You are reviewing a comprehensive deep thinking analysis completed through systematic investigation.
+Your role is to validate the thinking process, identify any gaps, challenge assumptions, and provide
+additional insights or alternative perspectives.
+
+ANALYSIS SCOPE:
+- Problem Context: {getattr(request, 'problem_context', 'General analysis')}
+- Focus Areas: {', '.join(getattr(request, 'focus_areas', ['comprehensive analysis']))}
+- Investigation Confidence: {request.confidence}
+- Steps Completed: {request.step_number} of {request.total_steps}
+
+THINKING SUMMARY:
+{request.findings}
+
+KEY INSIGHTS AND CONTEXT:
+{', '.join(request.relevant_context) if request.relevant_context else 'No specific context identified'}
+
+VALIDATION OBJECTIVES:
+1. Assess the depth and quality of the thinking process
+2. Identify any logical gaps, missing considerations, or flawed assumptions
+3. Suggest alternative approaches or perspectives not considered
+4. Validate the conclusions and recommendations
+5. Provide actionable next steps for implementation
+
+Be thorough but constructive in your analysis. Challenge the thinking where appropriate,
+but also acknowledge strong insights and valid conclusions.
+"""
+
+        if file_content:
+            thinking_context += f"\n\nFILE CONTEXT:\n{file_content}"
+
+        return f"{thinking_context}\n\n{base_prompt}"
+
+    def get_expert_analysis_instructions(self) -> str:
+        """
+        Return instructions for expert analysis specific to deep thinking validation
+        """
+        return (
+            "DEEP THINKING ANALYSIS IS COMPLETE. You MUST now summarize and present ALL thinking insights, "
+            "alternative approaches considered, risks and trade-offs identified, and final recommendations. "
+            "Clearly prioritize the top solutions or next steps that emerged from the analysis. "
+            "Provide concrete, actionable guidance based on the deep thinking—make it easy for the user to "
+            "understand exactly what to do next and how to implement the best solution."
+        )
+
+    # Override hook methods to use stored request parameters for expert analysis
+
+    def get_request_temperature(self, request) -> float:
+        """Use stored temperature from initial request."""
+        if hasattr(self, "stored_request_params") and self.stored_request_params.get("temperature") is not None:
+            return self.stored_request_params["temperature"]
+        return super().get_request_temperature(request)
+
+    def get_request_thinking_mode(self, request) -> str:
+        """Use stored thinking mode from initial request."""
+        if hasattr(self, "stored_request_params") and self.stored_request_params.get("thinking_mode") is not None:
+            return self.stored_request_params["thinking_mode"]
+        return super().get_request_thinking_mode(request)
+
+    def get_request_use_websearch(self, request) -> bool:
+        """Use stored use_websearch from initial request."""
+        if hasattr(self, "stored_request_params") and self.stored_request_params.get("use_websearch") is not None:
+            return self.stored_request_params["use_websearch"]
+        return super().get_request_use_websearch(request)
+
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """
+        Return required actions for the current thinking step.
+        """
+        actions = []
+
+        if step_number == 1:
+            actions.extend(
+                [
+                    "Begin systematic thinking analysis",
+                    "Identify key aspects and assumptions to explore",
+                    "Establish initial investigation approach",
+                ]
+            )
+        elif confidence == "low":
+            actions.extend(
+                [
+                    "Continue gathering evidence and insights",
+                    "Test initial hypotheses",
+                    "Explore alternative perspectives",
+                ]
+            )
+        elif confidence == "medium":
+            actions.extend(
+                [
+                    "Deepen analysis of promising approaches",
+                    "Validate key assumptions",
+                    "Consider implementation challenges",
+                ]
+            )
+        elif confidence == "high":
+            actions.extend(
+                [
+                    "Synthesize findings into cohesive recommendations",
+                    "Validate conclusions against evidence",
+                    "Prepare for expert analysis",
+                ]
+            )
+        else:  # certain
+            actions.append("Analysis complete - ready for implementation")
+
+        return actions
+
+    def should_call_expert_analysis(self, consolidated_findings, request=None) -> bool:
+        """
+        Determine if expert analysis should be called based on confidence and completion.
+        """
+        if request and hasattr(request, "confidence"):
+            # Don't call expert analysis if confidence is "certain"
+            if request.confidence == "certain":
+                return False
+
+        # Call expert analysis if investigation is complete (when next_step_required is False)
+        if request and hasattr(request, "next_step_required"):
+            return not request.next_step_required
+
+        # Fallback: call expert analysis if we have meaningful findings
+        return (
+            len(consolidated_findings.relevant_files) > 0
+            or len(consolidated_findings.findings) >= 2
+            or len(consolidated_findings.issues_found) > 0
+        )
+
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """
+        Prepare context for expert analysis specific to deep thinking.
+        """
+        context_parts = []
+
+        context_parts.append("DEEP THINKING ANALYSIS SUMMARY:")
+        context_parts.append(f"Steps completed: {len(consolidated_findings.findings)}")
+        context_parts.append(f"Final confidence: {consolidated_findings.confidence}")
+
+        if consolidated_findings.findings:
+            context_parts.append("\nKEY FINDINGS:")
+            for i, finding in enumerate(consolidated_findings.findings, 1):
+                context_parts.append(f"{i}. {finding}")
+
+        if consolidated_findings.relevant_context:
+            context_parts.append(f"\nRELEVANT CONTEXT:\n{', '.join(consolidated_findings.relevant_context)}")
+
+        # Get hypothesis from latest hypotheses entry if available
+        if consolidated_findings.hypotheses:
+            latest_hypothesis = consolidated_findings.hypotheses[-1].get("hypothesis", "")
+            if latest_hypothesis:
+                context_parts.append(f"\nFINAL HYPOTHESIS:\n{latest_hypothesis}")
+
+        if consolidated_findings.issues_found:
+            context_parts.append(f"\nISSUES IDENTIFIED: {len(consolidated_findings.issues_found)} issues")
+            for issue in consolidated_findings.issues_found:
+                context_parts.append(
+                    f"- {issue.get('severity', 'unknown')}: {issue.get('description', 'No description')}"
+                )
+
+        return "\n".join(context_parts)
+
+    def get_step_guidance_message(self, request) -> str:
+        """
+        Generate guidance for the next step in thinking analysis
+        """
+        if request.next_step_required:
+            next_step_number = request.step_number + 1
+
+            if request.confidence == "certain":
+                guidance = (
+                    f"Your thinking analysis confidence is CERTAIN. Consider if you truly need step {next_step_number} "
+                    f"or if you should complete the analysis now with expert validation."
+                )
+            elif request.confidence == "high":
+                guidance = (
+                    f"Your thinking analysis confidence is HIGH. For step {next_step_number}, consider: "
+                    f"validation of conclusions, stress-testing assumptions, or exploring edge cases."
+                )
+            elif request.confidence == "medium":
+                guidance = (
+                    f"Your thinking analysis confidence is MEDIUM. For step {next_step_number}, focus on: "
+                    f"deepening insights, exploring alternative approaches, or gathering additional evidence."
+                )
+            else:  # low or exploring
+                guidance = (
+                    f"Your thinking analysis confidence is {request.confidence.upper()}. For step {next_step_number}, "
+                    f"continue investigating: gather more evidence, test hypotheses, or explore different angles."
+                )
+
+            # Add specific thinking guidance based on progress
+            if request.step_number == 1:
+                guidance += (
+                    " Consider: What are the key assumptions? What evidence supports or contradicts initial theories? "
+                    "What alternative approaches exist?"
+                )
+            elif request.step_number >= request.total_steps // 2:
+                guidance += (
+                    " Consider: Synthesis of findings, validation of conclusions, identification of implementation "
+                    "challenges, and preparation for expert analysis."
+                )
+
+            return guidance
+        else:
+            return "Thinking analysis is ready for expert validation and final recommendations."
+
+    def format_final_response(self, assistant_response: str, request, **kwargs) -> dict:
+        """
+        Format the final response from the assistant for thinking analysis
+        """
+        response_data = {
+            "thinking_analysis": assistant_response,
+            "analysis_metadata": {
+                "total_steps_completed": request.step_number,
+                "final_confidence": request.confidence,
+                "files_analyzed": len(request.relevant_files),
+                "key_insights": len(request.relevant_context),
+                "issues_identified": len(request.issues_found),
+            },
+        }
+
+        # Add completion status
+        if request.confidence == "certain":
+            response_data["completion_status"] = "analysis_complete_with_certainty"
+        else:
+            response_data["completion_status"] = "analysis_complete_pending_validation"
+
+        return response_data
+
+    def format_step_response(
+        self,
+        assistant_response: str,
+        request,
+        status: str = "pause_for_thinkdeep",
+        continuation_id: Optional[str] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Format intermediate step responses for thinking workflow
+        """
+        response_data = super().format_step_response(assistant_response, request, status, continuation_id, **kwargs)
+
+        # Add thinking-specific step guidance
+        step_guidance = self.get_step_guidance_message(request)
+        response_data["thinking_guidance"] = step_guidance
+
+        # Add analysis progress indicators
+        response_data["analysis_progress"] = {
+            "step_completed": request.step_number,
+            "remaining_steps": max(0, request.total_steps - request.step_number),
+            "confidence_trend": request.confidence,
+            "investigation_depth": "expanding" if request.next_step_required else "finalizing",
+        }
+
+        return response_data
+
+    # Required abstract methods from BaseTool
+    def get_request_model(self):
+        """Return the thinkdeep workflow-specific request model."""
+        return ThinkDeepWorkflowRequest
+
+    async def prepare_prompt(self, request) -> str:
+        """Not used - workflow tools use execute_workflow()."""
+        return ""  # Workflow tools use execute_workflow() directly
--- a/tools/workflow/init.py
+++ b/tools/workflow/init.py
@@ -0,0 +1,22 @@
+"""
+Workflow tools for Zen MCP.
+
+Workflow tools follow a multi-step pattern with forced pauses between steps
+to encourage thorough investigation and analysis. They inherit from WorkflowTool
+which combines BaseTool with BaseWorkflowMixin.
+
+Available workflow tools:
+- debug: Systematic investigation and root cause analysis
+- planner: Sequential planning (special case - no AI calls)
+- analyze: Code analysis workflow
+- codereview: Code review workflow
+- precommit: Pre-commit validation workflow
+- refactor: Refactoring analysis workflow
+- thinkdeep: Deep thinking workflow
+"""
+
+from .base import WorkflowTool
+from .schema_builders import WorkflowSchemaBuilder
+from .workflow_mixin import BaseWorkflowMixin
+
+__all__ = ["WorkflowTool", "WorkflowSchemaBuilder", "BaseWorkflowMixin"]
--- a/tools/workflow/base.py
+++ b/tools/workflow/base.py
@@ -0,0 +1,399 @@
+"""
+Base class for workflow MCP tools.
+
+Workflow tools follow a multi-step pattern:
+1. Claude calls tool with work step data
+2. Tool tracks findings and progress
+3. Tool forces Claude to pause and investigate between steps
+4. Once work is complete, tool calls external AI model for expert analysis
+5. Tool returns structured response combining investigation + expert analysis
+
+They combine BaseTool's capabilities with BaseWorkflowMixin's workflow functionality
+and use SchemaBuilder for consistent schema generation.
+"""
+
+from abc import abstractmethod
+from typing import Any, Optional
+
+from tools.shared.base_models import WorkflowRequest
+from tools.shared.base_tool import BaseTool
+
+from .schema_builders import WorkflowSchemaBuilder
+from .workflow_mixin import BaseWorkflowMixin
+
+
+class WorkflowTool(BaseTool, BaseWorkflowMixin):
+    """
+    Base class for workflow (multi-step) tools.
+
+    Workflow tools perform systematic multi-step work with expert analysis.
+    They benefit from:
+    - Automatic workflow orchestration from BaseWorkflowMixin
+    - Automatic schema generation using SchemaBuilder
+    - Inherited conversation handling and file processing from BaseTool
+    - Progress tracking with ConsolidatedFindings
+    - Expert analysis integration
+
+    To create a workflow tool:
+    1. Inherit from WorkflowTool
+    2. Tool name is automatically provided by get_name() method
+    3. Implement get_required_actions() for step guidance
+    4. Implement should_call_expert_analysis() for completion criteria
+    5. Implement prepare_expert_analysis_context() for expert prompts
+    6. Optionally implement get_tool_fields() for additional fields
+    7. Optionally override workflow behavior methods
+
+    Example:
+        class DebugTool(WorkflowTool):
+            # get_name() is inherited from BaseTool
+
+            def get_tool_fields(self) -> Dict[str, Dict[str, Any]]:
+                return {
+                    "hypothesis": {
+                        "type": "string",
+                        "description": "Current theory about the issue",
+                    }
+                }
+
+            def get_required_actions(
+                self, step_number: int, confidence: str, findings: str, total_steps: int
+            ) -> List[str]:
+                return ["Examine relevant code files", "Trace execution flow", "Check error logs"]
+
+            def should_call_expert_analysis(self, consolidated_findings) -> bool:
+                return len(consolidated_findings.relevant_files) > 0
+    """
+
+    def __init__(self):
+        """Initialize WorkflowTool with proper multiple inheritance."""
+        BaseTool.__init__(self)
+        BaseWorkflowMixin.__init__(self)
+
+    def get_tool_fields(self) -> dict[str, dict[str, Any]]:
+        """
+        Return tool-specific field definitions beyond the standard workflow fields.
+
+        Workflow tools automatically get all standard workflow fields:
+        - step, step_number, total_steps, next_step_required
+        - findings, files_checked, relevant_files, relevant_context
+        - issues_found, confidence, hypothesis, backtrack_from_step
+        - plus common fields (model, temperature, etc.)
+
+        Override this method to add additional tool-specific fields.
+
+        Returns:
+            Dict mapping field names to JSON schema objects
+
+        Example:
+            return {
+                "severity_filter": {
+                    "type": "string",
+                    "enum": ["low", "medium", "high"],
+                    "description": "Minimum severity level to report",
+                }
+            }
+        """
+        return {}
+
+    def get_required_fields(self) -> list[str]:
+        """
+        Return additional required fields beyond the standard workflow requirements.
+
+        Workflow tools automatically require:
+        - step, step_number, total_steps, next_step_required, findings
+        - model (if in auto mode)
+
+        Override this to add additional required fields.
+
+        Returns:
+            List of additional required field names
+        """
+        return []
+
+    def get_input_schema(self) -> dict[str, Any]:
+        """
+        Generate the complete input schema using SchemaBuilder.
+
+        This method automatically combines:
+        - Standard workflow fields (step, findings, etc.)
+        - Common fields (temperature, thinking_mode, etc.)
+        - Model field with proper auto-mode handling
+        - Tool-specific fields from get_tool_fields()
+        - Required fields from get_required_fields()
+
+        Returns:
+            Complete JSON schema for the workflow tool
+        """
+        return WorkflowSchemaBuilder.build_schema(
+            tool_specific_fields=self.get_tool_fields(),
+            required_fields=self.get_required_fields(),
+            model_field_schema=self.get_model_field_schema(),
+            auto_mode=self.is_effective_auto_mode(),
+            tool_name=self.get_name(),
+        )
+
+    def get_workflow_request_model(self):
+        """
+        Return the workflow request model class.
+
+        Workflow tools use WorkflowRequest by default, which includes
+        all the standard workflow fields. Override this if your tool
+        needs a custom request model.
+        """
+        return WorkflowRequest
+
+    # Implement the abstract method from BaseWorkflowMixin
+    def get_work_steps(self, request) -> list[str]:
+        """
+        Default implementation - workflow tools typically don't need predefined steps.
+
+        The workflow is driven by Claude's investigation process rather than
+        predefined steps. Override this if your tool needs specific step guidance.
+        """
+        return []
+
+    # Default implementations for common workflow patterns
+
+    def get_standard_required_actions(self, step_number: int, confidence: str, base_actions: list[str]) -> list[str]:
+        """
+        Helper method to generate standard required actions based on confidence and step.
+
+        This provides common patterns that most workflow tools can use:
+        - Early steps: broad exploration
+        - Low confidence: deeper investigation
+        - Medium/high confidence: verification and confirmation
+
+        Args:
+            step_number: Current step number
+            confidence: Current confidence level
+            base_actions: Tool-specific base actions
+
+        Returns:
+            List of required actions appropriate for the current state
+        """
+        if step_number == 1:
+            # Initial investigation
+            return [
+                "Search for code related to the reported issue or symptoms",
+                "Examine relevant files and understand the current implementation",
+                "Understand the project structure and locate relevant modules",
+                "Identify how the affected functionality is supposed to work",
+            ]
+        elif confidence in ["exploring", "low"]:
+            # Need deeper investigation
+            return base_actions + [
+                "Trace method calls and data flow through the system",
+                "Check for edge cases, boundary conditions, and assumptions in the code",
+                "Look for related configuration, dependencies, or external factors",
+            ]
+        elif confidence in ["medium", "high"]:
+            # Close to solution - need confirmation
+            return base_actions + [
+                "Examine the exact code sections where you believe the issue occurs",
+                "Trace the execution path that leads to the failure",
+                "Verify your hypothesis with concrete code evidence",
+                "Check for any similar patterns elsewhere in the codebase",
+            ]
+        else:
+            # General continued investigation
+            return base_actions + [
+                "Continue examining the code paths identified in your hypothesis",
+                "Gather more evidence using appropriate investigation tools",
+                "Test edge cases and boundary conditions",
+                "Look for patterns that confirm or refute your theory",
+            ]
+
+    def should_call_expert_analysis_default(self, consolidated_findings) -> bool:
+        """
+        Default implementation for expert analysis decision.
+
+        This provides a reasonable default that most workflow tools can use:
+        - Call expert analysis if we have relevant files or significant findings
+        - Skip if confidence is "certain" (handled by the workflow mixin)
+
+        Override this for tool-specific logic.
+
+        Args:
+            consolidated_findings: The consolidated findings from all work steps
+
+        Returns:
+            True if expert analysis should be called
+        """
+        # Call expert analysis if we have relevant files or substantial findings
+        return (
+            len(consolidated_findings.relevant_files) > 0
+            or len(consolidated_findings.findings) >= 2
+            or len(consolidated_findings.issues_found) > 0
+        )
+
+    def prepare_standard_expert_context(
+        self, consolidated_findings, initial_description: str, context_sections: dict[str, str] = None
+    ) -> str:
+        """
+        Helper method to prepare standard expert analysis context.
+
+        This provides a common structure that most workflow tools can use,
+        with the ability to add tool-specific sections.
+
+        Args:
+            consolidated_findings: The consolidated findings from all work steps
+            initial_description: Description of the initial request/issue
+            context_sections: Optional additional sections to include
+
+        Returns:
+            Formatted context string for expert analysis
+        """
+        context_parts = [f"=== ISSUE DESCRIPTION ===\n{initial_description}\n=== END DESCRIPTION ==="]
+
+        # Add work progression
+        if consolidated_findings.findings:
+            findings_text = "\n".join(consolidated_findings.findings)
+            context_parts.append(f"\n=== INVESTIGATION FINDINGS ===\n{findings_text}\n=== END FINDINGS ===")
+
+        # Add relevant methods if available
+        if consolidated_findings.relevant_context:
+            methods_text = "\n".join(f"- {method}" for method in consolidated_findings.relevant_context)
+            context_parts.append(f"\n=== RELEVANT METHODS/FUNCTIONS ===\n{methods_text}\n=== END METHODS ===")
+
+        # Add hypothesis evolution if available
+        if consolidated_findings.hypotheses:
+            hypotheses_text = "\n".join(
+                f"Step {h['step']} ({h['confidence']} confidence): {h['hypothesis']}"
+                for h in consolidated_findings.hypotheses
+            )
+            context_parts.append(f"\n=== HYPOTHESIS EVOLUTION ===\n{hypotheses_text}\n=== END HYPOTHESES ===")
+
+        # Add issues found if available
+        if consolidated_findings.issues_found:
+            issues_text = "\n".join(
+                f"[{issue.get('severity', 'unknown').upper()}] {issue.get('description', 'No description')}"
+                for issue in consolidated_findings.issues_found
+            )
+            context_parts.append(f"\n=== ISSUES IDENTIFIED ===\n{issues_text}\n=== END ISSUES ===")
+
+        # Add tool-specific sections
+        if context_sections:
+            for section_title, section_content in context_sections.items():
+                context_parts.append(
+                    f"\n=== {section_title.upper()} ===\n{section_content}\n=== END {section_title.upper()} ==="
+                )
+
+        return "\n".join(context_parts)
+
+    def handle_completion_without_expert_analysis(
+        self, request, consolidated_findings, initial_description: str = None
+    ) -> dict[str, Any]:
+        """
+        Generic handler for completion when expert analysis is not needed.
+
+        This provides a standard response format for when the tool determines
+        that external expert analysis is not required. All workflow tools
+        can use this generic implementation or override for custom behavior.
+
+        Args:
+            request: The workflow request object
+            consolidated_findings: The consolidated findings from all work steps
+            initial_description: Optional initial description (defaults to request.step)
+
+        Returns:
+            Dictionary with completion response data
+        """
+        # Prepare work summary using inheritance hook
+        work_summary = self.prepare_work_summary()
+
+        return {
+            "status": self.get_completion_status(),
+            self.get_completion_data_key(): {
+                "initial_request": initial_description or request.step,
+                "steps_taken": len(consolidated_findings.findings),
+                "files_examined": list(consolidated_findings.files_checked),
+                "relevant_files": list(consolidated_findings.relevant_files),
+                "relevant_context": list(consolidated_findings.relevant_context),
+                "work_summary": work_summary,
+                "final_analysis": self.get_final_analysis_from_request(request),
+                "confidence_level": self.get_confidence_level(request),
+            },
+            "next_steps": self.get_completion_message(),
+            "skip_expert_analysis": True,
+            "expert_analysis": {
+                "status": self.get_skip_expert_analysis_status(),
+                "reason": self.get_skip_reason(),
+            },
+        }
+
+    # Inheritance hooks for customization
+
+    def prepare_work_summary(self) -> str:
+        """
+        Prepare a summary of the work performed. Override for custom summaries.
+        Default implementation provides a basic summary.
+        """
+        try:
+            return self._prepare_work_summary()
+        except AttributeError:
+            try:
+                return f"Completed {len(self.work_history)} work steps"
+            except AttributeError:
+                return "Completed 0 work steps"
+
+    def get_completion_status(self) -> str:
+        """Get the status to use when completing without expert analysis."""
+        return "high_confidence_completion"
+
+    def get_completion_data_key(self) -> str:
+        """Get the key name for completion data in the response."""
+        return f"complete_{self.get_name()}"
+
+    def get_final_analysis_from_request(self, request) -> Optional[str]:
+        """Extract final analysis from request. Override for tool-specific extraction."""
+        try:
+            return request.hypothesis
+        except AttributeError:
+            return None
+
+    def get_confidence_level(self, request) -> str:
+        """Get confidence level from request. Override for tool-specific logic."""
+        try:
+            return request.confidence or "high"
+        except AttributeError:
+            return "high"
+
+    def get_completion_message(self) -> str:
+        """Get completion message. Override for tool-specific messaging."""
+        return (
+            f"{self.get_name().capitalize()} complete with high confidence. You have identified the exact "
+            "analysis and solution. MANDATORY: Present the user with the results "
+            "and proceed with implementing the solution without requiring further "
+            "consultation. Focus on the precise, actionable steps needed."
+        )
+
+    def get_skip_reason(self) -> str:
+        """Get reason for skipping expert analysis. Override for tool-specific reasons."""
+        return f"{self.get_name()} completed with sufficient confidence"
+
+    def get_skip_expert_analysis_status(self) -> str:
+        """Get status for skipped expert analysis. Override for tool-specific status."""
+        return "skipped_by_tool_design"
+
+    # Abstract methods that must be implemented by specific workflow tools
+    # (These are inherited from BaseWorkflowMixin and must be implemented)
+
+    @abstractmethod
+    def get_required_actions(self, step_number: int, confidence: str, findings: str, total_steps: int) -> list[str]:
+        """Define required actions for each work phase."""
+        pass
+
+    @abstractmethod
+    def should_call_expert_analysis(self, consolidated_findings) -> bool:
+        """Decide when to call external model based on tool-specific criteria"""
+        pass
+
+    @abstractmethod
+    def prepare_expert_analysis_context(self, consolidated_findings) -> str:
+        """Prepare context for external model call"""
+        pass
+
+    # Default execute method - delegates to workflow
+    async def execute(self, arguments: dict[str, Any]) -> list:
+        """Execute the workflow tool - delegates to BaseWorkflowMixin."""
+        return await self.execute_workflow(arguments)
--- a/tools/workflow/schema_builders.py
+++ b/tools/workflow/schema_builders.py
@@ -0,0 +1,173 @@
+"""
+Schema builders for workflow MCP tools.
+
+This module provides workflow-specific schema generation functionality,
+keeping workflow concerns separated from simple tool concerns.
+"""
+
+from typing import Any
+
+from ..shared.base_models import WORKFLOW_FIELD_DESCRIPTIONS
+from ..shared.schema_builders import SchemaBuilder
+
+
+class WorkflowSchemaBuilder:
+    """
+    Schema builder for workflow MCP tools.
+
+    This class extends the base SchemaBuilder with workflow-specific fields
+    and schema generation logic, maintaining separation of concerns.
+    """
+
+    # Workflow-specific field schemas
+    WORKFLOW_FIELD_SCHEMAS = {
+        "step": {
+            "type": "string",
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["step"],
+        },
+        "step_number": {
+            "type": "integer",
+            "minimum": 1,
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["step_number"],
+        },
+        "total_steps": {
+            "type": "integer",
+            "minimum": 1,
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["total_steps"],
+        },
+        "next_step_required": {
+            "type": "boolean",
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["next_step_required"],
+        },
+        "findings": {
+            "type": "string",
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["findings"],
+        },
+        "files_checked": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["files_checked"],
+        },
+        "relevant_files": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["relevant_files"],
+        },
+        "relevant_context": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["relevant_context"],
+        },
+        "issues_found": {
+            "type": "array",
+            "items": {"type": "object"},
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["issues_found"],
+        },
+        "confidence": {
+            "type": "string",
+            "enum": ["exploring", "low", "medium", "high", "certain"],
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["confidence"],
+        },
+        "hypothesis": {
+            "type": "string",
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["hypothesis"],
+        },
+        "backtrack_from_step": {
+            "type": "integer",
+            "minimum": 1,
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["backtrack_from_step"],
+        },
+        "use_assistant_model": {
+            "type": "boolean",
+            "default": True,
+            "description": WORKFLOW_FIELD_DESCRIPTIONS["use_assistant_model"],
+        },
+    }
+
+    @staticmethod
+    def build_schema(
+        tool_specific_fields: dict[str, dict[str, Any]] = None,
+        required_fields: list[str] = None,
+        model_field_schema: dict[str, Any] = None,
+        auto_mode: bool = False,
+        tool_name: str = None,
+        excluded_workflow_fields: list[str] = None,
+        excluded_common_fields: list[str] = None,
+    ) -> dict[str, Any]:
+        """
+        Build complete schema for workflow tools.
+
+        Args:
+            tool_specific_fields: Additional fields specific to the tool
+            required_fields: List of required field names (beyond workflow defaults)
+            model_field_schema: Schema for the model field
+            auto_mode: Whether the tool is in auto mode (affects model requirement)
+            tool_name: Name of the tool (for schema title)
+            excluded_workflow_fields: Workflow fields to exclude from schema (e.g., for planning tools)
+            excluded_common_fields: Common fields to exclude from schema
+
+        Returns:
+            Complete JSON schema for the workflow tool
+        """
+        properties = {}
+
+        # Add workflow fields first, excluding any specified fields
+        workflow_fields = WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS.copy()
+        if excluded_workflow_fields:
+            for field in excluded_workflow_fields:
+                workflow_fields.pop(field, None)
+        properties.update(workflow_fields)
+
+        # Add common fields (temperature, thinking_mode, etc.) from base builder, excluding any specified fields
+        common_fields = SchemaBuilder.COMMON_FIELD_SCHEMAS.copy()
+        if excluded_common_fields:
+            for field in excluded_common_fields:
+                common_fields.pop(field, None)
+        properties.update(common_fields)
+
+        # Add model field if provided
+        if model_field_schema:
+            properties["model"] = model_field_schema
+
+        # Add tool-specific fields if provided
+        if tool_specific_fields:
+            properties.update(tool_specific_fields)
+
+        # Build required fields list - workflow tools have standard required fields
+        standard_required = ["step", "step_number", "total_steps", "next_step_required", "findings"]
+
+        # Filter out excluded fields from required fields
+        if excluded_workflow_fields:
+            standard_required = [field for field in standard_required if field not in excluded_workflow_fields]
+
+        required = standard_required + (required_fields or [])
+
+        if auto_mode and "model" not in required:
+            required.append("model")
+
+        # Build the complete schema
+        schema = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": properties,
+            "required": required,
+            "additionalProperties": False,
+        }
+
+        if tool_name:
+            schema["title"] = f"{tool_name.capitalize()}Request"
+
+        return schema
+
+    @staticmethod
+    def get_workflow_fields() -> dict[str, dict[str, Any]]:
+        """Get the standard field schemas for workflow tools."""
+        combined = {}
+        combined.update(WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS)
+        combined.update(SchemaBuilder.COMMON_FIELD_SCHEMAS)
+        return combined
+
+    @staticmethod
+    def get_workflow_only_fields() -> dict[str, dict[str, Any]]:
+        """Get only the workflow-specific field schemas."""
+        return WorkflowSchemaBuilder.WORKFLOW_FIELD_SCHEMAS.copy()
--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py