my-pal-mcp-server/tools/precommit.py

"""
Tool for pre-commit validation of git changes across multiple repositories.

Design Note - File Content in Multiple Sections:
Files may legitimately appear in both "Git Diffs" and "Additional Context Files" sections:
- Git Diffs: Shows changed lines + limited context (marked with "BEGIN DIFF" / "END DIFF")
- Additional Context: Shows complete file content (marked with "BEGIN FILE" / "END FILE")
This provides comprehensive context for AI analysis - not a duplication bug.
"""

import os
from typing import TYPE_CHECKING, Any, Literal, Optional

from pydantic import Field

if TYPE_CHECKING:
    from tools.models import ToolModelCategory

from systemprompts import PRECOMMIT_PROMPT
from utils.git_utils import find_git_repositories, get_git_status, run_git_command
from utils.token_utils import estimate_tokens

from .base import BaseTool, ToolRequest

# Conservative fallback for token limits
DEFAULT_CONTEXT_WINDOW = 200_000

# Field descriptions to avoid duplication between Pydantic and JSON schema
PRECOMMIT_FIELD_DESCRIPTIONS = {
    "path": "Starting absolute path to the directory to search for git repositories (must be FULL absolute paths - DO NOT SHORTEN).",
    "prompt": (
        "The original user request description for the changes. Provides critical context for the review. "
        "MANDATORY: if original request is limited or not available, you MUST study the changes carefully, think deeply "
        "about the implementation intent, analyze patterns across all modifications, infer the logic and "
        "requirements from the code changes and provide a thorough starting point."
    ),
    "compare_to": (
        "Optional: A git ref (branch, tag, commit hash) to compare against. If not provided, reviews local "
        "staged and unstaged changes."
    ),
    "include_staged": "Include staged changes in the review. Only applies if 'compare_to' is not set.",
    "include_unstaged": "Include uncommitted (unstaged) changes in the review. Only applies if 'compare_to' is not set.",
    "focus_on": "Specific aspects to focus on (e.g., 'logic for user authentication', 'database query efficiency').",
    "review_type": "Type of review to perform on the changes.",
    "severity_filter": "Minimum severity level to report on the changes.",
    "max_depth": "Maximum depth to search for nested git repositories to prevent excessive recursion.",
    "temperature": "Temperature for the response (0.0 to 1.0). Lower values are more focused and deterministic.",
    "thinking_mode": "Thinking depth mode for the assistant.",
    "files": (
        "Optional files or directories to provide as context (must be FULL absolute paths - DO NOT SHORTEN). "
        "These additional files are not part of the changes but provide helpful context like configs, docs, or related code."
    ),
    "images": (
        "Optional images showing expected UI changes, design requirements, or visual references for the changes "
        "being validated (must be FULL absolute paths - DO NOT SHORTEN). "
    ),
}


class PrecommitRequest(ToolRequest):
    """Request model for precommit tool"""

    path: str = Field(..., description=PRECOMMIT_FIELD_DESCRIPTIONS["path"])
    prompt: Optional[str] = Field(None, description=PRECOMMIT_FIELD_DESCRIPTIONS["prompt"])
    compare_to: Optional[str] = Field(None, description=PRECOMMIT_FIELD_DESCRIPTIONS["compare_to"])
    include_staged: bool = Field(True, description=PRECOMMIT_FIELD_DESCRIPTIONS["include_staged"])
    include_unstaged: bool = Field(True, description=PRECOMMIT_FIELD_DESCRIPTIONS["include_unstaged"])
    focus_on: Optional[str] = Field(None, description=PRECOMMIT_FIELD_DESCRIPTIONS["focus_on"])
    review_type: Literal["full", "security", "performance", "quick"] = Field(
        "full", description=PRECOMMIT_FIELD_DESCRIPTIONS["review_type"]
    )
    severity_filter: Literal["critical", "high", "medium", "low", "all"] = Field(
        "all", description=PRECOMMIT_FIELD_DESCRIPTIONS["severity_filter"]
    )
    max_depth: int = Field(5, description=PRECOMMIT_FIELD_DESCRIPTIONS["max_depth"])
    temperature: Optional[float] = Field(
        None,
        description=PRECOMMIT_FIELD_DESCRIPTIONS["temperature"],
        ge=0.0,
        le=1.0,
    )
    thinking_mode: Optional[Literal["minimal", "low", "medium", "high", "max"]] = Field(
        None, description=PRECOMMIT_FIELD_DESCRIPTIONS["thinking_mode"]
    )
    files: Optional[list[str]] = Field(None, description=PRECOMMIT_FIELD_DESCRIPTIONS["files"])
    images: Optional[list[str]] = Field(None, description=PRECOMMIT_FIELD_DESCRIPTIONS["images"])


class Precommit(BaseTool):
    """Tool for pre-commit validation of git changes across multiple repositories."""

    def get_name(self) -> str:
        return "precommit"

    def get_description(self) -> str:
        return (
            "PRECOMMIT VALIDATION FOR GIT CHANGES - ALWAYS use this tool before creating any git commit! "
            "Comprehensive pre-commit validation that catches bugs, security issues, incomplete implementations, "
            "and ensures changes match the original requirements. Searches all git repositories recursively and "
            "provides deep analysis of staged/unstaged changes. Essential for code quality and preventing bugs. "
            "Use this before committing, when reviewing changes, checking your changes, validating changes, "
            "or when you're about to commit or ready to commit. Claude should proactively suggest using this tool "
            "whenever the user mentions committing or when changes are complete. "
            "When original request context is unavailable, Claude MUST think deeply about implementation intent, "
            "analyze patterns across modifications, infer business logic and requirements from code changes, "
            "and provide comprehensive insights about what was accomplished and completion status. "
            "Choose thinking_mode based on changeset size: 'low' for small focused changes, "
            "'medium' for standard commits (default), 'high' for large feature branches or complex refactoring, "
            "'max' for critical releases or when reviewing extensive changes across multiple systems. "
            "Note: If you're not currently using a top-tier model such as Opus 4 or above, these tools can provide enhanced capabilities."
        )

    def get_input_schema(self) -> dict[str, Any]:
        schema = {
            "type": "object",
            "title": "PrecommitRequest",
            "description": "Request model for precommit tool",
            "properties": {
                "path": {
                    "type": "string",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["path"],
                },
                "model": self.get_model_field_schema(),
                "prompt": {
                    "type": "string",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["prompt"],
                },
                "compare_to": {
                    "type": "string",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["compare_to"],
                },
                "include_staged": {
                    "type": "boolean",
                    "default": True,
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["include_staged"],
                },
                "include_unstaged": {
                    "type": "boolean",
                    "default": True,
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["include_unstaged"],
                },
                "focus_on": {
                    "type": "string",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["focus_on"],
                },
                "review_type": {
                    "type": "string",
                    "enum": ["full", "security", "performance", "quick"],
                    "default": "full",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["review_type"],
                },
                "severity_filter": {
                    "type": "string",
                    "enum": ["critical", "high", "medium", "low", "all"],
                    "default": "all",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["severity_filter"],
                },
                "max_depth": {
                    "type": "integer",
                    "default": 5,
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["max_depth"],
                },
                "temperature": {
                    "type": "number",
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["temperature"],
                    "minimum": 0,
                    "maximum": 1,
                },
                "thinking_mode": {
                    "type": "string",
                    "enum": ["minimal", "low", "medium", "high", "max"],
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["thinking_mode"],
                },
                "files": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["files"],
                },
                "images": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": PRECOMMIT_FIELD_DESCRIPTIONS["images"],
                },
                "use_websearch": {
                    "type": "boolean",
                    "description": "Enable web search for documentation, best practices, and current information. Particularly useful for: brainstorming sessions, architectural design discussions, exploring industry best practices, working with specific frameworks/technologies, researching solutions to complex problems, or when current documentation and community insights would enhance the analysis.",
                    "default": True,
                },
                "continuation_id": {
                    "type": "string",
                    "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.",
                },
            },
            "required": ["path"] + (["model"] if self.is_effective_auto_mode() else []),
        }
        return schema

    def get_system_prompt(self) -> str:
        return PRECOMMIT_PROMPT

    def get_request_model(self):
        return PrecommitRequest

    def get_default_temperature(self) -> float:
        """Use analytical temperature for code review."""
        from config import TEMPERATURE_ANALYTICAL

        return TEMPERATURE_ANALYTICAL

    def get_model_category(self) -> "ToolModelCategory":
        """Precommit requires thorough analysis and reasoning"""
        from tools.models import ToolModelCategory

        return ToolModelCategory.EXTENDED_REASONING

    async def prepare_prompt(self, request: PrecommitRequest) -> str:
        """Prepare the prompt with git diff information."""
        # Check for prompt.txt in files
        prompt_content, updated_files = self.handle_prompt_file(request.files)

        # If prompt.txt was found, use it as prompt
        if prompt_content:
            request.prompt = prompt_content

        # Update request files list
        if updated_files is not None:
            request.files = updated_files

        # Check user input size at MCP transport boundary (before adding internal content)
        user_content = request.prompt if request.prompt else ""
        size_check = self.check_prompt_size(user_content)
        if size_check:
            from tools.models import ToolOutput

            raise ValueError(f"MCP_SIZE_CHECK:{ToolOutput(**size_check).model_dump_json()}")

        # File size validation happens at MCP boundary in server.py

        # Find all git repositories
        repositories = find_git_repositories(request.path, request.max_depth)

        if not repositories:
            return "No git repositories found in the specified path."

        # Collect all diffs directly
        all_diffs = []
        repo_summaries = []
        total_tokens = 0
        max_tokens = DEFAULT_CONTEXT_WINDOW - 50000  # Reserve tokens for prompt and response

        for repo_path in repositories:
            repo_name = os.path.basename(repo_path) or "root"

            # Get status information
            status = get_git_status(repo_path)
            changed_files = []

            # Process based on mode
            if request.compare_to:
                # Validate the ref
                is_valid_ref, err_msg = run_git_command(
                    repo_path,
                    ["rev-parse", "--verify", "--quiet", request.compare_to],
                )
                if not is_valid_ref:
                    repo_summaries.append(
                        {
                            "path": repo_path,
                            "error": f"Invalid or unknown git ref '{request.compare_to}': {err_msg}",
                            "changed_files": 0,
                        }
                    )
                    continue

                # Get list of changed files
                success, files_output = run_git_command(
                    repo_path,
                    ["diff", "--name-only", f"{request.compare_to}...HEAD"],
                )
                if success and files_output.strip():
                    changed_files = [f for f in files_output.strip().split("\n") if f]

                    # Generate per-file diffs
                    for file_path in changed_files:
                        success, diff = run_git_command(
                            repo_path,
                            [
                                "diff",
                                f"{request.compare_to}...HEAD",
                                "--",
                                file_path,
                            ],
                        )
                        if success and diff.strip():
                            # Format diff with file header
                            diff_header = (
                                f"\n--- BEGIN DIFF: {repo_name} / {file_path} (compare to {request.compare_to}) ---\n"
                            )
                            diff_footer = f"\n--- END DIFF: {repo_name} / {file_path} ---\n"
                            formatted_diff = diff_header + diff + diff_footer

                            # Check token limit
                            diff_tokens = estimate_tokens(formatted_diff)
                            if total_tokens + diff_tokens <= max_tokens:
                                all_diffs.append(formatted_diff)
                                total_tokens += diff_tokens
            else:
                # Handle staged/unstaged/untracked changes
                staged_files = []
                unstaged_files = []
                untracked_files = []

                if request.include_staged:
                    success, files_output = run_git_command(repo_path, ["diff", "--name-only", "--cached"])
                    if success and files_output.strip():
                        staged_files = [f for f in files_output.strip().split("\n") if f]

                        # Generate per-file diffs for staged changes
                        # Each diff is wrapped with clear markers to distinguish from full file content
                        for file_path in staged_files:
                            success, diff = run_git_command(repo_path, ["diff", "--cached", "--", file_path])
                            if success and diff.strip():
                                # Use "BEGIN DIFF" markers (distinct from "BEGIN FILE" markers in utils/file_utils.py)
                                # This allows AI to distinguish between diff context vs complete file content
                                diff_header = f"\n--- BEGIN DIFF: {repo_name} / {file_path} (staged) ---\n"
                                diff_footer = f"\n--- END DIFF: {repo_name} / {file_path} ---\n"
                                formatted_diff = diff_header + diff + diff_footer

                                # Check token limit
                                diff_tokens = estimate_tokens(formatted_diff)
                                if total_tokens + diff_tokens <= max_tokens:
                                    all_diffs.append(formatted_diff)
                                    total_tokens += diff_tokens

                if request.include_unstaged:
                    success, files_output = run_git_command(repo_path, ["diff", "--name-only"])
                    if success and files_output.strip():
                        unstaged_files = [f for f in files_output.strip().split("\n") if f]

                        # Generate per-file diffs for unstaged changes
                        # Same clear marker pattern as staged changes above
                        for file_path in unstaged_files:
                            success, diff = run_git_command(repo_path, ["diff", "--", file_path])
                            if success and diff.strip():
                                diff_header = f"\n--- BEGIN DIFF: {repo_name} / {file_path} (unstaged) ---\n"
                                diff_footer = f"\n--- END DIFF: {repo_name} / {file_path} ---\n"
                                formatted_diff = diff_header + diff + diff_footer

                                # Check token limit
                                diff_tokens = estimate_tokens(formatted_diff)
                                if total_tokens + diff_tokens <= max_tokens:
                                    all_diffs.append(formatted_diff)
                                    total_tokens += diff_tokens

                    # Also include untracked files when include_unstaged is True
                    # Untracked files are new files that haven't been added to git yet
                    if status["untracked_files"]:
                        untracked_files = status["untracked_files"]

                        # For untracked files, show the entire file content as a "new file" diff
                        for file_path in untracked_files:
                            file_full_path = os.path.join(repo_path, file_path)
                            if os.path.exists(file_full_path) and os.path.isfile(file_full_path):
                                try:
                                    with open(file_full_path, encoding="utf-8", errors="ignore") as f:
                                        file_content = f.read()

                                    # Format as a new file diff
                                    diff_header = (
                                        f"\n--- BEGIN DIFF: {repo_name} / {file_path} (untracked - new file) ---\n"
                                    )
                                    diff_content = f"+++ b/{file_path}\n"
                                    for _line_num, line in enumerate(file_content.splitlines(), 1):
                                        diff_content += f"+{line}\n"
                                    diff_footer = f"\n--- END DIFF: {repo_name} / {file_path} ---\n"
                                    formatted_diff = diff_header + diff_content + diff_footer

                                    # Check token limit
                                    diff_tokens = estimate_tokens(formatted_diff)
                                    if total_tokens + diff_tokens <= max_tokens:
                                        all_diffs.append(formatted_diff)
                                        total_tokens += diff_tokens
                                except Exception:
                                    # Skip files that can't be read (binary, permission issues, etc.)
                                    pass

                # Combine unique files
                changed_files = list(set(staged_files + unstaged_files + untracked_files))

            # Add repository summary
            if changed_files:
                repo_summaries.append(
                    {
                        "path": repo_path,
                        "branch": status["branch"],
                        "ahead": status["ahead"],
                        "behind": status["behind"],
                        "changed_files": len(changed_files),
                        "files": changed_files[:20],  # First 20 for summary
                    }
                )

        if not all_diffs:
            return "No pending changes found in any of the git repositories."

        # Process context files if provided using standardized file reading
        context_files_content = []
        context_files_summary = []
        context_tokens = 0

        if request.files:
            remaining_tokens = max_tokens - total_tokens

            # Use centralized file handling with filtering for duplicate prevention
            file_content, processed_files = self._prepare_file_content_for_prompt(
                request.files,
                request.continuation_id,
                "Context files",
                max_tokens=remaining_tokens + 1000,  # Add back the reserve that was calculated
                reserve_tokens=1000,  # Small reserve for formatting
            )
            self._actually_processed_files = processed_files

            if file_content:
                context_tokens = estimate_tokens(file_content)
                context_files_content = [file_content]
                context_files_summary.append(f"✅ Included: {len(request.files)} context files")
            else:
                context_files_summary.append("WARNING: No context files could be read or files too large")

            total_tokens += context_tokens

        # Build the final prompt
        prompt_parts = []

        # Add original request context if provided
        if request.prompt:
            prompt_parts.append(f"## Original Request\n\n{request.prompt}\n")

        # Add review parameters
        prompt_parts.append("## Review Parameters\n")
        prompt_parts.append(f"- Review Type: {request.review_type}")
        prompt_parts.append(f"- Severity Filter: {request.severity_filter}")

        if request.focus_on:
            prompt_parts.append(f"- Focus Areas: {request.focus_on}")

        if request.compare_to:
            prompt_parts.append(f"- Comparing Against: {request.compare_to}")
        else:
            review_scope = []
            if request.include_staged:
                review_scope.append("staged")
            if request.include_unstaged:
                review_scope.append("unstaged")
            prompt_parts.append(f"- Reviewing: {' and '.join(review_scope)} changes")

        # Add repository summary
        prompt_parts.append("\n## Repository Changes Summary\n")
        prompt_parts.append(f"Found {len(repo_summaries)} repositories with changes:\n")

        for idx, summary in enumerate(repo_summaries, 1):
            prompt_parts.append(f"\n### Repository {idx}: {summary['path']}")
            if "error" in summary:
                prompt_parts.append(f"ERROR: {summary['error']}")
            else:
                prompt_parts.append(f"- Branch: {summary['branch']}")
                if summary["ahead"] or summary["behind"]:
                    prompt_parts.append(f"- Ahead: {summary['ahead']}, Behind: {summary['behind']}")
                prompt_parts.append(f"- Changed Files: {summary['changed_files']}")

                if summary["files"]:
                    prompt_parts.append("\nChanged files:")
                    for file in summary["files"]:
                        prompt_parts.append(f"  - {file}")
                    if summary["changed_files"] > len(summary["files"]):
                        prompt_parts.append(f"  ... and {summary['changed_files'] - len(summary['files'])} more files")

        # Add context files summary if provided
        if context_files_summary:
            prompt_parts.append("\n## Context Files Summary\n")
            for summary_item in context_files_summary:
                prompt_parts.append(f"- {summary_item}")

        # Add token usage summary
        if total_tokens > 0:
            prompt_parts.append(f"\nTotal context tokens used: ~{total_tokens:,}")

        # Add the diff contents with clear section markers
        # Each diff is wrapped with "--- BEGIN DIFF: ... ---" and "--- END DIFF: ... ---"
        prompt_parts.append("\n## Git Diffs\n")
        if all_diffs:
            prompt_parts.extend(all_diffs)
        else:
            prompt_parts.append("--- NO DIFFS FOUND ---")

        # Add context files content if provided
        # IMPORTANT: Files may legitimately appear in BOTH sections:
        # - Git Diffs: Show only changed lines + limited context (what changed)
        # - Additional Context: Show complete file content (full understanding)
        # This is intentional design for comprehensive AI analysis, not duplication bug.
        # Each file in this section is wrapped with "--- BEGIN FILE: ... ---" and "--- END FILE: ... ---"
        if context_files_content:
            prompt_parts.append("\n## Additional Context Files")
            prompt_parts.append(
                "The following files are provided for additional context. They have NOT been modified.\n"
            )
            prompt_parts.extend(context_files_content)

        # Add web search instruction if enabled
        websearch_instruction = self.get_websearch_instruction(
            request.use_websearch,
            """When validating changes, consider if searches for these would help:
- Best practices for new features or patterns introduced
- Security implications of the changes
- Known issues with libraries or APIs being used
- Migration guides if updating dependencies
- Performance considerations for the implemented approach""",
        )

        # Add review instructions
        prompt_parts.append("\n## Review Instructions\n")
        prompt_parts.append(
            "Please review these changes according to the system prompt guidelines. "
            "Pay special attention to alignment with the original request, completeness of implementation, "
            "potential bugs, security issues, and any edge cases not covered."
        )

        # Add instruction for requesting files if needed
        if not request.files:
            prompt_parts.append(
                "\nIf you need additional context files to properly review these changes "
                "(such as configuration files, documentation, or related code), "
                "you may request them using the standardized JSON response format."
            )

        # Combine with system prompt and websearch instruction
        full_prompt = f"{self.get_system_prompt()}{websearch_instruction}\n\n" + "\n".join(prompt_parts)

        return full_prompt

    def format_response(self, response: str, request: PrecommitRequest, model_info: Optional[dict] = None) -> str:
        """Format the response with commit guidance"""
        # Base response
        formatted_response = response

        # Add footer separator
        formatted_response += "\n\n---\n\n"

        # Add commit status instruction
        formatted_response += (
            "COMMIT STATUS: You MUST provide a clear summary of ALL issues found to the user. "
            "If no critical or high severity issues found, changes are ready for commit. "
            "If critical issues are found, you MUST fix them first and then run the precommit tool again "
            "to validate the fixes before proceeding. "
            "Medium to low severity issues should be addressed but may not block commit. "
            "You MUST always CONFIRM with user and show them a CLEAR summary of ALL issues before proceeding with any commit."
        )

        return formatted_response