Major new addition: refactor tool

Supports decomposing large components and files, finding codesmells, finding modernizing opportunities as well as code organization opportunities. Fix this mega-classes today! Line numbers added to embedded code for better references from model -> claude
2025-06-15 06:00:01 +04:00
parent 70f1356e3e
commit b5004b91fc
28 changed files with 2633 additions and 310 deletions
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -23,148 +23,12 @@ import os
 from pathlib import Path
 from typing import Optional

+from .file_types import BINARY_EXTENSIONS, CODE_EXTENSIONS, IMAGE_EXTENSIONS, TEXT_EXTENSIONS
+from .security_config import CONTAINER_WORKSPACE, EXCLUDED_DIRS, MCP_SIGNATURE_FILES, SECURITY_ROOT, WORKSPACE_ROOT
 from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens

 logger = logging.getLogger(__name__)

-# Get workspace root for Docker path translation
-# IMPORTANT: WORKSPACE_ROOT should contain the HOST path (e.g., /Users/john/project)
-# that gets mounted to /workspace in the Docker container. This enables proper
-# path translation between host absolute paths and container workspace paths.
-WORKSPACE_ROOT = os.environ.get("WORKSPACE_ROOT")
-CONTAINER_WORKSPACE = Path("/workspace")
-
-# Dangerous paths that should never be used as WORKSPACE_ROOT
-# These would give overly broad access and pose security risks
-DANGEROUS_WORKSPACE_PATHS = {
-    "/",
-    "/etc",
-    "/usr",
-    "/bin",
-    "/var",
-    "/root",
-    "/home",
-    "/workspace",  # Container path - WORKSPACE_ROOT should be host path
-    "C:\\",
-    "C:\\Windows",
-    "C:\\Program Files",
-    "C:\\Users",
-}
-
-# Validate WORKSPACE_ROOT for security if it's set
-if WORKSPACE_ROOT:
-    # Resolve to canonical path for comparison
-    resolved_workspace = Path(WORKSPACE_ROOT).resolve()
-
-    # Special check for /workspace - common configuration mistake
-    if str(resolved_workspace) == "/workspace":
-        raise RuntimeError(
-            f"Configuration Error: WORKSPACE_ROOT should be set to the HOST path, not the container path. "
-            f"Found: WORKSPACE_ROOT={WORKSPACE_ROOT} "
-            f"Expected: WORKSPACE_ROOT should be set to your host directory path (e.g., $HOME) "
-            f"that contains all files Claude might reference. "
-            f"This path gets mounted to /workspace inside the Docker container."
-        )
-
-    # Check against other dangerous paths
-    if str(resolved_workspace) in DANGEROUS_WORKSPACE_PATHS:
-        raise RuntimeError(
-            f"Security Error: WORKSPACE_ROOT '{WORKSPACE_ROOT}' is set to a dangerous system directory. "
-            f"This would give access to critical system files. "
-            f"Please set WORKSPACE_ROOT to a specific project directory."
-        )
-
-    # Additional check: prevent filesystem root
-    if resolved_workspace.parent == resolved_workspace:
-        raise RuntimeError(
-            f"Security Error: WORKSPACE_ROOT '{WORKSPACE_ROOT}' cannot be the filesystem root. "
-            f"This would give access to the entire filesystem. "
-            f"Please set WORKSPACE_ROOT to a specific project directory."
-        )
-
-# Security boundary
-# In Docker: use /workspace (container directory)
-# In tests/direct mode: use WORKSPACE_ROOT (host directory)
-if CONTAINER_WORKSPACE.exists():
-    # Running in Docker container
-    SECURITY_ROOT = CONTAINER_WORKSPACE
-elif WORKSPACE_ROOT:
-    # Running in tests or direct mode with WORKSPACE_ROOT set
-    SECURITY_ROOT = Path(WORKSPACE_ROOT).resolve()
-else:
-    # Fallback for backward compatibility (should not happen in normal usage)
-    SECURITY_ROOT = Path.home()
-
-
-# Directories to exclude from recursive file search
-# These typically contain generated code, dependencies, or build artifacts
-EXCLUDED_DIRS = {
-    "__pycache__",
-    "node_modules",
-    ".venv",
-    "venv",
-    "env",
-    ".env",
-    ".git",
-    ".svn",
-    ".hg",
-    "build",
-    "dist",
-    "target",
-    ".idea",
-    ".vscode",
-    "__pypackages__",
-    ".mypy_cache",
-    ".pytest_cache",
-    ".tox",
-    "htmlcov",
-    ".coverage",
-    # Additional build and temp directories
-    "out",
-    ".next",
-    ".nuxt",
-    ".cache",
-    ".temp",
-    ".tmp",
-    "bower_components",
-    "vendor",
-    ".sass-cache",
-    ".gradle",
-    ".m2",
-    "coverage",
-    # OS-specific directories
-    ".DS_Store",
-    "Thumbs.db",
-    # Python specific
-    "*.egg-info",
-    ".eggs",
-    "wheels",
-    ".Python",
-    # IDE and editor directories
-    ".sublime",
-    ".atom",
-    ".brackets",
-    "*.swp",
-    "*.swo",
-    "*~",
-    # Documentation build
-    "_build",
-    "site",
-    # Mobile development
-    ".expo",
-    ".flutter",
-}
-
-# MCP signature files - presence of these indicates the MCP's own directory
-# Used to prevent the MCP from scanning its own codebase
-MCP_SIGNATURE_FILES = {
-    "zen_server.py",
-    "server.py",
-    "tools/precommit.py",
-    "utils/file_utils.py",
-    "prompts/tool_prompts.py",
-}
-

 def is_mcp_directory(path: Path) -> bool:
    """
@@ -242,7 +106,7 @@ def is_home_directory_root(path: Path) -> bool:
        # Check if this is exactly the home directory
        if resolved_path == resolved_home:
            logger.warning(
-                f"Attempted to scan user home directory root: {path}. " f"Please specify a subdirectory instead."
+                f"Attempted to scan user home directory root: {path}. Please specify a subdirectory instead."
            )
            return True

@@ -277,56 +141,105 @@ def is_home_directory_root(path: Path) -> bool:
    return False


-# Common code file extensions that are automatically included when processing directories
-# This set can be extended to support additional file types
-CODE_EXTENSIONS = {
-    ".py",
-    ".js",
-    ".ts",
-    ".jsx",
-    ".tsx",
-    ".java",
-    ".cpp",
-    ".c",
-    ".h",
-    ".hpp",
-    ".cs",
-    ".go",
-    ".rs",
-    ".rb",
-    ".php",
-    ".swift",
-    ".kt",
-    ".scala",
-    ".r",
-    ".m",
-    ".mm",
-    ".sql",
-    ".sh",
-    ".bash",
-    ".zsh",
-    ".fish",
-    ".ps1",
-    ".bat",
-    ".cmd",
-    ".yml",
-    ".yaml",
-    ".json",
-    ".xml",
-    ".toml",
-    ".ini",
-    ".cfg",
-    ".conf",
-    ".txt",
-    ".md",
-    ".rst",
-    ".tex",
-    ".html",
-    ".css",
-    ".scss",
-    ".sass",
-    ".less",
-}
+def detect_file_type(file_path: str) -> str:
+    """
+    Detect file type for appropriate processing strategy.
+
+    NOTE: This function is currently not used for line number auto-detection
+    due to backward compatibility requirements. It is intended for future
+    features requiring specific file type handling (e.g., image processing,
+    binary file analysis, or enhanced file filtering).
+
+    Args:
+        file_path: Path to the file to analyze
+
+    Returns:
+        str: "text", "binary", or "image"
+    """
+    path = Path(file_path)
+
+    # Check extension first (fast)
+    extension = path.suffix.lower()
+    if extension in TEXT_EXTENSIONS:
+        return "text"
+    elif extension in IMAGE_EXTENSIONS:
+        return "image"
+    elif extension in BINARY_EXTENSIONS:
+        return "binary"
+
+    # Fallback: check magic bytes for text vs binary
+    # This is helpful for files without extensions or unknown extensions
+    try:
+        with open(path, "rb") as f:
+            chunk = f.read(1024)
+            # Simple heuristic: if we can decode as UTF-8, likely text
+            chunk.decode("utf-8")
+            return "text"
+    except UnicodeDecodeError:
+        return "binary"
+    except (FileNotFoundError, PermissionError) as e:
+        logger.warning(f"Could not access file {file_path} for type detection: {e}")
+        return "unknown"
+
+
+def should_add_line_numbers(file_path: str, include_line_numbers: Optional[bool] = None) -> bool:
+    """
+    Determine if line numbers should be added to a file.
+
+    Args:
+        file_path: Path to the file
+        include_line_numbers: Explicit preference, or None for auto-detection
+
+    Returns:
+        bool: True if line numbers should be added
+    """
+    if include_line_numbers is not None:
+        return include_line_numbers
+
+    # Default: DO NOT add line numbers (backwards compatibility)
+    # Tools that want line numbers must explicitly request them
+    return False
+
+
+def _normalize_line_endings(content: str) -> str:
+    """
+    Normalize line endings for consistent line numbering.
+
+    Args:
+        content: File content with potentially mixed line endings
+
+    Returns:
+        str: Content with normalized LF line endings
+    """
+    # Normalize all line endings to LF for consistent counting
+    return content.replace("\r\n", "\n").replace("\r", "\n")
+
+
+def _add_line_numbers(content: str) -> str:
+    """
+    Add line numbers to text content for precise referencing.
+
+    Args:
+        content: Text content to number
+
+    Returns:
+        str: Content with line numbers in format "  45│ actual code line"
+        Supports files up to 99,999 lines with dynamic width allocation
+    """
+    # Normalize line endings first
+    normalized_content = _normalize_line_endings(content)
+    lines = normalized_content.split("\n")
+
+    # Dynamic width allocation based on total line count
+    # This supports files of any size by computing required width
+    total_lines = len(lines)
+    width = len(str(total_lines))
+    width = max(width, 4)  # Minimum padding for readability
+
+    # Format with dynamic width and clear separator
+    numbered_lines = [f"{i + 1:{width}d}│ {line}" for i, line in enumerate(lines)]
+
+    return "\n".join(numbered_lines)


 def translate_path_for_environment(path_str: str) -> str:
@@ -515,15 +428,13 @@ def expand_paths(paths: list[str], extensions: Optional[set[str]] = None) -> lis

            # Check 2: Prevent scanning user's home directory root
            if is_home_directory_root(path_obj):
-                logger.warning(
-                    f"Skipping home directory root: {path}. " f"Please specify a project subdirectory instead."
-                )
+                logger.warning(f"Skipping home directory root: {path}. Please specify a project subdirectory instead.")
                continue

            # Check 3: Skip if this is the MCP's own directory
            if is_mcp_directory(path_obj):
                logger.info(
-                    f"Skipping MCP server directory: {path}. " f"The MCP server code is excluded from project scans."
+                    f"Skipping MCP server directory: {path}. The MCP server code is excluded from project scans."
                )
                continue

@@ -575,7 +486,9 @@ def expand_paths(paths: list[str], extensions: Optional[set[str]] = None) -> lis
    return expanded_files


-def read_file_content(file_path: str, max_size: int = 1_000_000) -> tuple[str, int]:
+def read_file_content(
+    file_path: str, max_size: int = 1_000_000, *, include_line_numbers: Optional[bool] = None
+) -> tuple[str, int]:
    """
    Read a single file and format it for inclusion in AI prompts.

@@ -586,6 +499,7 @@ def read_file_content(file_path: str, max_size: int = 1_000_000) -> tuple[str, i
    Args:
        file_path: Path to file (must be absolute)
        max_size: Maximum file size to read (default 1MB to prevent memory issues)
+        include_line_numbers: Whether to add line numbers. If None, auto-detects based on file type

    Returns:
        Tuple of (formatted_content, estimated_tokens)
@@ -634,6 +548,10 @@ def read_file_content(file_path: str, max_size: int = 1_000_000) -> tuple[str, i
            content = f"\n--- FILE TOO LARGE: {file_path} ---\nFile size: {file_size:,} bytes (max: {max_size:,})\n--- END FILE ---\n"
            return content, estimate_tokens(content)

+        # Determine if we should add line numbers
+        add_line_numbers = should_add_line_numbers(file_path, include_line_numbers)
+        logger.debug(f"[FILES] Line numbers for {file_path}: {'enabled' if add_line_numbers else 'disabled'}")
+
        # Read the file with UTF-8 encoding, replacing invalid characters
        # This ensures we can handle files with mixed encodings
        logger.debug(f"[FILES] Reading file content for {file_path}")
@@ -642,6 +560,14 @@ def read_file_content(file_path: str, max_size: int = 1_000_000) -> tuple[str, i

        logger.debug(f"[FILES] Successfully read {len(file_content)} characters from {file_path}")

+        # Add line numbers if requested or auto-detected
+        if add_line_numbers:
+            file_content = _add_line_numbers(file_content)
+            logger.debug(f"[FILES] Added line numbers to {file_path}")
+        else:
+            # Still normalize line endings for consistency
+            file_content = _normalize_line_endings(file_content)
+
        # Format with clear delimiters that help the AI understand file boundaries
        # Using consistent markers makes it easier for the model to parse
        # NOTE: These markers ("--- BEGIN FILE: ... ---") are distinct from git diff markers
@@ -665,6 +591,8 @@ def read_files(
    code: Optional[str] = None,
    max_tokens: Optional[int] = None,
    reserve_tokens: int = 50_000,
+    *,
+    include_line_numbers: bool = False,
 ) -> str:
    """
    Read multiple files and optional direct code with smart token management.
@@ -679,6 +607,7 @@ def read_files(
        code: Optional direct code to include (prioritized over files)
        max_tokens: Maximum tokens to use (defaults to DEFAULT_CONTEXT_WINDOW)
        reserve_tokens: Tokens to reserve for prompt and response (default 50K)
+        include_line_numbers: Whether to add line numbers to file content

    Returns:
        str: All file contents formatted for AI consumption
@@ -728,7 +657,7 @@ def read_files(
                    files_skipped.extend(all_files[i:])
                    break

-                file_content, file_tokens = read_file_content(file_path)
+                file_content, file_tokens = read_file_content(file_path, include_line_numbers=include_line_numbers)
                logger.debug(f"[FILES] File {file_path}: {file_tokens:,} tokens")

                # Check if adding this file would exceed limit