refactor: cleanup and comprehensive documentation

Major changes: - Add comprehensive documentation to all modules with detailed docstrings - Remove unused THINKING_MODEL config (use single GEMINI_MODEL with thinking_mode param) - Remove list_models functionality (simplified to single model configuration) - Rename DEFAULT_MODEL to GEMINI_MODEL for clarity - Remove unused python-dotenv dependency - Fix missing pydantic in setup.py dependencies Documentation improvements: - Document security measures in file_utils.py (path validation, sandboxing) - Add detailed comments to critical logic sections - Document tool creation process in BaseTool - Explain configuration values and their impact - Add comprehensive function-level documentation Code quality: - Apply black formatting to all files - Fix all ruff linting issues - Update tests to match refactored code - All 63 tests passing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-09 19:04:24 +04:00
parent fd6e2f9b64
commit 783ba73181
12 changed files with 639 additions and 260 deletions
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -1,5 +1,21 @@
 """
 File reading utilities with directory support and token management
+
+This module provides secure file access functionality for the MCP server.
+It implements critical security measures to prevent unauthorized file access
+and manages token limits to ensure efficient API usage.
+
+Key Features:
+- Path validation and sandboxing to prevent directory traversal attacks
+- Support for both individual files and recursive directory reading
+- Token counting and management to stay within API limits
+- Automatic file type detection and filtering
+- Comprehensive error handling with informative messages
+
+Security Model:
+- All file access is restricted to PROJECT_ROOT and its subdirectories
+- Absolute paths are required to prevent ambiguity
+- Symbolic links are resolved to ensure they stay within bounds
 """

 import os
@@ -10,9 +26,12 @@ from .token_utils import estimate_tokens, MAX_CONTEXT_TOKENS

 # Get project root from environment or use current directory
 # This defines the sandbox directory where file access is allowed
+# Security: All file operations are restricted to this directory and its children
 PROJECT_ROOT = Path(os.environ.get("MCP_PROJECT_ROOT", os.getcwd())).resolve()

-# Security: Prevent running with overly permissive root
+# Critical Security Check: Prevent running with overly permissive root
+# Setting PROJECT_ROOT to "/" would allow access to the entire filesystem,
+# which is a severe security vulnerability
 if str(PROJECT_ROOT) == "/":
    raise RuntimeError(
        "Security Error: MCP_PROJECT_ROOT cannot be set to '/'. "
@@ -20,7 +39,8 @@ if str(PROJECT_ROOT) == "/":
    )


-# Common code file extensions
+# Common code file extensions that are automatically included when processing directories
+# This set can be extended to support additional file types
 CODE_EXTENSIONS = {
    ".py",
    ".js",
@@ -75,11 +95,16 @@ def resolve_and_validate_path(path_str: str) -> Path:
    """
    Validates that a path is absolute and resolves it.

+    This is the primary security function that ensures all file access
+    is properly sandboxed. It enforces two critical security policies:
+    1. All paths must be absolute (no ambiguity)
+    2. All paths must resolve to within PROJECT_ROOT (sandboxing)
+
    Args:
        path_str: Path string (must be absolute)

    Returns:
-        Resolved Path object
+        Resolved Path object that is guaranteed to be within PROJECT_ROOT

    Raises:
        ValueError: If path is not absolute
@@ -88,17 +113,19 @@ def resolve_and_validate_path(path_str: str) -> Path:
    # Create a Path object from the user-provided path
    user_path = Path(path_str)

-    # Require absolute paths
+    # Security Policy 1: Require absolute paths to prevent ambiguity
+    # Relative paths could be interpreted differently depending on working directory
    if not user_path.is_absolute():
        raise ValueError(
            f"Relative paths are not supported. Please provide an absolute path.\n"
            f"Received: {path_str}"
        )

-    # Resolve the absolute path
+    # Resolve the absolute path (follows symlinks, removes .. and .)
    resolved_path = user_path.resolve()

-    # Security check: ensure the resolved path is within PROJECT_ROOT
+    # Security Policy 2: Ensure the resolved path is within PROJECT_ROOT
+    # This prevents directory traversal attacks (e.g., /project/../../../etc/passwd)
    try:
        resolved_path.relative_to(PROJECT_ROOT)
    except ValueError:
@@ -115,12 +142,16 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis
    """
    Expand paths to individual files, handling both files and directories.

+    This function recursively walks directories to find all matching files.
+    It automatically filters out hidden files and common non-code directories
+    like __pycache__ to avoid including generated or system files.
+
    Args:
-        paths: List of file or directory paths
-        extensions: Optional set of file extensions to include
+        paths: List of file or directory paths (must be absolute)
+        extensions: Optional set of file extensions to include (defaults to CODE_EXTENSIONS)

    Returns:
-        List of individual file paths
+        List of individual file paths, sorted for consistent ordering
    """
    if extensions is None:
        extensions = CODE_EXTENSIONS
@@ -130,9 +161,10 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis

    for path in paths:
        try:
+            # Validate each path for security before processing
            path_obj = resolve_and_validate_path(path)
        except (ValueError, PermissionError):
-            # Skip invalid paths
+            # Skip invalid paths silently to allow partial success
            continue

        if not path_obj.exists():
@@ -145,51 +177,61 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis
                seen.add(str(path_obj))

        elif path_obj.is_dir():
-            # Walk directory recursively
+            # Walk directory recursively to find all files
            for root, dirs, files in os.walk(path_obj):
-                # Skip hidden directories and __pycache__
+                # Filter directories in-place to skip hidden and cache directories
+                # This prevents descending into .git, .venv, __pycache__, etc.
                dirs[:] = [
                    d for d in dirs if not d.startswith(".") and d != "__pycache__"
                ]

                for file in files:
-                    # Skip hidden files
+                    # Skip hidden files (e.g., .DS_Store, .gitignore)
                    if file.startswith("."):
                        continue

                    file_path = Path(root) / file

-                    # Check extension
+                    # Filter by extension if specified
                    if not extensions or file_path.suffix.lower() in extensions:
                        full_path = str(file_path)
+                        # Use set to prevent duplicates
                        if full_path not in seen:
                            expanded_files.append(full_path)
                            seen.add(full_path)

-    # Sort for consistent ordering
+    # Sort for consistent ordering across different runs
+    # This makes output predictable and easier to debug
    expanded_files.sort()
    return expanded_files


 def read_file_content(file_path: str, max_size: int = 1_000_000) -> Tuple[str, int]:
    """
-    Read a single file and format it for Gemini.
+    Read a single file and format it for inclusion in AI prompts.
+
+    This function handles various error conditions gracefully and always
+    returns formatted content, even for errors. This ensures the AI model
+    gets context about what files were attempted but couldn't be read.

    Args:
        file_path: Path to file (must be absolute)
-        max_size: Maximum file size to read
+        max_size: Maximum file size to read (default 1MB to prevent memory issues)

    Returns:
-        (formatted_content, estimated_tokens)
+        Tuple of (formatted_content, estimated_tokens)
+        Content is wrapped with clear delimiters for AI parsing
    """
    try:
+        # Validate path security before any file operations
        path = resolve_and_validate_path(file_path)
    except (ValueError, PermissionError) as e:
+        # Return error in a format that provides context to the AI
        content = f"\n--- ERROR ACCESSING FILE: {file_path} ---\nError: {str(e)}\n--- END FILE ---\n"
        return content, estimate_tokens(content)

    try:
-        # Check if path exists and is a file
+        # Validate file existence and type
        if not path.exists():
            content = f"\n--- FILE NOT FOUND: {file_path} ---\nError: File does not exist\n--- END FILE ---\n"
            return content, estimate_tokens(content)
@@ -198,17 +240,19 @@ def read_file_content(file_path: str, max_size: int = 1_000_000) -> Tuple[str, i
            content = f"\n--- NOT A FILE: {file_path} ---\nError: Path is not a file\n--- END FILE ---\n"
            return content, estimate_tokens(content)

-        # Check file size
+        # Check file size to prevent memory exhaustion
        file_size = path.stat().st_size
        if file_size > max_size:
            content = f"\n--- FILE TOO LARGE: {file_path} ---\nFile size: {file_size:,} bytes (max: {max_size:,})\n--- END FILE ---\n"
            return content, estimate_tokens(content)

-        # Read the file
+        # Read the file with UTF-8 encoding, replacing invalid characters
+        # This ensures we can handle files with mixed encodings
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            file_content = f.read()

-        # Format with clear delimiters for Gemini
+        # Format with clear delimiters that help the AI understand file boundaries
+        # Using consistent markers makes it easier for the model to parse
        formatted = f"\n--- BEGIN FILE: {file_path} ---\n{file_content}\n--- END FILE: {file_path} ---\n"
        return formatted, estimate_tokens(formatted)

@@ -226,14 +270,21 @@ def read_files(
    """
    Read multiple files and optional direct code with smart token management.

+    This function implements intelligent token budgeting to maximize the amount
+    of relevant content that can be included in an AI prompt while staying
+    within token limits. It prioritizes direct code and reads files until
+    the token budget is exhausted.
+
    Args:
-        file_paths: List of file or directory paths
-        code: Optional direct code to include
+        file_paths: List of file or directory paths (absolute paths required)
+        code: Optional direct code to include (prioritized over files)
        max_tokens: Maximum tokens to use (defaults to MAX_CONTEXT_TOKENS)
-        reserve_tokens: Tokens to reserve for prompt and response
+        reserve_tokens: Tokens to reserve for prompt and response (default 50K)

    Returns:
-        (full_content, brief_summary)
+        Tuple of (full_content, brief_summary)
+        - full_content: All file contents formatted for AI consumption
+        - brief_summary: Human-readable summary of what was processed
    """
    if max_tokens is None:
        max_tokens = MAX_CONTEXT_TOKENS
@@ -247,7 +298,8 @@ def read_files(
    files_skipped = []
    dirs_processed = []

-    # First, handle direct code if provided
+    # Priority 1: Handle direct code if provided
+    # Direct code is prioritized because it's explicitly provided by the user
    if code:
        formatted_code = (
            f"\n--- BEGIN DIRECT CODE ---\n{code}\n--- END DIRECT CODE ---\n"
@@ -258,19 +310,23 @@ def read_files(
            content_parts.append(formatted_code)
            total_tokens += code_tokens
            available_tokens -= code_tokens
+            # Create a preview for the summary
            code_preview = code[:50] + "..." if len(code) > 50 else code
            summary_parts.append(f"Direct code: {code_preview}")
        else:
            summary_parts.append("Direct code skipped (too large)")

-    # Expand all paths to get individual files
+    # Priority 2: Process file paths
    if file_paths:
-        # Track which paths are directories
+        # Track which paths are directories for summary
        for path in file_paths:
-            if Path(path).is_dir():
-                dirs_processed.append(path)
+            try:
+                if Path(path).is_dir():
+                    dirs_processed.append(path)
+            except Exception:
+                pass  # Ignore invalid paths

-        # Expand to get all files
+        # Expand directories to get all individual files
        all_files = expand_paths(file_paths)

        if not all_files and file_paths:
@@ -279,7 +335,7 @@ def read_files(
                f"\n--- NO FILES FOUND ---\nProvided paths: {', '.join(file_paths)}\n--- END ---\n"
            )
        else:
-            # Read files up to token limit
+            # Read files sequentially until token limit is reached
            for file_path in all_files:
                if total_tokens >= available_tokens:
                    files_skipped.append(file_path)
@@ -293,9 +349,10 @@ def read_files(
                    total_tokens += file_tokens
                    files_read.append(file_path)
                else:
+                    # File too large for remaining budget
                    files_skipped.append(file_path)

-    # Build summary
+    # Build human-readable summary of what was processed
    if dirs_processed:
        summary_parts.append(f"Processed {len(dirs_processed)} dir(s)")
    if files_read:
@@ -305,11 +362,12 @@ def read_files(
    if total_tokens > 0:
        summary_parts.append(f"~{total_tokens:,} tokens used")

-    # Add skipped files note if any were skipped
+    # Add informative note about skipped files to help users understand
+    # what was omitted and why
    if files_skipped:
        skip_note = "\n\n--- SKIPPED FILES (TOKEN LIMIT) ---\n"
        skip_note += f"Total skipped: {len(files_skipped)}\n"
-        # Show first 10 skipped files
+        # Show first 10 skipped files as examples
        for i, file_path in enumerate(files_skipped[:10]):
            skip_note += f"  - {file_path}\n"
        if len(files_skipped) > 10: