refactor: cleanup and comprehensive documentation
Major changes: - Add comprehensive documentation to all modules with detailed docstrings - Remove unused THINKING_MODEL config (use single GEMINI_MODEL with thinking_mode param) - Remove list_models functionality (simplified to single model configuration) - Rename DEFAULT_MODEL to GEMINI_MODEL for clarity - Remove unused python-dotenv dependency - Fix missing pydantic in setup.py dependencies Documentation improvements: - Document security measures in file_utils.py (path validation, sandboxing) - Add detailed comments to critical logic sections - Document tool creation process in BaseTool - Explain configuration values and their impact - Add comprehensive function-level documentation Code quality: - Apply black formatting to all files - Fix all ruff linting issues - Update tests to match refactored code - All 63 tests passing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,21 @@
|
||||
"""
|
||||
File reading utilities with directory support and token management
|
||||
|
||||
This module provides secure file access functionality for the MCP server.
|
||||
It implements critical security measures to prevent unauthorized file access
|
||||
and manages token limits to ensure efficient API usage.
|
||||
|
||||
Key Features:
|
||||
- Path validation and sandboxing to prevent directory traversal attacks
|
||||
- Support for both individual files and recursive directory reading
|
||||
- Token counting and management to stay within API limits
|
||||
- Automatic file type detection and filtering
|
||||
- Comprehensive error handling with informative messages
|
||||
|
||||
Security Model:
|
||||
- All file access is restricted to PROJECT_ROOT and its subdirectories
|
||||
- Absolute paths are required to prevent ambiguity
|
||||
- Symbolic links are resolved to ensure they stay within bounds
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -10,9 +26,12 @@ from .token_utils import estimate_tokens, MAX_CONTEXT_TOKENS
|
||||
|
||||
# Get project root from environment or use current directory
|
||||
# This defines the sandbox directory where file access is allowed
|
||||
# Security: All file operations are restricted to this directory and its children
|
||||
PROJECT_ROOT = Path(os.environ.get("MCP_PROJECT_ROOT", os.getcwd())).resolve()
|
||||
|
||||
# Security: Prevent running with overly permissive root
|
||||
# Critical Security Check: Prevent running with overly permissive root
|
||||
# Setting PROJECT_ROOT to "/" would allow access to the entire filesystem,
|
||||
# which is a severe security vulnerability
|
||||
if str(PROJECT_ROOT) == "/":
|
||||
raise RuntimeError(
|
||||
"Security Error: MCP_PROJECT_ROOT cannot be set to '/'. "
|
||||
@@ -20,7 +39,8 @@ if str(PROJECT_ROOT) == "/":
|
||||
)
|
||||
|
||||
|
||||
# Common code file extensions
|
||||
# Common code file extensions that are automatically included when processing directories
|
||||
# This set can be extended to support additional file types
|
||||
CODE_EXTENSIONS = {
|
||||
".py",
|
||||
".js",
|
||||
@@ -75,11 +95,16 @@ def resolve_and_validate_path(path_str: str) -> Path:
|
||||
"""
|
||||
Validates that a path is absolute and resolves it.
|
||||
|
||||
This is the primary security function that ensures all file access
|
||||
is properly sandboxed. It enforces two critical security policies:
|
||||
1. All paths must be absolute (no ambiguity)
|
||||
2. All paths must resolve to within PROJECT_ROOT (sandboxing)
|
||||
|
||||
Args:
|
||||
path_str: Path string (must be absolute)
|
||||
|
||||
Returns:
|
||||
Resolved Path object
|
||||
Resolved Path object that is guaranteed to be within PROJECT_ROOT
|
||||
|
||||
Raises:
|
||||
ValueError: If path is not absolute
|
||||
@@ -88,17 +113,19 @@ def resolve_and_validate_path(path_str: str) -> Path:
|
||||
# Create a Path object from the user-provided path
|
||||
user_path = Path(path_str)
|
||||
|
||||
# Require absolute paths
|
||||
# Security Policy 1: Require absolute paths to prevent ambiguity
|
||||
# Relative paths could be interpreted differently depending on working directory
|
||||
if not user_path.is_absolute():
|
||||
raise ValueError(
|
||||
f"Relative paths are not supported. Please provide an absolute path.\n"
|
||||
f"Received: {path_str}"
|
||||
)
|
||||
|
||||
# Resolve the absolute path
|
||||
# Resolve the absolute path (follows symlinks, removes .. and .)
|
||||
resolved_path = user_path.resolve()
|
||||
|
||||
# Security check: ensure the resolved path is within PROJECT_ROOT
|
||||
# Security Policy 2: Ensure the resolved path is within PROJECT_ROOT
|
||||
# This prevents directory traversal attacks (e.g., /project/../../../etc/passwd)
|
||||
try:
|
||||
resolved_path.relative_to(PROJECT_ROOT)
|
||||
except ValueError:
|
||||
@@ -115,12 +142,16 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis
|
||||
"""
|
||||
Expand paths to individual files, handling both files and directories.
|
||||
|
||||
This function recursively walks directories to find all matching files.
|
||||
It automatically filters out hidden files and common non-code directories
|
||||
like __pycache__ to avoid including generated or system files.
|
||||
|
||||
Args:
|
||||
paths: List of file or directory paths
|
||||
extensions: Optional set of file extensions to include
|
||||
paths: List of file or directory paths (must be absolute)
|
||||
extensions: Optional set of file extensions to include (defaults to CODE_EXTENSIONS)
|
||||
|
||||
Returns:
|
||||
List of individual file paths
|
||||
List of individual file paths, sorted for consistent ordering
|
||||
"""
|
||||
if extensions is None:
|
||||
extensions = CODE_EXTENSIONS
|
||||
@@ -130,9 +161,10 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
# Validate each path for security before processing
|
||||
path_obj = resolve_and_validate_path(path)
|
||||
except (ValueError, PermissionError):
|
||||
# Skip invalid paths
|
||||
# Skip invalid paths silently to allow partial success
|
||||
continue
|
||||
|
||||
if not path_obj.exists():
|
||||
@@ -145,51 +177,61 @@ def expand_paths(paths: List[str], extensions: Optional[Set[str]] = None) -> Lis
|
||||
seen.add(str(path_obj))
|
||||
|
||||
elif path_obj.is_dir():
|
||||
# Walk directory recursively
|
||||
# Walk directory recursively to find all files
|
||||
for root, dirs, files in os.walk(path_obj):
|
||||
# Skip hidden directories and __pycache__
|
||||
# Filter directories in-place to skip hidden and cache directories
|
||||
# This prevents descending into .git, .venv, __pycache__, etc.
|
||||
dirs[:] = [
|
||||
d for d in dirs if not d.startswith(".") and d != "__pycache__"
|
||||
]
|
||||
|
||||
for file in files:
|
||||
# Skip hidden files
|
||||
# Skip hidden files (e.g., .DS_Store, .gitignore)
|
||||
if file.startswith("."):
|
||||
continue
|
||||
|
||||
file_path = Path(root) / file
|
||||
|
||||
# Check extension
|
||||
# Filter by extension if specified
|
||||
if not extensions or file_path.suffix.lower() in extensions:
|
||||
full_path = str(file_path)
|
||||
# Use set to prevent duplicates
|
||||
if full_path not in seen:
|
||||
expanded_files.append(full_path)
|
||||
seen.add(full_path)
|
||||
|
||||
# Sort for consistent ordering
|
||||
# Sort for consistent ordering across different runs
|
||||
# This makes output predictable and easier to debug
|
||||
expanded_files.sort()
|
||||
return expanded_files
|
||||
|
||||
|
||||
def read_file_content(file_path: str, max_size: int = 1_000_000) -> Tuple[str, int]:
|
||||
"""
|
||||
Read a single file and format it for Gemini.
|
||||
Read a single file and format it for inclusion in AI prompts.
|
||||
|
||||
This function handles various error conditions gracefully and always
|
||||
returns formatted content, even for errors. This ensures the AI model
|
||||
gets context about what files were attempted but couldn't be read.
|
||||
|
||||
Args:
|
||||
file_path: Path to file (must be absolute)
|
||||
max_size: Maximum file size to read
|
||||
max_size: Maximum file size to read (default 1MB to prevent memory issues)
|
||||
|
||||
Returns:
|
||||
(formatted_content, estimated_tokens)
|
||||
Tuple of (formatted_content, estimated_tokens)
|
||||
Content is wrapped with clear delimiters for AI parsing
|
||||
"""
|
||||
try:
|
||||
# Validate path security before any file operations
|
||||
path = resolve_and_validate_path(file_path)
|
||||
except (ValueError, PermissionError) as e:
|
||||
# Return error in a format that provides context to the AI
|
||||
content = f"\n--- ERROR ACCESSING FILE: {file_path} ---\nError: {str(e)}\n--- END FILE ---\n"
|
||||
return content, estimate_tokens(content)
|
||||
|
||||
try:
|
||||
# Check if path exists and is a file
|
||||
# Validate file existence and type
|
||||
if not path.exists():
|
||||
content = f"\n--- FILE NOT FOUND: {file_path} ---\nError: File does not exist\n--- END FILE ---\n"
|
||||
return content, estimate_tokens(content)
|
||||
@@ -198,17 +240,19 @@ def read_file_content(file_path: str, max_size: int = 1_000_000) -> Tuple[str, i
|
||||
content = f"\n--- NOT A FILE: {file_path} ---\nError: Path is not a file\n--- END FILE ---\n"
|
||||
return content, estimate_tokens(content)
|
||||
|
||||
# Check file size
|
||||
# Check file size to prevent memory exhaustion
|
||||
file_size = path.stat().st_size
|
||||
if file_size > max_size:
|
||||
content = f"\n--- FILE TOO LARGE: {file_path} ---\nFile size: {file_size:,} bytes (max: {max_size:,})\n--- END FILE ---\n"
|
||||
return content, estimate_tokens(content)
|
||||
|
||||
# Read the file
|
||||
# Read the file with UTF-8 encoding, replacing invalid characters
|
||||
# This ensures we can handle files with mixed encodings
|
||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||
file_content = f.read()
|
||||
|
||||
# Format with clear delimiters for Gemini
|
||||
# Format with clear delimiters that help the AI understand file boundaries
|
||||
# Using consistent markers makes it easier for the model to parse
|
||||
formatted = f"\n--- BEGIN FILE: {file_path} ---\n{file_content}\n--- END FILE: {file_path} ---\n"
|
||||
return formatted, estimate_tokens(formatted)
|
||||
|
||||
@@ -226,14 +270,21 @@ def read_files(
|
||||
"""
|
||||
Read multiple files and optional direct code with smart token management.
|
||||
|
||||
This function implements intelligent token budgeting to maximize the amount
|
||||
of relevant content that can be included in an AI prompt while staying
|
||||
within token limits. It prioritizes direct code and reads files until
|
||||
the token budget is exhausted.
|
||||
|
||||
Args:
|
||||
file_paths: List of file or directory paths
|
||||
code: Optional direct code to include
|
||||
file_paths: List of file or directory paths (absolute paths required)
|
||||
code: Optional direct code to include (prioritized over files)
|
||||
max_tokens: Maximum tokens to use (defaults to MAX_CONTEXT_TOKENS)
|
||||
reserve_tokens: Tokens to reserve for prompt and response
|
||||
reserve_tokens: Tokens to reserve for prompt and response (default 50K)
|
||||
|
||||
Returns:
|
||||
(full_content, brief_summary)
|
||||
Tuple of (full_content, brief_summary)
|
||||
- full_content: All file contents formatted for AI consumption
|
||||
- brief_summary: Human-readable summary of what was processed
|
||||
"""
|
||||
if max_tokens is None:
|
||||
max_tokens = MAX_CONTEXT_TOKENS
|
||||
@@ -247,7 +298,8 @@ def read_files(
|
||||
files_skipped = []
|
||||
dirs_processed = []
|
||||
|
||||
# First, handle direct code if provided
|
||||
# Priority 1: Handle direct code if provided
|
||||
# Direct code is prioritized because it's explicitly provided by the user
|
||||
if code:
|
||||
formatted_code = (
|
||||
f"\n--- BEGIN DIRECT CODE ---\n{code}\n--- END DIRECT CODE ---\n"
|
||||
@@ -258,19 +310,23 @@ def read_files(
|
||||
content_parts.append(formatted_code)
|
||||
total_tokens += code_tokens
|
||||
available_tokens -= code_tokens
|
||||
# Create a preview for the summary
|
||||
code_preview = code[:50] + "..." if len(code) > 50 else code
|
||||
summary_parts.append(f"Direct code: {code_preview}")
|
||||
else:
|
||||
summary_parts.append("Direct code skipped (too large)")
|
||||
|
||||
# Expand all paths to get individual files
|
||||
# Priority 2: Process file paths
|
||||
if file_paths:
|
||||
# Track which paths are directories
|
||||
# Track which paths are directories for summary
|
||||
for path in file_paths:
|
||||
if Path(path).is_dir():
|
||||
dirs_processed.append(path)
|
||||
try:
|
||||
if Path(path).is_dir():
|
||||
dirs_processed.append(path)
|
||||
except Exception:
|
||||
pass # Ignore invalid paths
|
||||
|
||||
# Expand to get all files
|
||||
# Expand directories to get all individual files
|
||||
all_files = expand_paths(file_paths)
|
||||
|
||||
if not all_files and file_paths:
|
||||
@@ -279,7 +335,7 @@ def read_files(
|
||||
f"\n--- NO FILES FOUND ---\nProvided paths: {', '.join(file_paths)}\n--- END ---\n"
|
||||
)
|
||||
else:
|
||||
# Read files up to token limit
|
||||
# Read files sequentially until token limit is reached
|
||||
for file_path in all_files:
|
||||
if total_tokens >= available_tokens:
|
||||
files_skipped.append(file_path)
|
||||
@@ -293,9 +349,10 @@ def read_files(
|
||||
total_tokens += file_tokens
|
||||
files_read.append(file_path)
|
||||
else:
|
||||
# File too large for remaining budget
|
||||
files_skipped.append(file_path)
|
||||
|
||||
# Build summary
|
||||
# Build human-readable summary of what was processed
|
||||
if dirs_processed:
|
||||
summary_parts.append(f"Processed {len(dirs_processed)} dir(s)")
|
||||
if files_read:
|
||||
@@ -305,11 +362,12 @@ def read_files(
|
||||
if total_tokens > 0:
|
||||
summary_parts.append(f"~{total_tokens:,} tokens used")
|
||||
|
||||
# Add skipped files note if any were skipped
|
||||
# Add informative note about skipped files to help users understand
|
||||
# what was omitted and why
|
||||
if files_skipped:
|
||||
skip_note = "\n\n--- SKIPPED FILES (TOKEN LIMIT) ---\n"
|
||||
skip_note += f"Total skipped: {len(files_skipped)}\n"
|
||||
# Show first 10 skipped files
|
||||
# Show first 10 skipped files as examples
|
||||
for i, file_path in enumerate(files_skipped[:10]):
|
||||
skip_note += f" - {file_path}\n"
|
||||
if len(files_skipped) > 10:
|
||||
|
||||
Reference in New Issue
Block a user