Performance improvements when embedding files:

- Exit early at MCP boundary if files won't fit within given context of chosen model
- Encourage claude to re-run with better context
- Check file sizes before embedding
- Drop files from older conversations when building continuations and give priority to newer files
- List and mention excluded files to Claude on return
- Improved tests
- Improved precommit prompt
- Added a new Low severity to precommit
- Improved documentation of file embedding strategy
- Refactor
This commit is contained in:
Fahad
2025-06-16 05:51:52 +04:00
parent 56333cbd86
commit 91077e3810
16 changed files with 1557 additions and 308 deletions

View File

@@ -18,10 +18,12 @@ Security Model:
- Symbolic links are resolved to ensure they stay within bounds
"""
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional
from typing import Callable, Optional
from .file_types import BINARY_EXTENSIONS, CODE_EXTENSIONS, IMAGE_EXTENSIONS, TEXT_EXTENSIONS
from .security_config import CONTAINER_WORKSPACE, EXCLUDED_DIRS, MCP_SIGNATURE_FILES, SECURITY_ROOT, WORKSPACE_ROOT
@@ -689,3 +691,349 @@ def read_files(
result = "\n\n".join(content_parts) if content_parts else ""
logger.debug(f"[FILES] read_files complete: {len(result)} chars, {total_tokens:,} tokens used")
return result
def estimate_file_tokens(file_path: str) -> int:
"""
Estimate tokens for a file using file-type aware ratios.
Args:
file_path: Path to the file
Returns:
Estimated token count for the file
"""
try:
translated_path = translate_path_for_environment(file_path)
if not os.path.exists(translated_path) or not os.path.isfile(translated_path):
return 0
file_size = os.path.getsize(translated_path)
# Get the appropriate ratio for this file type
from .file_types import get_token_estimation_ratio
ratio = get_token_estimation_ratio(file_path)
return int(file_size / ratio)
except Exception:
return 0
def check_files_size_limit(files: list[str], max_tokens: int, threshold_percent: float = 1.0) -> tuple[bool, int, int]:
"""
Check if a list of files would exceed token limits.
Args:
files: List of file paths to check
max_tokens: Maximum allowed tokens
threshold_percent: Percentage of max_tokens to use as threshold (0.0-1.0)
Returns:
Tuple of (within_limit, total_estimated_tokens, file_count)
"""
if not files:
return True, 0, 0
total_estimated_tokens = 0
file_count = 0
threshold = int(max_tokens * threshold_percent)
for file_path in files:
try:
estimated_tokens = estimate_file_tokens(file_path)
total_estimated_tokens += estimated_tokens
if estimated_tokens > 0: # Only count accessible files
file_count += 1
except Exception:
# Skip files that can't be accessed for size check
continue
within_limit = total_estimated_tokens <= threshold
return within_limit, total_estimated_tokens, file_count
class LogTailer:
"""
General-purpose log file tailer with rotation detection.
This class provides a reusable way to monitor log files for new content,
automatically handling log rotation and maintaining position tracking.
"""
def __init__(self, file_path: str, initial_seek_end: bool = True):
"""
Initialize log tailer for a specific file.
Args:
file_path: Path to the log file to monitor
initial_seek_end: If True, start monitoring from end of file
"""
self.file_path = file_path
self.position = 0
self.last_size = 0
self.initial_seek_end = initial_seek_end
# Ensure file exists and initialize position
Path(self.file_path).touch()
if self.initial_seek_end and os.path.exists(self.file_path):
self.last_size = os.path.getsize(self.file_path)
self.position = self.last_size
def read_new_lines(self) -> list[str]:
"""
Read new lines since last call, handling rotation.
Returns:
List of new lines from the file
"""
if not os.path.exists(self.file_path):
return []
try:
current_size = os.path.getsize(self.file_path)
# Check for log rotation (file size decreased)
if current_size < self.last_size:
self.position = 0
self.last_size = current_size
with open(self.file_path, encoding="utf-8", errors="ignore") as f:
f.seek(self.position)
new_lines = f.readlines()
self.position = f.tell()
self.last_size = current_size
# Strip whitespace from each line
return [line.strip() for line in new_lines if line.strip()]
except OSError:
return []
def monitor_continuously(
self,
line_handler: Callable[[str], None],
check_interval: float = 0.5,
stop_condition: Optional[Callable[[], bool]] = None,
):
"""
Monitor file continuously and call handler for each new line.
Args:
line_handler: Function to call for each new line
check_interval: Seconds between file checks
stop_condition: Optional function that returns True to stop monitoring
"""
while True:
try:
if stop_condition and stop_condition():
break
new_lines = self.read_new_lines()
for line in new_lines:
line_handler(line)
time.sleep(check_interval)
except KeyboardInterrupt:
break
except Exception as e:
logger.warning(f"Error monitoring log file {self.file_path}: {e}")
time.sleep(1)
def read_json_file(file_path: str) -> Optional[dict]:
"""
Read and parse a JSON file with proper error handling.
Args:
file_path: Path to the JSON file
Returns:
Parsed JSON data as dict, or None if file doesn't exist or invalid
"""
try:
translated_path = translate_path_for_environment(file_path)
if not os.path.exists(translated_path):
return None
with open(translated_path, encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError):
return None
def write_json_file(file_path: str, data: dict, indent: int = 2) -> bool:
"""
Write data to a JSON file with proper formatting.
Args:
file_path: Path to write the JSON file
data: Dictionary data to serialize
indent: JSON indentation level
Returns:
True if successful, False otherwise
"""
try:
translated_path = translate_path_for_environment(file_path)
os.makedirs(os.path.dirname(translated_path), exist_ok=True)
with open(translated_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=indent, ensure_ascii=False)
return True
except (OSError, TypeError):
return False
def get_file_size(file_path: str) -> int:
"""
Get file size in bytes with proper error handling.
Args:
file_path: Path to the file
Returns:
File size in bytes, or 0 if file doesn't exist or error
"""
try:
translated_path = translate_path_for_environment(file_path)
if os.path.exists(translated_path) and os.path.isfile(translated_path):
return os.path.getsize(translated_path)
return 0
except OSError:
return 0
def ensure_directory_exists(file_path: str) -> bool:
"""
Ensure the parent directory of a file path exists.
Args:
file_path: Path to file (directory will be created for parent)
Returns:
True if directory exists or was created, False on error
"""
try:
translated_path = translate_path_for_environment(file_path)
directory = os.path.dirname(translated_path)
if directory:
os.makedirs(directory, exist_ok=True)
return True
except OSError:
return False
def is_text_file(file_path: str) -> bool:
"""
Check if a file is likely a text file based on extension and content.
Args:
file_path: Path to the file
Returns:
True if file appears to be text, False otherwise
"""
from .file_types import is_text_file as check_text_type
return check_text_type(file_path)
def read_file_safely(file_path: str, max_size: int = 10 * 1024 * 1024) -> Optional[str]:
"""
Read a file with size limits and encoding handling.
Args:
file_path: Path to the file
max_size: Maximum file size in bytes (default 10MB)
Returns:
File content as string, or None if file too large or unreadable
"""
try:
translated_path = translate_path_for_environment(file_path)
if not os.path.exists(translated_path) or not os.path.isfile(translated_path):
return None
file_size = os.path.getsize(translated_path)
if file_size > max_size:
return None
with open(translated_path, encoding="utf-8", errors="ignore") as f:
return f.read()
except OSError:
return None
def check_total_file_size(files: list[str], model_name: Optional[str] = None) -> Optional[dict]:
"""
Check if total file sizes would exceed token threshold before embedding.
IMPORTANT: This performs STRICT REJECTION at MCP boundary.
No partial inclusion - either all files fit or request is rejected.
This forces Claude to make better file selection decisions.
Args:
files: List of file paths to check
model_name: Model name for context-aware thresholds, or None for default
Returns:
Dict with MCP_CODE_TOO_LARGE response if too large, None if acceptable
"""
if not files:
return None
# Get model-specific token allocation (dynamic thresholds)
if not model_name:
from config import DEFAULT_MODEL
model_name = DEFAULT_MODEL
# Handle auto mode gracefully
if model_name.lower() == "auto":
from providers.registry import ModelProviderRegistry
model_name = ModelProviderRegistry.get_preferred_fallback_model()
from utils.model_context import ModelContext
model_context = ModelContext(model_name)
token_allocation = model_context.calculate_token_allocation()
# Dynamic threshold based on model capacity
context_window = token_allocation.total_tokens
if context_window >= 1_000_000: # Gemini-class models
threshold_percent = 0.8 # Can be more generous
elif context_window >= 500_000: # Mid-range models
threshold_percent = 0.7 # Moderate
else: # OpenAI-class models (200K)
threshold_percent = 0.6 # Conservative
max_file_tokens = int(token_allocation.file_tokens * threshold_percent)
# Use centralized file size checking (threshold already applied to max_file_tokens)
within_limit, total_estimated_tokens, file_count = check_files_size_limit(files, max_file_tokens)
if not within_limit:
return {
"status": "code_too_large",
"content": (
f"The selected files are too large for analysis "
f"(estimated {total_estimated_tokens:,} tokens, limit {max_file_tokens:,}). "
f"Please select fewer, more specific files that are most relevant "
f"to your question, then invoke the tool again."
),
"content_type": "text",
"metadata": {
"total_estimated_tokens": total_estimated_tokens,
"limit": max_file_tokens,
"file_count": file_count,
"threshold_percent": threshold_percent,
"model_context_window": context_window,
"instructions": "Reduce file selection and try again - all files must fit within budget",
},
}
return None # Proceed with ALL files