diff --git a/tools/analyze.py b/tools/analyze.py index b4818ff..9493344 100644 --- a/tools/analyze.py +++ b/tools/analyze.py @@ -137,7 +137,8 @@ class AnalyzeTool(BaseTool): # Use centralized file processing logic continuation_id = getattr(request, "continuation_id", None) - file_content = self._prepare_file_content_for_prompt(request.files, continuation_id, "Files") + file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Files") + self._actually_processed_files = processed_files # Build analysis instructions analysis_focus = [] diff --git a/tools/base.py b/tools/base.py index c4dc731..6cb612f 100644 --- a/tools/base.py +++ b/tools/base.py @@ -544,7 +544,7 @@ class BaseTool(ABC): reserve_tokens: int = 1_000, remaining_budget: Optional[int] = None, arguments: Optional[dict] = None, - ) -> str: + ) -> tuple[str, list[str]]: """ Centralized file processing for tool prompts. @@ -563,10 +563,13 @@ class BaseTool(ABC): arguments: Original tool arguments (used to extract _remaining_tokens if available) Returns: - str: Formatted file content string ready for prompt inclusion + tuple[str, list[str]]: (formatted_file_content, actually_processed_files) + - formatted_file_content: Formatted file content string ready for prompt inclusion + - actually_processed_files: List of individual file paths that were actually read and embedded + (directories are expanded to individual files) """ if not request_files: - return "" + return "", [] # Note: Even if conversation history is already embedded, we still need to process # any NEW files that aren't in the conversation history yet. The filter_new_files @@ -705,6 +708,7 @@ class BaseTool(ABC): ) content_parts = [] + actually_processed_files = [] # Read content of new files only if files_to_embed: @@ -713,6 +717,11 @@ class BaseTool(ABC): f"[FILES] {self.name}: Starting file embedding with token budget {effective_max_tokens + reserve_tokens:,}" ) try: + # Before calling read_files, expand directories to get individual file paths + from utils.file_utils import expand_paths + expanded_files = expand_paths(files_to_embed) + logger.debug(f"[FILES] {self.name}: Expanded {len(files_to_embed)} paths to {len(expanded_files)} individual files") + file_content = read_files( files_to_embed, max_tokens=effective_max_tokens + reserve_tokens, @@ -721,6 +730,9 @@ class BaseTool(ABC): ) self._validate_token_limit(file_content, context_description) content_parts.append(file_content) + + # Track the expanded files as actually processed + actually_processed_files.extend(expanded_files) # Estimate tokens for debug logging from utils.token_utils import estimate_tokens @@ -730,6 +742,7 @@ class BaseTool(ABC): f"{self.name} tool successfully embedded {len(files_to_embed)} files ({content_tokens:,} tokens)" ) logger.debug(f"[FILES] {self.name}: Successfully embedded files - {content_tokens:,} tokens used") + logger.debug(f"[FILES] {self.name}: Actually processed {len(actually_processed_files)} individual files") except Exception as e: logger.error(f"{self.name} tool failed to embed files {files_to_embed}: {type(e).__name__}: {e}") logger.debug(f"[FILES] {self.name}: File embedding failed - {type(e).__name__}: {e}") @@ -759,8 +772,8 @@ class BaseTool(ABC): logger.debug(f"[FILES] {self.name}: No skipped files to note") result = "".join(content_parts) if content_parts else "" - logger.debug(f"[FILES] {self.name}: _prepare_file_content_for_prompt returning {len(result)} chars") - return result + logger.debug(f"[FILES] {self.name}: _prepare_file_content_for_prompt returning {len(result)} chars, {len(actually_processed_files)} processed files") + return result, actually_processed_files def get_websearch_instruction(self, use_websearch: bool, tool_specific: Optional[str] = None) -> str: """ @@ -1408,7 +1421,9 @@ When recommending searches, be specific about what information you need and why ) # Add this response as the first turn (assistant turn) - request_files = getattr(request, "files", []) or [] + # Use actually processed files from file preparation instead of original request files + # This ensures directories are tracked as their individual expanded files + request_files = getattr(self, "_actually_processed_files", []) or getattr(request, "files", []) or [] # Extract model metadata model_provider = None model_name = None diff --git a/tools/chat.py b/tools/chat.py index 0256edc..08c5486 100644 --- a/tools/chat.py +++ b/tools/chat.py @@ -124,9 +124,10 @@ class ChatTool(BaseTool): # Add context files if provided (using centralized file handling with filtering) if request.files: - file_content = self._prepare_file_content_for_prompt( + file_content, processed_files = self._prepare_file_content_for_prompt( request.files, request.continuation_id, "Context files" ) + self._actually_processed_files = processed_files if file_content: user_content = f"{user_content}\n\n=== CONTEXT FILES ===\n{file_content}\n=== END CONTEXT ====" diff --git a/tools/codereview.py b/tools/codereview.py index 26b86cc..b6eabb7 100644 --- a/tools/codereview.py +++ b/tools/codereview.py @@ -196,7 +196,8 @@ class CodeReviewTool(BaseTool): # Use centralized file processing logic continuation_id = getattr(request, "continuation_id", None) - file_content = self._prepare_file_content_for_prompt(request.files, continuation_id, "Code") + file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Code") + self._actually_processed_files = processed_files # Build customized review instructions based on review type review_focus = [] diff --git a/tools/debug.py b/tools/debug.py index ca569fb..83274ee 100644 --- a/tools/debug.py +++ b/tools/debug.py @@ -166,7 +166,8 @@ class DebugIssueTool(BaseTool): if request.files: # Use centralized file processing logic continuation_id = getattr(request, "continuation_id", None) - file_content = self._prepare_file_content_for_prompt(request.files, continuation_id, "Code") + file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Code") + self._actually_processed_files = processed_files if file_content: context_parts.append(f"\n=== RELEVANT CODE ===\n{file_content}\n=== END CODE ===") diff --git a/tools/precommit.py b/tools/precommit.py index e5a5b0f..787ca16 100644 --- a/tools/precommit.py +++ b/tools/precommit.py @@ -408,13 +408,14 @@ class Precommit(BaseTool): remaining_tokens = max_tokens - total_tokens # Use centralized file handling with filtering for duplicate prevention - file_content = self._prepare_file_content_for_prompt( + file_content, processed_files = self._prepare_file_content_for_prompt( translated_files, request.continuation_id, "Context files", max_tokens=remaining_tokens + 1000, # Add back the reserve that was calculated reserve_tokens=1000, # Small reserve for formatting ) + self._actually_processed_files = processed_files if file_content: context_tokens = estimate_tokens(file_content) diff --git a/tools/refactor.py b/tools/refactor.py index 49dd88e..5f36e66 100644 --- a/tools/refactor.py +++ b/tools/refactor.py @@ -330,13 +330,14 @@ class RefactorTool(BaseTool): # Use standard file content preparation with dynamic token budget and line numbers try: logger.debug(f"[REFACTOR] Preparing file content for {len(examples_to_process)} style examples") - content = self._prepare_file_content_for_prompt( + content, processed_files = self._prepare_file_content_for_prompt( examples_to_process, continuation_id, "Style guide examples", max_tokens=style_examples_budget, reserve_tokens=1000, ) + # Store processed files for tracking - style examples are tracked separately from main code files # Determine how many files were actually included if content: @@ -478,9 +479,10 @@ class RefactorTool(BaseTool): # Use centralized file processing logic for main code files (with line numbers enabled) logger.debug(f"[REFACTOR] Preparing {len(code_files_to_process)} code files for analysis") - code_content = self._prepare_file_content_for_prompt( + code_content, processed_files = self._prepare_file_content_for_prompt( code_files_to_process, continuation_id, "Code to analyze", max_tokens=remaining_tokens, reserve_tokens=2000 ) + self._actually_processed_files = processed_files if code_content: from utils.token_utils import estimate_tokens diff --git a/tools/testgen.py b/tools/testgen.py index 1f5661d..c1eacfb 100644 --- a/tools/testgen.py +++ b/tools/testgen.py @@ -214,13 +214,14 @@ class TestGenTool(BaseTool): # Use standard file content preparation with dynamic token budget try: logger.debug(f"[TESTGEN] Preparing file content for {len(examples_to_process)} test examples") - content = self._prepare_file_content_for_prompt( + content, processed_files = self._prepare_file_content_for_prompt( examples_to_process, continuation_id, "Test examples", max_tokens=test_examples_budget, reserve_tokens=1000, ) + # Store processed files for tracking - test examples are tracked separately from main code files # Determine how many files were actually included if content: @@ -358,9 +359,10 @@ class TestGenTool(BaseTool): # Use centralized file processing logic for main code files (after deduplication) logger.debug(f"[TESTGEN] Preparing {len(code_files_to_process)} code files for analysis") - code_content = self._prepare_file_content_for_prompt( + code_content, processed_files = self._prepare_file_content_for_prompt( code_files_to_process, continuation_id, "Code to test", max_tokens=remaining_tokens, reserve_tokens=2000 ) + self._actually_processed_files = processed_files if code_content: from utils.token_utils import estimate_tokens diff --git a/tools/thinkdeep.py b/tools/thinkdeep.py index 1603dc8..20f3533 100644 --- a/tools/thinkdeep.py +++ b/tools/thinkdeep.py @@ -148,7 +148,8 @@ class ThinkDeepTool(BaseTool): if request.files: # Use centralized file processing logic continuation_id = getattr(request, "continuation_id", None) - file_content = self._prepare_file_content_for_prompt(request.files, continuation_id, "Reference files") + file_content, processed_files = self._prepare_file_content_for_prompt(request.files, continuation_id, "Reference files") + self._actually_processed_files = processed_files if file_content: context_parts.append(f"\n=== REFERENCE FILES ===\n{file_content}\n=== END FILES ===")