From 693f5786acba7b03766dc6ad7e0997d259305692 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Tue, 17 Mar 2026 02:31:56 -0700 Subject: [PATCH] perf: use ripgrep for file search (200x faster than find) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit search_files(target='files') now uses rg --files -g instead of find. Ripgrep respects .gitignore, excludes hidden dirs by default, and has parallel directory traversal — ~200x faster on wide trees (0.14s vs 34s benchmarked on 164-repo tree). Falls back to find when rg is unavailable, preserving hidden-dir exclusion and BSD find compatibility. Salvaged from PR #1464 by @light-merlin-dark (Merlin) — adapted to preserve hidden-dir exclusion added since the original PR. --- tools/file_operations.py | 72 +++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/tools/file_operations.py b/tools/file_operations.py index 4e35b9ba..7f39a027 100644 --- a/tools/file_operations.py +++ b/tools/file_operations.py @@ -867,53 +867,85 @@ class ShellFileOperations(FileOperations): def _search_files(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult: """Search for files by name pattern (glob-like).""" - # Check if find is available (not on Windows without Git Bash/WSL) - if not self._has_command('find'): - return SearchResult( - error="File search requires 'find' command. " - "On Windows, use Git Bash, WSL, or install Unix tools." - ) - # Auto-prepend **/ for recursive search if not already present if not pattern.startswith('**/') and '/' not in pattern: search_pattern = pattern else: search_pattern = pattern.split('/')[-1] - + + # Prefer ripgrep: respects .gitignore, excludes hidden dirs by + # default, and has parallel directory traversal (~200x faster than + # find on wide trees). Mirrors _search_content which already uses rg. + if self._has_command('rg'): + return self._search_files_rg(search_pattern, path, limit, offset) + + # Fallback: find (slower, no .gitignore awareness) + if not self._has_command('find'): + return SearchResult( + error="File search requires 'rg' (ripgrep) or 'find'. " + "Install ripgrep for best results: " + "https://github.com/BurntSushi/ripgrep#installation" + ) + # Exclude hidden directories (matching ripgrep's default behavior). - # This prevents the agent from discovering internal cache files - # (e.g. .hub/index-cache/) that may contain unvetted content. hidden_exclude = "-not -path '*/.*'" - - # Use find with modification time sorting - # -printf '%T@ %p\n' outputs: timestamp path - # sort -rn sorts by timestamp descending (newest first) + cmd = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \ - f"-printf '%T@ %p\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}" - + f"-printf '%T@ %p\\\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}" + result = self._exec(cmd, timeout=60) - + if not result.stdout.strip(): # Try without -printf (BSD find compatibility -- macOS) cmd_simple = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \ f"2>/dev/null | head -n {limit + offset} | tail -n +{offset + 1}" result = self._exec(cmd_simple, timeout=60) - + files = [] for line in result.stdout.strip().split('\n'): if not line: continue - # Parse "timestamp path" format parts = line.split(' ', 1) if len(parts) == 2 and parts[0].replace('.', '').isdigit(): files.append(parts[1]) else: files.append(line) - + return SearchResult( files=files, total_count=len(files) ) + + def _search_files_rg(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult: + """Search for files by name using ripgrep's --files mode. + + rg --files respects .gitignore and excludes hidden directories by + default, and uses parallel directory traversal for ~200x speedup + over find on wide trees. + """ + # rg --files -g uses glob patterns; wrap bare names so they match + # at any depth (equivalent to find -name). + if '/' not in pattern and not pattern.startswith('*'): + glob_pattern = f"*{pattern}" + else: + glob_pattern = pattern + + fetch_limit = limit + offset + cmd = ( + f"rg --files -g {self._escape_shell_arg(glob_pattern)} " + f"{self._escape_shell_arg(path)} 2>/dev/null " + f"| head -n {fetch_limit}" + ) + result = self._exec(cmd, timeout=60) + + all_files = [f for f in result.stdout.strip().split('\n') if f] + page = all_files[offset:offset + limit] + + return SearchResult( + files=page, + total_count=len(all_files), + truncated=len(all_files) >= fetch_limit, + ) def _search_content(self, pattern: str, path: str, file_glob: Optional[str], limit: int, offset: int, output_mode: str, context: int) -> SearchResult: