From 693f5786acba7b03766dc6ad7e0997d259305692 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Tue, 17 Mar 2026 02:31:56 -0700
Subject: [PATCH] perf: use ripgrep for file search (200x faster than find)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

search_files(target='files') now uses rg --files -g instead of find.
Ripgrep respects .gitignore, excludes hidden dirs by default, and has
parallel directory traversal — ~200x faster on wide trees (0.14s vs 34s
benchmarked on 164-repo tree).

Falls back to find when rg is unavailable, preserving hidden-dir
exclusion and BSD find compatibility.

Salvaged from PR #1464 by @light-merlin-dark (Merlin) — adapted to
preserve hidden-dir exclusion added since the original PR.
---
 tools/file_operations.py | 72 +++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 20 deletions(-)

diff --git a/tools/file_operations.py b/tools/file_operations.py
index 4e35b9ba..7f39a027 100644
--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@@ -867,53 +867,85 @@ class ShellFileOperations(FileOperations):
     
     def _search_files(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
         """Search for files by name pattern (glob-like)."""
-        # Check if find is available (not on Windows without Git Bash/WSL)
-        if not self._has_command('find'):
-            return SearchResult(
-                error="File search requires 'find' command. "
-                      "On Windows, use Git Bash, WSL, or install Unix tools."
-            )
-        
         # Auto-prepend **/ for recursive search if not already present
         if not pattern.startswith('**/') and '/' not in pattern:
             search_pattern = pattern
         else:
             search_pattern = pattern.split('/')[-1]
-        
+
+        # Prefer ripgrep: respects .gitignore, excludes hidden dirs by
+        # default, and has parallel directory traversal (~200x faster than
+        # find on wide trees).  Mirrors _search_content which already uses rg.
+        if self._has_command('rg'):
+            return self._search_files_rg(search_pattern, path, limit, offset)
+
+        # Fallback: find (slower, no .gitignore awareness)
+        if not self._has_command('find'):
+            return SearchResult(
+                error="File search requires 'rg' (ripgrep) or 'find'. "
+                      "Install ripgrep for best results: "
+                      "https://github.com/BurntSushi/ripgrep#installation"
+            )
+
         # Exclude hidden directories (matching ripgrep's default behavior).
-        # This prevents the agent from discovering internal cache files
-        # (e.g. .hub/index-cache/) that may contain unvetted content.
         hidden_exclude = "-not -path '*/.*'"
-        
-        # Use find with modification time sorting
-        # -printf '%T@ %p\n' outputs: timestamp path
-        # sort -rn sorts by timestamp descending (newest first)
+
         cmd = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
-              f"-printf '%T@ %p\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
-        
+              f"-printf '%T@ %p\\\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
+
         result = self._exec(cmd, timeout=60)
-        
+
         if not result.stdout.strip():
             # Try without -printf (BSD find compatibility -- macOS)
             cmd_simple = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
                         f"2>/dev/null | head -n {limit + offset} | tail -n +{offset + 1}"
             result = self._exec(cmd_simple, timeout=60)
-        
+
         files = []
         for line in result.stdout.strip().split('\n'):
             if not line:
                 continue
-            # Parse "timestamp path" format
             parts = line.split(' ', 1)
             if len(parts) == 2 and parts[0].replace('.', '').isdigit():
                 files.append(parts[1])
             else:
                 files.append(line)
-        
+
         return SearchResult(
             files=files,
             total_count=len(files)
         )
+
+    def _search_files_rg(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
+        """Search for files by name using ripgrep's --files mode.
+
+        rg --files respects .gitignore and excludes hidden directories by
+        default, and uses parallel directory traversal for ~200x speedup
+        over find on wide trees.
+        """
+        # rg --files -g uses glob patterns; wrap bare names so they match
+        # at any depth (equivalent to find -name).
+        if '/' not in pattern and not pattern.startswith('*'):
+            glob_pattern = f"*{pattern}"
+        else:
+            glob_pattern = pattern
+
+        fetch_limit = limit + offset
+        cmd = (
+            f"rg --files -g {self._escape_shell_arg(glob_pattern)} "
+            f"{self._escape_shell_arg(path)} 2>/dev/null "
+            f"| head -n {fetch_limit}"
+        )
+        result = self._exec(cmd, timeout=60)
+
+        all_files = [f for f in result.stdout.strip().split('\n') if f]
+        page = all_files[offset:offset + limit]
+
+        return SearchResult(
+            files=page,
+            total_count=len(all_files),
+            truncated=len(all_files) >= fetch_limit,
+        )
     
     def _search_content(self, pattern: str, path: str, file_glob: Optional[str],
                         limit: int, offset: int, output_mode: str, context: int) -> SearchResult: