Mountain/Environment/SearchProvider.rs
1//! # SearchProvider (Environment)
2//!
3//! Implements the `SearchProvider` trait using the `grep-searcher` crate
4//! (the ripgrep library) for `MountainEnvironment`.
5//!
6//! ## Search architecture
7//!
8//! The search implementation uses a multi-threaded approach:
9//!
10//! 1. **Pattern compilation** - regex is compiled with case/word/multiline
11//! modifiers; plain-text queries are `regex::escape`d first.
12//! 2. **Parallel walking** - workspace files are walked via
13//! `WalkBuilder::build_parallel()`, respecting `.gitignore` and `.ignore`
14//! files automatically.
15//! 3. **Per-file search** - each file is searched individually using a `Sink`
16//! pattern (`PerFileSink`).
17//! 4. **Result aggregation** - matches are collected in a shared
18//! `Arc<Mutex<Vec<FileMatch>>>`.
19//!
20//! ## Search features
21//!
22//! - **Case sensitivity** - controlled by `isCaseSensitive` option
23//! - **Word matching** - controlled by `isWordMatch` option
24//! - **Regex support** - full regex via `grep-regex`
25//! - **Ignore files** - respects `.gitignore`, `.ignore`, and siblings
26//! - **Memory efficient** - streams results; never loads entire files
27//!
28//! ## Search result format
29//!
30//! Each match includes:
31//! - `resource` - file URI
32//! - `lineNumber` - 1-based line number
33//! - `preview` - matched text line (capped at 512 bytes)
34//! - `columns` - per-match `{start, end}` char-offset ranges (0-based, UTF-8
35//! code units to match VS Code's `ISearchRange`)
36//!
37//! ## VS Code reference
38//!
39//! - `vs/workbench/contrib/search/browser/searchWidget.ts`
40//! - `vs/platform/search/common/search.ts`
41//! - `vs/platform/search/common/fileSearch.ts`
42
43use std::{
44 io,
45 path::PathBuf,
46 sync::{Arc, Mutex},
47};
48
49use CommonLibrary::{Error::CommonError::CommonError, Search::SearchProvider::SearchProvider};
50use async_trait::async_trait;
51use grep_matcher::Matcher;
52use grep_regex::{RegexMatcher, RegexMatcherBuilder};
53use grep_searcher::{Searcher, SearcherBuilder, Sink, SinkMatch};
54use ignore::WalkBuilder;
55use serde::{Deserialize, Serialize};
56use serde_json::{Value, json};
57
58use super::{MountainEnvironment::MountainEnvironment, Utility};
59use crate::dev_log;
60
61// TODO: result pagination, cancellation via CancellationToken, include/exclude
62// patterns, context lines (before/after), file-type filtering, replacement
63// highlighting, progress reporting, multi-folder independent search, caching,
64// regex capture groups, search history, result export, performance metrics,
65// deduplication, glob file matching, result ranking, binary file handling,
66// symlink following, max file size limit, search timeout, hidden files,
67// multi-line regex.
68
69/// Mirrors VS Code's `ITextSearchQuery` shape (`vs/workbench/services/
70/// search/common/search.ts`). The workbench's Search view serialises
71/// the user's input into this struct and the ProxyChannel sends it as
72/// slot 0 of the `search:textSearch` call.
73///
74/// - `pattern`: the user's typed query
75/// - `isRegExp` (default `false`): when `false`, the pattern is
76/// `regex::escape`'d before compilation so a literal search for `obj.method(`
77/// doesn't blow up the regex parser.
78/// - `isCaseSensitive` (default `false`): controls the regex's case-insensitive
79/// flag.
80/// - `isWordMatch` (default `false`): wraps the pattern in `\b…\b` via
81/// `RegexMatcherBuilder::word(true)`.
82/// - `isMultiline` (default `false`): toggles `.` matching `\n`.
83#[derive(Deserialize, Debug, Default)]
84#[serde(rename_all = "camelCase")]
85struct TextSearchQuery {
86 pattern:String,
87
88 #[serde(default)]
89 is_case_sensitive:Option<bool>,
90
91 #[serde(default)]
92 is_word_match:Option<bool>,
93
94 #[serde(default)]
95 is_reg_exp:Option<bool>,
96
97 #[serde(default)]
98 is_multiline:Option<bool>,
99}
100
101/// Per-match column range within the preview line.
102///
103/// `start` and `end` are 0-based UTF-8 character offsets, NOT byte
104/// offsets - VS Code's renderer measures columns in code units, so
105/// pre-converting bytes→chars here keeps the workbench from
106/// mis-highlighting multi-byte UTF-8 lines (the search panel underlines
107/// the wrong substring otherwise).
108///
109/// VS Code's `ISearchRange` is 1-based for line numbers but 0-based
110/// for columns; the SkyBridge consumer adds the +1 line offset there.
111#[derive(Serialize, Clone, Debug)]
112#[serde(rename_all = "camelCase")]
113struct ColumnRange {
114 start:u64,
115
116 end:u64,
117}
118
119#[derive(Serialize, Clone, Debug)]
120#[serde(rename_all = "camelCase")]
121struct TextMatch {
122 preview:String,
123
124 /// 1-based line number (grep-searcher emits 1-based when
125 /// `line_number(true)` is configured on the SearcherBuilder).
126 line_number:u64,
127
128 /// Per-line ranges where the matcher actually matched. A single
129 /// line can contain multiple matches (e.g. `test test test`); each
130 /// gets its own range. Empty when match-position lookup failed -
131 /// in that case the renderer falls back to highlighting the whole
132 /// line.
133 columns:Vec<ColumnRange>,
134}
135
136#[derive(Serialize, Clone, Debug)]
137#[serde(rename_all = "camelCase")]
138struct FileMatch {
139 // URI
140 resource:String,
141
142 matches:Vec<TextMatch>,
143}
144
145// This Sink is designed to be created for each file. It holds a reference to
146// the central results vector and the path of the file it's searching.
147struct PerFileSink {
148 path:PathBuf,
149
150 results:Arc<Mutex<Vec<FileMatch>>>,
151
152 /// Cloned per-thread so the sink can re-run the matcher against the
153 /// raw line bytes to recover column ranges. `SinkMatch::bytes()`
154 /// gives us the matched line but not where in the line the matcher
155 /// hit; calling `Matcher::find_at(...)` ourselves is the documented
156 /// pattern for recovering that information.
157 matcher:RegexMatcher,
158}
159
160impl Sink for PerFileSink {
161 type Error = io::Error;
162
163 fn matched(&mut self, _Searcher:&Searcher, Mat:&SinkMatch<'_>) -> Result<bool, Self::Error> {
164 let RawLine = Mat.bytes();
165 // Trim trailing newline so the preview text the renderer shows
166 // doesn't carry a stray empty line break.
167 let TrimmedLen = if RawLine.ends_with(b"\r\n") {
168 RawLine.len().saturating_sub(2)
169 } else if RawLine.last() == Some(&b'\n') {
170 RawLine.len().saturating_sub(1)
171 } else {
172 RawLine.len()
173 };
174 let LineBytes = &RawLine[..TrimmedLen];
175 // Cap preview length at 512 chars - super-long minified lines
176 // would otherwise force the renderer to layout massive rows
177 // AND make the byte→char map below grow proportionally.
178 const PREVIEW_BYTE_CAP:usize = 512;
179 let CapBytes = LineBytes.len().min(PREVIEW_BYTE_CAP);
180 // Round down to the nearest UTF-8 boundary so `from_utf8_lossy`
181 // doesn't replace half a multibyte char with U+FFFD.
182 let SafeCap = (0..=CapBytes)
183 .rev()
184 .find(|&I| I == 0 || I == LineBytes.len() || (LineBytes[I] & 0xC0) != 0x80)
185 .unwrap_or(0);
186 let Preview = String::from_utf8_lossy(&LineBytes[..SafeCap]).to_string();
187
188 // `line_number(true)` was set on the SearcherBuilder so this
189 // returns Some(n) (1-based). Default to 1 if we somehow lose
190 // it - rendering "line 0" looked wrong even when the rest of
191 // the data was correct.
192 let LineNumber = Mat.line_number().unwrap_or(1);
193
194 // Build a byte→char map ONCE per line so every column lookup
195 // is O(log n) (binary search) instead of O(n) (the previous
196 // `char_indices().position()` per call). On lines with many
197 // matches this collapses the per-line work from quadratic to
198 // linear, which is the difference between a 6 s search and a
199 // minutes-long hang on workspaces that contain match-dense
200 // minified bundles.
201 let mut CharBoundaries:Vec<usize> = Vec::with_capacity(Preview.len() / 2 + 1);
202 for (B, _) in Preview.char_indices() {
203 CharBoundaries.push(B);
204 }
205 CharBoundaries.push(Preview.len()); // Sentinel for end-of-string.
206 let ByteToChar = |Byte:usize| -> u64 {
207 match CharBoundaries.binary_search(&Byte) {
208 Ok(Index) => Index as u64,
209 Err(Index) => Index as u64,
210 }
211 };
212
213 // Walk the line bytes and collect every sub-line range the
214 // matcher hits. Multiple matches per line are common
215 // (e.g. searching for `test` in `test test`); each becomes its
216 // own ColumnRange so the renderer underlines them all. Cap at
217 // `MAX_COLUMNS_PER_LINE` to bound work on pathological lines
218 // where a regex matches every character (e.g. `.` or `\w`
219 // against a long minified line).
220 const MAX_COLUMNS_PER_LINE:usize = 100;
221 let mut Columns:Vec<ColumnRange> = Vec::new();
222 let mut StartByte = 0usize;
223 // Search within the truncated preview so columns line up with
224 // the preview text the renderer will display.
225 let SearchBytes = &LineBytes[..SafeCap];
226 while StartByte <= SearchBytes.len() && Columns.len() < MAX_COLUMNS_PER_LINE {
227 match self.matcher.find_at(SearchBytes, StartByte) {
228 Ok(Some(M)) => {
229 if M.start() >= SearchBytes.len() {
230 break;
231 }
232 Columns.push(ColumnRange { start:ByteToChar(M.start()), end:ByteToChar(M.end()) });
233 // `M.end() == M.start()` happens for zero-width
234 // matches (e.g. `\b`); advance by one byte to
235 // avoid an infinite loop.
236 StartByte = if M.end() == M.start() { M.end() + 1 } else { M.end() };
237 },
238 _ => break,
239 }
240 }
241
242 // Since this sink is per-file, we know `self.path` is correct.
243 let FileURI = url::Url::from_file_path(&self.path)
244 .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Could not convert path to URL"))?
245 .to_string();
246
247 let NewMatch = TextMatch { preview:Preview, line_number:LineNumber, columns:Columns };
248
249 // Mutex acquired AFTER the column-range scan so contention
250 // doesn't serialise the per-line regex work across the
251 // `WalkBuilder::build_parallel()` workers.
252 let mut ResultsGuard = self
253 .results
254 .lock()
255 .map_err(|Error| io::Error::new(io::ErrorKind::Other, Error.to_string()))?;
256
257 // Find the entry for our file, or create it if it's the first match.
258 if let Some(FileMatch) = ResultsGuard.iter_mut().find(|fm| fm.resource == FileURI) {
259 FileMatch.matches.push(NewMatch);
260 } else {
261 ResultsGuard.push(FileMatch { resource:FileURI, matches:vec![NewMatch] });
262 }
263
264 // Continue searching
265 Ok(true)
266 }
267}
268
269#[async_trait]
270impl SearchProvider for MountainEnvironment {
271 async fn TextSearch(&self, QueryValue:Value, _OptionsValue:Value) -> Result<Value, CommonError> {
272 let Query:TextSearchQuery = serde_json::from_value(QueryValue)?;
273
274 dev_log!("search", "[SearchProvider] Performing text search for: {:?}", Query);
275
276 let mut Builder = RegexMatcherBuilder::new();
277
278 Builder
279 .case_insensitive(!Query.is_case_sensitive.unwrap_or(false))
280 .word(Query.is_word_match.unwrap_or(false))
281 .multi_line(Query.is_multiline.unwrap_or(false));
282
283 // When `isRegExp` is false/missing (the default for the Search
284 // view's plain-text mode), escape the pattern so literal
285 // searches for strings containing regex metacharacters
286 // (`.`, `(`, `[`, `*`, `?`, etc.) don't crash the compiler
287 // or silently match the wrong thing.
288 let CompiledPattern = if Query.is_reg_exp.unwrap_or(false) {
289 Query.pattern.clone()
290 } else {
291 regex::escape(&Query.pattern)
292 };
293
294 let Matcher = Builder.build(&CompiledPattern).map_err(|Error| {
295 CommonError::InvalidArgument { ArgumentName:"pattern".into(), Reason:Error.to_string() }
296 })?;
297
298 let AllMatches = Arc::new(Mutex::new(Vec::<FileMatch>::new()));
299
300 let Folders = self
301 .ApplicationState
302 .Workspace
303 .WorkspaceFolders
304 .lock()
305 .map_err(Utility::ErrorMapping::MapApplicationStateLockErrorToCommonError)?
306 .clone();
307
308 if Folders.is_empty() {
309 dev_log!("search", "warn: [SearchProvider] No workspace folders to search in.");
310
311 return Ok(json!([]));
312 }
313
314 for Folder in Folders {
315 if let Ok(FolderPath) = Folder.URI.to_file_path() {
316 // Use a parallel walker for better performance.
317 let Walker = WalkBuilder::new(FolderPath).build_parallel();
318
319 // The `search_parallel` method is not available on `Searcher`. We must process
320 // entries from the walker and call `search_path` individually.
321 Walker.run(|| {
322 // `line_number(true)` is mandatory - without it,
323 // `SinkMatch::line_number()` returns None and every
324 // match lands at line 0, which the renderer treats
325 // as "no line info" and collapses into an
326 // uncategorised count-of-zero. The default
327 // `Searcher::new()` constructor disables line
328 // numbers for performance.
329 let mut Searcher = SearcherBuilder::new().line_number(true).build();
330
331 let Matcher = Matcher.clone();
332
333 let AllMatches = AllMatches.clone();
334
335 Box::new(move |EntryResult| {
336 if let Ok(Entry) = EntryResult {
337 if Entry.file_type().map_or(false, |ft| ft.is_file()) {
338 // For each file, create a new sink that knows its path.
339 let Sink = PerFileSink {
340 path:Entry.path().to_path_buf(),
341 results:AllMatches.clone(),
342 matcher:Matcher.clone(),
343 };
344
345 if let Err(Error) = Searcher.search_path(&Matcher, Entry.path(), Sink) {
346 dev_log!(
347 "search",
348 "warn: [SearchProvider] Error searching path {}: {}",
349 Entry.path().display(),
350 Error
351 );
352 }
353 }
354 }
355
356 ignore::WalkState::Continue
357 })
358 });
359 }
360 }
361
362 let FinalMatches = AllMatches
363 .lock()
364 .map_err(|Error| CommonError::StateLockPoisoned { Context:Error.to_string() })?
365 .clone();
366
367 let TotalLineMatches:usize = FinalMatches.iter().map(|F| F.matches.len()).sum();
368 dev_log!(
369 "search",
370 "[SearchProvider] returned {} files / {} line-matches for pattern={:?}",
371 FinalMatches.len(),
372 TotalLineMatches,
373 Query.pattern
374 );
375
376 Ok(json!(FinalMatches))
377 }
378}