fuzzy_nucleo: Optimize path matching with CharBag prefilter and add benchmarks (#54112)

This PR was originally a part of
https://github.com/zed-industries/zed/pull/53551 so theres more info
about its motivation there.

- Add a CharBag prefilter on path candidates to skip irrelevant entries
before invoking nucleo's matcher.
- Use binary_search on sorted matched char indices when reconstructing
byte positions (perf improvement).
- Add a criterion benchmark comparing `fuzzy_nucleo` path matching
against the existing fuzzy crate.

Performance Chart:

| Benchmark | Size | Nucleo (before) | Nucleo (after) | Fuzzy |
Before/Fuzzy | After/Fuzzy |

|-----------|-----:|----------------:|---------------:|------:|-------------:|------------:|
  | 1-word | 100 | 14.14 µs | 9.12 µs | 9.06 µs | 1.56x | 1.01x |
  | 1-word | 1,000 | 164.37 µs | 114.11 µs | 110.43 µs | 1.49x | 1.03x |
  | 1-word | 10,000 | 1.83 ms | 1.39 ms | 1.41 ms | 1.30x | 0.99x |
  | 2-word | 100 | 12.83 µs | 3.51 µs | 979 ns | 13.10x | 3.59x |
  | 2-word | 1,000 | 131.65 µs | 33.46 µs | 6.37 µs | 20.67x | 5.25x |
  | 2-word | 10,000 | 1.24 ms | 338.84 µs | 52.46 µs | 23.64x | 6.46x |o

Exact Current State:
| query | size | nucleo | fuzzy | nucleo/fuzzy |
  |---|---:|---:|---:|---:|
  | 1-word | 100 | 8.62 µs | 9.22 µs | 0.93× |
  | 1-word | 1000 | 102 µs | 111 µs | 0.92× |
  | 1-word | 10000 | 1.13 ms | 1.28 ms | 0.88× |
  | 2-word | 100 | 3.48 µs | 0.98 µs | 3.55× |
  | 2-word | 1000 | 29.9 µs | 6.39 µs | 4.68× |
  | 2-word | 10000 | 271 µs | 53.4 µs | 5.08× |
  | 4-word | 100 | 0.85 µs | 0.53 µs | 1.60× |
  | 4-word | 1000 | 2.99 µs | 1.66 µs | 1.80× |
  | 4-word | 10000 | 20.1 µs | 9.14 µs | 2.20× |

Self-Review Checklist:

- [x] I've reviewed my own diff for quality, security, and reliability
- [x] Unsafe blocks (if any) have justifying comments
- [x] The content is consistent with the [UI/UX
checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist)
- [x] Tests cover the new/changed behavior
- [x] Performance impact has been considered and is acceptable

Release Notes:

- fuzzy_nucleo: improved the performance of path matching
This commit is contained in:
Finn Eitreim 2026-04-17 10:20:45 -04:00 committed by GitHub
parent 80a053ed2a
commit 722f3089ed
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 451 additions and 45 deletions

2
Cargo.lock generated
View file

@ -6772,6 +6772,8 @@ dependencies = [
name = "fuzzy_nucleo"
version = "0.1.0"
dependencies = [
"criterion",
"fuzzy",
"gpui",
"nucleo",
"util",

View file

@ -698,13 +698,18 @@ fn matching_history_items<'a>(
.into_iter()
.chain(currently_opened)
.map(|found_path| {
let candidate = PathMatchCandidate {
is_dir: false, // You can't open directories as project items
path: &found_path.project.path,
// Only match history items names, otherwise their paths may match too many queries, producing false positives.
// E.g. `foo` would match both `something/foo/bar.rs` and `something/foo/foo.rs` and if the former is a history item,
// it would be shown first always, despite the latter being a better match.
};
// Only match history items names, otherwise their paths may match too many queries,
// producing false positives. E.g. `foo` would match both `something/foo/bar.rs` and
// `something/foo/foo.rs` and if the former is a history item, it would be shown first
// always, despite the latter being a better match.
let candidate = PathMatchCandidate::new(
&found_path.project.path,
false,
worktree_name_by_id
.as_ref()
.and_then(|m| m.get(&found_path.project.worktree_id))
.map(|prefix| prefix.as_ref()),
);
candidates_paths.insert(&found_path.project, found_path);
(found_path.project.worktree_id, candidate)
})
@ -731,7 +736,7 @@ fn matching_history_items<'a>(
worktree.to_usize(),
worktree_root_name,
query.path_query(),
false,
fuzzy_nucleo::Case::Ignore,
max_results,
path_style,
)
@ -914,7 +919,7 @@ impl FileFinderDelegate {
candidate_sets.as_slice(),
query.path_query(),
&relative_to,
false,
fuzzy_nucleo::Case::Ignore,
100,
&cancel_flag,
cx.background_executor().clone(),

View file

@ -13,9 +13,15 @@ path = "src/fuzzy_nucleo.rs"
doctest = false
[dependencies]
fuzzy.workspace = true
nucleo.workspace = true
gpui.workspace = true
util.workspace = true
[dev-dependencies]
util = {workspace = true, features = ["test-support"]}
criterion.workspace = true
util = { workspace = true, features = ["test-support"] }
[[bench]]
name = "match_benchmark"
harness = false

View file

@ -0,0 +1,253 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use fuzzy::CharBag;
use util::{paths::PathStyle, rel_path::RelPath};
const DIRS: &[&str] = &[
"src",
"crates/gpui/src",
"crates/editor/src",
"crates/fuzzy_nucleo/src",
"crates/workspace/src",
"crates/project/src",
"crates/language/src",
"crates/terminal/src",
"crates/assistant/src",
"crates/theme/src",
"tests/integration",
"tests/unit",
"docs/architecture",
"scripts",
"assets/icons",
"assets/fonts",
"crates/git/src",
"crates/rpc/src",
"crates/settings/src",
"crates/diagnostics/src",
"crates/search/src",
"crates/collab/src",
"crates/db/src",
"crates/lsp/src",
];
const FILENAMES: &[&str] = &[
"parser.rs",
"main.rs",
"executor.rs",
"editor.rs",
"strings.rs",
"workspace.rs",
"project.rs",
"buffer.rs",
"colors.rs",
"panel.rs",
"renderer.rs",
"dispatcher.rs",
"matcher.rs",
"paths.rs",
"context.rs",
"toolbar.rs",
"statusbar.rs",
"keymap.rs",
"config.rs",
"settings.rs",
"diagnostics.rs",
"completion.rs",
"hover.rs",
"references.rs",
"inlay_hints.rs",
"git_blame.rs",
"terminal.rs",
"search.rs",
"replace.rs",
"outline.rs",
"breadcrumbs.rs",
"tab_bar.rs",
"Cargo.toml",
"README.md",
"build.sh",
"LICENSE",
"overview.md",
"string_helpers.rs",
"test_helpers.rs",
"fixtures.json",
"schema.sql",
];
const QUERY_WORDS: &[&str] = &[
"par",
"edi",
"buf",
"set",
"mat",
"con",
"ren",
"dis",
"sea",
"ter",
"col",
"hov",
"out",
"rep",
"key",
"too",
"pan",
"str",
"dia",
"com",
"executor",
"workspace",
"settings",
"terminal",
"breadcrumbs",
"git_blame",
"fixtures",
"schema",
"config",
"toolbar",
];
/// Deterministic query generation from QUERY_WORDS using a simple LCG.
/// Returns `count` queries of each arity: 1, 2, and 4 space-separated words.
fn generate_queries(count: usize) -> (Vec<String>, Vec<String>, Vec<String>) {
let mut state: u64 = 0xDEAD_BEEF;
let mut next = || -> usize {
// LCG: simple, fast, deterministic
state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
(state >> 33) as usize
};
let mut n_word = |n: usize| -> Vec<String> {
(0..count)
.map(|_| {
(0..n)
.map(|_| QUERY_WORDS[next() % QUERY_WORDS.len()])
.collect::<Vec<_>>()
.join(" ")
})
.collect()
};
(n_word(1), n_word(2), n_word(4))
}
fn generate_path_strings(count: usize) -> &'static [String] {
let paths: Box<[String]> = (0..count)
.map(|id| {
let dir = DIRS[id % DIRS.len()];
let file = FILENAMES[id / DIRS.len() % FILENAMES.len()];
format!("{dir}/{file}")
})
.collect();
Box::leak(paths)
}
fn generate_nucleo_path_candidates(
paths: &'static [String],
) -> Vec<fuzzy_nucleo::PathMatchCandidate<'static>> {
paths
.iter()
.map(|path| {
fuzzy_nucleo::PathMatchCandidate::new(RelPath::unix(path).unwrap(), false, None)
})
.collect()
}
fn generate_fuzzy_path_candidates(
paths: &'static [String],
) -> Vec<fuzzy::PathMatchCandidate<'static>> {
paths
.iter()
.map(|path| fuzzy::PathMatchCandidate {
is_dir: false,
path: RelPath::unix(path).unwrap(),
char_bag: CharBag::from(path.as_str()),
})
.collect()
}
fn capitalize_each_word(query: &str) -> String {
query
.split_whitespace()
.map(|w| {
let mut chars = w.chars();
match chars.next() {
Some(c) => c.to_ascii_uppercase().to_string() + chars.as_str(),
None => String::new(),
}
})
.collect::<Vec<_>>()
.join(" ")
}
fn bench_path_matching(criterion: &mut Criterion) {
let sizes = [100, 1000, 10_000];
let all_path_strings = sizes.map(generate_path_strings);
let query_count = 200;
let (q1, q2, q4) = generate_queries(query_count);
let q1_upper: Vec<String> = q1.iter().map(|q| capitalize_each_word(q)).collect();
let q2_upper: Vec<String> = q2.iter().map(|q| capitalize_each_word(q)).collect();
let q4_upper: Vec<String> = q4.iter().map(|q| capitalize_each_word(q)).collect();
for (label, queries, case) in [
("path/1-word", &q1, fuzzy_nucleo::Case::Ignore),
("path/2-word", &q2, fuzzy_nucleo::Case::Ignore),
("path/4-word", &q4, fuzzy_nucleo::Case::Ignore),
("path_smart/1-word", &q1_upper, fuzzy_nucleo::Case::Smart),
("path_smart/2-word", &q2_upper, fuzzy_nucleo::Case::Smart),
("path_smart/4-word", &q4_upper, fuzzy_nucleo::Case::Smart),
] {
let mut group = criterion.benchmark_group(label);
for (size_index, &size) in sizes.iter().enumerate() {
let path_strings = all_path_strings[size_index];
let mut query_idx = 0usize;
group.bench_function(BenchmarkId::new("nucleo", size), |b| {
b.iter_batched(
|| {
let query = queries[query_idx % queries.len()].as_str();
query_idx += 1;
(generate_nucleo_path_candidates(path_strings), query)
},
|(candidates, query)| {
fuzzy_nucleo::match_fixed_path_set(
candidates,
0,
None,
query,
case,
size,
PathStyle::Posix,
)
},
BatchSize::SmallInput,
)
});
let mut query_idx = 0usize;
group.bench_function(BenchmarkId::new("fuzzy", size), |b| {
b.iter_batched(
|| {
let query = queries[query_idx % queries.len()].as_str();
query_idx += 1;
(generate_fuzzy_path_candidates(path_strings), query)
},
|(candidates, query)| {
fuzzy::match_fixed_path_set(
candidates,
0,
None,
query,
false,
size,
PathStyle::Posix,
)
},
BatchSize::SmallInput,
)
});
}
group.finish();
}
}
criterion_group!(benches, bench_path_matching);
criterion_main!(benches);

View file

@ -3,3 +3,53 @@ mod paths;
pub use paths::{
PathMatch, PathMatchCandidate, PathMatchCandidateSet, match_fixed_path_set, match_path_sets,
};
pub(crate) struct Cancelled;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Case {
Smart,
Ignore,
}
impl Case {
pub fn from_smart(smart: bool) -> Self {
if smart { Self::Smart } else { Self::Ignore }
}
pub fn is_smart(self) -> bool {
matches!(self, Self::Smart)
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum LengthPenalty {
On,
Off,
}
impl LengthPenalty {
pub fn from_bool(on: bool) -> Self {
if on { Self::On } else { Self::Off }
}
pub fn is_on(self) -> bool {
matches!(self, Self::On)
}
}
/// Reconstruct byte-offset match positions from a list of matched char offsets
/// that is already sorted ascending and deduplicated.
pub(crate) fn positions_from_sorted(s: &str, sorted_char_indices: &[u32]) -> Vec<usize> {
let mut iter = sorted_char_indices.iter().copied().peekable();
let mut out = Vec::with_capacity(sorted_char_indices.len());
for (char_offset, (byte_offset, _)) in s.char_indices().enumerate() {
if iter.peek().is_none() {
break;
}
if iter.next_if(|&m| m == char_offset as u32).is_some() {
out.push(byte_offset);
}
}
out
}

View file

@ -4,8 +4,15 @@ static MATCHERS: Mutex<Vec<nucleo::Matcher>> = Mutex::new(Vec::new());
pub const LENGTH_PENALTY: f64 = 0.01;
fn pool_cap() -> usize {
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(8)
.max(1)
}
pub fn get_matcher(config: nucleo::Config) -> nucleo::Matcher {
let mut matchers = MATCHERS.lock().unwrap();
let mut matchers = MATCHERS.lock().unwrap_or_else(|e| e.into_inner());
match matchers.pop() {
Some(mut matcher) => {
matcher.config = config;
@ -16,12 +23,15 @@ pub fn get_matcher(config: nucleo::Config) -> nucleo::Matcher {
}
pub fn return_matcher(matcher: nucleo::Matcher) {
MATCHERS.lock().unwrap().push(matcher);
let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner());
if pool.len() < pool_cap() {
pool.push(matcher);
}
}
pub fn get_matchers(n: usize, config: nucleo::Config) -> Vec<nucleo::Matcher> {
let mut matchers: Vec<_> = {
let mut pool = MATCHERS.lock().unwrap();
let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner());
let available = pool.len().min(n);
pool.drain(..available)
.map(|mut matcher| {
@ -34,6 +44,9 @@ pub fn get_matchers(n: usize, config: nucleo::Config) -> Vec<nucleo::Matcher> {
matchers
}
pub fn return_matchers(mut matchers: Vec<nucleo::Matcher>) {
MATCHERS.lock().unwrap().append(&mut matchers);
pub fn return_matchers(matchers: Vec<nucleo::Matcher>) {
let cap = pool_cap();
let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner());
let space = cap.saturating_sub(pool.len());
pool.extend(matchers.into_iter().take(space));
}

View file

@ -11,12 +11,35 @@ use util::{paths::PathStyle, rel_path::RelPath};
use nucleo::Utf32Str;
use nucleo::pattern::{Atom, AtomKind, CaseMatching, Normalization};
use fuzzy::CharBag;
use crate::matcher::{self, LENGTH_PENALTY};
use crate::{Cancelled, Case, positions_from_sorted};
#[derive(Clone, Debug)]
pub struct PathMatchCandidate<'a> {
pub is_dir: bool,
pub path: &'a RelPath,
pub char_bag: CharBag,
}
impl<'a> PathMatchCandidate<'a> {
/// Build a candidate whose prefilter bag covers both the worktree prefix and the path.
/// Pass `None` when matching against paths that have no worktree prefix.
pub fn new(path: &'a RelPath, is_dir: bool, path_prefix: Option<&RelPath>) -> Self {
let mut char_bag = CharBag::default();
if let Some(prefix) = path_prefix
&& !prefix.is_empty()
{
char_bag.extend(prefix.as_unix_str().chars().map(|c| c.to_ascii_lowercase()));
}
char_bag.extend(path.as_unix_str().chars().map(|c| c.to_ascii_lowercase()));
Self {
is_dir,
path,
char_bag,
}
}
}
#[derive(Clone, Debug)]
@ -62,8 +85,7 @@ impl PartialOrd for PathMatch {
impl Ord for PathMatch {
fn cmp(&self, other: &Self) -> Ordering {
self.score
.partial_cmp(&other.score)
.unwrap_or(Ordering::Equal)
.total_cmp(&other.score)
.then_with(|| self.worktree_id.cmp(&other.worktree_id))
.then_with(|| {
other
@ -74,18 +96,47 @@ impl Ord for PathMatch {
}
}
fn make_atoms(query: &str, smart_case: bool) -> Vec<Atom> {
let case = if smart_case {
CaseMatching::Smart
} else {
CaseMatching::Ignore
};
// Path matching is always case-insensitive at the nucleo level. `Case::Smart`
// is honored as a *scoring hint*: when the query contains uppercase, candidates
// whose matched characters disagree in case are downranked by a factor per
// mismatch rather than dropped. This keeps `"Editor: Backspace"` matching
// `"editor: backspace"` while still preferring exact-case hits.
const SMART_CASE_PENALTY_PER_MISMATCH: f64 = 0.9;
pub(crate) fn make_atoms(query: &str) -> Vec<Atom> {
query
.split_whitespace()
.map(|word| Atom::new(word, case, Normalization::Smart, AtomKind::Fuzzy, false))
.map(|word| {
Atom::new(
word,
CaseMatching::Ignore,
Normalization::Smart,
AtomKind::Fuzzy,
false,
)
})
.collect()
}
// Only populated when we will actually charge a smart-case penalty, so the hot
// path can iterate a plain `&[Atom]` and ignore this slice entirely.
fn make_source_words(query: &str, case: Case) -> Option<Vec<Vec<char>>> {
(case.is_smart() && query.chars().any(|c| c.is_uppercase())).then(|| {
query
.split_whitespace()
.map(|word| word.chars().collect())
.collect()
})
}
fn case_penalty(mismatches: u32) -> f64 {
if mismatches == 0 {
1.0
} else {
SMART_CASE_PENALTY_PER_MISMATCH.powi(mismatches as i32)
}
}
pub(crate) fn distance_between_paths(path: &RelPath, relative_to: &RelPath) -> usize {
let mut path_components = path.components();
let mut relative_components = relative_to.components();
@ -121,11 +172,12 @@ fn get_filename_match_bonus(
}
total_score as f64 / filename.len().max(1) as f64
}
struct Cancelled;
fn path_match_helper<'a>(
matcher: &mut nucleo::Matcher,
atoms: &[Atom],
source_words: Option<&[Vec<char>]>,
query_bag: CharBag,
candidates: impl Iterator<Item = PathMatchCandidate<'a>>,
results: &mut Vec<PathMatch>,
worktree_id: usize,
@ -146,6 +198,7 @@ fn path_match_helper<'a>(
let mut buf = Vec::new();
let mut matched_chars: Vec<u32> = Vec::new();
let mut atom_matched_chars = Vec::new();
let mut candidate_chars: Vec<char> = Vec::new();
for candidate in candidates {
buf.clear();
matched_chars.clear();
@ -153,6 +206,10 @@ fn path_match_helper<'a>(
return Err(Cancelled);
}
if !candidate.char_bag.is_superset(query_bag) {
continue;
}
candidate_buf.truncate(path_prefix_len);
if root_is_file {
candidate_buf.push_str(path_prefix.as_unix_str());
@ -162,18 +219,36 @@ fn path_match_helper<'a>(
let haystack = Utf32Str::new(&candidate_buf, &mut buf);
if source_words.is_some() {
candidate_chars.clear();
candidate_chars.extend(candidate_buf.chars());
}
let mut total_score: u32 = 0;
let mut case_mismatches: u32 = 0;
let mut all_matched = true;
for atom in atoms {
for (atom_idx, atom) in atoms.iter().enumerate() {
atom_matched_chars.clear();
if let Some(score) = atom.indices(haystack, matcher, &mut atom_matched_chars) {
total_score = total_score.saturating_add(score as u32);
matched_chars.extend_from_slice(&atom_matched_chars);
} else {
let Some(score) = atom.indices(haystack, matcher, &mut atom_matched_chars) else {
all_matched = false;
break;
};
total_score = total_score.saturating_add(score as u32);
if let Some(source_words) = source_words {
let query_chars = &source_words[atom_idx];
if query_chars.len() == atom_matched_chars.len() {
for (&query_char, &pos) in query_chars.iter().zip(&atom_matched_chars) {
if let Some(&candidate_char) = candidate_chars.get(pos as usize)
&& candidate_char != query_char
&& candidate_char.eq_ignore_ascii_case(&query_char)
{
case_mismatches += 1;
}
}
}
}
matched_chars.extend_from_slice(&atom_matched_chars);
}
if all_matched && !atoms.is_empty() {
@ -182,17 +257,9 @@ fn path_match_helper<'a>(
let length_penalty = candidate_buf.len() as f64 * LENGTH_PENALTY;
let filename_bonus = get_filename_match_bonus(&candidate_buf, atoms, matcher);
let adjusted_score = total_score as f64 + filename_bonus - length_penalty;
let mut positions: Vec<usize> = candidate_buf
.char_indices()
.enumerate()
.filter_map(|(char_offset, (byte_offset, _))| {
matched_chars
.contains(&(char_offset as u32))
.then_some(byte_offset)
})
.collect();
positions.sort_unstable();
let positive = (total_score as f64 + filename_bonus) * case_penalty(case_mismatches);
let adjusted_score = positive - length_penalty;
let positions = positions_from_sorted(&candidate_buf, &matched_chars);
results.push(PathMatch {
score: adjusted_score,
@ -225,7 +292,7 @@ pub fn match_fixed_path_set(
worktree_id: usize,
worktree_root_name: Option<Arc<RelPath>>,
query: &str,
smart_case: bool,
case: Case,
max_results: usize,
path_style: PathStyle,
) -> Vec<PathMatch> {
@ -233,7 +300,9 @@ pub fn match_fixed_path_set(
config.set_match_paths();
let mut matcher = matcher::get_matcher(config);
let atoms = make_atoms(query, smart_case);
let atoms = make_atoms(query);
let source_words = make_source_words(query, case);
let query_bag = CharBag::from(query);
let root_is_file = worktree_root_name.is_some() && candidates.iter().all(|c| c.path.is_empty());
@ -244,6 +313,8 @@ pub fn match_fixed_path_set(
path_match_helper(
&mut matcher,
&atoms,
source_words.as_deref(),
query_bag,
candidates.into_iter(),
&mut results,
worktree_id,
@ -263,7 +334,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
candidate_sets: &'a [Set],
query: &str,
relative_to: &Option<Arc<RelPath>>,
smart_case: bool,
case: Case,
max_results: usize,
cancel_flag: &AtomicBool,
executor: BackgroundExecutor,
@ -281,7 +352,9 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
query.to_owned()
};
let atoms = make_atoms(&query, smart_case);
let atoms = make_atoms(&query);
let source_words = make_source_words(&query, case);
let query_bag = CharBag::from(query.as_str());
let num_cpus = executor.num_cpus().min(path_count);
let segment_size = path_count.div_ceil(num_cpus);
@ -299,6 +372,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
.enumerate()
{
let atoms = atoms.clone();
let source_words = source_words.clone();
let relative_to = relative_to.clone();
scope.spawn(async move {
let segment_start = segment_idx * segment_size;
@ -316,6 +390,8 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
if path_match_helper(
matcher,
&atoms,
source_words.as_deref(),
query_bag,
candidates,
results,
candidate_set.id(),

View file

@ -6439,6 +6439,7 @@ impl<'a> Iterator for PathMatchCandidateSetNucleoIter<'a> {
.map(|entry| fuzzy_nucleo::PathMatchCandidate {
is_dir: entry.kind.is_dir(),
path: &entry.path,
char_bag: entry.char_bag,
})
}
}