Skip to content

Commit

Permalink
Correctly handle normalization in pattern API
Browse files Browse the repository at this point in the history
  • Loading branch information
pascalkuthe committed Dec 17, 2023
1 parent b41c989 commit eac298d
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 59 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# Changelog

# Unreleased
# [0.3.0] - 2023-12-20

## **Breaking Changes**

* Pattern API method now require a Unicode `Normalization` strategy in addition to a `CaseMatching` strategy.

## Bugfixes

* correctly handle Unicode normalization when there are normalizable characters in the pattern

* when the needle is composed of a single char, return the score and index
of the best position instead of always returning the first matched character
in the haystack
Expand All @@ -19,5 +25,6 @@
*initial public release*


[0.3.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.3.0
[0.2.1]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.1
[0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0
16 changes: 8 additions & 8 deletions matcher/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ can contain special characters to control what kind of match is performed (see
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching};
# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]);
```
Expand All @@ -30,24 +30,24 @@ If the pattern should be matched literally (without this special parsing)
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind};
# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization};
let paths = ["foo/bar", "bar/foo", "foobar"];
let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
let paths = ["^foo/bar", "bar/^foo", "foobar"];
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]);
```
If word segmentation is also not desired, a single `Atom` can be constructed directly.
```
# use nucleo_matcher::{Matcher, Config};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind};
# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind};
let paths = ["foobar", "foo bar"];
let mut matcher = Matcher::new(Config::DEFAULT);
let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
assert_eq!(matches, vec![("foo bar", 192)]);
```
Expand Down
82 changes: 72 additions & 10 deletions matcher/src/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ pub enum CaseMatching {
Smart,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
#[non_exhaustive]
/// How to handle unicode normalization,
pub enum Normalization {
/// Characters never match their normalized version (`a != ä`).
Never,
/// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
/// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
#[default]
#[cfg(feature = "unicode-normalization")]
Smart,
}

#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// The kind of matching algorithm to run for an atom.
Expand Down Expand Up @@ -73,24 +86,41 @@ pub struct Atom {
pub kind: AtomKind,
needle: Utf32String,
ignore_case: bool,
normalize: bool,
}

impl Atom {
/// Creates a single [`Atom`] from a string by performing unicode
/// normalization and case folding (if necessary). Optionally `\ ` can
/// be escaped to ` `.
pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom {
Atom::new_inner(needle, case, kind, escape_whitespace, false)
pub fn new(
needle: &str,
case: CaseMatching,
normalize: Normalization,
kind: AtomKind,
escape_whitespace: bool,
) -> Atom {
Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
}

fn new_inner(
needle: &str,
case: CaseMatching,
normalization: Normalization,
kind: AtomKind,
escape_whitespace: bool,
append_dollar: bool,
) -> Atom {
let mut ignore_case;
let mut normalize;
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
#[cfg(not(feature = "unicode-normalization"))]
{
normalize = false;
}
let needle = if needle.is_ascii() {
let mut needle = if escape_whitespace {
if let Some((start, rem)) = needle.split_once("\\ ") {
Expand Down Expand Up @@ -133,6 +163,10 @@ impl Atom {
{
ignore_case = false;
}
#[cfg(feature = "unicode-normalization")]
{
normalize = matches!(normalization, Normalization::Smart);
}
if escape_whitespace {
let mut saw_backslash = false;
for mut c in chars::graphemes(needle) {
Expand All @@ -155,6 +189,13 @@ impl Atom {
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
needle_.push(c);
}
} else {
Expand All @@ -168,6 +209,13 @@ impl Atom {
}
CaseMatching::Respect => (),
}
match normalization {
#[cfg(feature = "unicode-normalization")]
Normalization::Smart => {
normalize = normalize && chars::normalize(c) == c;
}
Normalization::Never => (),
}
c
});
needle_.extend(chars);
Expand All @@ -177,18 +225,20 @@ impl Atom {
}
Utf32String::Unicode(needle_.into_boxed_slice())
};
println!("hello {needle:?} {normalize}");
Atom {
kind,
needle,
negative: false,
ignore_case,
normalize,
}
}

/// Parse a pattern atom from a string. Some special trailing and leading
/// characters can be used to control the atom kind. See [`AtomKind`] for
/// details.
pub fn parse(raw: &str, case: CaseMatching) -> Atom {
pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
let mut atom = raw;
let invert = match atom.as_bytes() {
[b'!', ..] => {
Expand Down Expand Up @@ -239,7 +289,7 @@ impl Atom {
kind = AtomKind::Substring
}

let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar);
let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
pattern.negative = invert;
pattern
}
Expand All @@ -252,6 +302,7 @@ impl Atom {
/// each pattern atom.
pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
Expand Down Expand Up @@ -285,6 +336,7 @@ impl Atom {
indices: &mut Vec<u32>,
) -> Option<u16> {
matcher.config.ignore_case = self.ignore_case;
matcher.config.normalize = self.normalize;
if self.negative {
let pattern_score = match self.kind {
AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
Expand Down Expand Up @@ -371,10 +423,15 @@ impl Pattern {
/// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
/// and ^ don't receive special treatment). If you want to match the entire
/// pattern as a single needle use a single [`Atom`] instead.
pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern {
pub fn new(
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
kind: AtomKind,
) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::new(pat, case_matching, kind, true);
let pat = Atom::new(pat, case_matching, normalize, kind, true);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Expand All @@ -384,10 +441,10 @@ impl Pattern {
/// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
/// cause different matching behaviour (see [`AtomKind`]). These can be
/// escaped with backslash.
pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern {
pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
let atoms = pattern_atoms(pattern)
.filter_map(|pat| {
let pat = Atom::parse(pat, case_matching);
let pat = Atom::parse(pat, case_matching, normalize);
(!pat.needle.is_empty()).then_some(pat)
})
.collect();
Expand Down Expand Up @@ -477,10 +534,15 @@ impl Pattern {
/// Refreshes this pattern by reparsing it from a string. This is mostly
/// equivalent to just constructing a new pattern using [`Pattern::parse`]
/// but is slightly more efficient by reusing some allocations
pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) {
pub fn reparse(
&mut self,
pattern: &str,
case_matching: CaseMatching,
normalize: Normalization,
) {
self.atoms.clear();
let atoms = pattern_atoms(pattern).filter_map(|atom| {
let atom = Atom::parse(atom, case_matching);
let atom = Atom::parse(atom, case_matching, normalize);
if atom.needle.is_empty() {
return None;
}
Expand Down
Loading

0 comments on commit eac298d

Please sign in to comment.