helix-editor · pascalkuthe · Dec 22, 2023 · Dec 17, 2023 · Dec 22, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,15 @@
 # Changelog
 
-# Unreleased
+# [0.3.0] - 2023-12-20
+
+## **Breaking Changes**
+
+* Pattern API method now requires a Unicode `Normalization` strategy in addition to a `CaseMatching` strategy.
 
 ## Bugfixes
 
+* avoid incorrect matches when searching for ASCII needles in a Unicode haystack
+* correctly handle Unicode normalization when there are normalizable characters in the pattern, for example characters with umlauts
 * when the needle is composed of a single char, return the score and index
   of the best position instead of always returning the first matched character
   in the haystack
@@ -19,5 +25,6 @@
 *initial public release*
 
 
+[0.3.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.3.0
 [0.2.1]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.1
 [0.2.0]: https://github.com/helix-editor/nucleo/releases/tag/nucleo-v0.2.0
diff --git a/matcher/src/exact.rs b/matcher/src/exact.rs
@@ -258,6 +258,9 @@ impl Matcher {
                 }
             }
         }
+        if max_score == 0 {
+            return None;
+        }
 
         let score = self.calculate_score::<INDICES, _, _>(
             haystack,

diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs
@@ -16,12 +16,12 @@ can contain special characters to control what kind of match is performed (see
 
 ```
 # use nucleo_matcher::{Matcher, Config};
-# use nucleo_matcher::pattern::{Pattern, CaseMatching};
+# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching};
 let paths = ["foo/bar", "bar/foo", "foobar"];
 let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
-let matches = Pattern::parse("foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
+let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
 assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
-let matches = Pattern::parse("^foo bar", CaseMatching::Ignore).match_list(paths, &mut matcher);
+let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher);
 assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]);
 ```
 
@@ -30,24 +30,24 @@ If the pattern should be matched literally (without this special parsing)
 
 ```
 # use nucleo_matcher::{Matcher, Config};
-# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind};
+# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization};
 let paths = ["foo/bar", "bar/foo", "foobar"];
 let mut matcher = Matcher::new(Config::DEFAULT.match_paths());
-let matches = Pattern::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
+let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
 assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]);
 let paths = ["^foo/bar", "bar/^foo", "foobar"];
-let matches = Pattern::new("^foo bar", CaseMatching::Ignore, AtomKind::Fuzzy).match_list(paths, &mut matcher);
+let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher);
 assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]);
 ```
 
 If word segmentation is also not desired, a single `Atom` can be constructed directly.
 
 ```
 # use nucleo_matcher::{Matcher, Config};
-# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, AtomKind};
+# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind};
 let paths = ["foobar", "foo bar"];
 let mut matcher = Matcher::new(Config::DEFAULT);
-let matches = Atom::new("foo bar", CaseMatching::Ignore, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
+let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher);
 assert_eq!(matches, vec![("foo bar", 192)]);
 ```
 
@@ -496,15 +496,8 @@ impl Matcher {
                         .substring_match_1_non_ascii::<INDICES>(haystack, needle, start, indices);
                     return Some(res);
                 }
-                let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?;
-                self.fuzzy_match_optimal::<INDICES, char, char>(
-                    haystack,
-                    needle,
-                    start,
-                    start + 1,
-                    end,
-                    indices,
-                )
+                let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?;
+                self.substring_match_non_ascii::<INDICES, _>(haystack, needle, start, indices)
             }
         }
     }

diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs
@@ -25,6 +25,19 @@ pub enum CaseMatching {
     Smart,
 }
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[non_exhaustive]
+/// How to handle unicode normalization,
+pub enum Normalization {
+    /// Characters never match their normalized version (`a != ä`).
+    Never,
+    /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
+    /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
+    #[default]
+    #[cfg(feature = "unicode-normalization")]
+    Smart,
+}
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 #[non_exhaustive]
 /// The kind of matching algorithm to run for an atom.
@@ -73,24 +86,41 @@ pub struct Atom {
     pub kind: AtomKind,
     needle: Utf32String,
     ignore_case: bool,
+    normalize: bool,
 }
 
 impl Atom {
     /// Creates a single [`Atom`] from a string by performing unicode
     /// normalization and case folding (if necessary). Optionally `\ ` can
     /// be escaped to ` `.
-    pub fn new(needle: &str, case: CaseMatching, kind: AtomKind, escape_whitespace: bool) -> Atom {
-        Atom::new_inner(needle, case, kind, escape_whitespace, false)
+    pub fn new(
+        needle: &str,
+        case: CaseMatching,
+        normalize: Normalization,
+        kind: AtomKind,
+        escape_whitespace: bool,
+    ) -> Atom {
+        Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
     }
 
     fn new_inner(
         needle: &str,
         case: CaseMatching,
+        normalization: Normalization,
         kind: AtomKind,
         escape_whitespace: bool,
         append_dollar: bool,
     ) -> Atom {
         let mut ignore_case;
+        let mut normalize;
+        #[cfg(feature = "unicode-normalization")]
+        {
+            normalize = matches!(normalization, Normalization::Smart);
+        }
+        #[cfg(not(feature = "unicode-normalization"))]
+        {
+            normalize = false;
+        }
         let needle = if needle.is_ascii() {
             let mut needle = if escape_whitespace {
                 if let Some((start, rem)) = needle.split_once("\\ ") {
@@ -133,6 +163,10 @@ impl Atom {
             {
                 ignore_case = false;
             }
+            #[cfg(feature = "unicode-normalization")]
+            {
+                normalize = matches!(normalization, Normalization::Smart);
+            }
             if escape_whitespace {
                 let mut saw_backslash = false;
                 for mut c in chars::graphemes(needle) {
@@ -155,6 +189,13 @@ impl Atom {
                         }
                         CaseMatching::Respect => (),
                     }
+                    match normalization {
+                        #[cfg(feature = "unicode-normalization")]
+                        Normalization::Smart => {
+                            normalize = normalize && chars::normalize(c) == c;
+                        }
+                        Normalization::Never => (),
+                    }
                     needle_.push(c);
                 }
             } else {
@@ -168,6 +209,13 @@ impl Atom {
                         }
                         CaseMatching::Respect => (),
                     }
+                    match normalization {
+                        #[cfg(feature = "unicode-normalization")]
+                        Normalization::Smart => {
+                            normalize = normalize && chars::normalize(c) == c;
+                        }
+                        Normalization::Never => (),
+                    }
                     c
                 });
                 needle_.extend(chars);
@@ -182,13 +230,14 @@ impl Atom {
             needle,
             negative: false,
             ignore_case,
+            normalize,
         }
     }
 
     /// Parse a pattern atom from a string. Some special trailing and leading
     /// characters can be used to control the atom kind. See [`AtomKind`] for
     /// details.
-    pub fn parse(raw: &str, case: CaseMatching) -> Atom {
+    pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
         let mut atom = raw;
         let invert = match atom.as_bytes() {
             [b'!', ..] => {
@@ -239,7 +288,7 @@ impl Atom {
             kind = AtomKind::Substring
         }
 
-        let mut pattern = Atom::new_inner(atom, case, kind, true, append_dollar);
+        let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
         pattern.negative = invert;
         pattern
     }
@@ -252,6 +301,7 @@ impl Atom {
     /// each pattern atom.
     pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
         matcher.config.ignore_case = self.ignore_case;
+        matcher.config.normalize = self.normalize;
         let pattern_score = match self.kind {
             AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
             AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
@@ -285,6 +335,7 @@ impl Atom {
         indices: &mut Vec<u32>,
     ) -> Option<u16> {
         matcher.config.ignore_case = self.ignore_case;
+        matcher.config.normalize = self.normalize;
         if self.negative {
             let pattern_score = match self.kind {
                 AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
@@ -371,10 +422,15 @@ impl Pattern {
     /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
     /// and ^ don't receive special treatment). If you want to match the entire
     /// pattern as a single needle use a single [`Atom`] instead.
-    pub fn new(pattern: &str, case_matching: CaseMatching, kind: AtomKind) -> Pattern {
+    pub fn new(
+        pattern: &str,
+        case_matching: CaseMatching,
+        normalize: Normalization,
+        kind: AtomKind,
+    ) -> Pattern {
         let atoms = pattern_atoms(pattern)
             .filter_map(|pat| {
-                let pat = Atom::new(pat, case_matching, kind, true);
+                let pat = Atom::new(pat, case_matching, normalize, kind, true);
                 (!pat.needle.is_empty()).then_some(pat)
             })
             .collect();
@@ -384,10 +440,10 @@ impl Pattern {
     /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
     /// cause different matching behaviour (see [`AtomKind`]). These can be
     /// escaped with backslash.
-    pub fn parse(pattern: &str, case_matching: CaseMatching) -> Pattern {
+    pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
         let atoms = pattern_atoms(pattern)
             .filter_map(|pat| {
-                let pat = Atom::parse(pat, case_matching);
+                let pat = Atom::parse(pat, case_matching, normalize);
                 (!pat.needle.is_empty()).then_some(pat)
             })
             .collect();
@@ -477,10 +533,15 @@ impl Pattern {
     /// Refreshes this pattern by reparsing it from a string. This is mostly
     /// equivalent to just constructing a new pattern using [`Pattern::parse`]
     /// but is slightly more efficient by reusing some allocations
-    pub fn reparse(&mut self, pattern: &str, case_matching: CaseMatching) {
+    pub fn reparse(
+        &mut self,
+        pattern: &str,
+        case_matching: CaseMatching,
+        normalize: Normalization,
+    ) {
         self.atoms.clear();
         let atoms = pattern_atoms(pattern).filter_map(|atom| {
-            let atom = Atom::parse(atom, case_matching);
+            let atom = Atom::parse(atom, case_matching, normalize);
             if atom.needle.is_empty() {
                 return None;
             }