Document the regex accents peculiarity

enso-org · Feb 21, 2022 · fcdee07 · fcdee07
1 parent bb6cf24
commit fcdee07
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 4 deletions.
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@@ -777,6 +777,19 @@ Text.ends_with suffix = Text_Utils.ends_with this suffix
      ensures that different ways of expressing the same character in the
      underlying binary representation are considered equal.
 
+     This however is not always well handled by the regex engine. The behaviour
+     is as follows:
+
+         'ś' . contains 's' == False
+         's\u{301}' . contains 's' == False
+         's\u{301}' . contains 'ś' == True
+         'ś' . contains 's\u{301}' == True
+
+         'ś' . contains 's' (Regex_Matcher.new) == True
+         's\u{301}' . contains 's' (Regex_Matcher.new) == True
+         's\u{301}' . contains 'ś' (Regex_Matcher.new) == True
+         'ś' . contains 's\u{301}' (Regex_Matcher.new) == True
+
    > Example
      See if the text "Hello" contains the text "ell".
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso
@@ -55,6 +55,7 @@ polyglot java import java.util.regex.PatternSyntaxException
 
 polyglot java import com.ibm.icu.impl.UnicodeRegex
 polyglot java import org.enso.base.Regex_Utils
+polyglot java import org.enso.base.Text_Utils
 
 ## Construct an instance of the default engine.
 
@@ -173,9 +174,17 @@ type Pattern
        - input: The text on which it will be matching.
        - start: The start of the matcher's region.
        - end: The end of the matcher's region.
+
+       ! Unicode Normalization
+         The Regex engine used here handles string modifiers, like accents in a
+         weird way. The string "s\u{301}" will be treated as containing "s"
+         within it, but "ś" (which is canonically equivalent to the former one)
+         will not contain "s". To get consistent behavior that does not depend
+         on the encoding, we normalize all input.
     build_matcher : Text -> Integer -> Integer -> Java_Matcher
     build_matcher input start end =
-        internal_matcher = this.internal_pattern.matcher input . region start end
+        normalized_input = Text_Utils.normalize input
+        internal_matcher = this.internal_pattern.matcher normalized_input . region start end
 
         if this.options.contains No_Anchoring_Bounds then
             internal_matcher.useAnchoringBounds False

diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@@ -1,6 +1,7 @@
 package org.enso.base;
 
 import com.ibm.icu.text.Normalizer;
+import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.StringSearch;
 import java.nio.charset.StandardCharsets;
 import java.util.regex.Pattern;
@@ -199,7 +200,8 @@ public static int compare_normalized(String a, String b) {
    * @return whether {@code substring} is a substring of {@code string}.
    */
   public static boolean contains(String string, String substring) {
-    // {@code StringSearch} does not handle empty strings as we would want, so we need these special cases.
+    // {@code StringSearch} does not handle empty strings as we would want, so we need these special
+    // cases.
     if (substring.length() == 0) return true;
     if (string.length() == 0) return false;
     StringSearch searcher = new StringSearch(substring, string);
@@ -218,4 +220,14 @@ public static boolean contains(String string, String substring) {
   public static String replace(String str, String oldSequence, String newSequence) {
     return str.replace(oldSequence, newSequence);
   }
+
+  /**
+   * Normalizes the string to its canonical Unicode form using NFD decomposition.
+   *
+   * <p>This is to ensure that things like accents are in a common format, i.e. `ś` gets decomposed
+   * into `s` and a separate codepoint for the accent etc.
+   */
+  public static String normalize(String str) {
+    return Normalizer2.getNFDInstance().normalize(str);
+  }
 }
diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso
@@ -251,12 +251,31 @@ spec =
             "123 meters and 4 centimeters" . contains "[0-9]+" Regex_Matcher.new . should_be_true
             "foo" . contains "[0-9]+" Regex_Matcher.new . should_be_false
 
+            'ś' . contains 's' . should_be_false
+            's\u{301}' . contains 's' . should_be_false
+            's\u{301}' . contains 'ś' . should_be_true
+            'ś' . contains 's\u{301}' . should_be_true
+
+            ## These first two cases are not really desirable, but we are
+               documenting here what is the current behaviour.
+            ## This shows what regex is doing by default and we cannot easily fix
+               that.
+            's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
+            ## This would normally be false, but we perform input normalization
+               to get results that are consistent regardless of if the input was
+               normalized or not.
+            'ś' . contains 's' (Regex_Matcher.new) . should_be_true
+            's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
+            'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
+
             "Cześć" . contains "ś" Regex_Matcher.new . should_be_true
-            "Cześć" . contains "s" Regex_Matcher.new . should_be_false
             "Cześć" . contains 's\u{301}' Regex_Matcher.new . should_be_true
             'Czes\u{301}c\u{301}' . contains 's\u{301}' Regex_Matcher.new . should_be_true
             'Czes\u{301}c\u{301}' . contains 'ś' Regex_Matcher.new . should_be_true
-            'Czes\u{301}c\u{301}' . contains 's' Regex_Matcher.new . should_be_false
+            ## These two tests below are disabled due to how regex is handling
+               letters with accents. See the tests above for explanation.
+            #"Cześć" . contains "s" Regex_Matcher.new . should_be_false
+            #'Czes\u{301}c\u{301}' . contains 's' Regex_Matcher.new . should_be_false
 
             "fooBar" . contains "b.." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
             "foar" . contains "b.." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false