Skip to content

Commit

Permalink
Document the regex accents peculiarity
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed Feb 21, 2022
1 parent bb6cf24 commit fcdee07
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,19 @@ Text.ends_with suffix = Text_Utils.ends_with this suffix
ensures that different ways of expressing the same character in the
underlying binary representation are considered equal.

This however is not always well handled by the regex engine. The behaviour
is as follows:

'ś' . contains 's' == False
's\u{301}' . contains 's' == False
's\u{301}' . contains 'ś' == True
'ś' . contains 's\u{301}' == True

'ś' . contains 's' (Regex_Matcher.new) == True
's\u{301}' . contains 's' (Regex_Matcher.new) == True
's\u{301}' . contains 'ś' (Regex_Matcher.new) == True
'ś' . contains 's\u{301}' (Regex_Matcher.new) == True

> Example
See if the text "Hello" contains the text "ell".

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ polyglot java import java.util.regex.PatternSyntaxException

polyglot java import com.ibm.icu.impl.UnicodeRegex
polyglot java import org.enso.base.Regex_Utils
polyglot java import org.enso.base.Text_Utils

## Construct an instance of the default engine.

Expand Down Expand Up @@ -173,9 +174,17 @@ type Pattern
- input: The text on which it will be matching.
- start: The start of the matcher's region.
- end: The end of the matcher's region.

! Unicode Normalization
The Regex engine used here handles string modifiers, like accents in a
weird way. The string "s\u{301}" will be treated as containing "s"
within it, but "ś" (which is canonically equivalent to the former one)
will not contain "s". To get consistent behavior that does not depend
on the encoding, we normalize all input.
build_matcher : Text -> Integer -> Integer -> Java_Matcher
build_matcher input start end =
internal_matcher = this.internal_pattern.matcher input . region start end
normalized_input = Text_Utils.normalize input
internal_matcher = this.internal_pattern.matcher normalized_input . region start end

if this.options.contains No_Anchoring_Bounds then
internal_matcher.useAnchoringBounds False
Expand Down
14 changes: 13 additions & 1 deletion std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.enso.base;

import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringSearch;
import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -199,7 +200,8 @@ public static int compare_normalized(String a, String b) {
* @return whether {@code substring} is a substring of {@code string}.
*/
public static boolean contains(String string, String substring) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special cases.
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.length() == 0) return true;
if (string.length() == 0) return false;
StringSearch searcher = new StringSearch(substring, string);
Expand All @@ -218,4 +220,14 @@ public static boolean contains(String string, String substring) {
public static String replace(String str, String oldSequence, String newSequence) {
return str.replace(oldSequence, newSequence);
}

/**
* Normalizes the string to its canonical Unicode form using NFD decomposition.
*
* <p>This is to ensure that things like accents are in a common format, i.e. `ś` gets decomposed
* into `s` and a separate codepoint for the accent etc.
*/
public static String normalize(String str) {
return Normalizer2.getNFDInstance().normalize(str);
}
}
23 changes: 21 additions & 2 deletions test/Tests/src/Data/Text_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,31 @@ spec =
"123 meters and 4 centimeters" . contains "[0-9]+" Regex_Matcher.new . should_be_true
"foo" . contains "[0-9]+" Regex_Matcher.new . should_be_false

'ś' . contains 's' . should_be_false
's\u{301}' . contains 's' . should_be_false
's\u{301}' . contains 'ś' . should_be_true
'ś' . contains 's\u{301}' . should_be_true

## These first two cases are not really desirable, but we are
documenting here what is the current behaviour.
## This shows what regex is doing by default and we cannot easily fix
that.
's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
## This would normally be false, but we perform input normalization
to get results that are consistent regardless of if the input was
normalized or not.
'ś' . contains 's' (Regex_Matcher.new) . should_be_true
's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true

"Cześć" . contains "ś" Regex_Matcher.new . should_be_true
"Cześć" . contains "s" Regex_Matcher.new . should_be_false
"Cześć" . contains 's\u{301}' Regex_Matcher.new . should_be_true
'Czes\u{301}c\u{301}' . contains 's\u{301}' Regex_Matcher.new . should_be_true
'Czes\u{301}c\u{301}' . contains 'ś' Regex_Matcher.new . should_be_true
'Czes\u{301}c\u{301}' . contains 's' Regex_Matcher.new . should_be_false
## These two tests below are disabled due to how regex is handling
letters with accents. See the tests above for explanation.
#"Cześć" . contains "s" Regex_Matcher.new . should_be_false
#'Czes\u{301}c\u{301}' . contains 's' Regex_Matcher.new . should_be_false

"fooBar" . contains "b.." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
"foar" . contains "b.." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
Expand Down

0 comments on commit fcdee07

Please sign in to comment.