enso-org · mergify · Feb 23, 2022 · Feb 18, 2022 · Feb 18, 2022 · Feb 18, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -50,6 +50,8 @@
   search.][3285]
 - [Implemented new `Text.take` and `Text.drop` functions, replacing existing
   functions][3287]
+- [Implemented new `Text.starts_with` and `Text.ends_with` functions, replacing
+  existing functions][3292]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -78,6 +80,7 @@
 [3282]: https://github.com/enso-org/enso/pull/3282
 [3285]: https://github.com/enso-org/enso/pull/3285
 [3287]: https://github.com/enso-org/enso/pull/3287
+[3292]: https://github.com/enso-org/enso/pull/3292
 
 #### Enso Compiler
 

@@ -711,26 +711,61 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
 
    Arguments:
    - prefix: The prefix to see if `this` starts with.
+   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
+     rules specified in the matcher.
+     If a `Regex_Matcher`, the term is used as a regular expression and matched
+     using the associated options.
 
    ! Unicode Equality
      The definition of equality includes Unicode canonicalization. I.e. two
      texts are equal if they are identical after canonical decomposition. This
      ensures that different ways of expressing the same character in the
      underlying binary representation are considered equal.
 
-   > Example
-     See if the text "Hello" starts with the prefix "hi".
+     This however is not always well handled by the regex engine. The behaviour
+     is as follows:
+
+         'ś' . starts_with 's' == False
+         's\u{301}' . starts_with 's' == False
+         's\u{301}' . starts_with 'ś' == True
+         'ś' . starts_with 's\u{301}' == True
 
-         "Hello".starts_with "hi"
-Text.starts_with : Text -> Boolean
-Text.starts_with prefix = Text_Utils.starts_with this prefix
+         'ś' . starts_with 's' (Regex_Matcher.new) == True
+         's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
+         's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
+         'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
+
+   > Example
+     See if the text "Hello!" starts with the specified prefix.
+
+         "Hello!".starts_with "Hello" == True
+         "Hello!".starts_with "hello" == False
+         "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) == True
+         "Hello!".starts_with "[a-z]" Regex_Matcher.new == False
+         "Hello!".starts_with "[A-Z]" Regex_Matcher.new == True
+Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
+Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitivity -> case case_sensitivity of
+        True ->
+            this.take (Text_Sub_Range.First prefix.length) == prefix
+        Case_Insensitive locale ->
+            this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
+    Regex_Matcher _ _ _ _ _ ->
+        preprocessed_pattern = "\A(?:" + prefix + ")"
+        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
+        match = compiled_pattern.match this Mode.First
+        match.is_nothing.not
 
 ## ALIAS Check Suffix
 
    Checks whether `this` ends with `suffix`.
 
    Arguments:
    - suffix: The suffix to see if `this` ends with.
+   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
+     rules specified in the matcher.
+     If a `Regex_Matcher`, the term is used as a regular expression and matched
+     using the associated options.
 
    ! Unicode Equality
      The definition of equality includes Unicode canonicalization. I.e. two
@@ -739,10 +774,24 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
      underlying binary representation are considered equal.
 
    > Example
-     See if the text "Hello" ends with the suffix "low".
-         "Hello".ends_with "low"
-Text.ends_with : Text -> Boolean
-Text.ends_with suffix = Text_Utils.ends_with this suffix
+     See if the text "Hello World" ends with the specified suffix.
+
+         "Hello World".ends_with "World" == True
+         "Hello World".ends_with "world" == False
+         "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) == True
+         "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new == True
+Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
+Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitivity -> case case_sensitivity of
+        True ->
+            this.take (Text_Sub_Range.Last suffix.length) == suffix
+        Case_Insensitive locale ->
+            this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
+    Regex_Matcher _ _ _ _ _ ->
+        preprocessed_pattern = "(?:" + suffix + ")\z"
+        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
+        match = compiled_pattern.match this Mode.First
+        match.is_nothing.not
 
 ## ALIAS Contains
 
@@ -801,14 +850,8 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
         True -> Text_Utils.contains this term
         Case_Insensitive locale ->
             Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
-    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
-        case_insensitive = case case_sensitive of
-            True -> False
-            ## TODO [RW] Currently locale is not supported in case-insensitive
-               Regex matching. There are plans to revisit it:
-               https://www.pivotaltracker.com/story/show/181313576
-            Case_Insensitive _ -> True
-        compiled_pattern = Regex.compile term case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
+    Regex_Matcher _ _ _ _ _ ->
+        compiled_pattern = here.prepare_regex term matcher
         match = compiled_pattern.match this Mode.First
         match.is_nothing.not
 
@@ -997,3 +1040,16 @@ Text.to_lower_case locale=Locale.default =
 Text.to_upper_case : Locale.Locale -> Text
 Text.to_upper_case locale=Locale.default =
     UCharacter.toUpperCase locale.java_locale this
+
+## PRIVATE
+prepare_regex : Text -> Regex_Matcher -> Pattern
+prepare_regex pattern regex_matcher = case regex_matcher of
+    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
+        case_insensitive = case case_sensitive of
+            True -> False
+            ## TODO [RW] Currently locale is not supported in case-insensitive
+               Regex matching. There are plans to revisit it:
+               https://www.pivotaltracker.com/story/show/181313576
+            Case_Insensitive _ -> True
+        compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
+        compiled_pattern
@@ -157,28 +157,6 @@ public static String from_chars(char[] chars) {
     return String.valueOf(chars);
   }
 
-  /**
-   * Checks whether {@code prefix} is a prefix of {@code str}.
-   *
-   * @param str the string to check
-   * @param prefix the potential prefix
-   * @return whether {@code prefix} is a prefix of {@code str}
-   */
-  public static boolean starts_with(String str, String prefix) {
-    return str.startsWith(prefix);
-  }
-
-  /**
-   * Checks whether {@code suffix} is a suffix of {@code str}.
-   *
-   * @param str the string to check
-   * @param suffix the potential suffix
-   * @return whether {@code suffix} is a suffix of {@code str}
-   */
-  public static boolean ends_with(String str, String suffix) {
-    return str.endsWith(suffix);
-  }
-
   /**
    * Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
    * normalization.

@@ -90,6 +90,7 @@ spec =
             "I" . equals_ignore_case "ı" . should_be_true
             "İ" . equals_ignore_case "i" . should_be_false
             "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
+            "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
 
             "Kongressstraße"=="Kongressstrasse" . should_be_false
             "Kongressstraße" . equals_ignore_case "Kongressstrasse" . should_be_true
@@ -425,6 +426,185 @@ spec =
             long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
             long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
 
+        Test.specify "should check for starts_with using Unicode normalization" <|
+            "Hello".starts_with "He" . should_be_true
+
+            "Ściana".starts_with 'S\u{301}' . should_be_true
+            "Ściana".starts_with 'Ś' . should_be_true
+            "Ściana".starts_with 'S' . should_be_false
+            'S\u{301}ciana'.starts_with 'Ś' . should_be_true
+            'S\u{301}ciana'.starts_with 'S\u{301}' . should_be_true
+            'S\u{301}ciana'.starts_with 'S' . should_be_false
+
+            "ABC" . starts_with "A" . should_be_true
+            "ABC" . starts_with "a" . should_be_false
+            "" . starts_with "foo" . should_be_false
+            "abc" . starts_with "" . should_be_true
+            "" . starts_with "" . should_be_true
+            "foo foo foo" . starts_with "foo" . should_be_true
+
+            "Hello!".starts_with "he" . should_be_false
+
+        Test.specify "starts_with should work as shown in the examples" <|
+            "Hello!".starts_with "Hello" . should_be_true
+            "Hello!".starts_with "hello" . should_be_false
+            "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Hello!".starts_with "[a-z]" Regex_Matcher.new . should_be_false
+            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
+
+        Test.specify "should allow for case-insensitive starts_with checks" <|
+            "Hello".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "Ściana".starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Ściana".starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
+            'S\u{301}ciana'.starts_with 'ś' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'S\u{301}ciana'.starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'S\u{301}ciana'.starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
+
+            "ABC" . starts_with "A" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . starts_with "a" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . starts_with "C" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "" . starts_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "abc" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "fOo FOO foo" . starts_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "Hello!".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+        Test.specify "should allow for Regex starts_with checks" <|
+            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
+            "foobar" . starts_with ".o." Regex_Matcher.new . should_be_true
+            "foob" . starts_with ".f." Regex_Matcher.new . should_be_false
+
+            "123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
+            "foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false
+
+            # Correct non-regex behaviour for reference.
+            'ś' . starts_with 's' == False
+            's\u{301}' . starts_with 's' == False
+            's\u{301}' . starts_with 'ś' == True
+            'ś' . starts_with 's\u{301}' == True
+
+            # These two behave as expected.
+            's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
+            'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
+
+            ## These two are included to document the current behaviour
+               (even though ideally, we would want them to return False).
+            'ś' . starts_with 's' (Regex_Matcher.new) == True
+            's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
+
+            "ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
+            "ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
+            's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
+            's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
+
+            ## These two tests below are disabled due to how regex is handling
+               letters with accents. See the tests above for explanation.
+            #"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
+            # 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false
+
+            "fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "faaaar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
+
+            long_text = """
+                EOL
+                SOL Hmm...
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
+
+            "aaazzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
+            "bbbzzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
+            "zzzaaa" . starts_with "a|b" Regex_Matcher.new . should_be_false
+            "zzzbbb" . starts_with "a|b" Regex_Matcher.new . should_be_false
+            "aaazzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "bbbzzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "zzzaaa" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_false
+            "ABC" . starts_with "\AA" Regex_Matcher.new . should_be_true
+            "ABC" . starts_with "\AA\z" Regex_Matcher.new . should_be_false
+            "foobar" . starts_with "" Regex_Matcher.new . should_be_true
+            "" . starts_with "" Regex_Matcher.new . should_be_true
+
+        Test.specify "should check for ends_with using Unicode normalization" <|
+            "Hello".ends_with "lo" . should_be_true
+            "Hello".ends_with "LO" . should_be_false
+
+            "rzeczywistość".ends_with 'c\u{301}' . should_be_true
+            "rzeczywistość".ends_with 'ć' . should_be_true
+            "rzeczywistość".ends_with 'c' . should_be_false
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'ć' . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'c\u{301}' . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'c' . should_be_false
+
+            "ABC" . ends_with "C" . should_be_true
+            "ABC" . ends_with "c" . should_be_false
+            "" . ends_with "foo" . should_be_false
+            "abc" . ends_with "" . should_be_true
+            "" . ends_with "" . should_be_true
+            "foo foo foo" . ends_with "foo" . should_be_true
+
+        Test.specify "ends_with should work as shown in the examples" <|
+            "Hello World".ends_with "World" . should_be_true
+            "Hello World".ends_with "world" . should_be_false
+            "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new . should_be_true
+
+        Test.specify "should allow for case-insensitive ends_with checks" <|
+            "Hello".ends_with "LO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "rzeczywistość".ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            "rzeczywistość".ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'Ć' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
+
+            "ABC" . ends_with "C" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . ends_with "c" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . ends_with "A" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "" . ends_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "abc" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "fOo FOO fOo" . ends_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+        Test.specify "should allow for Regex ends_with checks" <|
+            "Hello".ends_with "[a-z]" Regex_Matcher.new . should_be_true
+            "Hello!".ends_with "[a-z]" Regex_Matcher.new . should_be_false
+
+            "foobar" . ends_with ".o." Regex_Matcher.new . should_be_false
+            "foobar" . ends_with ".a." Regex_Matcher.new . should_be_true
+
+            "123 meters and 4 centimeters" . ends_with "[0-9]+" Regex_Matcher.new . should_be_false
+            "foo 123" . ends_with "[0-9]+" Regex_Matcher.new . should_be_true
+
+            "rzeczywistość" . ends_with "ć" Regex_Matcher.new . should_be_true
+            "rzeczywistość" . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'ć' Regex_Matcher.new . should_be_true
+            "rzeczywistość" . ends_with "c" Regex_Matcher.new . should_be_false
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'c' Regex_Matcher.new . should_be_false
+
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'Ć' (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "fOOBar" . ends_with ".A." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "faaaar" . ends_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
+
+            long_text = """
+                Hnnnn EOL
+                SOL
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
+
+            "zzzaaa" . ends_with "a|b" Regex_Matcher.new . should_be_true
+            "zzzbbb" . ends_with "a|b" Regex_Matcher.new . should_be_true
+            "aaazzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
+            "bbbzzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
+            "zzzaaa" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "zzzbbb" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "aaazzz" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_false
+            "ABC" . ends_with "C\z" Regex_Matcher.new . should_be_true
+            "ABC" . ends_with "\AC\z" Regex_Matcher.new . should_be_false
+            "foobar" . ends_with "" Regex_Matcher.new . should_be_true
+            "" . ends_with "" Regex_Matcher.new . should_be_true
+
     Test.group "Regex matching" <|
         Test.specify "should be possible on text" <|
             match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First