enso-org · mergify · Mar 12, 2022 · Mar 4, 2022 · Mar 4, 2022 · Mar 7, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@
 - [Implemented `Bool.compare_to` method][3317]
 - [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
   also compute mode, percentile, minimum, maximum.][3318]
+- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -100,7 +101,8 @@
 [3236]: https://github.com/enso-org/enso/pull/3236
 [3311]: https://github.com/enso-org/enso/pull/3311
 [3317]: https://github.com/enso-org/enso/pull/3317
-[3317]: https://github.com/enso-org/enso/pull/3318
+[3318]: https://github.com/enso-org/enso/pull/3318
+[3324]: https://github.com/enso-org/enso/pull/3324
 
 #### Enso Compiler
 

@@ -5,16 +5,19 @@ from Standard.Builtins import Text, Prim_Text_Helpers
 
 import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Mode
+import Standard.Base.Data.Text.Matching_Mode
 import Standard.Base.Data.Text.Case
 import Standard.Base.Data.Text.Location
 import Standard.Base.Data.Text.Line_Ending_Style
+from Standard.Base.Data.Text.Span as Span_Module import Span
 import Standard.Base.Data.Text.Split_Kind
 import Standard.Base.Data.Text.Text_Sub_Range
 import Standard.Base.Data.Locale
 import Standard.Base.Meta
 
 from Standard.Builtins export Text
 
+export Standard.Base.Data.Text.Matching_Mode
 export Standard.Base.Data.Text.Case
 export Standard.Base.Data.Text.Location
 export Standard.Base.Data.Text.Split_Kind
@@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te
          (('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
 Text.equals_ignore_case : Text -> Locale -> Boolean
 Text.equals_ignore_case that locale=Locale.default =
-    (this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale)
+    Text_Utils.equals_ignore_case this that locale.java_locale
 
 ## ADVANCED
    PRIVATE
@@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default =
    used to perform case-insensitive comparisons.
 Text.to_case_insensitive_key : Locale -> Text
 Text.to_case_insensitive_key locale=Locale.default =
-    this.to_case Case.Lower locale . to_case Case.Upper locale
+    Text_Utils.case_insensitive_key this locale.java_locale
 
 ## Compare two texts to discover their ordering.
 
@@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
     Text_Matcher case_sensitivity -> case case_sensitivity of
         True -> Text_Utils.contains this term
         Case_Insensitive locale ->
-            Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
+            Text_Utils.contains_case_insensitive this term locale.java_locale
     Regex_Matcher _ _ _ _ _ ->
         compiled_pattern = matcher.compile term
         match = compiled_pattern.match this Mode.First
@@ -952,27 +955,6 @@ Text.repeat count=1 =
        https://www.pivotaltracker.com/story/show/181435598
     0.up_to (count.max 0) . fold "" acc-> _-> acc + this
 
-## PRIVATE
-   Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points
-range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
-range_to_char_indices text range =
-    len = text.length
-    start = if range.start < 0 then range.start + len else range.start
-    end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
-    is_valid = (Range 0 len+1).contains
-
-    case (Pair (is_valid start) (is_valid end)) of
-        Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
-        Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
-        Pair True True ->
-            if start>=end then (Range 0 0) else
-                iterator = BreakIterator.getCharacterInstance
-                iterator.setText text
-
-                start_index = iterator.next start
-                end_index = iterator.next (end - start)
-                Range start_index end_index
-
 ## ALIAS first, last, left, right, mid, substring
    Creates a new Text by selecting the specified range of the input.
 
@@ -1009,7 +991,7 @@ range_to_char_indices text range =
 Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
 Text.take range =
     char_range = case range of
-        Range _ _ -> here.range_to_char_indices this range
+        Range _ _ -> Span_Module.range_to_char_indices this range
         _ -> range.to_char_range this
     Text_Utils.substring this char_range.start char_range.end
 
@@ -1049,7 +1031,7 @@ Text.take range =
 Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
 Text.drop range =
     char_range = case range of
-        Range _ _ -> here.range_to_char_indices this range
+        Range _ _ -> Span_Module.range_to_char_indices this range
         _ -> range.to_char_range this
     if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
         prefix = Text_Utils.substring this 0 char_range.start
@@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace =
             loop current break_iterator.previous
     if start_index >= end_index then "" else
         Text_Utils.substring this start_index end_index
+
+## ALIAS find, index_of, position_of, span_of
+   Find the location of the `term` in the input.
+   Returns a Span representing the location at which the term was found, or
+   `Nothing` if the term was not found in the input.
+
+   Arguments:
+   - term: The term to find.
+   - mode: Specifies if the first or last occurrence of the term should be
+     returned if there are multiple occurrences within the input. The first
+     occurrence is returned by default.
+   - matcher: Specifies how the term is matched against the input:
+     - If a `Text_Matcher`, the text is compared using case-sensitively rules
+       specified in the matcher.
+     - If a `Regex_Matcher`, the `term` is used as a regular expression and
+       matched using the associated options.
+
+   ! What is a Character?
+     A character is defined as an Extended Grapheme Cluster, see Unicode
+     Standard Annex 29. This is the smallest unit that still has semantic
+     meaning in most text-processing applications.
+
+   > Example
+     Finding location of a substring.
+
+         "Hello World!".location_of "J" == Nothing
+         "Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
+         "Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!"
+
+   ! Match Length
+     The  function returns not only the index of the match but a `Span` instance
+     which contains both the start and end indices, allowing to determine the
+     length of the match. This is useful not only with regex matches (where a
+     regular expression can have matches of various lengths) but also for case
+     insensitive matching. In case insensitive mode, a single character can
+     match multiple characters, for example `ß` will match `ss` and `SS`, and
+     the ligature `ﬃ` will match `ffi` or `f` etc. Thus in case insensitive
+     mode, the length of the match can be shorter or longer than the term that
+     was being matched, so it is extremely important to not rely on the length
+     of the matched term when analysing the matches as they may have different
+     lengths.
+
+   > Example
+     Match length differences in case insensitive matching.
+
+         term = "straße"
+         text = "MONUMENTENSTRASSE 42"
+         match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
+         term.length == 6
+         match.length == 7
+
+   ! Matching Grapheme Clusters
+     In case insensitive mode, a single character can match multiple characters,
+     for example `ß` will match `ss` and `SS`, and the ligature `ﬃ` will match
+     `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
+     match only a part of some single grapheme cluster, for example in the text
+     `ﬃa` the term `ia` will match just one-third of the first grapheme `ﬃ`.
+     Since we do not have the resolution to distinguish such partial matches
+     (as that would require non-integer indices), so a match which matched just
+     a part of some grapheme cluster is extended and treated as if it matched
+     the whole grapheme cluster.
+
+   > Example
+     Extending matches to full grapheme clusters.
+
+         ligatures = "ﬃﬄ"
+         ligatures.length == 2
+         term_1 = "IFF"
+         match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
+         term_1.length == 3
+         match_1.length == 2
+         term_2 = "ffiffl"
+         match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
+         term_2.length == 6
+         match_2.length == 2
+         # After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters.
+         match_1 == match_2
+Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing
+Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitive -> case case_sensitive of
+        True ->
+            codepoint_span = case mode of
+                Matching_Mode.First -> Text_Utils.span_of this term
+                Matching_Mode.Last -> Text_Utils.last_span_of this term
+            if codepoint_span.is_nothing then Nothing else
+                start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start
+                ## While the codepoint_span may have different code unit length
+                   from our term, the `length` counted in grapheme clusters is
+                   guaranteed to be the same.
+                end = start + term.length
+                Span (Range start end) this
+        Case_Insensitive locale -> case term.is_empty of
+            True -> case mode of
+                Matching_Mode.First -> Span (Range 0 0) this
+                Matching_Mode.Last ->
+                    end = this.length
+                    Span (Range end end) this
+            False ->
+                search_for_last = case mode of
+                    Matching_Mode.First -> False
+                    Matching_Mode.Last -> True
+                case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of
+                    Nothing -> Nothing
+                    grapheme_span ->
+                        Span (Range grapheme_span.start grapheme_span.end) this
+    Regex_Matcher _ _ _ _ _ -> case mode of
+        Matching_Mode.First ->
+            case matcher.compile term . match this Mode.First of
+                Nothing -> Nothing
+                match -> match.span 0 . to_grapheme_span
+        Matching_Mode.Last ->
+            case matcher.compile term . match this Mode.All of
+                Nothing -> Nothing
+                matches -> matches.last.span 0 . to_grapheme_span
+
+## ALIAS find_all, index_of_all, position_of_all, span_of_all
+   Finds all the locations of the `term` in the input.
+   If not found, the function returns an empty Vector.
+
+   Arguments:
+   - term: The term to find.
+   - matcher: Specifies how the term is matched against the input:
+     - If a `Text_Matcher`, the text is compared using case-sensitively rules
+       specified in the matcher.
+     - If a `Regex_Matcher`, the `term` is used as a regular expression and
+       matched using the associated options.
+
+   ! What is a Character?
+     A character is defined as an Extended Grapheme Cluster, see Unicode
+     Standard Annex 29. This is the smallest unit that still has semantic
+     meaning in most text-processing applications.
+
+   > Example
+     Finding locations of all occurrences of a substring.
+
+         "Hello World!".location_of_all "J" == []
+         "Hello World!".location_of_all "o" . map .start == [4, 7]
+
+   ! Match Length
+     The  function returns not only the index of the match but a `Span` instance
+     which contains both the start and end indices, allowing to determine the
+     length of the match. This is useful not only with regex matches (where a
+     regular expression can have matches of various lengths) but also for case
+     insensitive matching. In case insensitive mode, a single character can
+     match multiple characters, for example `ß` will match `ss` and `SS`, and
+     the ligature `ﬃ` will match `ffi` or `f` etc. Thus in case insensitive
+     mode, the length of the match can be shorter or longer than the term that
+     was being matched, so it is extremely important to not rely on the length
+     of the matched term when analysing the matches as they may have different
+     lengths.
+
+   > Example
+     Match length differences in case insensitive matching.
+
+         term = "strasse"
+         text = "MONUMENTENSTRASSE ist eine große Straße."
+         match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
+         term.length == 7
+         match . map .length == [7, 6]
+
+   ! Matching Grapheme Clusters
+     In case insensitive mode, a single character can match multiple characters,
+     for example `ß` will match `ss` and `SS`, and the ligature `ﬃ` will match
+     `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
+     match only a part of some single grapheme cluster, for example in the text
+     `ﬃa` the term `ia` will match just one-third of the first grapheme `ﬃ`.
+     Since we do not have the resolution to distinguish such partial matches
+     (as that would require non-integer indices), so a match which matched just
+     a part of some grapheme cluster is extended and treated as if it matched
+     the whole grapheme cluster.
+
+   > Example
+     Extending matches to full grapheme clusters.
+
+         ligatures = "ﬃﬄFFIFF"
+         ligatures.length == 7
+         match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
+         match_1 . map .length == [2, 3]
+         match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
+         match_2 . map .length == [2, 5]
+Text.location_of_all : Text -> Matcher -> [Span]
+Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
+        True ->
+            codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
+            grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
+            ## While the codepoint_spans may have different code unit lengths
+               from our term, the `length` counted in grapheme clusters is
+               guaranteed to be the same.
+            offset = term.length
+            grahpeme_ixes . map start->
+                end = start+offset
+                Span (Range start end) this
+        Case_Insensitive locale ->
+            grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
+            grapheme_spans.map grapheme_span->
+                Span (Range grapheme_span.start grapheme_span.end) this
+    Regex_Matcher _ _ _ _ _ ->
+        case matcher.compile term . match this Mode.All of
+            Nothing -> []
+            matches -> matches.map m-> m.span 0 . to_grapheme_span
@@ -0,0 +1,5 @@
+## Matches the first found instance.
+type First
+
+## Matches the last found instance.
+type Last