Data analysts should be able to use Text.replace to substitute part…

…s of the text (#3393) Implements https://www.pivotaltracker.com/story/show/181266274
enso-org · Apr 19, 2022 · 0c44b42 · 0c44b42
1 parent b986754
commit 0c44b42
Show file tree

Hide file tree

Showing 15 changed files with 450 additions and 143 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -105,6 +105,7 @@
 - [Implemented `Text.reverse`][3377]
 - [Implemented support for most Table aggregations in the Database
   backend.][3383]
+- [Update `Text.replace` to new API.][3393]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -160,6 +161,7 @@
 [3383]: https://github.com/enso-org/enso/pull/3383
 [3385]: https://github.com/enso-org/enso/pull/3385
 [3392]: https://github.com/enso-org/enso/pull/3392
+[3393]: https://github.com/enso-org/enso/pull/3393
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso
@@ -39,6 +39,7 @@ import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Engine
 import Standard.Base.Data.Text.Regex.Option as Global_Option
 import Standard.Base.Data.Text.Regex.Mode
+import Standard.Base.Data.Text.Matching_Mode
 import Standard.Base.Polyglot.Java as Java_Ext
 from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
 
@@ -533,7 +534,7 @@ type Pattern
                  pattern = engine.compile "aa []
                  input = "aabbaabbbbbaab"
                  pattern.replace input "REPLACED"
-    replace : Text -> Text -> (Mode.First | Integer | Mode.All | Mode.Full) -> Text
+    replace : Text -> Text -> (Mode.First | Integer | Mode.All | Mode.Full | Matching_Mode.Last) -> Text
     replace input replacement mode=Mode.All =
         do_replace_mode mode start end = case mode of
             Mode.First ->
@@ -559,8 +560,26 @@ type Pattern
                 internal_matcher.replaceAll replacement
             Mode.Full ->
                 case this.match input mode=Mode.Full of
-                    Match _ _ _ _ -> replacement
+                    Match _ _ _ _ -> this.replace input replacement Mode.First
                     Nothing -> input
+            Matching_Mode.Last ->
+                all_matches = this.match input
+                all_matches_count = if all_matches.is_nothing then 0 else all_matches.length
+
+                if all_matches_count == 0 then input else
+                    internal_matcher = this.build_matcher input start end
+                    buffer = StringBuffer.new
+                    last_match_index = all_matches_count - 1
+
+                    go match_index =
+                        internal_matcher.find
+                        case match_index == last_match_index of
+                            True -> internal_matcher.appendReplacement buffer replacement
+                            False -> @Tail_Call go (match_index + 1)
+
+                    go 0
+                    internal_matcher.appendTail buffer
+                    buffer.to_text
             Mode.Bounded _ _ _ -> Panic.throw <|
                 Mode_Error "Modes cannot be recursive."
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
@@ -81,22 +81,22 @@ type Text_Sub_Range
                 if delimiter.is_empty then (Range 0 0) else
                     span = Text_Utils.span_of text delimiter
                     if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
-                        (Range 0 span.start)
+                        (Range 0 span.codeunit_start)
             Before_Last delimiter ->
                 if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
                     span = Text_Utils.last_span_of text delimiter
                     if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
-                        (Range 0 span.start)
+                        (Range 0 span.codeunit_start)
             After delimiter ->
                 if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
                     span = Text_Utils.span_of text delimiter
                     if span.is_nothing then (Range 0 0) else
-                        (Range span.end (Text_Utils.char_length text))
+                        (Range span.codeunit_end (Text_Utils.char_length text))
             After_Last delimiter ->
                 if delimiter.is_empty then (Range 0 0) else
                     span = Text_Utils.last_span_of text delimiter
                     if span.is_nothing then (Range 0 0) else
-                        (Range span.end (Text_Utils.char_length text))
+                        (Range span.codeunit_end (Text_Utils.char_length text))
             While predicate ->
                 indices = find_sub_range_end text _-> start-> end->
                     predicate (Text_Utils.substring text start end) . not

diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@@ -12,6 +12,7 @@
 import java.util.Locale;
 import java.util.regex.Pattern;
 import org.enso.base.text.CaseFoldedString;
+import org.enso.base.text.CaseFoldedString.Grapheme;
 import org.enso.base.text.GraphemeSpan;
 import org.enso.base.text.Utf16Span;
 
@@ -231,19 +232,6 @@ public static String case_insensitive_key(String string, Locale locale) {
     return CaseFoldedString.simpleFold(string, locale);
   }
 
-  /**
-   * Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
-   *
-   * @param str the string to process
-   * @param oldSequence the substring that is searched for and will be replaced
-   * @param newSequence the string that will replace occurrences of {@code oldSequence}
-   * @return {@code str} with all occurrences of {@code oldSequence} replaced with {@code
-   *     newSequence}
-   */
-  public static String replace(String str, String oldSequence, String newSequence) {
-    return str.replace(oldSequence, newSequence);
-  }
-
   /**
    * Gets the length of char array of a string
    *
@@ -306,7 +294,7 @@ public static List<Utf16Span> span_of_all(String haystack, String needle) {
 
     StringSearch search = new StringSearch(needle, haystack);
     ArrayList<Utf16Span> occurrences = new ArrayList<>();
-    long ix;
+    int ix;
     while ((ix = search.next()) != StringSearch.DONE) {
       occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
     }
@@ -456,13 +444,21 @@ public static List<GraphemeSpan> span_of_all_case_insensitive(
    * @return a minimal {@code GraphemeSpan} which contains all code units from the match
    */
   private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
-    int firstGrapheme = string.codeUnitToGraphemeIndex(position);
+    Grapheme firstGrapheme = string.findGrapheme(position);
     if (length == 0) {
-      return new GraphemeSpan(firstGrapheme, firstGrapheme);
+      return new GraphemeSpan(
+          firstGrapheme.index,
+          firstGrapheme.index,
+          firstGrapheme.codeunit_start,
+          firstGrapheme.codeunit_start);
     } else {
-      int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
-      int endGrapheme = lastGrapheme + 1;
-      return new GraphemeSpan(firstGrapheme, endGrapheme);
+      Grapheme lastGrapheme = string.findGrapheme(position + length - 1);
+      int endGraphemeIndex = lastGrapheme.index + 1;
+      return new GraphemeSpan(
+          firstGrapheme.index,
+          endGraphemeIndex,
+          firstGrapheme.codeunit_start,
+          lastGrapheme.codeunit_end);
     }
   }
 
@@ -485,4 +481,30 @@ public static String normalize(String str) {
   public static boolean is_all_whitespace(String text) {
     return text.codePoints().allMatch(UCharacter::isUWhiteSpace);
   }
+
+  /**
+   * Replaces all provided spans within the text with {@code newSequence}.
+   *
+   * @param str the string to process
+   * @param spans the spans to replace; the spans should be sorted by their starting point in the
+   *     non-decreasing order; the behaviour is undefined if these requirements are not satisfied.
+   * @param newSequence the string that will replace the spans
+   * @return {@code str} with all provided spans replaced with {@code newSequence}
+   */
+  public static String replace_spans(String str, List<Utf16Span> spans, String newSequence) {
+    StringBuilder sb = new StringBuilder();
+    int current_ix = 0;
+    for (Utf16Span span : spans) {
+      if (span.codeunit_start > current_ix) {
+        sb.append(str, current_ix, span.codeunit_start);
+      }
+
+      sb.append(newSequence);
+      current_ix = span.codeunit_end;
+    }
+
+    // Add the remaining part of the string (if any).
+    sb.append(str, current_ix, str.length());
+    return sb.toString();
+  }
 }
diff --git a/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java b/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java
@@ -13,6 +13,20 @@
  * indices back in the original string.
  */
 public class CaseFoldedString {
+  public static class Grapheme {
+    /** The grapheme index of the given grapheme in the string. */
+    public final int index;
+
+    /** The codeunit indices of start and end of the given grapheme in the original string. */
+    public final int codeunit_start, codeunit_end;
+
+    public Grapheme(int index, int codeunit_start, int codeunit_end) {
+      this.index = index;
+      this.codeunit_start = codeunit_start;
+      this.codeunit_end = codeunit_end;
+    }
+  }
+
   private final String foldedString;
 
   /**
@@ -24,33 +38,67 @@ public class CaseFoldedString {
    */
   private final int[] graphemeIndexMapping;
 
+  /**
+   * A mapping from code units in the transformed string to the first code-unit of the corresponding
+   * grapheme in the original string.
+   *
+   * <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
+   * (inclusive).
+   */
+  private final int[] codeunitStartIndexMapping;
+
+  /**
+   * A mapping from code units in the transformed string to the end code-unit of the corresponding
+   * grapheme in the original string.
+   *
+   * <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
+   * (inclusive).
+   */
+  private final int[] codeunitEndIndexMapping;
+
   /**
    * Constructs a new instance of the folded string.
    *
    * @param foldeString the string after applying the case folding transformation
    * @param graphemeIndexMapping a mapping created during the transformation which maps code units
    *     in the transformed string to their corresponding graphemes in the original string
+   * @param codeunitStartIndexMapping a mapping created during the transformation which maps code
+   *     units in the transformed string to first codeunits of corresponding graphemes in the
+   *     original string
+   * @param codeunitStartIndexMapping a mapping created during the transformation which maps code
+   *     units in the transformed string to end codeunits of corresponding graphemes in the original
+   *     string
    */
-  private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
+  private CaseFoldedString(
+      String foldeString,
+      int[] graphemeIndexMapping,
+      int[] codeunitStartIndexMapping,
+      int[] codeunitEndIndexMapping) {
     this.foldedString = foldeString;
     this.graphemeIndexMapping = graphemeIndexMapping;
+    this.codeunitStartIndexMapping = codeunitStartIndexMapping;
+    this.codeunitEndIndexMapping = codeunitEndIndexMapping;
   }
 
   /**
-   * Maps a code unit in the folded string to the corresponding grapheme in the original string.
+   * Finds the grapheme corresponding to a code unit in the folded string.
    *
    * @param codeunitIndex the index of the code unit in the folded string, valid indices range from
    *     0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
    *     position of the end code unit which is located right after the end of the string - which
    *     should always map to the analogous end grapheme.
-   * @return the index of the grapheme from the original string that after applying the
-   *     transformation contains the requested code unit
+   * @return the index of the first code unit of the grapheme from the original string that after
+   *     applying the transformation contains the requested code unit
    */
-  public int codeUnitToGraphemeIndex(int codeunitIndex) {
+  public Grapheme findGrapheme(int codeunitIndex) {
     if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
       throw new IndexOutOfBoundsException(codeunitIndex);
     }
-    return graphemeIndexMapping[codeunitIndex];
+
+    return new Grapheme(
+        graphemeIndexMapping[codeunitIndex],
+        codeunitStartIndexMapping[codeunitIndex],
+        codeunitEndIndexMapping[codeunitIndex]);
   }
 
   /** Returns the transformed string. */
@@ -74,7 +122,9 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
     breakIterator.setText(charSequence);
     StringBuilder stringBuilder = new StringBuilder(charSequence.length());
     Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
-    IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
+    IntArrayBuilder grapheme_mapping = new IntArrayBuilder(charSequence.length() + 1);
+    IntArrayBuilder codeunit_start_mapping = new IntArrayBuilder(charSequence.length() + 1);
+    IntArrayBuilder codeunit_end_mapping = new IntArrayBuilder(charSequence.length() + 1);
 
     // We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
     // each grapheme cluster is independent of surrounding ones. Regular casing is
@@ -87,7 +137,9 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
       String foldedGrapheme = foldAlgorithm.apply(grapheme);
       stringBuilder.append(foldedGrapheme);
       for (int i = 0; i < foldedGrapheme.length(); ++i) {
-        index_mapping.add(grapheme_index);
+        grapheme_mapping.add(grapheme_index);
+        codeunit_start_mapping.add(current);
+        codeunit_end_mapping.add(next);
       }
 
       grapheme_index++;
@@ -96,10 +148,13 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
 
     // The mapping should also be able to handle a {@code str.length()} query, so we add one more
     // element to the mapping pointing to a non-existent grapheme after the end of the text.
-    index_mapping.add(grapheme_index);
+    grapheme_mapping.add(grapheme_index);
 
     return new CaseFoldedString(
-        stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
+        stringBuilder.toString(),
+        grapheme_mapping.unsafeGetStorageAndInvalidateTheBuilder(),
+        codeunit_start_mapping.unsafeGetStorageAndInvalidateTheBuilder(),
+        codeunit_end_mapping.unsafeGetStorageAndInvalidateTheBuilder());
   }
 
   /**

diff --git a/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java b/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java
@@ -9,20 +9,21 @@
  * <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
  * space just before the grapheme corresponding to index start.
  */
-public class GraphemeSpan {
+public class GraphemeSpan extends Utf16Span {
 
-  public final long start, end;
+  public final int grapheme_start, grapheme_end;
 
   /**
    * Constructs a span of characters (understood as extended grapheme clusters).
-   *
-   * @param start index of the first extended grapheme cluster contained within the span (or
+   *  @param grapheme_start index of the first extended grapheme cluster contained within the span (or
    *     location of the span if it is empty)
-   * @param end index of the first extended grapheme cluster after start that is not contained
-   *     within the span
+   * @param grapheme_end index of the first extended grapheme cluster after start that is not contained
+   * @param codeunit_start code unit index of {@code grapheme_start}
+   * @param codeunit_end code unit index of {@code grapheme_end}
    */
-  public GraphemeSpan(long start, long end) {
-    this.start = start;
-    this.end = end;
+  public GraphemeSpan(int grapheme_start, int grapheme_end, int codeunit_start, int codeunit_end) {
+    super(codeunit_start, codeunit_end);
+    this.grapheme_start = grapheme_start;
+    this.grapheme_end = grapheme_end;
   }
 }
diff --git a/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java b/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java
@@ -8,11 +8,11 @@
  */
 public class Utf16Span {
 
-  public final long start, end;
+  public final int codeunit_start, codeunit_end;
 
   /** Constructs a span of UTF-16 code units. */
-  public Utf16Span(long start, long end) {
-    this.start = start;
-    this.end = end;
+  public Utf16Span(int codeunit_start, int codeunit_end) {
+    this.codeunit_start = codeunit_start;
+    this.codeunit_end = codeunit_end;
   }
 }
diff --git a/test/Table_Tests/src/Common_Table_Spec.enso b/test/Table_Tests/src/Common_Table_Spec.enso
@@ -376,7 +376,7 @@ spec prefix table_builder supports_case_sensitive_columns pending=Nothing =
             expect_column_names ["bar", "foo_001", "foo_1", "Foo_2", "foo_3", "foo_21", "foo_100"] <| table.sort_columns (Sort_Method natural_order=True case_sensitive=Case_Insensitive.new)
             expect_column_names ["foo_3", "foo_21", "foo_100", "foo_1", "foo_001", "bar", "Foo_2"] <| table.sort_columns (Sort_Method order=Sort_Order.Descending)
 
-        Test.specify "should correctly handle case insensitive sorting" <|
+        Test.specify "should correctly handle case-insensitive sorting" <|
             expect_column_names ["bar", "foo_001", "foo_1", "foo_100", "Foo_2", "foo_21", "foo_3"] <| table.sort_columns (Sort_Method case_sensitive=Case_Insensitive.new)
 
         Test.specify "should correctly handle natural order sorting" <|
@@ -412,7 +412,7 @@ spec prefix table_builder supports_case_sensitive_columns pending=Nothing =
             expect_column_names ["FirstColumn", "beta", "gamma", "Another"] <|
                 table.rename_columns (Column_Mapping.By_Name map (Text_Matcher True))
 
-        Test.specify "should work by name case insensitively" <|
+        Test.specify "should work by name case-insensitively" <|
             map = Map.from_vector [["ALPHA", "FirstColumn"], ["DELTA", "Another"]]
             expect_column_names ["FirstColumn", "beta", "gamma", "Another"] <|
                 table.rename_columns (Column_Mapping.By_Name map (Text_Matcher Case_Insensitive.new))