Skip to content

Commit

Permalink
Data analysts should be able to use Text.replace to substitute part…
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored and hubertp committed Apr 19, 2022
1 parent b986754 commit 0c44b42
Show file tree
Hide file tree
Showing 15 changed files with 450 additions and 143 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
- [Implemented `Text.reverse`][3377]
- [Implemented support for most Table aggregations in the Database
backend.][3383]
- [Update `Text.replace` to new API.][3393]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -160,6 +161,7 @@
[3383]: https://github.com/enso-org/enso/pull/3383
[3385]: https://github.com/enso-org/enso/pull/3385
[3392]: https://github.com/enso-org/enso/pull/3392
[3393]: https://github.com/enso-org/enso/pull/3393

#### Enso Compiler

Expand Down
185 changes: 121 additions & 64 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Engine
import Standard.Base.Data.Text.Regex.Option as Global_Option
import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Text.Matching_Mode
import Standard.Base.Polyglot.Java as Java_Ext
from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span

Expand Down Expand Up @@ -533,7 +534,7 @@ type Pattern
pattern = engine.compile "aa []
input = "aabbaabbbbbaab"
pattern.replace input "REPLACED"
replace : Text -> Text -> (Mode.First | Integer | Mode.All | Mode.Full) -> Text
replace : Text -> Text -> (Mode.First | Integer | Mode.All | Mode.Full | Matching_Mode.Last) -> Text
replace input replacement mode=Mode.All =
do_replace_mode mode start end = case mode of
Mode.First ->
Expand All @@ -559,8 +560,26 @@ type Pattern
internal_matcher.replaceAll replacement
Mode.Full ->
case this.match input mode=Mode.Full of
Match _ _ _ _ -> replacement
Match _ _ _ _ -> this.replace input replacement Mode.First
Nothing -> input
Matching_Mode.Last ->
all_matches = this.match input
all_matches_count = if all_matches.is_nothing then 0 else all_matches.length

if all_matches_count == 0 then input else
internal_matcher = this.build_matcher input start end
buffer = StringBuffer.new
last_match_index = all_matches_count - 1

go match_index =
internal_matcher.find
case match_index == last_match_index of
True -> internal_matcher.appendReplacement buffer replacement
False -> @Tail_Call go (match_index + 1)

go 0
internal_matcher.appendTail buffer
buffer.to_text
Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error "Modes cannot be recursive."

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,22 @@ type Text_Sub_Range
if delimiter.is_empty then (Range 0 0) else
span = Text_Utils.span_of text delimiter
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
(Range 0 span.start)
(Range 0 span.codeunit_start)
Before_Last delimiter ->
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
span = Text_Utils.last_span_of text delimiter
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
(Range 0 span.start)
(Range 0 span.codeunit_start)
After delimiter ->
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
span = Text_Utils.span_of text delimiter
if span.is_nothing then (Range 0 0) else
(Range span.end (Text_Utils.char_length text))
(Range span.codeunit_end (Text_Utils.char_length text))
After_Last delimiter ->
if delimiter.is_empty then (Range 0 0) else
span = Text_Utils.last_span_of text delimiter
if span.is_nothing then (Range 0 0) else
(Range span.end (Text_Utils.char_length text))
(Range span.codeunit_end (Text_Utils.char_length text))
While predicate ->
indices = find_sub_range_end text _-> start-> end->
predicate (Text_Utils.substring text start end) . not
Expand Down
60 changes: 41 additions & 19 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import java.util.Locale;
import java.util.regex.Pattern;
import org.enso.base.text.CaseFoldedString;
import org.enso.base.text.CaseFoldedString.Grapheme;
import org.enso.base.text.GraphemeSpan;
import org.enso.base.text.Utf16Span;

Expand Down Expand Up @@ -231,19 +232,6 @@ public static String case_insensitive_key(String string, Locale locale) {
return CaseFoldedString.simpleFold(string, locale);
}

/**
* Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
*
* @param str the string to process
* @param oldSequence the substring that is searched for and will be replaced
* @param newSequence the string that will replace occurrences of {@code oldSequence}
* @return {@code str} with all occurrences of {@code oldSequence} replaced with {@code
* newSequence}
*/
public static String replace(String str, String oldSequence, String newSequence) {
return str.replace(oldSequence, newSequence);
}

/**
* Gets the length of char array of a string
*
Expand Down Expand Up @@ -306,7 +294,7 @@ public static List<Utf16Span> span_of_all(String haystack, String needle) {

StringSearch search = new StringSearch(needle, haystack);
ArrayList<Utf16Span> occurrences = new ArrayList<>();
long ix;
int ix;
while ((ix = search.next()) != StringSearch.DONE) {
occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
}
Expand Down Expand Up @@ -456,13 +444,21 @@ public static List<GraphemeSpan> span_of_all_case_insensitive(
* @return a minimal {@code GraphemeSpan} which contains all code units from the match
*/
private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
int firstGrapheme = string.codeUnitToGraphemeIndex(position);
Grapheme firstGrapheme = string.findGrapheme(position);
if (length == 0) {
return new GraphemeSpan(firstGrapheme, firstGrapheme);
return new GraphemeSpan(
firstGrapheme.index,
firstGrapheme.index,
firstGrapheme.codeunit_start,
firstGrapheme.codeunit_start);
} else {
int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
int endGrapheme = lastGrapheme + 1;
return new GraphemeSpan(firstGrapheme, endGrapheme);
Grapheme lastGrapheme = string.findGrapheme(position + length - 1);
int endGraphemeIndex = lastGrapheme.index + 1;
return new GraphemeSpan(
firstGrapheme.index,
endGraphemeIndex,
firstGrapheme.codeunit_start,
lastGrapheme.codeunit_end);
}
}

Expand All @@ -485,4 +481,30 @@ public static String normalize(String str) {
public static boolean is_all_whitespace(String text) {
return text.codePoints().allMatch(UCharacter::isUWhiteSpace);
}

/**
* Replaces all provided spans within the text with {@code newSequence}.
*
* @param str the string to process
* @param spans the spans to replace; the spans should be sorted by their starting point in the
* non-decreasing order; the behaviour is undefined if these requirements are not satisfied.
* @param newSequence the string that will replace the spans
* @return {@code str} with all provided spans replaced with {@code newSequence}
*/
public static String replace_spans(String str, List<Utf16Span> spans, String newSequence) {
StringBuilder sb = new StringBuilder();
int current_ix = 0;
for (Utf16Span span : spans) {
if (span.codeunit_start > current_ix) {
sb.append(str, current_ix, span.codeunit_start);
}

sb.append(newSequence);
current_ix = span.codeunit_end;
}

// Add the remaining part of the string (if any).
sb.append(str, current_ix, str.length());
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@
* indices back in the original string.
*/
public class CaseFoldedString {
public static class Grapheme {
/** The grapheme index of the given grapheme in the string. */
public final int index;

/** The codeunit indices of start and end of the given grapheme in the original string. */
public final int codeunit_start, codeunit_end;

public Grapheme(int index, int codeunit_start, int codeunit_end) {
this.index = index;
this.codeunit_start = codeunit_start;
this.codeunit_end = codeunit_end;
}
}

private final String foldedString;

/**
Expand All @@ -24,33 +38,67 @@ public class CaseFoldedString {
*/
private final int[] graphemeIndexMapping;

/**
* A mapping from code units in the transformed string to the first code-unit of the corresponding
* grapheme in the original string.
*
* <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
* (inclusive).
*/
private final int[] codeunitStartIndexMapping;

/**
* A mapping from code units in the transformed string to the end code-unit of the corresponding
* grapheme in the original string.
*
* <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
* (inclusive).
*/
private final int[] codeunitEndIndexMapping;

/**
* Constructs a new instance of the folded string.
*
* @param foldeString the string after applying the case folding transformation
* @param graphemeIndexMapping a mapping created during the transformation which maps code units
* in the transformed string to their corresponding graphemes in the original string
* @param codeunitStartIndexMapping a mapping created during the transformation which maps code
* units in the transformed string to first codeunits of corresponding graphemes in the
* original string
* @param codeunitStartIndexMapping a mapping created during the transformation which maps code
* units in the transformed string to end codeunits of corresponding graphemes in the original
* string
*/
private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
private CaseFoldedString(
String foldeString,
int[] graphemeIndexMapping,
int[] codeunitStartIndexMapping,
int[] codeunitEndIndexMapping) {
this.foldedString = foldeString;
this.graphemeIndexMapping = graphemeIndexMapping;
this.codeunitStartIndexMapping = codeunitStartIndexMapping;
this.codeunitEndIndexMapping = codeunitEndIndexMapping;
}

/**
* Maps a code unit in the folded string to the corresponding grapheme in the original string.
* Finds the grapheme corresponding to a code unit in the folded string.
*
* @param codeunitIndex the index of the code unit in the folded string, valid indices range from
* 0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
* position of the end code unit which is located right after the end of the string - which
* should always map to the analogous end grapheme.
* @return the index of the grapheme from the original string that after applying the
* transformation contains the requested code unit
* @return the index of the first code unit of the grapheme from the original string that after
* applying the transformation contains the requested code unit
*/
public int codeUnitToGraphemeIndex(int codeunitIndex) {
public Grapheme findGrapheme(int codeunitIndex) {
if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
throw new IndexOutOfBoundsException(codeunitIndex);
}
return graphemeIndexMapping[codeunitIndex];

return new Grapheme(
graphemeIndexMapping[codeunitIndex],
codeunitStartIndexMapping[codeunitIndex],
codeunitEndIndexMapping[codeunitIndex]);
}

/** Returns the transformed string. */
Expand All @@ -74,7 +122,9 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
breakIterator.setText(charSequence);
StringBuilder stringBuilder = new StringBuilder(charSequence.length());
Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
IntArrayBuilder grapheme_mapping = new IntArrayBuilder(charSequence.length() + 1);
IntArrayBuilder codeunit_start_mapping = new IntArrayBuilder(charSequence.length() + 1);
IntArrayBuilder codeunit_end_mapping = new IntArrayBuilder(charSequence.length() + 1);

// We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
// each grapheme cluster is independent of surrounding ones. Regular casing is
Expand All @@ -87,7 +137,9 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
String foldedGrapheme = foldAlgorithm.apply(grapheme);
stringBuilder.append(foldedGrapheme);
for (int i = 0; i < foldedGrapheme.length(); ++i) {
index_mapping.add(grapheme_index);
grapheme_mapping.add(grapheme_index);
codeunit_start_mapping.add(current);
codeunit_end_mapping.add(next);
}

grapheme_index++;
Expand All @@ -96,10 +148,13 @@ public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {

// The mapping should also be able to handle a {@code str.length()} query, so we add one more
// element to the mapping pointing to a non-existent grapheme after the end of the text.
index_mapping.add(grapheme_index);
grapheme_mapping.add(grapheme_index);

return new CaseFoldedString(
stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
stringBuilder.toString(),
grapheme_mapping.unsafeGetStorageAndInvalidateTheBuilder(),
codeunit_start_mapping.unsafeGetStorageAndInvalidateTheBuilder(),
codeunit_end_mapping.unsafeGetStorageAndInvalidateTheBuilder());
}

/**
Expand Down
19 changes: 10 additions & 9 deletions std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,21 @@
* <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
* space just before the grapheme corresponding to index start.
*/
public class GraphemeSpan {
public class GraphemeSpan extends Utf16Span {

public final long start, end;
public final int grapheme_start, grapheme_end;

/**
* Constructs a span of characters (understood as extended grapheme clusters).
*
* @param start index of the first extended grapheme cluster contained within the span (or
* @param grapheme_start index of the first extended grapheme cluster contained within the span (or
* location of the span if it is empty)
* @param end index of the first extended grapheme cluster after start that is not contained
* within the span
* @param grapheme_end index of the first extended grapheme cluster after start that is not contained
* @param codeunit_start code unit index of {@code grapheme_start}
* @param codeunit_end code unit index of {@code grapheme_end}
*/
public GraphemeSpan(long start, long end) {
this.start = start;
this.end = end;
public GraphemeSpan(int grapheme_start, int grapheme_end, int codeunit_start, int codeunit_end) {
super(codeunit_start, codeunit_end);
this.grapheme_start = grapheme_start;
this.grapheme_end = grapheme_end;
}
}
8 changes: 4 additions & 4 deletions std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
*/
public class Utf16Span {

public final long start, end;
public final int codeunit_start, codeunit_end;

/** Constructs a span of UTF-16 code units. */
public Utf16Span(long start, long end) {
this.start = start;
this.end = end;
public Utf16Span(int codeunit_start, int codeunit_end) {
this.codeunit_start = codeunit_start;
this.codeunit_end = codeunit_end;
}
}
4 changes: 2 additions & 2 deletions test/Table_Tests/src/Common_Table_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ spec prefix table_builder supports_case_sensitive_columns pending=Nothing =
expect_column_names ["bar", "foo_001", "foo_1", "Foo_2", "foo_3", "foo_21", "foo_100"] <| table.sort_columns (Sort_Method natural_order=True case_sensitive=Case_Insensitive.new)
expect_column_names ["foo_3", "foo_21", "foo_100", "foo_1", "foo_001", "bar", "Foo_2"] <| table.sort_columns (Sort_Method order=Sort_Order.Descending)

Test.specify "should correctly handle case insensitive sorting" <|
Test.specify "should correctly handle case-insensitive sorting" <|
expect_column_names ["bar", "foo_001", "foo_1", "foo_100", "Foo_2", "foo_21", "foo_3"] <| table.sort_columns (Sort_Method case_sensitive=Case_Insensitive.new)

Test.specify "should correctly handle natural order sorting" <|
Expand Down Expand Up @@ -412,7 +412,7 @@ spec prefix table_builder supports_case_sensitive_columns pending=Nothing =
expect_column_names ["FirstColumn", "beta", "gamma", "Another"] <|
table.rename_columns (Column_Mapping.By_Name map (Text_Matcher True))

Test.specify "should work by name case insensitively" <|
Test.specify "should work by name case-insensitively" <|
map = Map.from_vector [["ALPHA", "FirstColumn"], ["DELTA", "Another"]]
expect_column_names ["FirstColumn", "beta", "gamma", "Another"] <|
table.rename_columns (Column_Mapping.By_Name map (Text_Matcher Case_Insensitive.new))
Expand Down
Loading

0 comments on commit 0c44b42

Please sign in to comment.