From b940cd93b7b75bdbfac45bafe56c64b165e0f0dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Sat, 3 Sep 2022 23:58:18 +0200 Subject: [PATCH] Add Text folding to UnorderedMultiValueKey --- .../Table/0.0.0-dev/src/Data/Table.enso | 9 +++++-- .../table/aggregations/CountDistinct.java | 1 - .../table/data/index/MultiValueKeyBase.java | 26 +++++++++---------- .../data/index/UnorderedMultiValueKey.java | 21 ++++++++++++++- .../java/org/enso/table/data/table/Table.java | 7 ++--- .../org/enso/table/operations/Distinct.java | 5 ++-- .../enso/table/text/CaseInsensitiveFold.java | 19 ++++++++++++++ .../enso/table/text/TextFoldingStrategy.java | 5 ++++ .../table/text/UnicodeNormalizedFold.java | 13 ++++++++++ 9 files changed, 84 insertions(+), 22 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/text/CaseInsensitiveFold.java create mode 100644 std-bits/table/src/main/java/org/enso/table/text/TextFoldingStrategy.java create mode 100644 std-bits/table/src/main/java/org/enso/table/text/UnicodeNormalizedFold.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 77d0d89a94855..3d0fe6f5fd55d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -32,6 +32,8 @@ polyglot java import org.enso.table.data.table.Table as Java_Table polyglot java import org.enso.table.data.table.Column as Java_Column polyglot java import org.enso.table.operations.OrderBuilder polyglot java import org.enso.table.data.mask.OrderMask +polyglot java import org.enso.table.text.UnicodeNormalizedFold +polyglot java import org.enso.table.text.CaseInsensitiveFold ## Creates a new table from a vector of `[name, items]` pairs. @@ -682,14 +684,17 @@ type Table `Floating_Point_Grouping` warning. distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning = - # TODO case sensitive! warning_mapper error = case error of No_Output_Columns -> Maybe.Some No_Input_Columns_Selected _ -> Nothing key_columns = Warning.map_warnings_and_errors warning_mapper <| Table_Helpers.select_columns internal_columns=self.columns selector=columns reorder=True on_problems=on_problems java_columns = key_columns.map .java_column - java_table = self.java_table.distinct java_columns.to_array Comparator.new + text_folding_strategy = case case_sensitive of + True -> UnicodeNormalizedFold.INSTANCE + Case_Insensitive_Data locale -> + CaseInsensitiveFold.new locale.java_locale + java_table = self.java_table.distinct java_columns.to_array text_folding_strategy on_problems.attach_problems_after (Table_Data java_table) <| problems = java_table.getProblems Aggregate_Column_Helper.parse_aggregated_problems problems diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java b/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java index fa8cb427e12b1..80b031ef89cbe 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java @@ -1,7 +1,6 @@ package org.enso.table.aggregations; import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.index.MultiValueKeyBase; import org.enso.table.data.index.UnorderedMultiValueKey; import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.FloatingPointGrouping; diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java index 1c6aabc339e22..8506542e75821 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java @@ -51,21 +51,21 @@ private boolean findFloats() { return false; } - protected static Object foldObject(Object value) { + protected Object foldObject(Object value) { if (value instanceof Long) { return value; - } else if (value instanceof Integer) { - return ((Integer) value).longValue(); - } else if (value instanceof Byte) { - return ((Byte) value).longValue(); - } else if (value instanceof Float && ((Float) value) % 1 == 0) { - return ((Float) value).longValue(); - } else if (value instanceof Double && ((Double) value) % 1 == 0) { - return ((Double) value).longValue(); - } else if (value instanceof Float) { - return ((Float) value).doubleValue(); - } else if (value instanceof Double) { - return value; + } else if (value instanceof Integer i) { + return i.longValue(); + } else if (value instanceof Byte b) { + return b.longValue(); + } else if (value instanceof Float f && f % 1 == 0) { + return f.longValue(); + } else if (value instanceof Double d && d % 1 == 0) { + return d.longValue(); + } else if (value instanceof Float f) { + return f.doubleValue(); + } else if (value instanceof Double d) { + return d; } return value; diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java b/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java index 7ea178e9041ff..e6f6cdc1e1f76 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java @@ -1,14 +1,22 @@ package org.enso.table.data.index; import org.enso.table.data.column.storage.Storage; +import org.enso.table.text.TextFoldingStrategy; +import org.enso.table.text.UnicodeNormalizedFold; import java.util.Objects; public class UnorderedMultiValueKey extends MultiValueKeyBase { private final int hashCodeValue; + private final TextFoldingStrategy textFoldingStrategy; public UnorderedMultiValueKey(Storage[] storages, int rowIndex) { + this(storages, rowIndex, UnicodeNormalizedFold.INSTANCE); + } + + public UnorderedMultiValueKey(Storage[] storages, int rowIndex, TextFoldingStrategy textFoldingStrategy) { super(storages, rowIndex); + this.textFoldingStrategy = textFoldingStrategy; // Precompute HashCode - using Apache.Commons.Collections.Map.MultiKeyMap.hash algorithm int h = 1; @@ -27,13 +35,24 @@ public UnorderedMultiValueKey(Storage[] storages, int rowIndex) { floatsComputed = true; } + @Override + protected Object foldObject(Object value) { + if (value instanceof String s) { + return textFoldingStrategy.fold(s); + } else { + return super.foldObject(value); + } + } + @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof MultiValueKeyBase that)) return false; if (storages.length != that.storages.length) return false; for (int i = 0; i < storages.length; i++) { - if (!Objects.equals(get(i), that.get(i))) { + Object thisFolded = foldObject(this.get(i)); + Object thatFolded = foldObject(that.get(i)); + if (!Objects.equals(thisFolded, thatFolded)) { return false; } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java index 6530c9f6bb8f1..7040b66cff0d7 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java @@ -13,6 +13,7 @@ import org.enso.table.error.NoSuchColumnException; import org.enso.table.error.UnexpectedColumnTypeException; import org.enso.table.operations.Distinct; +import org.enso.table.text.TextFoldingStrategy; import java.util.*; import java.util.stream.Collectors; @@ -236,12 +237,12 @@ public Table orderBy(Column[] columns, Long[] directions, Comparator obj * Creates a new table keeping only rows with distinct key columns. * * @param keyColumns set of columns to use as an Index - * @param objectComparator Object comparator allowing calling back to `compare_to` when needed. + * @param textFoldingStrategy a strategy for folding text columns * @return a table where duplicate rows with the same key are removed */ - public Table distinct(Column[] keyColumns, Comparator objectComparator) { + public Table distinct(Column[] keyColumns, TextFoldingStrategy textFoldingStrategy) { var problems = new AggregatedProblems(); - var rowsToKeep = Distinct.buildDistinctRowsMask(rowCount(), keyColumns, objectComparator, problems); + var rowsToKeep = Distinct.buildDistinctRowsMask(rowCount(), keyColumns, textFoldingStrategy, problems); int cardinality = rowsToKeep.cardinality(); Column[] newColumns = new Column[this.columns.length]; Index newIx = index.mask(rowsToKeep, cardinality); diff --git a/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java b/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java index 31c601f7e2ab3..fe5558724db3a 100644 --- a/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java +++ b/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java @@ -6,18 +6,19 @@ import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.AggregatedProblems; import org.enso.table.data.table.problems.FloatingPointGrouping; +import org.enso.table.text.TextFoldingStrategy; import java.util.*; public class Distinct { - public static BitSet buildDistinctRowsMask(int tableSize, Column[] keyColumns, Comparator objectComparator, AggregatedProblems problems) { + public static BitSet buildDistinctRowsMask(int tableSize, Column[] keyColumns, TextFoldingStrategy textFoldingStrategy, AggregatedProblems problems) { var mask = new BitSet(); if (keyColumns.length != 0) { HashSet visitedRows = new HashSet<>(); int size = keyColumns[0].getSize(); Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new); for (int i = 0; i < size; i++) { - UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i); + UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i, textFoldingStrategy); if (key.hasFloatValues()) { problems.add(new FloatingPointGrouping("Distinct", i)); diff --git a/std-bits/table/src/main/java/org/enso/table/text/CaseInsensitiveFold.java b/std-bits/table/src/main/java/org/enso/table/text/CaseInsensitiveFold.java new file mode 100644 index 0000000000000..f8007f7d47633 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/text/CaseInsensitiveFold.java @@ -0,0 +1,19 @@ +package org.enso.table.text; + +import org.enso.base.Text_Utils; + +import java.util.Locale; + +public class CaseInsensitiveFold implements TextFoldingStrategy { + + private final Locale locale; + + public CaseInsensitiveFold(Locale locale) { + this.locale = locale; + } + + @Override + public String fold(String value) { + return Text_Utils.case_insensitive_key(value, locale); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/text/TextFoldingStrategy.java b/std-bits/table/src/main/java/org/enso/table/text/TextFoldingStrategy.java new file mode 100644 index 0000000000000..40c4e82925852 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/text/TextFoldingStrategy.java @@ -0,0 +1,5 @@ +package org.enso.table.text; + +public interface TextFoldingStrategy { + String fold(String value); +} diff --git a/std-bits/table/src/main/java/org/enso/table/text/UnicodeNormalizedFold.java b/std-bits/table/src/main/java/org/enso/table/text/UnicodeNormalizedFold.java new file mode 100644 index 0000000000000..58ae099e87b5f --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/text/UnicodeNormalizedFold.java @@ -0,0 +1,13 @@ +package org.enso.table.text; + +import org.enso.base.Text_Utils; + +public class UnicodeNormalizedFold implements TextFoldingStrategy { + @Override + public String fold(String value) { + return Text_Utils.normalize(value); + } + + public static final UnicodeNormalizedFold INSTANCE = new UnicodeNormalizedFold(); +} +