enso-org · mergify · Sep 7, 2022 · Aug 24, 2022 · Aug 31, 2022 · Sep 1, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -190,6 +190,7 @@
 - [Added various date part functions to `Date` and `Date_Time`.][3669]
 - [Implemented `Table.take` and `Table.drop` for the in-memory backend.][3647]
 - [Implemented specialized storage for the in-memory Table.][3673]
+- [Implemented `Table.distinct` for the in-memory backend.][3684]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -303,6 +304,7 @@
 [3669]: https://github.com/enso-org/enso/pull/3669
 [3647]: https://github.com/enso-org/enso/pull/3647
 [3673]: https://github.com/enso-org/enso/pull/3673
+[3684]: https://github.com/enso-org/enso/pull/3684
 
 #### Enso Compiler
 

@@ -1767,7 +1767,7 @@ lazy val `std-table` = project
     Compile / packageBin / artifactPath :=
       `table-polyglot-root` / "std-table.jar",
     libraryDependencies ++= Seq(
-      "com.ibm.icu"         % "icu4j"             % icuVersion,
+      "com.ibm.icu"         % "icu4j"             % icuVersion % "provided",
       "com.univocity"       % "univocity-parsers" % "2.9.1",
       "org.apache.poi"      % "poi-ooxml"         % "5.2.2",
       "org.apache.xmlbeans" % "xmlbeans"          % "5.1.0",
@@ -1786,6 +1786,7 @@ lazy val `std-table` = project
       result
     }.value
   )
+  .dependsOn(`std-base` % "provided")
 
 lazy val `std-image` = project
   .in(file("std-bits") / "image")

@@ -1,3 +1,7 @@
+from Standard.Base import all
+
+polyglot java import org.enso.base.text.TextFoldingStrategy
+
 ## Specifies the casing options for text conversion.
 type Case
     ## All letters in lower case.
@@ -8,3 +12,18 @@ type Case
 
     ## First letter of each word in upper case, rest in lower case.
     Title
+
+## Represents case-insensitive comparison mode.
+
+   Arguments:
+   - locale: The locale used for the comparison.
+type Case_Insensitive
+    Case_Insensitive_Data locale=Locale.default
+
+
+folding_strategy : (True|Case_Insensitive) -> TextFol
+folding_strategy case_sensitive = case case_sensitive of
+    True -> TextFoldingStrategy.unicodeNormalizedFold
+    Case_Insensitive_Data locale ->
+        TextFoldingStrategy.caseInsensitiveFold locale.java_locale
+
@@ -1,5 +1,6 @@
 from Standard.Base import all
 
+from Standard.Base.Data.Text.Case import Case_Insensitive, Case_Insensitive_Data
 from Standard.Base.Error.Problem_Behavior import Report_Warning
 from Standard.Base.Error.Common import Wrapped_Dataflow_Error_Data
 
@@ -13,13 +14,6 @@ No_Matches_Found.to_display_text self =
     "The criteria "+self.criteria.to_text+" did not match any names in the input."
 
 
-## Represents case-insensitive comparison mode.
-
-   Arguments:
-   - locale: The locale used for the comparison.
-type Case_Insensitive
-    Case_Insensitive_Data locale=Locale.default
-
 ## Represents exact text matching mode.
 
    Arguments:

@@ -19,6 +19,7 @@ import project.Data.Regression
 import project.Data.Statistics
 import project.Data.Statistics.Rank_Method
 import project.Data.Text
+import project.Data.Text.Case
 import project.Data.Text.Encoding
 import project.Data.Text.Extensions
 import project.Data.Text.Matching
@@ -97,7 +98,8 @@ from project.Data.Range export all
    https://www.pivotaltracker.com/story/show/181403340
    https://www.pivotaltracker.com/story/show/181309938
 from project.Data.Text.Extensions export Text, Line_Ending_Style, Case, Location, Matching_Mode
-from project.Data.Text.Matching export Case_Insensitive_Data, Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data
+from project.Data.Text.Matching export Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data
+from project.Data.Text.Case export Case_Insensitive_Data, Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data
 from project.Data.Text export all hiding Encoding, Span, Text_Ordering
 from project.Data.Text.Encoding export Encoding, Encoding_Error, Encoding_Error_Data
 from project.Data.Text.Text_Ordering export all

@@ -19,7 +19,7 @@ import Standard.Table.Internal.Aggregate_Column_Helper
 from Standard.Database.Data.Column import Column, Aggregate_Column_Builder, Column_Data
 from Standard.Database.Data.Internal.IR import Internal_Column, Internal_Column_Data
 from Standard.Table.Errors import No_Such_Column_Error, No_Such_Column_Error_Data
-from Standard.Table.Data.Column_Selector import Column_Selector, By_Index
+from Standard.Table.Data.Column_Selector import Column_Selector, By_Index, By_Name
 from Standard.Table.Data.Data_Formatter import Data_Formatter
 from Standard.Database.Error import Unsupported_Database_Operation_Error_Data
 import Standard.Table.Data.Column_Name_Mapping
@@ -547,6 +547,38 @@ type Table
             new_ctx = self.context.add_orders new_order_descriptors
             self.updated_context new_ctx
 
+    ## Returns the distinct set of rows within the specified columns from the
+       input table.
+
+       When multiple rows have the same values within the specified columns, the
+       first row of each such set is returned.
+
+       For the in-memory table, the unique rows will be in the order they
+       occurred in the input (this is not guaranteed for database operations).
+
+       Arguments:
+       - columns: The columns of the table to use for distinguishing the rows.
+       - case_sensitive: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies how to handle if a problem occurs, raising as a
+         warning by default.
+
+         The following problems can occur:
+         - If a column in columns is not in the input table, a
+           `Missing_Input_Columns`.
+         - If duplicate columns, names or indices are provided, a
+           `Duplicate_Column_Selectors`.
+         - If a column index is out of range, a `Column_Indexes_Out_Of_Range`.
+         - If two distinct indices refer to the same column, an
+           `Input_Indices_Already_Matched`.
+         - If no valid columns are selected, a `No_Input_Columns_Selected`.
+         - If floating points values are present in the distinct columns, a
+           `Floating_Point_Grouping` warning.
+    distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table
+    distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning =
+        _ = [columns, case_sensitive, on_problems]
+        Error.throw (Unsupported_Database_Operation_Error_Data "`Table.distinct` is not yet implemented for the database backend.")
+
     ## UNSTABLE
 
        Efficiently joins two tables based on either the index or a key column.

@@ -3,6 +3,7 @@ import Standard.Base.Error.Common as Errors
 from Standard.Base.Error.Problem_Behavior import Report_Warning
 import Standard.Base.Data.Index_Sub_Range
 import Standard.Base.Data.Ordering.Comparator
+import Standard.Base.Data.Text.Case
 import Standard.Base.System.Platform
 
 import Standard.Table.Data.Column
@@ -14,10 +15,10 @@ import Standard.Table.Internal.Delimited_Reader
 import Standard.Table.Internal.Delimited_Writer
 import Standard.Table.Internal.Problem_Builder
 
-from Standard.Table.Data.Column_Selector import Column_Selector, By_Index
+from Standard.Table.Data.Column_Selector import Column_Selector, By_Index, By_Name
 from Standard.Table.Data.Column_Type_Selection import Column_Type_Selection, Auto
 from Standard.Table.Data.Data_Formatter import Data_Formatter, Data_Formatter_Data
-from Standard.Table.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data
+from Standard.Table.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns
 import Standard.Table.Data.Match_Columns
 
 import Standard.Table.Data.Column_Name_Mapping
@@ -541,7 +542,7 @@ type Table
     aggregate self columns (on_problems=Report_Warning) =
         validated = Aggregate_Column_Helper.prepare_aggregate_columns columns self
 
-        on_problems.attach_problems_before validated.problems <|
+        on_problems.attach_problems_before validated.problems <| Illegal_Argument_Error.handle_java_exception <|
             java_key_columns = validated.key_columns.map .java_column
             index = self.java_table.indexFromColumns java_key_columns.to_array Comparator.new
 
@@ -650,9 +651,53 @@ type Table
             selected_columns = columns_for_ordering.map c->c.column.java_column
             ordering = columns_for_ordering.map c->c.associated_selector.direction.to_sign
             comparator = Comparator.for_text_ordering text_ordering
-            java_table = self.java_table.orderBy selected_columns.to_array ordering.to_array comparator
+            java_table = Illegal_Argument_Error.handle_java_exception <|
+                self.java_table.orderBy selected_columns.to_array ordering.to_array comparator
             Table_Data java_table
 
+    ## Returns the distinct set of rows within the specified columns from the
+       input table.
+
+       When multiple rows have the same values within the specified columns, the
+       first row of each such set is returned.
+
+       For the in-memory table, the unique rows will be in the order they
+       occurred in the input (this is not guaranteed for database operations).
+
+       Arguments:
+       - columns: The columns of the table to use for distinguishing the rows.
+       - case_sensitive: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies how to handle if a problem occurs, raising as a
+         warning by default.
+
+         The following problems can occur:
+         - If a column in columns is not in the input table, a
+           `Missing_Input_Columns`.
+         - If duplicate columns, names or indices are provided, a
+           `Duplicate_Column_Selectors`.
+         - If a column index is out of range, a `Column_Indexes_Out_Of_Range`.
+         - If two distinct indices refer to the same column, an
+           `Input_Indices_Already_Matched`.
+         - If no valid columns are selected, a `No_Input_Columns_Selected`.
+         - If floating points values are present in the distinct columns, a
+           `Floating_Point_Grouping` warning.
+    distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table
+    distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning =
+        warning_mapper error = case error of
+            No_Output_Columns -> Maybe.Some No_Input_Columns_Selected
+            _ -> Nothing
+        key_columns = Warning.map_warnings_and_errors warning_mapper <|
+            Table_Helpers.select_columns internal_columns=self.columns selector=columns reorder=True on_problems=on_problems
+        java_columns = key_columns.map .java_column
+        text_folding_strategy = Case.folding_strategy case_sensitive
+        java_table = Illegal_Argument_Error.handle_java_exception <|
+            self.java_table.distinct java_columns.to_array text_folding_strategy
+        on_problems.attach_problems_after (Table_Data java_table) <|
+            problems = java_table.getProblems
+            Aggregate_Column_Helper.parse_aggregated_problems problems
+
+
     ## Parses columns within a Table to a specific value type.
        By default, it looks at all `Text` columns and attempts to deduce the
        type (columns with other types are not affected). If `column_types` are

@@ -0,0 +1,27 @@
+package org.enso.base.text;
+
+import org.enso.base.Text_Utils;
+
+import java.util.Locale;
+
+/** A strategy for folding text values for comparison and hashing. */
+public interface TextFoldingStrategy {
+  String fold(String value);
+
+  /**
+   * A folding strategy that ensures the strings are normalized, so various equivalent Unicode forms
+   * are equated.
+   */
+  TextFoldingStrategy unicodeNormalizedFold = Text_Utils::normalize;
+
+  /**
+   * A folding strategy that not only normalizes the Unicode strings but also ensures
+   * case-insensitive comparison. It needs a locale for locale-specific case handling.
+   */
+  static TextFoldingStrategy caseInsensitiveFold(Locale locale) {
+    return (String value) -> {
+      String normalized = Text_Utils.normalize(value);
+      return Text_Utils.case_insensitive_key(normalized, locale);
+    };
+  }
+}
@@ -1,11 +1,12 @@
 package org.enso.table.aggregations;
 
-import java.util.List;
 import org.enso.table.data.column.storage.Storage;
 import org.enso.table.data.table.Column;
 import org.enso.table.data.table.problems.InvalidAggregation;
 import org.enso.table.data.table.problems.UnquotedDelimiter;
 
+import java.util.List;
+
 public class Concatenate extends Aggregator {
   private final Storage storage;
   private final String separator;

@@ -1,11 +1,14 @@
 package org.enso.table.aggregations;
 
 import org.enso.table.data.column.storage.Storage;
-import org.enso.table.data.index.MultiValueKey;
+import org.enso.table.data.index.UnorderedMultiValueKey;
 import org.enso.table.data.table.Column;
 import org.enso.table.data.table.problems.FloatingPointGrouping;
 
-import java.util.*;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
 
 /**
  * Aggregate Column counting the number of distinct items in a group. If `ignoreAllNull` is true,
@@ -33,9 +36,9 @@ public CountDistinct(
 
   @Override
   public Object aggregate(List<Integer> indexes) {
-    Set<MultiValueKey> set = new HashSet<>();
+    HashSet<UnorderedMultiValueKey> set = new HashSet<>();
     for (int row : indexes) {
-      MultiValueKey key = new MultiValueKey(storage, row, objectComparator);
+      UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, row);
       if (key.hasFloatValues()) {
         this.addProblem(new FloatingPointGrouping(this.getName(), row));
       }

@@ -1,7 +1,7 @@
 package org.enso.table.aggregations;
 
 import org.enso.table.data.column.storage.Storage;
-import org.enso.table.data.index.MultiValueKey;
+import org.enso.table.data.index.OrderedMultiValueKey;
 import org.enso.table.data.table.Column;
 
 import java.util.Arrays;
@@ -51,7 +51,7 @@ public Object aggregate(List<Integer> indexes) {
   }
 
   private Object firstBySpecifiedOrder(List<Integer> indexes) {
-    MultiValueKey key = null;
+    OrderedMultiValueKey key = null;
     Object current = null;
 
     for (int row : indexes) {
@@ -60,8 +60,9 @@ private Object firstBySpecifiedOrder(List<Integer> indexes) {
         continue;
       }
 
-      MultiValueKey newKey =
-          new MultiValueKey(this.orderByColumns, row, this.orderByDirections, objectComparator);
+      OrderedMultiValueKey newKey =
+          new OrderedMultiValueKey(
+              this.orderByColumns, row, this.orderByDirections, objectComparator);
       if (key == null || key.compareTo(newKey) > 0) {
         key = newKey;
         current = storage.getItemBoxed(row);

@@ -1,7 +1,7 @@
 package org.enso.table.aggregations;
 
 import org.enso.table.data.column.storage.Storage;
-import org.enso.table.data.index.MultiValueKey;
+import org.enso.table.data.index.OrderedMultiValueKey;
 import org.enso.table.data.table.Column;
 
 import java.util.Arrays;
@@ -50,7 +50,7 @@ public Object aggregate(List<Integer> indexes) {
   }
 
   private Object lastBySpecifiedOrder(List<Integer> indexes) {
-    MultiValueKey key = null;
+    OrderedMultiValueKey key = null;
     Object current = null;
 
     for (int i = indexes.size() - 1; i >= 0; i--) {
@@ -60,8 +60,9 @@ private Object lastBySpecifiedOrder(List<Integer> indexes) {
         continue;
       }
 
-      MultiValueKey newKey =
-          new MultiValueKey(this.orderByColumns, row, this.orderByDirections, objectComparator);
+      OrderedMultiValueKey newKey =
+          new OrderedMultiValueKey(
+              this.orderByColumns, row, this.orderByDirections, objectComparator);
       if (key == null || key.compareTo(newKey) < 0) {
         key = newKey;
         current = storage.getItemBoxed(row);

@@ -1,12 +1,13 @@
 package org.enso.table.aggregations;
 
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.table.Column;
+import org.enso.table.data.table.problems.InvalidAggregation;
+
 import java.util.List;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
-import org.enso.table.data.column.storage.Storage;
-import org.enso.table.data.table.Column;
-import org.enso.table.data.table.problems.InvalidAggregation;
 
 /** Aggregate Column computing a percentile value in a group. */
 public class Percentile extends Aggregator {

@@ -1,10 +1,11 @@
 package org.enso.table.aggregations;
 
-import java.util.List;
 import org.enso.table.data.column.storage.Storage;
 import org.enso.table.data.table.Column;
 import org.enso.table.data.table.problems.InvalidAggregation;
 
+import java.util.List;
+
 /** Aggregate Column computing the standard deviation of a group. */
 public class StandardDeviation extends Aggregator {
   private static class Calculation {