enso-org · mergify · Jan 17, 2023 · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -273,6 +273,7 @@
 - [Aligning core APIs for Vector, List and Range. Adding some missing functions
   to the types.][4026]
 - [Implemented `Table.distinct` for Database backends.][4027]
+- [Implemented `Table.union` for the in-memory backend.][4052]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -428,6 +429,7 @@
 [4013]: https://github.com/enso-org/enso/pull/4013
 [4026]: https://github.com/enso-org/enso/pull/4026
 [4027]: https://github.com/enso-org/enso/pull/4027
+[4052]: https://github.com/enso-org/enso/pull/4052
 
 #### Enso Compiler
 

@@ -241,7 +241,7 @@ type Map
              import Standard.Examples
 
              example_contains = Examples.map.contains_key 2
-    contains_key : Any -> Any
+    contains_key : Any -> Boolean
     contains_key self key =
         go map = case map of
             Map.Tip -> False

@@ -1,4 +1,5 @@
 from Standard.Base import all
+import Standard.Base.Data.Array_Proxy.Array_Proxy
 import Standard.Base.Error.Common.Index_Out_Of_Bounds
 import Standard.Base.Error.Common.Type_Error
 import Standard.Base.Error.File_Error.File_Error
@@ -13,6 +14,7 @@ import Standard.Table.Data.Expression.Expression
 import Standard.Table.Data.Expression.Expression_Error
 import Standard.Table.Data.Join_Condition.Join_Condition
 import Standard.Table.Data.Join_Kind.Join_Kind
+import Standard.Table.Data.Report_Unmatched.Report_Unmatched
 import Standard.Table.Data.Row.Row
 import Standard.Table.Data.Table.Table as Materialized_Table
 import Standard.Table.Internal.Java_Exports
@@ -534,13 +536,17 @@ type Table
 
        Returns the vector of columns contained in this table.
     columns : Vector Column
-    columns self = self.internal_columns . map self.make_column
+    columns self = Vector.from_polyglot_array <|
+        Array_Proxy.new self.internal_columns.length i->
+            self.make_column (self.internal_columns.at i)
 
     ## UNSTABLE
 
        Returns the vector of column names contained in this table.
     column_names : Vector Text
-    column_names self = self.internal_columns . map _.name
+    column_names self = Vector.from_polyglot_array <|
+        Array_Proxy.new self.internal_columns.length i->
+            self.internal_columns.at i . name
 
     ## Returns a vector of rows contained in this table.
 
@@ -685,6 +691,9 @@ type Table
            and an empty result is reported.
          - If a column index is invalid, an `Index_Out_Of_Bounds` is
            reported and an empty result is reported.
+         - If there are column names that are clashing between the two tables, a
+           `Duplicate_Output_Column_Names` is reported and the columns from the
+           table are renamed as described below.
          - If a join condition correlates columns whose types are not compatible
            (for example comparing numeric types with text), an
            `Invalid_Value_Type` is reported.
@@ -767,6 +776,99 @@ type Table
             problem_builder.attach_problems_before on_problems <|
                 self.connection.dialect.prepare_join self.connection sql_join_kind new_table_name left_setup.subquery right_setup.subquery on_expressions where_expressions columns_to_select=result_columns
 
+    ## ALIAS append, concat
+       Appends records from other table(s) to this table.
+
+       Arguments:
+       - tables: A single table or a vector of tables to append to this one. The
+         tables are concatenated in the order they are specified, with `self`
+         being the first one.
+       - match_columns: Specifies how to match the columns.
+         - If `Match_Columns.By_Name` - the columns are matched by name across
+           all provided tables.
+           If unmatched columns are to be dropped, the resulting table will keep
+           only the set of columns that appear in all provided tables, in the
+           relative order that they appeared in the `self` table.
+           If unmatched columns are kept, they are added in the order of
+           appearance - i.e. first all columns from `self` will be added in the
+           original order, then any columns from the second table that were not
+           matched will be added at the end (preserving their relative order),
+           and so on for all the remaining tables.
+         - If `Match_Columns.By_Position` - the columns are mapped by position.
+           If unmatched columns are to be dropped, the resulting table will have
+           as many columns as the table that had the least columns and the
+           column names of the first table (`self`) will be used.
+           If unmatched columns are kept, the resulting table will have as many
+           columns as the table with the most columns. Since the first table may
+           not have all the necessary columns to provide column names for the
+           result, the result will have column names taken from the first table
+           that has the biggest number of columns.
+       - keep_unmatched_columns: If set to `True`, unmatched columns are kept
+         and are padded with `Nothing` for tables that did not have them.
+         If set to `False`, only the common subset of columns is kept - any
+         column that is not present in all tables is dropped. Defaults to
+         `Report_Unmatched`, which behaves like `True` - unmatched columns are
+         kept and padded with `Nothing`, but a problem is reported.
+       - allow_type_widening: Specifies if the resulting column type should be
+         adjusted to fit columns from all arguments. If `True`, a common type
+         will be chosen for each column (see "Unifying Column Types" below).
+         If `False`, the resulting column type will be the same as in the first
+         table containing the column. In this case, all columns that are
+         concatenated must have the same type as the first one (unless this
+         had a `Mixed` type - in which case it will accept any other types).
+       - on_problems: Specifies how to handle problems if they occur, reporting
+         them as warnings by default.
+
+         - If `keep_unmatched_columns` is set to `Report_Unmatched` (the
+           default):
+           - If matching by name and there are columns that are not present in
+             all tables, `Unmatched_Columns` is reported.
+           - If matching by position and column counts of the merged tables
+             differ, then a `Column_Count_Mismatch` is reported. The error will
+             contain the greatest column count as its `expected` value and the
+             smallest one as its `actual` value.
+         - If `keep_unmatched_columns` is set to `False` and matching by name,
+           it is possible that there are no columns that are common to all
+           provided tables, in that case `No_Output_Columns` is thrown as a
+           dataflow error regardless of the `on_problems` setting, because there
+           are no columns to include in the resulting table.
+         - If type widening is disabled and one of corresponding columns has a
+           type that is incompatible with the type coming from the first table,
+           a `Column_Type_Mismatch` is reported. The problematic column will be
+           dropped from the resulting table. With type widening disabled, the
+           subsequent tables must have the same types as the first one, unless
+           the type of the first one was `Mixed` which will accept any other
+           type.
+         - If a common type coercion for a set of matched columns from
+           concatenated tables cannot be found, a `No_Common_Type` is reported.
+           In warning or ignore mode, the problematic column will be dropped
+           from the resulting table.
+
+       ? Unifying Column Types
+
+         If `allow_type_widening` is set to `True`, then the following rules are
+         used to find a common type that will fit values from all merged tables.
+
+         Numeric columns are unified by finding the most general type that can
+         fit all of the columns. The biggest integer type will be chosen and if
+         integers and decimals are mixed, the decimal type will be chosen.
+         If boolean columns are mixed with numeric columns, they will be coerced
+         to the numeric type (and converted to 0 and 1).
+
+         Text types will also be coerced according to the common rules - if
+         constant-length texts of different lengths are mixed, they will be
+         coerced to a varying-length type.
+
+         If one of the matched columns has `Mixed` type, that type will be used
+         regardless of types of other columns. Mixing any other types will
+         result in a `No_Common_Type` problem. If columns of incompatible types
+         are meant to be mixed, at least one of them should be explicitly
+         retyped to the `Mixed` type to indicate that intention.
+    union : (Table | Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table
+    union self tables match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning =
+        _ = [tables, match_columns, keep_unmatched_columns, allow_type_widening, on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.union is not implemented yet for the Database backends.")
+
     ## ALIAS group, summarize
 
        Aggregates the rows in a table using any `Group_By` entries in columns.

@@ -14,6 +14,7 @@ from project.Data.Table import print_table
 
 from project.Errors import No_Index_Set_Error
 
+polyglot java import org.enso.table.data.column.storage.Storage as Java_Storage
 polyglot java import org.enso.table.data.table.Column as Java_Column
 polyglot java import org.enso.table.operations.OrderBuilder
 
@@ -34,6 +35,11 @@ type Column
     from_vector : Text -> Vector -> Column
     from_vector name items = Column.Value (Java_Column.fromItems name items.to_array)
 
+    ## PRIVATE
+       Creates a new column given a name and an internal Java storage.
+    from_storage : Text -> Java_Storage -> Column
+    from_storage name storage = Column.Value (Java_Column.new name storage)
+
     ## Creates a new column given a name and a vector of elements repeated over and over.
 
        Arguments:
@@ -1013,20 +1019,11 @@ type Column
     storage_type : Storage
     storage_type self =
         tp = self.java_column.getStorage.getType
-        Storage.types.at tp . catch Index_Out_Of_Bounds.Error _->
-            Panic.throw (Illegal_State.Error "Unknown storage type: "+tp.to_text)
+        Storage.from_java tp
 
     ## UNSTABLE TODO this is a prototype that will be revisited later on
     value_type : Value_Type
-    value_type self = case self.storage_type of
-        Storage.Text -> Value_Type.Char
-        Storage.Integer -> Value_Type.Integer
-        Storage.Decimal -> Value_Type.Float
-        Storage.Boolean -> Value_Type.Boolean
-        Storage.Date -> Value_Type.Date
-        Storage.Time_Of_Day -> Value_Type.Time
-        Storage.Date_Time -> Value_Type.Date_Time
-        Storage.Any -> Value_Type.Mixed
+    value_type self = self.storage_type.to_approximate_value_type
 
     ## UNSTABLE
 
@@ -1323,3 +1320,44 @@ get_item_string column ix =
 slice_ranges column ranges =
     normalized = Index_Sub_Range_Module.normalize_ranges ranges
     Column.Value (column.java_column.slice normalized.to_array)
+
+## PRIVATE
+   Creates a storage builder suitable for building a column for the provided
+   column type.
+
+   This relies on a rudimentary mapping between `Value_Type` and `Storage`. It
+   does not ensure validity checks for the particular type, like checking string
+   length or number size.
+
+   It may be tempting to return an `InferredBuilder` for the `Mixed` type - as
+   this will use a more compact storage if a mixed type column contains only
+   numbers. However, since currently `Column.value_type` is derived directly
+   from its storage type, that would result in a changed `value_type` in the
+   result. Whereas we want to ensure that if the requested type is `Mixed`, the
+   resulting column should also report `Mixed` value type. Once the types work
+   decouples `value_type` from `storage_type`, this logic could be adjusted.
+
+   Due to the coupling of value types and storage, `value_type` of the created
+   column may not be exactly the same as the one requested here, it will be the
+   closest one currently supported by our storage (i.e. any constraints like
+   integer size or constant text width will be dropped). This will need to be
+   revisited as part of the types work:
+   https://www.pivotaltracker.com/story/show/183854180
+make_storage_builder_for_type value_type initial_size=128 =
+    closest_storage_type = case value_type of
+        Value_Type.Boolean -> Storage.Boolean
+        Value_Type.Byte -> Storage.Integer
+        Value_Type.Integer _ -> Storage.Integer
+        Value_Type.Float _ -> Storage.Decimal
+        ## Arbitrary precision numbers are not currently representable by our
+           specialized in-memory storage, so falling back to object storage.
+        Value_Type.Decimal _ _ -> Storage.Any
+        Value_Type.Char _ _ -> Storage.Text
+        Value_Type.Date -> Storage.Date
+        Value_Type.Date_Time with_timezone ->
+            ## Our specialized storage is only capable of storing date time with timezone. If we want to store a different kind of date-time, we will
+            if with_timezone then Storage.Date_Time else Storage.Any
+        Value_Type.Time -> Storage.Time_Of_Day
+        Value_Type.Mixed -> Storage.Any
+        _ -> Storage.Any
+    Storage.make_builder closest_storage_type initial_size
@@ -1,15 +1,107 @@
 from Standard.Base import all
+import Standard.Base.Runtime.State
 
-## Specifies how to join columns in the table to existing data.
+import project.Data.Report_Unmatched.Report_Unmatched
+from project.Errors import Column_Count_Mismatch, Unmatched_Columns, No_Output_Columns
+
+## Specifies a column matching strategy.
 type Match_Columns
-    ## Columns are matched by Name against an existing file.
-       A `Column_Name_Mismatch` error occurs if any column name in the existing
-       data could not be matched to the new data, or any column name in the new
-       data was not found in the existing data.
+    ## Columns are matched by Name.
     By_Name
 
-    ## Columns are matched by Position against the existing data.
+    ## Columns are matched by Position.
+
        Note: column names are not compared.
-       A `Column_Count_Mismatch` error occurs if the existing data has a
-       different number of columns than the table.
     By_Position
+
+## PRIVATE
+   A helper that encapsulates the common backend-agnostic logic of matching
+   columns in `Table.union`.
+
+   It matches columns according to the provided matching settings and returns a
+   list of column sets to be merged.
+
+   Each column set consists of a name of the resulting column and a list of
+   indices for columns in corresponding tables that will be merged to form this
+   result column. The first column index corresponds to the first table in the
+   input and so on. If no column corresponding to a given column set was matched
+   in a particular table, its entry will be contain `Nothing` instead.
+
+   The column sets are returned in the order in which the corresponding result
+   columns should appear in the resulting table.
+
+   The method assumes at least one table is provided in its input.
+match_columns tables matching_mode keep_unmatched_columns problem_builder = case matching_mode of
+    Match_Columns.By_Name -> case keep_unmatched_columns of
+        False ->
+            column_counts = find_column_counts tables
+            # This will only include columns that were present in all tables.
+            common_column_names = tables.first.column_names.filter name->
+                column_counts.at name == tables.length
+            if common_column_names.is_empty then Error.throw No_Output_Columns else
+                common_column_names.map name->
+                    column_indices = tables.map table->
+                        table.column_names.index_of name
+                    Column_Set.Value name column_indices
+        _ ->
+            output_column_names = distinct_columns_in_appearance_order tables
+            if keep_unmatched_columns == Report_Unmatched then
+                column_counts = find_column_counts tables
+                all_tables_count = tables.length
+                ## We iterate over output column names to get deterministic
+                  order of unmatched columns.
+                unmatched_column_names = output_column_names.filter name->
+                    column_counts.get name 0 < all_tables_count
+                if unmatched_column_names.not_empty then
+                    problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names)
+            output_column_names.map name->
+                column_indices = tables.map table->
+                    table.columns.index_of col-> col.name==name
+                Column_Set.Value name column_indices
+    Match_Columns.By_Position ->
+        column_counts = tables.map table-> table.columns.length
+        minmax = column_counts.compute_bulk [Statistic.Minimum, Statistic.Maximum]
+        columns_to_take = if keep_unmatched_columns == False then minmax.first else minmax.second
+        if (minmax.first != minmax.second) && (keep_unmatched_columns == Report_Unmatched) then
+            problem_builder.report_other_warning (Column_Count_Mismatch.Error minmax.second minmax.first)
+        name_source = if keep_unmatched_columns == False then tables.first else
+            tables.find table-> table.columns.length == columns_to_take
+        column_sets = Vector.new columns_to_take i->
+            name = name_source.at i . name
+            column_ids = tables.map table->
+                column_count = table.columns.length
+                if i >= column_count then Nothing else i
+            Column_Set.Value name column_ids
+        column_sets
+
+type Column_Set
+    ## PRIVATE
+    Value (name : Text) (column_indices : Vector Integer)
+
+    ## PRIVATE
+    resolve_columns self all_tables = self.column_indices.zip all_tables i-> parent_table->
+        case i of
+            Nothing -> Nothing
+            _ : Integer -> parent_table.at i
+
+## PRIVATE
+   Returns a map indicating in how many tables did a column with a given name appear.
+find_column_counts tables =
+    tables.fold Map.empty current->table->
+        table.columns.fold current counts-> column->
+            name=column.name
+            new_count = counts.get name 0 + 1
+            counts.insert name new_count
+
+## PRIVATE
+   Returns a list of distinct column names, in the order of first appearance,
+   starting from the first table.
+distinct_columns_in_appearance_order tables =
+    names_builder = Vector.new_builder
+    tables.fold Map.empty current-> table->
+        table.columns.fold current seen_names-> column->
+            name = column.name
+            if seen_names.contains_key name then seen_names else
+                names_builder.append name
+                seen_names.insert name True
+    names_builder.to_vector
@@ -0,0 +1,3 @@
+## A value that marks the mode that allows column mismatches but reports them as
+   a problem.
+type Report_Unmatched