diff --git a/CHANGELOG.md b/CHANGELOG.md index 28a4fe134672..2ea9454bc1b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -273,6 +273,7 @@ - [Aligning core APIs for Vector, List and Range. Adding some missing functions to the types.][4026] - [Implemented `Table.distinct` for Database backends.][4027] +- [Implemented `Table.union` for the in-memory backend.][4052] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -428,6 +429,7 @@ [4013]: https://github.com/enso-org/enso/pull/4013 [4026]: https://github.com/enso-org/enso/pull/4026 [4027]: https://github.com/enso-org/enso/pull/4027 +[4052]: https://github.com/enso-org/enso/pull/4052 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Map.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Map.enso index cceca084485a..6e6921eaeebc 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Map.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Map.enso @@ -241,7 +241,7 @@ type Map import Standard.Examples example_contains = Examples.map.contains_key 2 - contains_key : Any -> Any + contains_key : Any -> Boolean contains_key self key = go map = case map of Map.Tip -> False diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index cb143ae18dcf..44b9b387031f 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -1,4 +1,5 @@ from Standard.Base import all +import Standard.Base.Data.Array_Proxy.Array_Proxy import Standard.Base.Error.Common.Index_Out_Of_Bounds import Standard.Base.Error.Common.Type_Error import Standard.Base.Error.File_Error.File_Error @@ -13,6 +14,7 @@ import Standard.Table.Data.Expression.Expression import Standard.Table.Data.Expression.Expression_Error import Standard.Table.Data.Join_Condition.Join_Condition import Standard.Table.Data.Join_Kind.Join_Kind +import Standard.Table.Data.Report_Unmatched.Report_Unmatched import Standard.Table.Data.Row.Row import Standard.Table.Data.Table.Table as Materialized_Table import Standard.Table.Internal.Java_Exports @@ -534,13 +536,17 @@ type Table Returns the vector of columns contained in this table. columns : Vector Column - columns self = self.internal_columns . map self.make_column + columns self = Vector.from_polyglot_array <| + Array_Proxy.new self.internal_columns.length i-> + self.make_column (self.internal_columns.at i) ## UNSTABLE Returns the vector of column names contained in this table. column_names : Vector Text - column_names self = self.internal_columns . map _.name + column_names self = Vector.from_polyglot_array <| + Array_Proxy.new self.internal_columns.length i-> + self.internal_columns.at i . name ## Returns a vector of rows contained in this table. @@ -685,6 +691,9 @@ type Table and an empty result is reported. - If a column index is invalid, an `Index_Out_Of_Bounds` is reported and an empty result is reported. + - If there are column names that are clashing between the two tables, a + `Duplicate_Output_Column_Names` is reported and the columns from the + table are renamed as described below. - If a join condition correlates columns whose types are not compatible (for example comparing numeric types with text), an `Invalid_Value_Type` is reported. @@ -767,6 +776,99 @@ type Table problem_builder.attach_problems_before on_problems <| self.connection.dialect.prepare_join self.connection sql_join_kind new_table_name left_setup.subquery right_setup.subquery on_expressions where_expressions columns_to_select=result_columns + ## ALIAS append, concat + Appends records from other table(s) to this table. + + Arguments: + - tables: A single table or a vector of tables to append to this one. The + tables are concatenated in the order they are specified, with `self` + being the first one. + - match_columns: Specifies how to match the columns. + - If `Match_Columns.By_Name` - the columns are matched by name across + all provided tables. + If unmatched columns are to be dropped, the resulting table will keep + only the set of columns that appear in all provided tables, in the + relative order that they appeared in the `self` table. + If unmatched columns are kept, they are added in the order of + appearance - i.e. first all columns from `self` will be added in the + original order, then any columns from the second table that were not + matched will be added at the end (preserving their relative order), + and so on for all the remaining tables. + - If `Match_Columns.By_Position` - the columns are mapped by position. + If unmatched columns are to be dropped, the resulting table will have + as many columns as the table that had the least columns and the + column names of the first table (`self`) will be used. + If unmatched columns are kept, the resulting table will have as many + columns as the table with the most columns. Since the first table may + not have all the necessary columns to provide column names for the + result, the result will have column names taken from the first table + that has the biggest number of columns. + - keep_unmatched_columns: If set to `True`, unmatched columns are kept + and are padded with `Nothing` for tables that did not have them. + If set to `False`, only the common subset of columns is kept - any + column that is not present in all tables is dropped. Defaults to + `Report_Unmatched`, which behaves like `True` - unmatched columns are + kept and padded with `Nothing`, but a problem is reported. + - allow_type_widening: Specifies if the resulting column type should be + adjusted to fit columns from all arguments. If `True`, a common type + will be chosen for each column (see "Unifying Column Types" below). + If `False`, the resulting column type will be the same as in the first + table containing the column. In this case, all columns that are + concatenated must have the same type as the first one (unless this + had a `Mixed` type - in which case it will accept any other types). + - on_problems: Specifies how to handle problems if they occur, reporting + them as warnings by default. + + - If `keep_unmatched_columns` is set to `Report_Unmatched` (the + default): + - If matching by name and there are columns that are not present in + all tables, `Unmatched_Columns` is reported. + - If matching by position and column counts of the merged tables + differ, then a `Column_Count_Mismatch` is reported. The error will + contain the greatest column count as its `expected` value and the + smallest one as its `actual` value. + - If `keep_unmatched_columns` is set to `False` and matching by name, + it is possible that there are no columns that are common to all + provided tables, in that case `No_Output_Columns` is thrown as a + dataflow error regardless of the `on_problems` setting, because there + are no columns to include in the resulting table. + - If type widening is disabled and one of corresponding columns has a + type that is incompatible with the type coming from the first table, + a `Column_Type_Mismatch` is reported. The problematic column will be + dropped from the resulting table. With type widening disabled, the + subsequent tables must have the same types as the first one, unless + the type of the first one was `Mixed` which will accept any other + type. + - If a common type coercion for a set of matched columns from + concatenated tables cannot be found, a `No_Common_Type` is reported. + In warning or ignore mode, the problematic column will be dropped + from the resulting table. + + ? Unifying Column Types + + If `allow_type_widening` is set to `True`, then the following rules are + used to find a common type that will fit values from all merged tables. + + Numeric columns are unified by finding the most general type that can + fit all of the columns. The biggest integer type will be chosen and if + integers and decimals are mixed, the decimal type will be chosen. + If boolean columns are mixed with numeric columns, they will be coerced + to the numeric type (and converted to 0 and 1). + + Text types will also be coerced according to the common rules - if + constant-length texts of different lengths are mixed, they will be + coerced to a varying-length type. + + If one of the matched columns has `Mixed` type, that type will be used + regardless of types of other columns. Mixing any other types will + result in a `No_Common_Type` problem. If columns of incompatible types + are meant to be mixed, at least one of them should be explicitly + retyped to the `Mixed` type to indicate that intention. + union : (Table | Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table + union self tables match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning = + _ = [tables, match_columns, keep_unmatched_columns, allow_type_widening, on_problems] + Error.throw (Unsupported_Database_Operation.Error "Table.union is not implemented yet for the Database backends.") + ## ALIAS group, summarize Aggregates the rows in a table using any `Group_By` entries in columns. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index a0fcb68880e6..db682ee3969a 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -14,6 +14,7 @@ from project.Data.Table import print_table from project.Errors import No_Index_Set_Error +polyglot java import org.enso.table.data.column.storage.Storage as Java_Storage polyglot java import org.enso.table.data.table.Column as Java_Column polyglot java import org.enso.table.operations.OrderBuilder @@ -34,6 +35,11 @@ type Column from_vector : Text -> Vector -> Column from_vector name items = Column.Value (Java_Column.fromItems name items.to_array) + ## PRIVATE + Creates a new column given a name and an internal Java storage. + from_storage : Text -> Java_Storage -> Column + from_storage name storage = Column.Value (Java_Column.new name storage) + ## Creates a new column given a name and a vector of elements repeated over and over. Arguments: @@ -1013,20 +1019,11 @@ type Column storage_type : Storage storage_type self = tp = self.java_column.getStorage.getType - Storage.types.at tp . catch Index_Out_Of_Bounds.Error _-> - Panic.throw (Illegal_State.Error "Unknown storage type: "+tp.to_text) + Storage.from_java tp ## UNSTABLE TODO this is a prototype that will be revisited later on value_type : Value_Type - value_type self = case self.storage_type of - Storage.Text -> Value_Type.Char - Storage.Integer -> Value_Type.Integer - Storage.Decimal -> Value_Type.Float - Storage.Boolean -> Value_Type.Boolean - Storage.Date -> Value_Type.Date - Storage.Time_Of_Day -> Value_Type.Time - Storage.Date_Time -> Value_Type.Date_Time - Storage.Any -> Value_Type.Mixed + value_type self = self.storage_type.to_approximate_value_type ## UNSTABLE @@ -1323,3 +1320,44 @@ get_item_string column ix = slice_ranges column ranges = normalized = Index_Sub_Range_Module.normalize_ranges ranges Column.Value (column.java_column.slice normalized.to_array) + +## PRIVATE + Creates a storage builder suitable for building a column for the provided + column type. + + This relies on a rudimentary mapping between `Value_Type` and `Storage`. It + does not ensure validity checks for the particular type, like checking string + length or number size. + + It may be tempting to return an `InferredBuilder` for the `Mixed` type - as + this will use a more compact storage if a mixed type column contains only + numbers. However, since currently `Column.value_type` is derived directly + from its storage type, that would result in a changed `value_type` in the + result. Whereas we want to ensure that if the requested type is `Mixed`, the + resulting column should also report `Mixed` value type. Once the types work + decouples `value_type` from `storage_type`, this logic could be adjusted. + + Due to the coupling of value types and storage, `value_type` of the created + column may not be exactly the same as the one requested here, it will be the + closest one currently supported by our storage (i.e. any constraints like + integer size or constant text width will be dropped). This will need to be + revisited as part of the types work: + https://www.pivotaltracker.com/story/show/183854180 +make_storage_builder_for_type value_type initial_size=128 = + closest_storage_type = case value_type of + Value_Type.Boolean -> Storage.Boolean + Value_Type.Byte -> Storage.Integer + Value_Type.Integer _ -> Storage.Integer + Value_Type.Float _ -> Storage.Decimal + ## Arbitrary precision numbers are not currently representable by our + specialized in-memory storage, so falling back to object storage. + Value_Type.Decimal _ _ -> Storage.Any + Value_Type.Char _ _ -> Storage.Text + Value_Type.Date -> Storage.Date + Value_Type.Date_Time with_timezone -> + ## Our specialized storage is only capable of storing date time with timezone. If we want to store a different kind of date-time, we will + if with_timezone then Storage.Date_Time else Storage.Any + Value_Type.Time -> Storage.Time_Of_Day + Value_Type.Mixed -> Storage.Any + _ -> Storage.Any + Storage.make_builder closest_storage_type initial_size diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Match_Columns.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Match_Columns.enso index fb4e10998f8c..9cdaec0aaffe 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Match_Columns.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Match_Columns.enso @@ -1,15 +1,107 @@ from Standard.Base import all +import Standard.Base.Runtime.State -## Specifies how to join columns in the table to existing data. +import project.Data.Report_Unmatched.Report_Unmatched +from project.Errors import Column_Count_Mismatch, Unmatched_Columns, No_Output_Columns + +## Specifies a column matching strategy. type Match_Columns - ## Columns are matched by Name against an existing file. - A `Column_Name_Mismatch` error occurs if any column name in the existing - data could not be matched to the new data, or any column name in the new - data was not found in the existing data. + ## Columns are matched by Name. By_Name - ## Columns are matched by Position against the existing data. + ## Columns are matched by Position. + Note: column names are not compared. - A `Column_Count_Mismatch` error occurs if the existing data has a - different number of columns than the table. By_Position + +## PRIVATE + A helper that encapsulates the common backend-agnostic logic of matching + columns in `Table.union`. + + It matches columns according to the provided matching settings and returns a + list of column sets to be merged. + + Each column set consists of a name of the resulting column and a list of + indices for columns in corresponding tables that will be merged to form this + result column. The first column index corresponds to the first table in the + input and so on. If no column corresponding to a given column set was matched + in a particular table, its entry will be contain `Nothing` instead. + + The column sets are returned in the order in which the corresponding result + columns should appear in the resulting table. + + The method assumes at least one table is provided in its input. +match_columns tables matching_mode keep_unmatched_columns problem_builder = case matching_mode of + Match_Columns.By_Name -> case keep_unmatched_columns of + False -> + column_counts = find_column_counts tables + # This will only include columns that were present in all tables. + common_column_names = tables.first.column_names.filter name-> + column_counts.at name == tables.length + if common_column_names.is_empty then Error.throw No_Output_Columns else + common_column_names.map name-> + column_indices = tables.map table-> + table.column_names.index_of name + Column_Set.Value name column_indices + _ -> + output_column_names = distinct_columns_in_appearance_order tables + if keep_unmatched_columns == Report_Unmatched then + column_counts = find_column_counts tables + all_tables_count = tables.length + ## We iterate over output column names to get deterministic + order of unmatched columns. + unmatched_column_names = output_column_names.filter name-> + column_counts.get name 0 < all_tables_count + if unmatched_column_names.not_empty then + problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names) + output_column_names.map name-> + column_indices = tables.map table-> + table.columns.index_of col-> col.name==name + Column_Set.Value name column_indices + Match_Columns.By_Position -> + column_counts = tables.map table-> table.columns.length + minmax = column_counts.compute_bulk [Statistic.Minimum, Statistic.Maximum] + columns_to_take = if keep_unmatched_columns == False then minmax.first else minmax.second + if (minmax.first != minmax.second) && (keep_unmatched_columns == Report_Unmatched) then + problem_builder.report_other_warning (Column_Count_Mismatch.Error minmax.second minmax.first) + name_source = if keep_unmatched_columns == False then tables.first else + tables.find table-> table.columns.length == columns_to_take + column_sets = Vector.new columns_to_take i-> + name = name_source.at i . name + column_ids = tables.map table-> + column_count = table.columns.length + if i >= column_count then Nothing else i + Column_Set.Value name column_ids + column_sets + +type Column_Set + ## PRIVATE + Value (name : Text) (column_indices : Vector Integer) + + ## PRIVATE + resolve_columns self all_tables = self.column_indices.zip all_tables i-> parent_table-> + case i of + Nothing -> Nothing + _ : Integer -> parent_table.at i + +## PRIVATE + Returns a map indicating in how many tables did a column with a given name appear. +find_column_counts tables = + tables.fold Map.empty current->table-> + table.columns.fold current counts-> column-> + name=column.name + new_count = counts.get name 0 + 1 + counts.insert name new_count + +## PRIVATE + Returns a list of distinct column names, in the order of first appearance, + starting from the first table. +distinct_columns_in_appearance_order tables = + names_builder = Vector.new_builder + tables.fold Map.empty current-> table-> + table.columns.fold current seen_names-> column-> + name = column.name + if seen_names.contains_key name then seen_names else + names_builder.append name + seen_names.insert name True + names_builder.to_vector diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Report_Unmatched.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Report_Unmatched.enso new file mode 100644 index 000000000000..588f69284335 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Report_Unmatched.enso @@ -0,0 +1,3 @@ +## A value that marks the mode that allows column mismatches but reports them as + a problem. +type Report_Unmatched diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Storage.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Storage.enso index f3cf956735d3..578821e85c40 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Storage.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Storage.enso @@ -1,4 +1,11 @@ -import Standard.Base.Data.Vector.Vector +from Standard.Base import all +import Standard.Base.Error.Common.Index_Out_Of_Bounds +import Standard.Base.Error.Illegal_State.Illegal_State + +import Standard.Table.Data.Value_Type.Value_Type + +polyglot java import org.enso.table.data.column.builder.object.Builder +polyglot java import org.enso.table.data.column.storage.Storage as Java_Storage ## Represents different types of underlying storage for Columns. type Storage @@ -29,7 +36,46 @@ type Storage ## PRIVATE Enumerates storage types in a way that is consistent with `org.enso.table.data.Storage.Storage`, i.e. - `storage_type.at org.enso.table.data.Storage.Storage.LONG` will yield the + `storage_type.at org.enso.table.data.Storage.Type.LONG` will yield the corresponding `Storage.Integer`. types : Vector Storage types = [Storage.Any, Storage.Integer, Storage.Decimal, Storage.Text, Storage.Boolean, Storage.Date, Storage.Time_Of_Day, Storage.Date_Time] + + ## PRIVATE + Converts a `Storage` to a Java storage id. + to_java : Integer + to_java self = case self of + Storage.Any -> Java_Storage.Type.OBJECT + Storage.Integer -> Java_Storage.Type.LONG + Storage.Decimal -> Java_Storage.Type.DOUBLE + Storage.Text -> Java_Storage.Type.STRING + Storage.Boolean -> Java_Storage.Type.BOOL + Storage.Date -> Java_Storage.Type.DATE + Storage.Time_Of_Day -> Java_Storage.Type.TIME_OF_DAY + Storage.Date_Time -> Java_Storage.Type.DATE_TIME + + ## PRIVATE + Converts a Java storage id to a `Storage`. + from_java : Integer -> Storage + from_java id = + Storage.types.at id . catch Index_Out_Of_Bounds.Error _-> + Panic.throw (Illegal_State.Error "Unknown storage type: "+id.to_text) + + ## PRIVATE + Converts this storage type to a value type closest representing it. + to_approximate_value_type : Value_Type + to_approximate_value_type self = case self of + Storage.Text -> Value_Type.Char + Storage.Integer -> Value_Type.Integer + Storage.Decimal -> Value_Type.Float + Storage.Boolean -> Value_Type.Boolean + Storage.Date -> Value_Type.Date + Storage.Time_Of_Day -> Value_Type.Time + Storage.Date_Time -> Value_Type.Date_Time + Storage.Any -> Value_Type.Mixed + + ## PRIVATE + Creates a column storage builder for the given storage type. + make_builder : Storage -> Integer -> Builder + make_builder storage initial_size=64 = + Builder.getForType storage.to_java initial_size diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 073e6e5d30cf..ac2c7490c3b1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -1,6 +1,6 @@ from Standard.Base import all -import Standard.Base.Data.Index_Sub_Range as Index_Sub_Range_Module import Standard.Base.Data.Array_Proxy.Array_Proxy +import Standard.Base.Data.Index_Sub_Range as Index_Sub_Range_Module import Standard.Base.Data.Ordering.Comparator import Standard.Base.Error.Common.Index_Out_Of_Bounds import Standard.Base.Error.Common.No_Such_Method @@ -11,13 +11,16 @@ import Standard.Base.Error.Incomparable_Values.Incomparable_Values import Standard.Base.Error.Unimplemented.Unimplemented import project.Data.Column.Column +import project.Data.Column as Column_Module import project.Data.Column_Name_Mapping.Column_Name_Mapping import project.Data.Column_Selector.Column_Selector import project.Data.Data_Formatter.Data_Formatter import project.Data.Join_Condition.Join_Condition import project.Data.Join_Kind.Join_Kind import project.Data.Match_Columns.Match_Columns +import project.Data.Match_Columns as Match_Columns_Helpers import project.Data.Position.Position +import project.Data.Report_Unmatched.Report_Unmatched import project.Data.Row.Row import project.Data.Storage.Storage import project.Data.Value_Type.Value_Type @@ -28,10 +31,10 @@ import project.Data.Storage.Storage import project.Internal.Aggregate_Column_Helper import project.Internal.Java_Problems import project.Internal.Join_Helpers -import project.Internal.Table_Helpers -import project.Internal.Table_Helpers.Table_Column_Helper import project.Internal.Parse_Values_Helper import project.Internal.Problem_Builder.Problem_Builder +import project.Internal.Table_Helpers +import project.Internal.Table_Helpers.Table_Column_Helper import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy import project.Data.Expression.Expression import project.Data.Expression.Expression_Error @@ -44,6 +47,7 @@ from project.Errors import Column_Count_Mismatch, Missing_Input_Columns, Column_ from project.Data.Column import get_item_string from project.Internal.Filter_Condition_Helpers import make_filter_column +polyglot java import org.enso.table.data.column.builder.object.StorageTypeMismatch polyglot java import org.enso.table.data.table.Table as Java_Table polyglot java import org.enso.table.data.table.Column as Java_Column polyglot java import org.enso.table.data.table.join.Equals as Java_Join_Equals @@ -661,7 +665,6 @@ type Table problems = java_table.getProblems Java_Problems.parse_aggregated_problems problems - ## Parses columns within a Table to a specific value type. By default, it looks at all `Text` columns and attempts to deduce the type (columns with other types are not affected). If `column_types` are @@ -972,13 +975,17 @@ type Table example_columns = Examples.inventory_table.columns columns : Vector - columns self = Vector.from_polyglot_array self.java_table.getColumns . map Column.Value + columns self = Vector.from_polyglot_array <| + Array_Proxy.new self.java_table.getColumns.length i-> + Column.Value (self.java_table.getColumns.at i) ## UNSTABLE Returns the vector of column names contained in this table. column_names : Vector Text - column_names self = Vector.from_polyglot_array self.java_table.getColumns . map _.getName + column_names self = Vector.from_polyglot_array <| + Array_Proxy.new self.java_table.getColumns.length i-> + self.java_table.getColumns.at i . getName ## Returns a vector of rows contained in this table. @@ -1015,6 +1022,9 @@ type Table and an empty result is reported. - If a column index is invalid, an `Index_Out_Of_Bounds` is reported and an empty result is reported. + - If there are column names that are clashing between the two tables, a + `Duplicate_Output_Column_Names` is reported and the columns from the + table are renamed as described below. - If a join condition correlates columns whose types are not compatible (for example comparing numeric types with text), an `Invalid_Value_Type` is reported. @@ -1051,11 +1061,7 @@ type Table table.join other on=[Join_Condition.Equals "A" "A", Join_Condition.Equals "B" "B"] join : Table -> Join_Kind -> Join_Condition | Text | Vector (Join_Condition | Text) -> Text -> Problem_Behavior -> Table join self right join_kind=Join_Kind.Inner on=[Join_Condition.Equals 0 0] right_prefix="Right_" on_problems=Report_Warning = - if Table_Helpers.is_table right . not then Error.throw (Type_Error.Error Table right "right") else - same_backend = case right of - _ : Table -> True - _ -> False - if same_backend . not then Error.throw (Illegal_Argument.Error "Currently cross-backend joins are not supported. Materialize the table using `.read` before joining it with an in-memory Table.") else + if check_table "right" right then # [left_unmatched, matched, right_unmatched] rows_to_keep = case join_kind of Join_Kind.Inner -> [False, True, False] @@ -1083,6 +1089,116 @@ type Table problems = new_java_table.getProblems Java_Problems.parse_aggregated_problems problems + ## ALIAS append, concat + Appends records from other table(s) to this table. + + Arguments: + - tables: A single table or a vector of tables to append to this one. The + tables are concatenated in the order they are specified, with `self` + being the first one. + - match_columns: Specifies how to match the columns. + - If `Match_Columns.By_Name` - the columns are matched by name across + all provided tables. + If unmatched columns are to be dropped, the resulting table will keep + only the set of columns that appear in all provided tables, in the + relative order that they appeared in the `self` table. + If unmatched columns are kept, they are added in the order of + appearance - i.e. first all columns from `self` will be added in the + original order, then any columns from the second table that were not + matched will be added at the end (preserving their relative order), + and so on for all the remaining tables. + - If `Match_Columns.By_Position` - the columns are mapped by position. + If unmatched columns are to be dropped, the resulting table will have + as many columns as the table that had the least columns and the + column names of the first table (self) will be used. + If unmatched columns are kept, the resulting table will have as many + columns as the table with the most columns. Since the first table may + not have all the necessary columns to provide column names for the + result, the result will have column names taken from the first table + that has the biggest number of columns. + - keep_unmatched_columns: If set to `True`, unmatched columns are kept + and are padded with `Nothing` for tables that did not have them. + If set to `False`, only the common subset of columns is kept - any + column that is not present in all tables is dropped. Defaults to + `Report_Unmatched`, which behaves like `True` - unmatched columns are + kept and padded with `Nothing`, but a problem is reported. + - allow_type_widening: Specifies if the resulting column type should be + adjusted to fit columns from all arguments. If `True`, a common type + will be chosen for each column (see "Unifying Column Types" below). + If `False`, the resulting column type will be the same as in the first + table containing the column. In this case, all columns that are + concatenated must have the same type as the first one (unless this + had a `Mixed` type - in which case it will accept any other types). + - on_problems: Specifies how to handle problems if they occur, reporting + them as warnings by default. + + - If `keep_unmatched_columns` is set to `Report_Unmatched` (the + default): + - If matching by name and there are columns that are not present in + all tables, `Unmatched_Columns` is reported. + - If matching by position and column counts of the merged tables + differ, then a `Column_Count_Mismatch` is reported. The error will + contain the greatest column count as its `expected` value and the + smallest one as its `actual` value. + - If `keep_unmatched_columns` is set to `False` and matching by name, + it is possible that there are no columns that are common to all + provided tables, in that case `No_Output_Columns` is thrown as a + dataflow error regardless of the `on_problems` setting, because there + are no columns to include in the resulting table. + - If type widening is disabled and one of corresponding columns has a + type that is incompatible with the type coming from the first table, + a `Column_Type_Mismatch` is reported. The problematic column will be + dropped from the resulting table. With type widening disabled, the + subsequent tables must have the same types as the first one, unless + the type of the first one was `Mixed` which will accept any other + type. + - If a common type coercion for a set of matched columns from + concatenated tables cannot be found, a `No_Common_Type` is reported. + In warning or ignore mode, the problematic column will be dropped + from the resulting table. + + ? Unifying Column Types + + If `allow_type_widening` is set to `True`, then the following rules are + used to find a common type that will fit values from all merged tables. + + Numeric columns are unified by finding the most general type that can + fit all of the columns. The biggest integer type will be chosen and if + integers and decimals are mixed, the decimal type will be chosen. + If boolean columns are mixed with numeric columns, they will be coerced + to the numeric type (and converted to 0 and 1). + + Text types will also be coerced according to the common rules - if + constant-length texts of different lengths are mixed, they will be + coerced to a varying-length type. + + If one of the matched columns has `Mixed` type, that type will be used + regardless of types of other columns. Mixing any other types will + result in a `No_Common_Type` problem. If columns of incompatible types + are meant to be mixed, at least one of them should be explicitly + retyped to the `Mixed` type to indicate that intention. + union : (Table | Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table + union self tables match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning = + all_tables = case tables of + v : Vector -> [self] + v + single_table -> [self, single_table] + ## `is_everything_ok` should actually never be False; it will either be + True or will contain a dataflow error propagating through the if. + is_everything_ok = all_tables.all (check_table "tables") + if is_everything_ok then + problem_builder = Problem_Builder.new + matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns keep_unmatched_columns problem_builder + result_row_count = all_tables.fold 0 c-> t-> c + t.row_count + merged_columns = matched_column_sets.map column_set-> + case Table_Helpers.unify_result_type_for_union column_set all_tables allow_type_widening problem_builder of + Nothing -> Nothing + result_type : Value_Type -> + concat_columns column_set all_tables result_type result_row_count + good_columns = merged_columns.filter Filter_Condition.Not_Nothing + if good_columns.is_empty then Error.throw No_Output_Columns else + problem_builder.attach_problems_before on_problems <| + Table.new good_columns + ## ALIAS dropna ALIAS drop_missing_rows Remove rows which are all blank or containing blank values. @@ -1126,33 +1242,6 @@ type Table cols = self.columns Table.new [["Column", cols.map .name], ["Items Count", cols.map .count], ["Storage Type", cols.map .storage_type]] - ## UNSTABLE - - Concatenates `other` to `self`, resulting in a table with the number of rows - being the sum of numbers of rows of `tables`. Any column that is present in - some tables, but missing in others, will be `Nothing`-padded in the positions - corresponding to the missing values. - - Arguments: - - other: The table or vector of tables to concatenate to `self`. - - Any column that is present in one table, but missing in another, will be - `Nothing`-padded in the positions corresponding to the missing column. - - > Example - Concatenate two tables together. - - import Standard.Examples - - example_concat = - Examples.inventory_table.concat Examples.popularity_table - concat : Table | Vector Table -> Table - concat self other = case other of - _ : Vector -> - java_tables = Vector.new (other.length + 1) i->(if i==0 then self else other.at i).java_table - Table.Value (Java_Table.concat java_tables.to_array) - Table.Value other_java_table -> Table.Value (Java_Table.concat [self.java_table, other_java_table].to_array) - ## Returns a new table with a chosen subset of columns left unchanged and the other columns pivoted to rows with a single name field and a single value field. @@ -1468,3 +1557,28 @@ make_join_helpers left_table right_table = make_between _ left right_lower right_upper = Java_Join_Between.new left.java_column right_lower.java_column right_upper.java_column Join_Helpers.Join_Condition_Resolver.Value (left_table.at _) (right_table.at _) make_equals make_equals_ignore_case make_between + +## PRIVATE + Checks if the argument is a proper table and comes from the current backend. + It returns True or throws a dataflow error explaining the issue. +check_table arg_name table = + if Table_Helpers.is_table table . not then Error.throw (Type_Error.Error Table table arg_name) else + same_backend = table.is_a Table + case same_backend of + False -> + Error.throw (Illegal_Argument.Error "Currently cross-backend operations are not supported. Materialize the table using `.read` before mixing it with an in-memory Table.") + True -> True + +## PRIVATE + A helper that efficiently concatenates storages of in-memory columns. +concat_columns column_set all_tables result_type result_row_count = + storage_builder = Column_Module.make_storage_builder_for_type result_type initial_size=result_row_count + column_set.column_indices.zip all_tables i-> parent_table-> + case i of + Nothing -> + null_row_count = parent_table.row_count + storage_builder.appendNulls null_row_count + _ : Integer -> + storage = parent_table.at i . java_column . getStorage + storage_builder.appendBulkStorage storage + Column.from_storage column_set.name storage_builder.seal diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Value_Type.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Value_Type.enso index 9fa1421dbb76..d9b52667598f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Value_Type.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Value_Type.enso @@ -15,6 +15,16 @@ type Bits ## 64-bit (8 byte) value Bits_64 + ## PRIVATE + to_bits : Integer + to_bits self = case self of + Bits.Bits_16 -> 16 + Bits.Bits_32 -> 32 + Bits.Bits_64 -> 64 + + ## PRIVATE + compare_to self other = self.to_bits . compare_to other.to_bits + ## Represents the different possible types of values within RDBMS columns. type Value_Type ## Boolean or Bit value: 0 or 1. @@ -97,3 +107,71 @@ type Value_Type expect_boolean value_type ~action = case value_type of Value_Type.Boolean -> action _ -> Error.throw (Invalid_Value_Type.Error Value_Type.Boolean value_type) + + ## PRIVATE + Finds a type that can fit both a current type and a new type. + reconcile_types current new = case current of + Value_Type.Mixed -> Value_Type.Mixed + Value_Type.Integer size -> case new of + Value_Type.Integer new_size -> + Value_Type.Integer (Math.max size new_size) + Value_Type.Byte -> Value_Type.Integer size + Value_Type.Boolean -> Value_Type.Integer size + # If we unify integers with floats, we select the default Float 64 regardless of the input sizes. + Value_Type.Float _ -> Value_Type.Float + _ -> Value_Type.Mixed + Value_Type.Float size -> case new of + Value_Type.Float new_size -> + Value_Type.Float (Math.max size new_size) + # If we unify integers with floats, we select the default Float 64 regardless of the input sizes. + Value_Type.Integer _ -> Value_Type.Float + Value_Type.Byte -> Value_Type.Float + Value_Type.Boolean -> Value_Type.Float + _ -> Value_Type.Mixed + Value_Type.Byte -> case new of + Value_Type.Byte -> Value_Type.Byte + Value_Type.Integer size -> + Value_Type.Integer size + Value_Type.Boolean -> Value_Type.Byte + Value_Type.Float _ -> Value_Type.Float + _ -> Value_Type.Mixed + Value_Type.Boolean -> case new of + Value_Type.Boolean -> Value_Type.Boolean + Value_Type.Integer size -> + Value_Type.Integer size + Value_Type.Byte -> Value_Type.Byte + Value_Type.Float _ -> Value_Type.Float + _ -> Value_Type.Mixed + Value_Type.Char current_size current_variable -> case new of + Value_Type.Char new_size new_variable -> + result_variable = current_variable || new_variable || current_size != new_size + case result_variable of + True -> Value_Type.Char Nothing True + False -> Value_Type.Char current_size False + _ -> Value_Type.Mixed + Value_Type.Binary current_size current_variable -> case new of + Value_Type.Binary new_size new_variable -> + result_variable = current_variable || new_variable || current_size != new_size + case result_variable of + True -> Value_Type.Binary Nothing True + False -> Value_Type.Binary current_size False + _ -> Value_Type.Mixed + _ -> + if current == new then current else Value_Type.Mixed + + ## PRIVATE + Finds the most specific value type that will fit all the provided types. + + If `strict` is `True`, it is implemented as specified in the note + "Unifying Column Types" in `Table.union`. In that case, if no common type + is found, `Nothing` is returned. + + It assumes that the `types` vector is not empty. + find_common_type : Vector Value_Type -> Boolean -> Value_Type | Nothing + find_common_type types strict = + most_generic_type = (types.drop 1).fold types.first Value_Type.reconcile_types + if strict.not || most_generic_type != Value_Type.Mixed then most_generic_type else + # Double check if Mixed was really allowed to come out. + if types.contains Value_Type.Mixed then Value_Type.Mixed else + Nothing + diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso index 70fb729f8be8..7028c7a9c771 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso @@ -2,6 +2,7 @@ from Standard.Base import all polyglot java import org.enso.table.error.ColumnCountMismatchException polyglot java import org.enso.table.error.ColumnNameMismatchException + type Missing_Input_Columns ## PRIVATE One or more columns not found in the input table. @@ -319,3 +320,41 @@ type Invalid_JSON_Format to_display_text : Text to_display_text self = "The input " + self.input.to_text + " had an invalid format due to: " + self.message.to_text + "." + +type Column_Type_Mismatch + ## UNSTABLE + + An error indicating a mismatch of column types of merged columns. + Error (column_name : Text) (expected_type : Text) (got_type : Text) + + ## PRIVATE + + Create a human-readable version of the error. + to_display_text : Text + to_display_text self = + "The column ["+self.column_name+"] expects type "+self.expected_type+" but one of the provided tables had type "+self.got_type+" which is not compatible with it." + +type No_Common_Type + ## UNSTABLE + + An error indicating that no common type could be found for the merged + columns. + Error (column_name : Text) + + ## PRIVATE + + Create a human-readable version of the error. + to_display_text : Text + to_display_text self = + "No common type could have been found for the columns corresponding to ["+self.column_name+"]. If you want to allow mixed types, please retype the columns to the `Mixed` before the concatenation (note however that most Database backends do not support `Mixed` types, so it may work only for the in-memory backend)." + +type Unmatched_Columns + ## UNSTABLE + + An error indicating that some columns were not present in all of the + merged tables. + Error (column_names : Vector Text) + + to_display_text : Text + to_display_text self = + "The following columns were not present in some of the provided tables: " + (self.column_names.map (n -> "["+n+"]") . join ", ") + ". The missing values have been filled with `Nothing`." diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso index 2702dcf75941..990cd8b30764 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso @@ -11,11 +11,12 @@ import project.Data.Position.Position import project.Data.Sort_Column_Selector.Sort_Column_Selector import project.Data.Sort_Column.Sort_Column import project.Data.Table.Table +import project.Data.Value_Type.Value_Type import project.Internal.Problem_Builder.Problem_Builder import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy from project.Data.Aggregate_Column.Aggregate_Column import Minimum, Maximum -from project.Errors import Missing_Input_Columns, No_Output_Columns, Too_Many_Column_Names_Provided, No_Input_Columns_Selected +from project.Errors import Missing_Input_Columns, No_Output_Columns, Too_Many_Column_Names_Provided, No_Input_Columns_Selected, No_Common_Type, Column_Type_Mismatch polyglot java import java.util.HashSet @@ -631,3 +632,26 @@ filter_blank_rows table when_any treat_nans_as_blank = is_table obj = known_types = ["Standard.Table.Data.Table.Table", "Standard.Database.Data.Table.Table"] known_types.contains (Meta.get_qualified_type_name obj) + +## PRIVATE + A helper method that resolves what should be the result type of a particular + column set based on the union settings. +unify_result_type_for_union column_set all_tables allow_type_widening problem_builder = + columns = column_set.resolve_columns all_tables + case allow_type_widening of + True -> + types = columns.filter Filter_Condition.Not_Nothing . map .value_type + common_type = Value_Type.find_common_type types strict=True + if common_type.is_nothing then + problem_builder.report_other_warning (No_Common_Type.Error column_set.name) + common_type + False -> + first_column = columns.find (c-> c.is_nothing.not) + first_type = first_column.value_type + if first_type == Value_Type.Mixed then Value_Type.Mixed else + first_wrong_column = columns.find if_missing=Nothing col-> + col.is_nothing.not && col.value_type != first_type + if first_wrong_column.is_nothing then first_type else + got_type = first_wrong_column.value_type + problem_builder.report_other_warning (Column_Type_Mismatch.Error column_set.name first_type got_type) + Nothing diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso index 4090f8f4d4ac..25fb713b3ed3 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso @@ -9,6 +9,7 @@ import project.Data.Join_Condition.Join_Condition import project.Data.Join_Kind.Join_Kind import project.Data.Match_Columns.Match_Columns import project.Data.Position.Position +import project.Data.Report_Unmatched.Report_Unmatched import project.Data.Sort_Column.Sort_Column import project.Data.Sort_Column_Selector.Sort_Column_Selector import project.Data.Table.Table @@ -28,6 +29,7 @@ export project.Data.Join_Condition.Join_Condition export project.Data.Join_Kind.Join_Kind export project.Data.Match_Columns.Match_Columns export project.Data.Position.Position +export project.Data.Report_Unmatched.Report_Unmatched export project.Data.Sort_Column.Sort_Column export project.Data.Sort_Column_Selector.Sort_Column_Selector export project.Data.Table.Table diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java index bf4fd8c37f75..e4b50914ca94 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/BoolBuilder.java @@ -2,6 +2,7 @@ import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; +import org.enso.table.util.BitSets; import java.util.BitSet; @@ -61,6 +62,21 @@ public void appendNulls(int count) { size += count; } + @Override + public void appendBulkStorage(Storage storage) { + if (storage.getType() == getType()) { + if (storage instanceof BoolStorage boolStorage) { + BitSets.copy(boolStorage.getValues(), vals, size, boolStorage.size()); + BitSets.copy(boolStorage.getIsMissing(), isNa, size, boolStorage.size()); + size += boolStorage.size(); + } else { + throw new IllegalStateException("Unexpected storage implementation for type BOOLEAN: " + storage + ". This is a bug in the Table library."); + } + } else { + throw new StorageTypeMismatch(getType(), storage.getType()); + } + } + @Override public Storage seal() { return new BoolStorage(vals, isNa, size, false); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/Builder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/Builder.java index 90552a8164e5..e521877ddbaa 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/Builder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/Builder.java @@ -46,9 +46,24 @@ public static Builder getForType(int type, int size) { */ public abstract void appendNulls(int count); - /** @return the number of appended elements */ + /** + * Appends the whole contents of some other storage. + * + *

This may be used to efficiently copy a whole storage into the builder. Used for example when + * concatenating columns. + * + *

If the provided storage type is not compatible with the type of this builder, a {@code + * StorageTypeMismatch} exception may be thrown. + */ + public abstract void appendBulkStorage(Storage storage); + + /** + * @return the number of appended elements + */ public abstract int getCurrentSize(); - /** @return a storage containing all the items appended so far */ + /** + * @return a storage containing all the items appended so far + */ public abstract Storage seal(); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/InferredBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/InferredBuilder.java index 1d4f1c337602..fc74ba574801 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/InferredBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/InferredBuilder.java @@ -78,6 +78,13 @@ public void appendNulls(int count) { currentSize += count; } + @Override + public void appendBulkStorage(Storage storage) { + for (int i = 0; i < storage.size(); i++) { + append(storage.getItemBoxed(i)); + } + } + private void initBuilderFor(Object o) { int initialCapacity = Math.max(initialSize, currentSize); if (o instanceof Boolean) { diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/NumericBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/NumericBuilder.java index 065a067a6507..e727e01c24f9 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/NumericBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/NumericBuilder.java @@ -1,9 +1,11 @@ package org.enso.table.data.column.builder.object; import org.enso.base.polyglot.NumericConverter; +import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.DoubleStorage; import org.enso.table.data.column.storage.LongStorage; import org.enso.table.data.column.storage.Storage; +import org.enso.table.util.BitSets; import java.util.Arrays; import java.util.BitSet; @@ -99,6 +101,98 @@ public void appendNulls(int count) { currentSize += count; } + @Override + public void appendBulkStorage(Storage storage) { + if (isDouble) { + appendBulkDouble(storage); + } else { + appendBulkLong(storage); + } + } + + private void ensureFreeSpaceFor(int additionalSize) { + if (currentSize + additionalSize > data.length) { + grow(currentSize + additionalSize); + } + } + + private void appendBulkDouble(Storage storage) { + if (storage.getType() == Storage.Type.DOUBLE) { + if (storage instanceof DoubleStorage doubleStorage) { + int n = doubleStorage.size(); + ensureFreeSpaceFor(n); + System.arraycopy(doubleStorage.getRawData(), 0, data, currentSize, n); + BitSets.copy(doubleStorage.getIsMissing(), isMissing, currentSize, n); + currentSize += n; + } else { + throw new IllegalStateException("Unexpected storage implementation for type DOUBLE: " + storage + ". This is a bug in the Table library."); + } + } else if (storage.getType() == Storage.Type.LONG) { + if (storage instanceof LongStorage longStorage) { + int n = longStorage.size(); + BitSets.copy(longStorage.getIsMissing(), isMissing, currentSize, n); + for (int i = 0; i < n; i++) { + data[currentSize++] = Double.doubleToRawLongBits(longStorage.getItem(i)); + } + } else { + throw new IllegalStateException("Unexpected storage implementation for type LONG: " + storage + ". This is a bug in the Table library."); + } + } else if (storage.getType() == Storage.Type.BOOL) { + if (storage instanceof BoolStorage boolStorage) { + int n = boolStorage.size(); + for (int i = 0; i < n; i++) { + if (boolStorage.isNa(i)) { + isMissing.set(currentSize++); + } else { + double x = booleanAsDouble(boolStorage.getItem(i)); + data[currentSize++] = Double.doubleToRawLongBits(x); + } + } + } else { + throw new IllegalStateException("Unexpected storage implementation for type BOOLEAN: " + storage + ". This is a bug in the Table library."); + } + } else { + throw new StorageTypeMismatch(getType(), storage.getType()); + } + } + + private void appendBulkLong(Storage storage) { + if (storage.getType() == Storage.Type.LONG) { + if (storage instanceof LongStorage longStorage) { + int n = longStorage.size(); + ensureFreeSpaceFor(n); + System.arraycopy(longStorage.getRawData(), 0, data, currentSize, n); + BitSets.copy(longStorage.getIsMissing(), isMissing, currentSize, n); + currentSize += n; + } else { + throw new IllegalStateException("Unexpected storage implementation for type DOUBLE: " + storage + ". This is a bug in the Table library."); + } + } else if (storage.getType() == Storage.Type.BOOL) { + if (storage instanceof BoolStorage boolStorage) { + int n = boolStorage.size(); + for (int i = 0; i < n; i++) { + if (boolStorage.isNa(i)) { + isMissing.set(currentSize++); + } else { + data[currentSize++] = booleanAsLong(boolStorage.getItem(i)); + } + } + } else { + throw new IllegalStateException("Unexpected storage implementation for type BOOLEAN: " + storage + ". This is a bug in the Table library."); + } + } else { + throw new StorageTypeMismatch(getType(), storage.getType()); + } + } + + private long booleanAsLong(boolean value) { + return value ? 1 : 0; + } + + private double booleanAsDouble(boolean value) { + return value ? 1.0 : 0.0; + } + /** * Append a new item in raw form to this builder, assuming that it has enough allocated space. * @@ -155,6 +249,10 @@ private void grow() { if (data.length > 1) { desiredCapacity = (data.length * 3 / 2); } + grow(desiredCapacity); + } + + private void grow(int desiredCapacity) { this.data = Arrays.copyOf(data, desiredCapacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/ObjectBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/ObjectBuilder.java index 32eaa4b4389e..5927330cc6d6 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/ObjectBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/ObjectBuilder.java @@ -1,6 +1,7 @@ package org.enso.table.data.column.builder.object; import org.enso.table.data.column.storage.ObjectStorage; +import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.Storage; import java.util.Arrays; @@ -61,6 +62,23 @@ public void appendNulls(int count) { currentSize += count; } + @Override + public void appendBulkStorage(Storage storage) { + if (currentSize + storage.size() > data.length) { + grow(currentSize + storage.size()); + } + + if (storage instanceof SpecializedStorage specializedStorage) { + System.arraycopy(specializedStorage.getData(), 0, data, currentSize, storage.size()); + currentSize += storage.size(); + } else { + int n = storage.size(); + for (int i = 0; i < n; i++) { + data[currentSize++] = storage.getItemBoxed(i); + } + } + } + @Override public int getCurrentSize() { return currentSize; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/StorageTypeMismatch.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/StorageTypeMismatch.java new file mode 100644 index 000000000000..a5761baafb8d --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/StorageTypeMismatch.java @@ -0,0 +1,24 @@ +package org.enso.table.data.column.builder.object; + +public class StorageTypeMismatch extends RuntimeException { + private final int expectedType; + private final int gotType; + + public StorageTypeMismatch(int expectedType, int gotType) { + this.expectedType = expectedType; + this.gotType = gotType; + } + + @Override + public String getMessage() { + return "Expected storage of type " + + expectedType + + ", got " + + gotType + + ". This is a bug in the Table library."; + } + + public int gotType() { + return gotType; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/TypedBuilderImpl.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/TypedBuilderImpl.java index f467a02844da..8941a74f5415 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/TypedBuilderImpl.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/object/TypedBuilderImpl.java @@ -1,5 +1,6 @@ package org.enso.table.data.column.builder.object; +import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.Storage; import java.util.Arrays; @@ -50,6 +51,28 @@ public void appendNulls(int count) { currentSize += count; } + @Override + public void appendBulkStorage(Storage storage) { + if (storage.getType() == getType()) { + if (storage instanceof SpecializedStorage) { + // This cast is safe, because storage.getType() == this.getType() iff storage.T == this.T + @SuppressWarnings("unchecked") + SpecializedStorage specializedStorage = (SpecializedStorage) storage; + System.arraycopy(specializedStorage.getData(), 0, data, currentSize, storage.size()); + currentSize += storage.size(); + } else { + throw new IllegalStateException( + "Unexpected storage implementation for type " + + storage.getType() + + ": " + + storage + + ". This is a bug in the Table library."); + } + } else { + throw new StorageTypeMismatch(getType(), storage.getType()); + } + } + @Override public int getCurrentSize() { return currentSize; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java index 56f61b963094..a9b08e6e7e62 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java @@ -169,6 +169,10 @@ public BitSet getIsMissing() { return isMissing; } + public long[] getRawData() { + return data; + } + private static MapOpStorage buildOps() { MapOpStorage ops = new MapOpStorage<>(); ops.add( diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java index 3d056d9f9d67..c5558d8b4a02 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java @@ -186,6 +186,10 @@ public BitSet getIsMissing() { return isMissing; } + public long[] getRawData() { + return data; + } + private static MapOpStorage buildOps() { MapOpStorage ops = new MapOpStorage<>(); ops.add( diff --git a/std-bits/table/src/main/java/org/enso/table/util/BitSets.java b/std-bits/table/src/main/java/org/enso/table/util/BitSets.java new file mode 100644 index 000000000000..3bdc20d5a7fb --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/util/BitSets.java @@ -0,0 +1,19 @@ +package org.enso.table.util; + +import java.util.BitSet; + +public class BitSets { + /** + * An utility to copy a part of one bitset onto another, with a possible destination offset. + * + *

Unfortunately BitSet does not provide a fast way to do this. We could try to implement + * something on our own that would operate on whole longs instead of bit by bit. + */ + public static void copy(BitSet source, BitSet destination, int destinationOffset, int length) { + for (int i = 0; i < length; i++) { + if (source.get(i)) { + destination.set(destinationOffset + i); + } + } + } +} diff --git a/test/Table_Tests/src/Common_Table_Operations/Join_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Join_Spec.enso index f67d5b27862f..bdb744bb6dd8 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Join_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Join_Spec.enso @@ -10,7 +10,7 @@ from Standard.Database.Errors import Unsupported_Database_Operation from Standard.Test import Test, Problems import Standard.Test.Extensions -from project.Common_Table_Operations.Util import expect_column_names, run_default_backend +from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table type My_Type Value x y @@ -24,8 +24,6 @@ type My_Type main = run_default_backend spec -within_table table = Test.with_clue 'Resulting table:\n'+table.display+'\n\n' - spec setup = prefix = setup.prefix table_builder = setup.table_builder diff --git a/test/Table_Tests/src/Common_Table_Operations/Main.enso b/test/Table_Tests/src/Common_Table_Operations/Main.enso index e35fc7cf4a22..ac11df692ef7 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Main.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Main.enso @@ -1,5 +1,6 @@ from Standard.Base import all +import project.Common_Table_Operations.Aggregate_Spec import project.Common_Table_Operations.Column_Operations_Spec import project.Common_Table_Operations.Core_Spec import project.Common_Table_Operations.Distinct_Spec @@ -11,8 +12,7 @@ import project.Common_Table_Operations.Missing_Values_Spec import project.Common_Table_Operations.Order_By_Spec import project.Common_Table_Operations.Select_Columns_Spec import project.Common_Table_Operations.Take_Drop_Spec -import project.Common_Table_Operations.Aggregate_Spec -import project.Common_Table_Operations.Aggregate_Spec.Test_Selection as Aggregate_Test_Selection +import project.Common_Table_Operations.Union_Spec from project.Common_Table_Operations.Util import run_default_backend @@ -95,6 +95,7 @@ spec setup = Take_Drop_Spec.spec setup Expression_Spec.spec detailed=False setup Join_Spec.spec setup + Union_Spec.spec setup Distinct_Spec.spec setup Integration_Tests.spec setup diff --git a/test/Table_Tests/src/Common_Table_Operations/Union_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Union_Spec.enso new file mode 100644 index 000000000000..656c4e8e138b --- /dev/null +++ b/test/Table_Tests/src/Common_Table_Operations/Union_Spec.enso @@ -0,0 +1,295 @@ +from Standard.Base import all + +from Standard.Table import all +from Standard.Table.Errors import all +import Standard.Table.Data.Value_Type.Value_Type + +from Standard.Database.Errors import Unsupported_Database_Operation + +from Standard.Test import Test, Problems +import Standard.Test.Extensions + +from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table +import project.Util + +type My_Type + Value x y + +main = run_default_backend spec + +spec setup = + prefix = setup.prefix + table_builder = setup.table_builder + db_todo = if prefix.contains "In-Memory" then Nothing else "Union API is not yet implemented for the DB backend." + Test.group prefix+"Table.union" pending=db_todo <| + Test.specify "should merge columns from multiple tables" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["A", [4, 5, 6]], ["B", ["d", "e", "f"]]] + t3 = table_builder [["A", [7, 8, 9]], ["B", ["g", "h", "i"]]] + + t4 = t1.union t2 + expect_column_names ["A", "B"] t4 + t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + t4.at "B" . to_vector . should_equal ["a", "b", "c", "d", "e", "f"] + + t5 = t3.union [t1, t2] + expect_column_names ["A", "B"] t5 + t5.at "A" . to_vector . should_equal [7, 8, 9, 1, 2, 3, 4, 5, 6] + t5.at "B" . to_vector . should_equal ["g", "h", "i", "a", "b", "c", "d", "e", "f"] + + Test.specify "should fill unmatched columns (by name matching) with nulls and report a warning by default" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] + t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] + + action = t1.union [t2, t3] on_problems=_ + tester table = + expect_column_names ["A", "B", "C", "D"] table + table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, Nothing] + table.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing] + table.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"] + table.at "D" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, 0] + problems = [Unmatched_Columns.Error ["A", "B", "C", "D"]] + Problems.test_problem_handling action problems tester + + action2 = t2.union t3 on_problems=_ + tester2 table = + expect_column_names ["C", "A", "D"] table + table.at "C" . to_vector . should_equal ["d", "e", "f", "g", "h", "i"] + table.at "A" . to_vector . should_equal [4, 5, 6, Nothing, Nothing, Nothing] + table.at "D" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, 0] + problems2 = [Unmatched_Columns.Error ["A", "D"]] + Problems.test_problem_handling action2 problems2 tester2 + + Test.specify "should drop unmatched columns if asked to" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] + t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] + + t4 = t1.union [t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Report_Error + Problems.assume_no_problems t4 + expect_column_names ["A"] t4 + t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0] + + Test.specify "should keep unmatched columns without errors if asked to" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] + t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] + + t4 = t1.union [t2, t3] keep_unmatched_columns=True on_problems=Problem_Behavior.Report_Error + Problems.assume_no_problems t4 + expect_column_names ["A", "B", "C"] t4 + t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0] + t4.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing] + t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"] + + Test.specify "should fail if asked to drop unmatched columns but the set of common columns is empty" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] + t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] + + t4 = t1.union [t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Ignore + t4.should_fail_with No_Output_Columns + + Test.specify "should ignore colum names when matching by position" <| + t1 = table_builder [["A", [1, 2, 3]], ["Y", ["a", "b", "c"]]] + t2 = table_builder [["X", [4, 5, 6]], ["A", ["d", "e", "f"]]] + + t3 = t1.union t2 match_columns=Match_Columns.By_Position + expect_column_names ["A", "Y"] t3 + t3.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + t3.at "Y" . to_vector . should_equal ["a", "b", "c", "d", "e", "f"] + + Test.specify "should fill extra columns (positional matching) with nulls and report a warning by default" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] + t3 = table_builder [["A2", [10, 20, 30]]] + + action = t1.union [t2, t3] match_columns=Match_Columns.By_Position on_problems=_ + tester table = + expect_column_names ["A1", "B1", "C"] table + table.at "A1" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30] + table.at "B1" . to_vector . should_equal ["a", "b", "c", "d", "e", "f", Nothing, Nothing, Nothing] + table.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, 7, 8, 9, Nothing, Nothing, Nothing] + problems = [Column_Count_Mismatch.Error 3 1] + Problems.test_problem_handling action problems tester + + Test.specify "should keep the least number of columns with positional matching if asked to drop unmatched ones" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] + t3 = table_builder [["A2", [10, 20, 30]]] + + t4 = t1.union [t2, t3] keep_unmatched_columns=False match_columns=Match_Columns.By_Position on_problems=Problem_Behavior.Report_Error + Problems.assume_no_problems t4 + expect_column_names ["A"] t4 + t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30] + + Test.specify "should keep the greatest number of columns with positional matching if asked to keep unmatched ones, filling missing values with null and reporting no problems" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] + t3 = table_builder [["A2", [10, 20, 30]]] + + t4 = t1.union [t2, t3] match_columns=Match_Columns.By_Position keep_unmatched_columns=True on_problems=Problem_Behavior.Ignore + Problems.assume_no_problems t4 + expect_column_names ["A1", "B1", "C"] t4 + t4.at "A1" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30] + t4.at "B1" . to_vector . should_equal ["a", "b", "c", "d", "e", "f", Nothing, Nothing, Nothing] + t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, 7, 8, 9, Nothing, Nothing, Nothing] + + Test.specify "should use column names from the first table that has enough columns in positional matching mode" <| + t1 = table_builder [["A", [1, 2, 3]]] + t2 = table_builder [["X", [4, 5, 6]], ["A", ["a", "b", "c"]]] + + check table = + expect_column_names ["X", "A"] table + table.at "X" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + table.at "A" . to_vector . should_equal [Nothing, Nothing, Nothing, "a", "b", "c"] + + t3 = t1.union t2 match_columns=Match_Columns.By_Position + within_table t3 <| + check t3 + Warning.get_all t3 . map .value . should_equal [Column_Count_Mismatch.Error 2 1] + + t4 = t1.union t2 match_columns=Match_Columns.By_Position keep_unmatched_columns=True + within_table t4 <| + check t4 + + t5 = table_builder [["Y", [7, 8, 9]], ["A", ["d", "e", "f"]], ["Z", [10, 11, 12]]] + t6 = table_builder [["W", [0]]] + t7 = table_builder [["X", [7, 8, 9]], ["Y", ["d", "e", "f"]], ["Z", [10, 11, 12]]] + t8 = t1.union [t2, t5, t6, t7] match_columns=Match_Columns.By_Position + expect_column_names ["Y", "A", "Z"] t8 + + Test.specify "should gracefully handle the case where no tables to union were provided" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + + t1.union [] . should_equal t1 + t1.union [] match_columns=Match_Columns.By_Position . should_equal t1 + + t1.union [] keep_unmatched_columns=False . should_equal t1 + t1.union [] match_columns=Match_Columns.By_Position keep_unmatched_columns=False . should_equal t1 + + t1.union [] keep_unmatched_columns=True . should_equal t1 + t1.union [] match_columns=Match_Columns.By_Position keep_unmatched_columns=True . should_equal t1 + + Test.specify "should find a common type that will fit the merged columns" <| + t1 = table_builder [["int+bool", [1, 2, 3]], ["int+float", [0, 1, 2]]] + t2 = table_builder [["int+bool", [True, False, Nothing]], ["int+float", [1.0, 2.0, 2.5]]] + + t1.at "int+bool" . value_type . should_equal Value_Type.Integer + t1.at "int+float" . value_type . should_equal Value_Type.Integer + t2.at "int+bool" . value_type . should_equal Value_Type.Boolean + t2.at "int+float" . value_type . should_equal Value_Type.Float + + t3 = t1.union t2 + expect_column_names ["int+bool", "int+float"] t3 + t3.at "int+bool" . value_type . should_equal Value_Type.Integer + t3.at "int+float" . value_type . should_equal Value_Type.Float + t3.at "int+bool" . to_vector . should_equal [1, 2, 3, 1, 0, Nothing] + t3.at "int+float" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5] + + t4 = table_builder [["float", [1.0, 2.0, 3.3]]] + t5 = t1.union [t2, t4] match_columns=Match_Columns.By_Position keep_unmatched_columns=False + expect_column_names ["int+bool"] t5 + t5.at "int+bool" . value_type . should_equal Value_Type.Float + t5.at "int+bool" . to_vector . should_equal [1, 2, 3, 1, 0, Nothing, 1.0, 2.0, 3.3] + + Test.specify "should resort to Mixed value type only if at least one column is already Mixed" <| + ## TODO currently no way to retype a column to Mixed, so we are + using a custom object; this test won't work in DB so it will need + to be adapted once proper type support is implemented + t1 = table_builder [["A", [1, 2, 3]], ["mixed", ["a", My_Type.Value 1 2, Nothing]]] + t2 = table_builder [["A", [4, 5, 6]], ["mixed", [1, 2, 3]]] + t1.at "mixed" . value_type . should_equal Value_Type.Mixed + t2.at "mixed" . value_type . should_equal Value_Type.Integer + + t3 = t1.union t2 + Problems.assume_no_problems t3 + expect_column_names ["A", "mixed"] t3 + t3.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + t3.at "mixed" . to_vector . should_equal ["a", My_Type.Value 1 2, Nothing, 1, 2, 3] + + t4 = table_builder [["A", [1, 3]], ["mixed", [True, False]]] + t5 = table_builder [["A", [4, 5]], ["mixed", ["X", "y"]]] + t4.at "mixed" . value_type . should_equal Value_Type.Boolean + t5.at "mixed" . value_type . should_equal Value_Type.Char + + t6 = t5.union [t1, t2, t4] + Problems.assume_no_problems t6 + expect_column_names ["A", "mixed"] t6 + t6.at "A" . to_vector . should_equal [4, 5, 1, 2, 3, 4, 5, 6, 1, 3] + t6.at "mixed" . to_vector . should_equal ["X", "y", "a", My_Type.Value 1 2, Nothing, 1, 2, 3, True, False] + t6.at "mixed" . value_type . should_equal Value_Type.Mixed + + Test.specify "if no common type can be found, should report error and drop the problematic column" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]], ["C", [True, False, Nothing]]] + t2 = table_builder [["C", ["x", "Y", "Z"]], ["A", [4, 5, 6]], ["B", [1, 2, 3]]] + + action = t1.union t2 on_problems=_ + tester table = + expect_column_names ["A"] table + table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + problems = [No_Common_Type.Error "B", No_Common_Type.Error "C"] + Problems.test_problem_handling action problems tester + + Test.specify "if type widening is not allowed, should use the type from first table that contained the given column" <| + t1 = table_builder [["A", [1, 2, 3]]] + t2 = table_builder [["A", [4, 5, 6]], ["B", [1.2, 2.2, 3.1]]] + + t2.at "B" . value_type . should_equal Value_Type.Float + + t3 = t1.union t2 allow_type_widening=False keep_unmatched_columns=True + within_table t3 <| + Problems.assume_no_problems t3 + expect_column_names ["A", "B"] t3 + t3.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + t3.at "A" . value_type . should_equal Value_Type.Integer + t3.at "B" . to_vector . should_equal [Nothing, Nothing, Nothing, 1.2, 2.2, 3.1] + t3.at "B" . value_type . should_equal Value_Type.Float + + Test.specify "if type widening is not allowed and types do not match, should report error and drop the problematic column" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", [1, 2, 3]], ["C", [True, False, Nothing]], ["D", [10, 20, 30]], ["E", [1.1, 2.5, 3.2]]] + t2 = table_builder [["A", [4, 5, 6]], ["B", [1.5, 2.5, 3.5]], ["C", [1, 2, 3]], ["D", [True, True, True]], ["E", [1, 2, 3]]] + + t1.at "B" . value_type . should_equal Value_Type.Integer + t1.at "C" . value_type . should_equal Value_Type.Boolean + t1.at "D" . value_type . should_equal Value_Type.Integer + t1.at "E" . value_type . should_equal Value_Type.Float + + t2.at "B" . value_type . should_equal Value_Type.Float + t2.at "C" . value_type . should_equal Value_Type.Integer + t2.at "D" . value_type . should_equal Value_Type.Boolean + t2.at "E" . value_type . should_equal Value_Type.Integer + + action = t1.union t2 allow_type_widening=False on_problems=_ + tester table = + expect_column_names ["A"] table + table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + problems = [Column_Type_Mismatch.Error "B" Value_Type.Integer Value_Type.Float, Column_Type_Mismatch.Error "C" Value_Type.Boolean Value_Type.Integer, Column_Type_Mismatch.Error "D" Value_Type.Integer Value_Type.Boolean, Column_Type_Mismatch.Error "E" Value_Type.Float Value_Type.Integer] + Problems.test_problem_handling action problems tester + + Test.specify "even if type widening is not allowed, if the first column is mixed, it should accept any column to be concatenated to it" <| + t1 = table_builder [["X", ["a", 1, Nothing]]] + t2 = table_builder [["X", [1]]] + t3 = table_builder [["X", [1.2, 2.3, 3.4]]] + t4 = table_builder [["X", ["a", "b"]]] + t5 = table_builder [["X", [True, False]]] + + t1.at "X" . value_type . should_equal Value_Type.Mixed + t2.at "X" . value_type . should_equal Value_Type.Integer + + t6 = t1.union [t2, t3, t4, t5] allow_type_widening=False + Problems.assume_no_problems t6 + t6.at "X" . value_type . should_equal Value_Type.Mixed + t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False] + + Test.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <| + t1 = table_builder [["A", [1, 2, 3]]] + t2 = table_builder [["A", ['x']]] + + e3 = t1.union t2 allow_type_widening=True on_problems=Problem_Behavior.Ignore + e3.should_fail_with No_Output_Columns + + t4 = table_builder [["A", [1.5]]] + e5 = t1.union t4 allow_type_widening=False on_problems=Problem_Behavior.Ignore + e5.should_fail_with No_Output_Columns diff --git a/test/Table_Tests/src/Common_Table_Operations/Util.enso b/test/Table_Tests/src/Common_Table_Operations/Util.enso index ba807444de33..3ea6969fff2e 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Util.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Util.enso @@ -1,7 +1,7 @@ from Standard.Base import all -import Standard.Test -from Standard.Test import Test_Suite +from Standard.Test import Test, Test_Suite +import Standard.Test.Extensions import project.In_Memory.Common_Spec as In_Memory_Table_Spec @@ -13,3 +13,14 @@ expect_column_names names table = a shortcut that allows to run these tests with the in-memory backend. run_default_backend spec = Test_Suite.run_main (In_Memory_Table_Spec.run_common_spec spec) + +## Adds a clue which will display the provided table next to the failed test + description. + + > Example + Test a property of the table, displaying its contents if the test fails. + + t = Table.new [["A", [1, 2, 3]]] + within_table t <| + t.at "A" . to_vector . should_equal [1, 2, 3] +within_table table = Test.with_clue 'Resulting table:\n'+table.display+'\n\n' diff --git a/test/Table_Tests/src/In_Memory/Table_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Spec.enso index bafcd0967f7e..f27d61d7654b 100644 --- a/test/Table_Tests/src/In_Memory/Table_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Spec.enso @@ -458,38 +458,6 @@ spec = c = Column.from_vector 'c' ['z', 'a', 'd', 'f', 's', 'e\u0301', 'ś', 'ą', 's\u0301', 'w', 'b'] c.sort.to_vector . should_equal ['a', 'ą', 'b', 'd', 'e\u0301', 'f', 's', 's\u0301', 'ś', 'w', 'z'] - Test.group "Concatenating Tables" <| - Test.specify 'should concat tables with the same schema' <| - c_1_1 = ['foo', [1, 2, 3, 4]] - c_1_2 = ['bar', ['baz', 'quux', 'spam', 'eggs']] - t_1 = Table.new [c_1_1, c_1_2] - - c_2_1 = ['foo', [5, 6, 7]] - c_2_2 = ['bar', [False, True, False]] - t_2 = Table.new [c_2_1, c_2_2] - - r = t_1.concat t_2 - - r.column_count.should_equal 2 - r.at 'foo' . to_vector . should_equal [1, 2, 3, 4, 5, 6, 7] - r.at 'bar' . to_vector . should_equal ['baz', 'quux', 'spam', 'eggs', False, True, False] - - Test.specify 'should missing-pad unmatched columns' <| - c_1_1 = ['foo', [1, 2, 3, 4]] - c_1_2 = ['bar', ['baz', 'quux', 'spam', 'eggs']] - t_1 = Table.new [c_1_1, c_1_2] - - c_2_1 = ['foo', [5, 6, 7]] - c_2_2 = ['baz', [False, True, False]] - t_2 = Table.new [c_2_1, c_2_2] - - r = t_1.concat t_2 - - r.column_count.should_equal 3 - r.at 'foo' . to_vector . should_equal [1, 2, 3, 4, 5, 6, 7] - r.at 'bar' . to_vector . should_equal ['baz', 'quux', 'spam', 'eggs', Nothing, Nothing, Nothing] - r.at 'baz' . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, False, True, False] - Test.group "Slicing Tables" <| Test.specify 'should allow taking first n rows' <| i_1 = ['ix', [1, 2, 3]]