diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a3973d00e13..d2155fd52adf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -381,6 +381,7 @@ methods.][6176] - [Implemented `Table.union` for the Database backend.][6204] - [Array & Vector have the same methods & behavior][6218] +- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -578,6 +579,7 @@ [6204]: https://github.com/enso-org/enso/pull/6204 [6077]: https://github.com/enso-org/enso/pull/6077 [6218]: https://github.com/enso-org/enso/pull/6218 +[6233]: https://github.com/enso-org/enso/pull/6233 #### Enso Compiler diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index 0d68dcfbd83d..f603843fc79a 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -1392,6 +1392,80 @@ type Table msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`." Error.throw (Unsupported_Database_Operation.Error msg) + ## Splits a column of text into a set of new columns. + The original column will be removed from the table. + The new columns will be named with the name of the input column with a + incrementing number after. + + Arguments: + - column: The name or index of the column to split the text of. + - delimiter: The term or terms used to split the text. + - column_count: The number of columns to split to. + If `Nothing` then columns will be added to fit all data. + - on_problems: Specifies the behavior when a problem occurs. + + ! Error Conditions + If the data exceeds the `column_count`, a `Column_Count_Exceeded` will + be reported according to the `on_problems` behavior. + split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table + split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error = + _ = [column delimiter column_count on_problems] + Error.throw (Unsupported_Database_Operation.Error "Table.split_to_columns is not implemented yet for the Database backends.") + + ## Splits a column of text into a set of new rows. + The values of other columns are repeated for the new rows. + + Arguments: + - column: The name or index of the column to split the text of. + - delimiter: The term or terms used to split the text. + - on_problems: Specifies the behavior when a problem occurs. + split_to_rows : Text | Integer -> Text -> Table + split_to_rows self column delimiter="," = + _ = [column delimiter] + Error.throw (Unsupported_Database_Operation.Error "Table.split_to_rows is not implemented yet for the Database backends.") + + ## Tokenizes a column of text into a set of new columns using a regular + expression. + If the pattern contains marked groups, the values are concatenated + together; otherwise the whole match is returned. + The original column will be removed from the table. + The new columns will be named with the name of the input column with a + incrementing number after. + + Arguments: + - column: The name or index of the column to tokenize the text of. + - pattern: The pattern used to find within the text. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - column_count: The number of columns to split to. + If `Nothing` then columns will be added to fit all data. + - on_problems: Specifies the behavior when a problem occurs. + + ! Error Conditions + If the data exceeds the `column_count`, a `Column_Count_Exceeded` will + be reported according to the `on_problems` behavior. + tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table + tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error = + _ = [column pattern case_sensitivity column_count on_problems] + Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_columns is not implemented yet for the Database backends.") + + ## Tokenizes a column of text into a set of new rows using a regular + expression. + If the pattern contains marked groups, the values are concatenated + together; otherwise the whole match is returned. + The values of other columns are repeated for the new rows. + + Arguments: + - column: The name or index of the column to tokenize the text of. + - pattern: The pattern used to find within the text. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - on_problems: Specifies the behavior when a problem occurs. + tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table + tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive = + _ = [column pattern case_sensitivity] + Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_rows is not implemented yet for the Database backends.") + ## PRIVATE UNSTABLE Cast the selected columns to a specific type. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 8ffba43dc7c1..c36b1b818bf8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -29,6 +29,7 @@ import project.Internal.Join_Helpers import project.Internal.Naming_Helpers.Naming_Helpers import project.Internal.Parse_Values_Helper import project.Internal.Problem_Builder.Problem_Builder +import project.Internal.Split_Tokenize import project.Internal.Table_Helpers import project.Internal.Table_Helpers.Table_Column_Helper import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy @@ -918,6 +919,76 @@ type Table result = Table.new new_columns problem_builder.attach_problems_after on_problems result + ## Splits a column of text into a set of new columns. + The original column will be removed from the table. + The new columns will be named with the name of the input column with a + incrementing number after. + + Arguments: + - column: The name or index of the column to split the text of. + - delimiter: The term or terms used to split the text. + - column_count: The number of columns to split to. + If `Nothing` then columns will be added to fit all data. + - on_problems: Specifies the behavior when a problem occurs. + + ! Error Conditions + If the data exceeds the `column_count`, a `Column_Count_Exceeded` will + be reported according to the `on_problems` behavior. + split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table + split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error = + Split_Tokenize.split_to_columns self column delimiter column_count on_problems + + ## Splits a column of text into a set of new rows. + The values of other columns are repeated for the new rows. + + Arguments: + - column: The name or index of the column to split the text of. + - delimiter: The term or terms used to split the text. + - on_problems: Specifies the behavior when a problem occurs. + split_to_rows : Text | Integer -> Text -> Table + split_to_rows self column delimiter="," = + Split_Tokenize.split_to_rows self column delimiter + + ## Tokenizes a column of text into a set of new columns using a regular + expression. + If the pattern contains marked groups, the values are concatenated + together; otherwise the whole match is returned. + The original column will be removed from the table. + The new columns will be named with the name of the input column with a + incrementing number after. + + Arguments: + - column: The name or index of the column to tokenize the text of. + - pattern: The pattern used to find within the text. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - column_count: The number of columns to split to. + If `Nothing` then columns will be added to fit all data. + - on_problems: Specifies the behavior when a problem occurs. + + ! Error Conditions + If the data exceeds the `column_count`, a `Column_Count_Exceeded` will + be reported according to the `on_problems` behavior. + tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table + tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error = + Split_Tokenize.tokenize_to_columns self column pattern case_sensitivity column_count on_problems + + ## Tokenizes a column of text into a set of new rows using a regular + expression. + If the pattern contains marked groups, the values are concatenated + together; otherwise the whole match is returned. + The values of other columns are repeated for the new rows. + + Arguments: + - column: The name or index of the column to tokenize the text of. + - pattern: The pattern used to find within the text. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. + - on_problems: Specifies the behavior when a problem occurs. + tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table + tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive = + Split_Tokenize.tokenize_to_rows self column pattern case_sensitivity + ## ALIAS Filter Rows Selects only the rows of this table that correspond to `True` values of diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso index fb32620a742a..e478139156df 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso @@ -552,3 +552,16 @@ type Invalid_Value_For_Type to_display_text : Text to_display_text self = "The value ["+self.value.to_text+"] is not valid for the column type ["+self.value_type.to_text+"]." + +type Column_Count_Exceeded + ## PRIVATE + Indicates that an operation generating new columns produced more columns + than allowed by the limit. + Error (limit : Integer) (column_count : Integer) + + ## PRIVATE + + Create a human-readable version of the error. + to_display_text : Text + to_display_text self = + "The operation produced more columns than the specified limit. The limit is "+self.limit.to_text+" and the number of new columns was "+self.column_count.to_text+". The limit may be turned off by setting the `limit` option to `Nothing`." diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Split_Tokenize.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Split_Tokenize.enso new file mode 100644 index 000000000000..364b4c70ab2f --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Split_Tokenize.enso @@ -0,0 +1,236 @@ +from Standard.Base import all + +import project.Data.Column.Column +import project.Data.Table.Table +import project.Data.Type.Value_Type.Value_Type +import project.Internal.Java_Exports +import project.Internal.Problem_Builder.Problem_Builder +import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy + +from project import Value_Type +from project.Errors import Column_Count_Exceeded, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns +from project.Internal.Java_Exports import make_string_builder + +polyglot java import org.enso.table.data.mask.OrderMask + +## PRIVATE + Splits a column of text into a set of new columns. + See `Table.split_to_columns`. +split_to_columns : Table -> Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table +split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error = + column = table.at input_column_id + Value_Type.expect_text (column.value_type) related_column=column <| + fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems + +## PRIVATE + Splits a column of text into a set of new rows. + See `Table.split_to_rows`. +split_to_rows : Table -> Text | Integer -> Text -> Table +split_to_rows table input_column_id delimiter="," = + column = table.at input_column_id + Value_Type.expect_text (column.value_type) related_column=column <| + fan_out_to_rows table input_column_id (handle_nothing (_.split delimiter)) + +## PRIVATE + Tokenizes a column of text into a set of new columns using a regular + expression. + See `Table.tokenize_to_columns`. +tokenize_to_columns : Table -> Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table +tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems = + column = table.at input_column_id + Value_Type.expect_text (column.value_type) related_column=column <| + fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems + +## PRIVATE + Tokenizes a column of text into a set of new rows using a regular + expression. + See `Table.tokenize_to_rows`. +tokenize_to_rows : Table -> Text | Integer -> Text -> Case_Sensitivity -> Table +tokenize_to_rows table input_column_id pattern="." case_sensitivity=Case_Sensitivity.Sensitive = + column = table.at input_column_id + Value_Type.expect_text (column.value_type) related_column=column <| + fan_out_to_rows table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) + +## PRIVATE + Transform a table by transforming a column into a set of columns. Takes a + function that maps a single element of the input column to a vector of output + values. The original column is replaced by the new columns. + + Arguments: + - table: The table to transform. + - input_column: The column to transform. + - function: A function that transforms a single element of `input_column` + to multiple values. +fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Behavior -> Table | Nothing +fan_out_to_columns table input_column_id function column_count=Nothing on_problems=Report_Error = + input_column = table.get input_column_id + problem_builder = Problem_Builder.new + new_columns_unrenamed = map_columns_to_multiple input_column function column_count problem_builder + new_columns = rename_new_columns table new_columns_unrenamed problem_builder + new_table = replace_column_with_columns table input_column new_columns + problem_builder.attach_problems_after on_problems new_table + +## PRIVATE + Transform a column by applying the given function to the values in the + column. The function produces multiple outputs, so each row is duplicated, + with each row getting a distinct output value in place of the original + input value. The other column values are just duplicated. + + Arguments: + - table: The table to transform. + - input_column: The column to transform. + - function: A function that transforms a single element of `input_column` + to multiple values. +fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Table +fan_out_to_rows table input_column_id function = + input_column = table.at input_column_id + input_storage = input_column.java_column.getStorage + num_input_rows = input_storage.size + + # Guess that most of the time, we'll get at least one value for each input. + initial_size = input_column.length + # Accumulates the output of the output column values. + output_column_builder = make_string_builder initial_size + # Accumulates repeated position indices for the order mask. + order_mask_positions = Vector.new_builder initial_size + + 0.up_to num_input_rows . each i-> + input_value = input_storage.getItemBoxed i + output_values = function input_value + # Append each value. + output_values.each v-> output_column_builder.append v + # Append n copies of the input row position, n = # of output values. + repeat_each output_values.length <| order_mask_positions.append i + + # Build the output column + output_storage = output_column_builder.seal + output_column = Column.from_storage input_column_id output_storage + + # Build the order mask. + order_mask = OrderMask.new (order_mask_positions.to_vector) + + # Build the other columns, and include the output_column while doing it. + new_columns = table.columns.map column-> + case column.name == input_column_id of + True -> + # Replace the input column with the output column. + output_column + False -> + # Build a new column from the old one with the mask + old_storage = column.java_column.getStorage + new_storage = old_storage.applyMask order_mask + Column.from_storage column.name new_storage + + Table.new new_columns + +## PRIVATE + + Map a multi-valued function over a column and return the results as set of + output columns. + + Returns a Pair of a Vector of Columns and a Vector of problems. + + Arguments: + - input_column: The column to transform. + - function: A function that transforms a single element of `input_column` + to multiple values. + - column_count: The number of columns to split to. + If `Nothing` then columns will be added to fit all data. + If the data exceeds the `column_count`, a `Column_Count_Exceeded` error + will follow the `on_problems` behavior. + - on_problems: Specifies the behavior when a problem occurs. +map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Builder -> Vector Column +map_columns_to_multiple input_column function column_count problem_builder = + num_rows = input_column.length + input_storage = input_column.java_column.getStorage + + builders = case column_count of + Nothing -> + builders = Vector.new_builder + + 0.up_to num_rows . each i-> + input_value = input_storage.getItemBoxed i + output_values = function input_value + + # Add more builders if necessary to accommodate `output_values`. + if output_values.length > builders.length then + num_builders_needed = output_values.length - builders.length + repeat_each num_builders_needed <| + builder = make_string_builder num_rows + + # Pad the new builder with nulls + num_nulls_needed = i + builder.appendNulls num_nulls_needed + + builders.append builder + + ## Add `output_values` to builders; if there are more builders + than `output_values`, pad with null. + 0.up_to builders.length . each i-> + builders.at i . appendNoGrow (output_values.get i Nothing) + + builders.to_vector + + _ : Integer -> + builders = Vector.new column_count (_-> make_string_builder num_rows) + + output_lengths = 0.up_to num_rows . map i-> + input_value = input_storage.getItemBoxed i + output_values = function input_value + + ## Add `output_values` to builders; if there are more builders + than `output_values`, pad with null. + 0.up_to builders.length . each i-> + builders.at i . appendNoGrow (output_values.get i Nothing) + + output_values.length + + max_output_length = maximum output_lengths + + if max_output_length > column_count then + problem = Column_Count_Exceeded.Error column_count max_output_length + problem_builder.report_other_warning problem + + builders + + # Build Columns. + builders.map .seal . map_with_index i-> storage-> + name = input_column.name + "_" + i.to_text + Column.from_storage name storage + +## PRIVATE + Rename a vector of columns to be unique when added to a table. +rename_new_columns : Table -> Vector Column -> Problem_Builder -> Vector Column +rename_new_columns table columns problem_builder = + unique = Unique_Name_Strategy.new + unique.mark_used <| table.columns.map .name + new_columns = columns.map column-> + new_name = unique.make_unique column.name + column.rename new_name + problem_builder.report_unique_name_strategy unique + new_columns + +## PRIVATE + Replace a single column in a table with new columns. + Does not ensure names are unique; that must be done before calling this. +replace_column_with_columns : Table -> Column -> Vector Column -> Table +replace_column_with_columns table old_column new_columns = + Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten) + +## PRIVATE + Return the maximum value of the vector. + Throws Empty_Error if the vector is empty. +maximum : Vector Any -> Vector Any +maximum vec = if vec.is_empty then Nothing else + vec.reduce (a-> b-> a.max b) + +## PRIVATE + Wrap a function so that it returns [] if passed Nothing +handle_nothing : (Any -> Any) -> (Any -> Any) +handle_nothing function = x-> case x of + _ : Nothing -> [] + _ -> function x + +## PRIVATE + Repeat a computation n times. +repeat_each n ~action = 0.up_to n . each _-> action diff --git a/test/Table_Tests/src/In_Memory/Main.enso b/test/Table_Tests/src/In_Memory/Main.enso index af32f71ecfeb..c8e11a8829b5 100644 --- a/test/Table_Tests/src/In_Memory/Main.enso +++ b/test/Table_Tests/src/In_Memory/Main.enso @@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec import project.In_Memory.Column_Spec import project.In_Memory.Common_Spec import project.In_Memory.Join_Performance_Spec +import project.In_Memory.Split_Tokenize_Spec import project.In_Memory.Table_Spec import project.In_Memory.Table_Date_Spec import project.In_Memory.Table_Date_Time_Spec @@ -22,5 +23,6 @@ spec = Aggregate_Column_Spec.spec Builders_Spec.spec Join_Performance_Spec.spec + Split_Tokenize_Spec.spec main = Test_Suite.run_main spec diff --git a/test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso b/test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso new file mode 100644 index 000000000000..ca63fb4010d6 --- /dev/null +++ b/test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso @@ -0,0 +1,214 @@ +from Standard.Base import all + +import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity +import Standard.Test.Extensions + +from Standard.Table import Table +from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, No_Such_Column +from Standard.Test import Test, Test_Suite, Problems +from project.Util import all + +spec = + Test.group "Table.split" <| + Test.specify "can do split_to_columns" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, "gh", "ij", "u"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.split_to_columns "bar" "b" + t2.should_equal_verbose expected + + Test.specify "can do split_to_rows" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [2, "gh"], [2, "ij"], [2, "u"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.split_to_rows "bar" "b" + t2.should_equal_verbose expected + + Test.specify "can do split_to_columns with some Nothings" <| + cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, Nothing, Nothing, Nothing], [3, "gh", "ij", "u"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.split_to_columns "bar" "b" + t2.should_equal_verbose expected + + Test.specify "can do split_to_rows with some Nothings" <| + cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [3, "gh"], [3, "ij"], [3, "u"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.split_to_rows "bar" "b" + t2.should_equal_verbose expected + + Test.group "Table.tokenize" <| + Test.specify "can do tokenize_to_columns" <| + cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]] + t = Table.new cols + expected_rows = [[0, "12", "34", "5"], [1, "23", Nothing, Nothing], [2, "2", "4", "55"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.tokenize_to_columns "bar" "\d+" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_rows" <| + cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]] + t = Table.new cols + expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [2, "2"], [2, "4"], [2, "55"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.tokenize_to_rows "bar" "\d+" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_columns with some nothings" <| + cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]] + t = Table.new cols + expected_rows = [[0, "12", "34", "5"], [1, Nothing, Nothing, Nothing], [2, "23", Nothing, Nothing], [3, "2", "4", "55"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.tokenize_to_columns "bar" "\d+" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_rows with some Nothings" <| + cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]] + t = Table.new cols + expected_rows = [[0, "12"], [0, "34"], [0, "5"], [2, "23"], [3, "2"], [3, "4"], [3, "55"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.tokenize_to_rows "bar" "\d+" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_rows with some rows that have no matches" <| + cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", "23", "q", "2r4r55"]]] + t = Table.new cols + expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [3, "2"], [3, "4"], [3, "55"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.tokenize_to_rows "bar" "\d+" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_columns with groups" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.tokenize_to_columns "bar" "([a-z]).(\d+)" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_rows with groups" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + expected_rows = [[0, "a1"], [0, "b12"], [0, "d50"], [1, "b10"], [1, "c20"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.tokenize_to_rows "bar" "([a-z]).(\d+)" + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_columns case-insensitively" <| + cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]] + t = Table.new cols + expected_rows = [[0, "B", "c", Nothing], [1, "c", "B", Nothing], [2, "c", "C", "b"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows + t2 = t.tokenize_to_columns "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive + t2.should_equal_verbose expected + + Test.specify "can do tokenize_to_rows case-insensitively" <| + cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]] + t = Table.new cols + expected_rows = [[0, "B"], [0, "c"], [1, "c"], [1, "B"], [2, "c"], [2, "C"], [2, "b"]] + expected = Table.from_rows ["foo", "bar"] expected_rows + t2 = t.tokenize_to_rows "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive + t2.should_equal_verbose expected + + Test.group "Table.split/tokenize column count" <| + Test.specify "should generate extra empty columns if column_count is set" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a", "c", Nothing, Nothing], [1, "c", "d", "ef", Nothing], [2, "gh", "ij", "u", Nothing]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows + t2 = t.split_to_columns "bar" "b" column_count=4 + t2.should_equal_verbose expected + t2.at "bar_3" . value_type . is_text . should_be_true + + Test.specify "split should limit columns and return problems when exceeding the column limit" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]] + t = Table.new cols + expected_rows = [[0, "a", "c"], [1, "c", "d"], [2, "gh", "ij"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows + action = t.split_to_columns "bar" "b" column_count=2 on_problems=_ + tester = t-> t.should_equal_verbose expected + problems = [Column_Count_Exceeded.Error 2 3] + Problems.test_problem_handling action problems tester + + Test.specify "tokenize should limit columns and return problems when exceeding the column limit" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]] + expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows + action = t.tokenize_to_columns "bar" "([a-z]).(\d+)" column_count=2 on_problems=_ + tester = t-> t.should_equal_verbose expected + problems = [Column_Count_Exceeded.Error 2 3] + Problems.test_problem_handling action problems tester + + Test.specify "should generate extra empty columns if column_count is set (with rows in a different order)" <| + cols = [["foo", [0, 1, 2]], ["bar", ["ghbijbu", "cbdbef", "abc"]]] + t = Table.new cols + expected_rows = [[0, "gh", "ij", "u", Nothing], [1, "c", "d", "ef", Nothing], [2, "a", "c", Nothing, Nothing]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows + t2 = t.split_to_columns "bar" "b" column_count=4 + t2.should_equal_verbose expected + t2.at "bar_3" . value_type . is_text . should_be_true + + Test.group "Table.split/tokenize errors" <| + Test.specify "won't work on a non-text column" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + t.split_to_columns "foo" "x" . should_fail_with Invalid_Value_Type + t.split_to_rows "foo" "x" . should_fail_with Invalid_Value_Type + t.tokenize_to_columns "foo" "x" . should_fail_with Invalid_Value_Type + t.tokenize_to_rows "foo" "x" . should_fail_with Invalid_Value_Type + + Test.specify "won't work on a mixed column" <| + cols = [["foo", [0, 1]], ["bar", [500, "ab-10:bc-20c"]]] + t = Table.new cols + t.split_to_columns "bar" "x" . should_fail_with Invalid_Value_Type + t.split_to_rows "bar" "x" . should_fail_with Invalid_Value_Type + t.tokenize_to_columns "bar" "x" . should_fail_with Invalid_Value_Type + t.tokenize_to_rows "bar" "x" . should_fail_with Invalid_Value_Type + + Test.specify "*_to_columns handles missing input column" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + t.tokenize_to_columns "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column + + Test.specify "*_to_rows handles missing input column" <| + cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]] + t = Table.new cols + t.tokenize_to_rows "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column + + Test.group "Table.split/tokenize name conflicts" <| + Test.specify "split will make column names unique" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["bar_1", ["a", "b", "c"]]] + t = Table.new cols + expected_rows = [[0, "a", "c", Nothing, "a"], [1, "c", "d", "ef", "b"], [2, "gh", "ij", "u", "c"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows + action = t.split_to_columns "bar" "b" on_problems=_ + tester = t-> t.should_equal_verbose expected + problems = [Duplicate_Output_Column_Names.Error ["bar_1"]] + Problems.test_problem_handling action problems tester + + Test.specify "tokenize will make column names unique" <| + cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]], ["bar_1", ["a", "b", "c"]]] + t = Table.new cols + expected_rows = [[0, "12", "34", "5", "a"], [1, "23", Nothing, Nothing, "b"], [2, "2", "4", "55", "c"]] + expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows + action = t.tokenize_to_columns "bar" "\d+" on_problems=_ + tester = t-> t.should_equal_verbose expected + problems = [Duplicate_Output_Column_Names.Error ["bar_1"]] + Problems.test_problem_handling action problems tester + + Test.group "Table.split/tokenize column order" <| + Test.specify "preserves column order" <| + cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["baz", [1, 2, 3]]] + t = Table.new cols + expected_rows = [[0, "a", "c", Nothing, 1], [1, "c", "d", "ef", 2], [2, "gh", "ij", "u", 3]] + expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "baz"] expected_rows + t2 = t.split_to_columns "bar" "b" + t2.should_equal_verbose expected + +main = Test_Suite.run_main spec diff --git a/test/Table_Tests/src/Util.enso b/test/Table_Tests/src/Util.enso index 01a9b01458aa..048d59261506 100644 --- a/test/Table_Tests/src/Util.enso +++ b/test/Table_Tests/src/Util.enso @@ -13,6 +13,16 @@ Table.should_equal self expected = self_cols.map .name . should_equal (that_cols.map .name) frames_to_skip=1 self_cols.map .to_vector . should_equal (that_cols.map .to_vector) frames_to_skip=1 +Table.should_equal_verbose self expected = + tables_equal t0 t1 = + same_headers = (t0.columns.map .name) == (t1.columns.map .name) + same_columns = (t0.columns.map .to_vector) == (t1.columns.map .to_vector) + same_headers && same_columns + equal = tables_equal self expected + if equal.not then + msg = 'Tables differ.\nActual:\n' + self.display + '\nExpected:\n' + expected.display + Test.fail msg + Column.should_equal self expected = if self.name != expected.name then Test.fail "Expected column name "+expected.name+", but got "+self.name+"."