enso-org · mergify · Apr 14, 2023 · Apr 5, 2023 · Apr 5, 2023 · Apr 6, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -381,6 +381,7 @@
   methods.][6176]
 - [Implemented `Table.union` for the Database backend.][6204]
 - [Array & Vector have the same methods & behavior][6218]
+- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -578,6 +579,7 @@
 [6204]: https://github.com/enso-org/enso/pull/6204
 [6077]: https://github.com/enso-org/enso/pull/6077
 [6218]: https://github.com/enso-org/enso/pull/6218
+[6233]: https://github.com/enso-org/enso/pull/6233
 
 #### Enso Compiler
 

@@ -1392,6 +1392,80 @@ type Table
         msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
         Error.throw (Unsupported_Database_Operation.Error msg)
 
+    ## Splits a column of text into a set of new columns.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
+    split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
+        _ = [column delimiter column_count on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.split_to_columns is not implemented yet for the Database backends.")
+
+    ## Splits a column of text into a set of new rows.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - on_problems: Specifies the behavior when a problem occurs.
+    split_to_rows : Text | Integer -> Text -> Table
+    split_to_rows self column delimiter="," =
+        _ = [column delimiter]
+        Error.throw (Unsupported_Database_Operation.Error "Table.split_to_rows is not implemented yet for the Database backends.")
+
+    ## Tokenizes a column of text into a set of new columns using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
+    tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
+        _ = [column pattern case_sensitivity column_count on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_columns is not implemented yet for the Database backends.")
+
+    ## Tokenizes a column of text into a set of new rows using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies the behavior when a problem occurs.
+    tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
+    tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
+        _ = [column pattern case_sensitivity]
+        Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_rows is not implemented yet for the Database backends.")
+
     ## PRIVATE
        UNSTABLE
        Cast the selected columns to a specific type.

@@ -29,6 +29,7 @@ import project.Internal.Join_Helpers
 import project.Internal.Naming_Helpers.Naming_Helpers
 import project.Internal.Parse_Values_Helper
 import project.Internal.Problem_Builder.Problem_Builder
+import project.Internal.Split_Tokenize
 import project.Internal.Table_Helpers
 import project.Internal.Table_Helpers.Table_Column_Helper
 import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
@@ -918,6 +919,76 @@ type Table
         result = Table.new new_columns
         problem_builder.attach_problems_after on_problems result
 
+    ## Splits a column of text into a set of new columns.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
+    split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
+        Split_Tokenize.split_to_columns self column delimiter column_count on_problems
+
+    ## Splits a column of text into a set of new rows.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - on_problems: Specifies the behavior when a problem occurs.
+    split_to_rows : Text | Integer -> Text -> Table
+    split_to_rows self column delimiter="," =
+        Split_Tokenize.split_to_rows self column delimiter
+
+    ## Tokenizes a column of text into a set of new columns using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
+    tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
+        Split_Tokenize.tokenize_to_columns self column pattern case_sensitivity column_count on_problems
+
+    ## Tokenizes a column of text into a set of new rows using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies the behavior when a problem occurs.
+    tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
+    tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
+        Split_Tokenize.tokenize_to_rows self column pattern case_sensitivity
+
     ## ALIAS Filter Rows
 
        Selects only the rows of this table that correspond to `True` values of

@@ -552,3 +552,16 @@ type Invalid_Value_For_Type
     to_display_text : Text
     to_display_text self =
         "The value ["+self.value.to_text+"] is not valid for the column type ["+self.value_type.to_text+"]."
+
+type Column_Count_Exceeded
+    ## PRIVATE
+       Indicates that an operation generating new columns produced more columns
+       than allowed by the limit.
+    Error (limit : Integer) (column_count : Integer)
+
+    ## PRIVATE
+
+       Create a human-readable version of the error.
+    to_display_text : Text
+    to_display_text self =
+        "The operation produced more columns than the specified limit. The limit is "+self.limit.to_text+" and the number of new columns was "+self.column_count.to_text+". The limit may be turned off by setting the `limit` option to `Nothing`."