Update Table API with new filter design (#3750)

Implements https://www.pivotaltracker.com/story/show/183389855 # Important Notes Implements basic filter operations both In-Memory and for the Database backend, ensuring that existing tests can be adapted and keep working. Not all `Filter_Condition`s are implemented yet. Also implements significant part of https://www.pivotaltracker.com/story/show/183390314
enso-org · Oct 5, 2022 · 503d3eb · 503d3eb
1 parent 44a031f
commit 503d3eb
Show file tree

Hide file tree

Showing 19 changed files with 635 additions and 195 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -205,6 +205,7 @@
   `add_work_days` which allows to shift a date by a number of work days.][3726]
 - [Added `query` and `read` functions to Database connections.][3727]
 - [Added `Date_Period.Week` to `start_of` and `end_of` methods.][3733]
+- [Replaced `Table.where` with a new API relying on `Table.filter`.][3750]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -328,6 +329,7 @@
 [3727]: https://github.com/enso-org/enso/pull/3727
 [3733]: https://github.com/enso-org/enso/pull/3733
 [3749]: https://github.com/enso-org/enso/pull/3749
+[3750]: https://github.com/enso-org/enso/pull/3750
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Problem_Behavior.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Problem_Behavior.enso
@@ -122,3 +122,21 @@ type Problem_Behavior
         Report_Error -> case decorated_value of
             _ -> if problems.is_empty then decorated_value else
                 Error.throw problems.first
+
+    ## ADVANCED
+       UNSTABLE
+       A helper method that will handle any errors contained within the result
+       according to the current problem behavior settings. If the `result` does
+       not contain an error, it is returned as-is. Otherwise, if the problem
+       behavior is `Report_Error`, the error will be passed as-is. If it is
+       `Report_Warning`, the error is turned into a warning, and the fallback
+       value is returned with that error attached to it as a warning. If it is
+       `Ignore`, the fallback value is returned and the error is discarded.
+
+       The `error_type` parameter can be overridden to catch only some types of
+       errors. By default `Any` error is caught.
+    handle_errors : Any -> Any -> Any -> Any
+    handle_errors self result ~fallback error_type=Any = result.catch error_type error-> case self of
+        Ignore -> fallback
+        Report_Warning -> Warning.attach error fallback
+        Report_Error -> result
diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso
@@ -3,9 +3,11 @@ from Standard.Base import all
 import Standard.Database.Internal.Helpers
 import Standard.Database.Internal.IR
 import Standard.Database.Data.Table
+from Standard.Table import Filter_Condition
 import Standard.Table.Data.Column as Materialized_Column
 import Standard.Table.Data.Sort_Column_Selector
 import Standard.Table.Data.Sort_Column
+from Standard.Table.Data.Value_Type import Value_Type
 
 from Standard.Database.Data.SQL import SQL_Type, Statement
 from Standard.Database.Data.Table import Integrity_Error
@@ -92,6 +94,21 @@ type Column
         without_ix = self.to_table.set_index []
         without_ix . read  . at self.name . to_vector
 
+    ## UNSTABLE TODO this is a very early prototype that will be revisited later
+       This implementation is really just so that we can use the types in
+       `filter`, it does not provide even a decent approximation of the true
+       type in many cases. It will be improved when the types work is
+       implemented.
+    value_type : Value_Type
+    value_type self =
+        if self.sql_type.is_definitely_boolean then Value_Type.Boolean else
+            if self.sql_type.is_definitely_text then Value_Type.Char else
+                ## TODO we could return integers here too but then we should
+                   check how many bits there are - and this is out of scope for
+                   this prototype especially as the method for checking the type
+                   is likely to change so this code would likely be thrown away.
+                   So we just fall back to unsupported (abusing it slightly).
+                Value_Type.Unsupported_Data_Type self.sql_type.name
     ## UNSTABLE
 
        Returns an SQL statement that will be used for materializing this column.
@@ -211,13 +228,15 @@ type Column
 
        Returns the number of missing items in this column.
     count_missing : Integer
-    count_missing self = self.where self.is_missing . length
+    count_missing self =
+        self.to_table.filter 0 Filter_Condition.Is_Nothing . row_count
 
     ## UNSTABLE
 
        Returns the number of non-null items in this column.
     count : Integer
-    count self = self.where self.is_missing.not . length
+    count self =
+        self.to_table.filter 0 Filter_Condition.Not_Nothing . row_count
 
     ## UNSTABLE
 
@@ -397,36 +416,6 @@ type Column
     fill_missing : Any -> Column
     fill_missing self default = self.make_binary_op "FILLNULL" default
 
-    ## UNSTABLE
-
-       Selects only the rows of this column that correspond to `True` values in
-       `filter`.
-
-       Arguments:
-       - filter: A column of booleans to mask `self` by.
-
-       This is useful for filtering the rows by given predicate.
-
-       > Example
-         Select only the rows of `my_column` where the `status_column` column
-         has the value `"Valid"`
-             my_column.where (status_column == "Valid")
-    where : Column -> Column
-    where self filter =
-        case Helpers.check_integrity self filter of
-            False ->
-                Error.throw (Integrity_Error "Column "+filter.name)
-            True ->
-                new_filters = self.context.where_filters + [filter.expression]
-                new_ctx = self.context.set_where_filters new_filters
-                Column_Data self.name self.connection self.sql_type self.expression new_ctx
-
-    ## UNSTABLE
-
-       Returns a new column without rows that had missing values.
-    drop_missing : Any -> Column
-    drop_missing self = self.where self.is_missing.not
-
     ## UNSTABLE
 
        Returns the same column with changed name.

diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
@@ -7,6 +7,7 @@ import Standard.Database.Internal.IR
 from Standard.Database.Data.SQL_Query import Raw_SQL
 from Standard.Database.Data.SQL import Statement, SQL_Type
 
+from Standard.Table.Data.Filter_Condition import make_filter_column, Filter_Condition
 import Standard.Table.Data.Column as Materialized_Column
 import Standard.Table.Data.Table as Materialized_Table
 from Standard.Table import Auto_Detect, Aggregate_Column, Data_Formatter, Column_Name_Mapping, Sort_Column_Selector, Sort_Column, Match_Columns
@@ -341,30 +342,74 @@ type Table
             if Helpers.check_integrity self column then column else
                 Panic.throw (Integrity_Error_Data "Column "+column.name)
 
-    ## UNSTABLE
+    ## ALIAS Filter Rows
 
-       Selects only the rows of this table that correspond to `True` values in
+       Selects only the rows of this table that correspond to `True` values of
        `filter`.
 
        Arguments:
-       - filter: A column of boolean values that will be used to mask the table
-         rows.
+       - column: The column to use for filtering. Can be a column name, index or
+         the `Column` object itself.
+       - filter: The filter to apply to the column. It can either be an instance
+         of `Filter_Condition` or a predicate taking a cell value and returning
+         a boolean value indicating whether the corresponding row should be kept
+         or not.
+       - on_problems: is the behavior to take when the filter cannot be applied.
+         The following happens in warning mode:
+         If a column name cannot be found, a `No_Such_Column` warning is raised
+         and the original table is returned.
+         If a column index is invalid, an `Index_Out_Of_Bounds_Error` warning is
+         raised and the original table is returned.
+         If the column is an invalid type for the filter, an
+         `Invalid_Value_Type` warning is raised and an empty table is returned.
+         In error mode, the first warning is returned as a dataflow error. In
+         ignore mode the same values are returned but without the warnings
+         attached.
+
+       > Example
+         Get people older than 30.
 
-       This is useful for filtering the rows by given predicate.
+             people.filter "Age" (Greater 30)
 
        > Example
-         Select only the rows of `my_table` where the `"Status"` column has the
-         value `"Valid"`
-             my_table.where (my_table.at "Status" == "Valid")
-    where : Column -> Table
-    where self filter =
-        case Helpers.check_integrity self filter of
-            False ->
-                Error.throw (Integrity_Error_Data "Column "+filter.name)
-            True ->
-                new_filters = self.context.where_filters + [filter.expression]
-                new_ctx = self.context.set_where_filters new_filters
-                self.updated_context new_ctx
+         Filter people between 30 and 40.
+
+             people.filter "Age" (Between 30 40)
+
+       > Example
+         Select rows where more than 50% of the stock is sold.
+
+             table.filter "sold_stock" (Greater (table.at "total_stock" / 2))
+
+       > Example
+         Select people celebrating a jubilee.
+
+             people.filter "age" (age -> (age%10 == 0))
+    filter : (Column | Text | Integer) -> (Filter_Condition|(Any->Boolean)) -> Problem_Behavior -> Table
+    filter self column filter=Filter_Condition.Is_True on_problems=Report_Warning = case column of
+       _ : Column ->
+           mask filter_column = case Helpers.check_integrity self filter_column of
+               False ->
+                   Error.throw (Integrity_Error_Data "Column "+filter_column.name)
+               True ->
+                   new_filters = self.context.where_filters + [filter_column.expression]
+                   new_ctx = self.context.set_where_filters new_filters
+                   self.updated_context new_ctx
+           case Meta.type_of filter of
+               Filter_Condition ->
+                    on_problems.handle_errors fallback=self.with_no_rows <|
+                        mask (make_filter_column column filter)
+               Function -> Error.throw (Unsupported_Database_Operation_Error_Data "Filtering with a custom predicate is not supported in the database.")
+       _ -> case on_problems.handle_errors (self.at column) fallback=Nothing of
+            Nothing -> self
+            resolved_column -> self.filter resolved_column filter on_problems
+
+    ## PRIVATE
+    with_no_rows self =
+        false_expression = IR.Operation "=" [IR.Constant SQL_Type.integer 1, IR.Constant SQL_Type.integer 2]
+        new_filters = self.context.where_filters + [false_expression]
+        new_ctx = self.context.set_where_filters new_filters
+        self.updated_context new_ctx
 
     ## UNSTABLE
        Creates a new Table with the specified range of rows from the input
@@ -419,7 +464,7 @@ type Table
          numbers 1, 2, ..., 10, will return rows starting from 6 and not an empty
          result as one could expect if the limit was applied before the filters.
              t1 = table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "A"]) . limit 5
-             t2 = t1.where (t1.at 'A' > 5)
+             t2 = t1.filter 'A' (Greater than=5)
              t2.read
     limit : Integer -> Table
     limit self max_rows =
@@ -743,20 +788,14 @@ type Table
         msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
         Error.throw (Unsupported_Database_Operation_Error_Data msg)
 
-    ## UNSTABLE
-
-       Returns a new Table without rows that contained missing values in any of
-       the columns.
+    ## DEPRECATED Will be replaced with `filter_incomplete_rows`.
     drop_missing_rows : Table
     drop_missing_rows self =
         filters = self.columns.map (c -> c.is_missing.not.expression)
         new_ctx = self.context.set_where_filters (self.context.where_filters + filters)
         self.updated_context new_ctx
 
-    ## Returns a new Table without columns that contained any missing values.
-
-       This operation needs to actually materialize the underlying query in
-       order to know which columns to drop.
+    ## DEPRECATED Will be replaced with `Incomplete_Columns` selector (to be used with `remove_columns`).
     drop_missing_columns : Table
     drop_missing_columns self =
         rows_expr = IR.Operation "COUNT_ROWS" []

diff --git a/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science.enso b/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science.enso
@@ -42,7 +42,7 @@
          example_where =
              table = Examples.inventory_table
              mask = (table.at "sold_stock" > (table.at "total_stock" / 2))
-             table.where mask
+             table.filter mask
 
    > Example
      Sort the shop inventory based on the total stock, using the number sold to

diff --git a/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Transform.enso b/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Transform.enso
@@ -14,7 +14,7 @@
          example_where =
              table = Examples.inventory_table
              mask = (table.at "sold_stock" > (table.at "total_stock" / 2))
-             table.where mask
+             table.filter mask
 
    > Example
      Multiply each element of the column by itself.

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso
@@ -4,6 +4,11 @@ import Standard.Base.Data.Index_Sub_Range
 
 import Standard.Table.Data.Table
 import Standard.Table.Data.Storage
+from Standard.Table.Data.Value_Type import Value_Type
+
+# TODO Dubious constructor export
+from Standard.Table.Data.Column.Column import all
+from Standard.Table.Data.Column.Column export all
 
 polyglot java import org.enso.table.data.table.Column as Java_Column
 polyglot java import org.enso.table.operations.OrderBuilder
@@ -24,10 +29,6 @@ polyglot java import org.enso.table.operations.OrderBuilder
 from_vector : Text -> Vector -> Column
 from_vector name items = Column_Data (Java_Column.fromItems name items.to_array)
 
-# TODO Dubious constructor export
-from project.Data.Column.Column import all
-from project.Data.Column.Column export all
-
 type Column
 
     ## PRIVATE
@@ -485,20 +486,6 @@ type Column
         col = Java_Column.new name index new_st
         Column_Data col
 
-    ## ALIAS Drop Missing
-
-       Returns a new column without rows that had missing values.
-
-       > Example
-         Drop missing values from a column.
-
-             import Standard.Examples
-
-             example_drop_missing = Examples.decimal_column.drop_missing
-    drop_missing : Any -> Column
-    drop_missing self =
-        self.where self.is_missing.not
-
     ## Checks for each element of the column if it starts with `other`.
 
        Arguments:
@@ -742,27 +729,6 @@ type Column
             if storage.isNa index then Nothing else
                 storage.getItem index
 
-    ## Selects only the rows of this column that correspond to `True` values in
-       `indexes`.
-
-       Arguments:
-       - indexes: A column containing boolean values that is used to mask
-         `self`.
-
-       This is useful for filtering the rows by given predicate.
-
-       > Example
-         Select only the rows of the column where the length of the text exceeds
-         2.
-
-             import Standard.Examples
-
-             example_where =
-                 Examples.text_column_1.where (Examples.text_column_1.map .length > 2)
-    where : Column -> Column
-    where self indexes =
-        Column_Data (self.java_column.mask indexes.java_column)
-
     ## Returns a vector containing all the elements in this column.
 
        > Example
@@ -788,6 +754,18 @@ type Column
         storage_types.at tp . catch Index_Out_Of_Bounds_Error _->
             Panic.throw (Illegal_State_Error "Unknown storage type: "+tp.to_text)
 
+    ## UNSTABLE TODO this is a prototype that will be revisited later on
+    value_type : Value_Type
+    value_type self = case self.storage_type of
+        Storage.Text -> Value_Type.Char
+        Storage.Integer -> Value_Type.Integer
+        Storage.Decimal -> Value_Type.Float
+        Storage.Boolean -> Value_Type.Boolean
+        Storage.Date -> Value_Type.Date
+        Storage.Time_Of_Day -> Value_Type.Time
+        Storage.Date_Time -> Value_Type.Date_Time
+        Storage.Any -> Value_Type.Mixed
+
     ## UNSTABLE
 
        Converts this column to JSON.
@@ -1008,8 +986,7 @@ type Column
         new_col = self.java_column.applyMask mask
         Column_Data new_col
 
-    ## UNSTABLE
-       Creates a new Column with the specified range of rows from the input
+    ## Creates a new Column with the specified range of rows from the input
        Column.
 
        Arguments:
@@ -1018,8 +995,7 @@ type Column
     take self range=(First 1) =
         Index_Sub_Range.take_helper self.length self.at self.slice (slice_ranges self) range
 
-    ## UNSTABLE
-       Creates a new Column from the input with the specified range of rows
+    ## Creates a new Column from the input with the specified range of rows
        removed.
 
        Arguments: