Skip to content

Commit

Permalink
Update drop_missing_rows to filter_blank_rows API. (#3805)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored Oct 18, 2022
1 parent a53fbc7 commit 17f7398
Show file tree
Hide file tree
Showing 17 changed files with 220 additions and 60 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@
- [Reimplemented `Duration` as a built-in type.][3759]
- [Implemented `Table.replace_text` for in-memory table.][3793]
- [Extended `Filter_Condition` with `Is_In` and `Not_In`.][3790]
- [Replaced `Table.drop_missing_rows` with `filter_blank_rows` with an updated
API.][3805]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -341,6 +343,7 @@
[3759]: https://github.com/enso-org/enso/pull/3759
[3793]: https://github.com/enso-org/enso/pull/3793
[3790]: https://github.com/enso-org/enso/pull/3790
[3805]: https://github.com/enso-org/enso/pull/3805

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,12 @@ type Column
is_missing : Column
is_missing self = self.make_unary_op "IS_NULL" new_type=SQL_Type.boolean

## UNSTABLE
Returns a column of booleans, with `True` items at the positions where
this column contains a NaN. This is only applicable to double columns.
is_nan : Column
is_nan self = self.make_unary_op "IS_NAN" new_type=SQL_Type.boolean

## PRIVATE
Returns a column of booleans, with `True` items at the positions where
this column contains an empty string or `Nothing`.
Expand Down
22 changes: 16 additions & 6 deletions distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -784,12 +784,22 @@ type Table
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
Error.throw (Unsupported_Database_Operation_Error_Data msg)

## DEPRECATED Will be replaced with `filter_incomplete_rows`.
drop_missing_rows : Table
drop_missing_rows self =
filters = self.columns.map (c -> c.is_missing.not.expression)
new_ctx = self.context.set_where_filters (self.context.where_filters + filters)
self.updated_context new_ctx
## ALIAS dropna
ALIAS drop_missing_rows
Remove rows which are all blank or containing blank values.

Arguments:
- when_any: If `True`, then remove any row containing any blank values.
If `False`, then only remove rows with all blank values.
- treat_nans_as_blank: If `True`, then `Number.nan` is considered as blank.

? Blank values
Blank values are `Nothing`, `""` and depending on setting `Number.nan`.
filter_blank_rows : Boolean -> Boolean -> Table
filter_blank_rows self when_any=False treat_nans_as_blank=False =
can_contain_text col = col.sql_type.is_definitely_text
can_contain_double col = col.sql_type.is_definitely_double
Table_Helpers.filter_blank_rows self can_contain_text can_contain_double when_any treat_nans_as_blank

## DEPRECATED Will be replaced with `Incomplete_Columns` selector (to be used with `remove_columns`).
drop_missing_columns : Table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,22 +174,17 @@ base_dialect =
compare = [bin "=", bin "!=", bin "<", bin ">", bin "<=", bin ">=", ["BETWEEN", make_between]]
agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"]
counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]]
text = [["IS_EMPTY", make_is_empty], bin "LIKE"]
text = [is_empty, bin "LIKE"]
nulls = [["IS_NULL", make_right_unary_op "IS NULL"], ["FILL_NULL", make_function "COALESCE"]]
contains = [["IS_IN", make_is_in]]
base_map = Map.from_vector (arith + logic + compare + agg + counts + text + nulls + contains)
Internal_Dialect.Value base_map wrap_in_quotes

## PRIVATE
make_is_empty : Vector Builder -> Builder
make_is_empty arguments = case arguments.length of
1 ->
arg = arguments.at 0
is_null = (arg ++ " IS NULL").paren
is_empty = (arg ++ " = ''").paren
(is_null ++ " OR " ++ is_empty).paren
_ ->
Error.throw <| Illegal_State_Error_Data ("Invalid amount of arguments for operation IS_EMPTY")
is_empty = lift_unary_op "IS_EMPTY" arg->
is_null = (arg ++ " IS NULL").paren
is_empty = (arg ++ " = ''").paren
(is_null ++ " OR " ++ is_empty).paren

## PRIVATE
make_between : Vector Builder -> Builder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ make_internal_generator_dialect =
cases = [["LOWER", Base_Generator.make_function "LOWER"], ["UPPER", Base_Generator.make_function "UPPER"]]
text = [starts_with, contains, ends_with, agg_shortest, agg_longest]+concat_ops+cases
counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
arith_extensions = [is_nan]

stddev_pop = ["STDDEV_POP", Base_Generator.make_function "stddev_pop"]
stddev_samp = ["STDDEV_SAMP", Base_Generator.make_function "stddev_samp"]
stats = [agg_median, agg_mode, agg_percentile, stddev_pop, stddev_samp]
my_mappings = text + counts + stats + first_last_aggregators
my_mappings = text + counts + stats + first_last_aggregators + arith_extensions
Base_Generator.base_dialect . extend_with my_mappings

## PRIVATE
Expand Down Expand Up @@ -247,3 +248,7 @@ make_order_descriptor internal_column sort_direction text_ordering =
Order_Descriptor.Value folded_expression sort_direction nulls_order=nulls collation=Nothing
False ->
Order_Descriptor.Value internal_column.expression sort_direction nulls_order=nulls collation=Nothing

## PRIVATE
is_nan = Base_Generator.lift_unary_op "IS_NAN" arg->
(arg ++ " = double precision 'NaN'").paren
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import Standard.Examples

example_drop_missing_rows =
Examples.inventory_table.drop_missing_rows
Examples.inventory_table.filter_blank_rows when_any=True

> Example
Remove any columns that contain missing values from the table.
Expand Down
31 changes: 24 additions & 7 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -483,13 +483,23 @@ type Column

example_is_missing = Examples.decimal_column.is_missing
is_missing : Column
is_missing self = run_vectorized_unary_op self "is_missing" (== Nothing)
is_missing self = run_vectorized_unary_op self "is_missing" (== Nothing) on_missing=True

## UNSTABLE
Returns a column of booleans, with `True` items at the positions where
this column contains a NaN. This is only applicable to double columns.
is_nan : Column
is_nan self =
is_object_nan x = case x of
_ : Decimal -> x.is_nan
_ -> False
run_vectorized_unary_op self "is_nan" is_object_nan on_missing=False

## PRIVATE
Returns a column of booleans, with `True` items at the positions where
this column contains an empty string or `Nothing`.
is_empty : Column
is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition.Is_Empty.to_predicate
is_empty self = run_vectorized_unary_op self "is_empty" Filter_Condition.Is_Empty.to_predicate on_missing=True

## Returns a column of booleans, with `True` items at the positions where
this column does not contain a `Nothing`.
Expand Down Expand Up @@ -654,6 +664,10 @@ type Column

Arguments:
- function: The function to apply to each element of `self` column.
- on_missing: The value to return for missing cells. Ideally it should be
replaced with a `skip_nulls` parameter like elsewhere, but currently
that is not possible due to a bug:
https://github.com/oracle/graal/issues/4741

> Example
Multiply each element of the column by itself.
Expand All @@ -662,10 +676,10 @@ type Column

example_map = Examples.integer_column.map (x -> x * x)
map : (Any -> Any) -> Column
map self function =
map self function on_missing=Nothing =
storage = self.java_column.getStorage
index = self.java_column.getIndex
new_st = storage.map Nothing function
new_st = storage.map Nothing function on_missing
col = Java_Column.new "Result" index new_st
Column_Data col

Expand Down Expand Up @@ -1219,11 +1233,14 @@ run_vectorized_binary_op column name fallback_fn operand skip_nulls=True = case
- column: The column to execute the operation over.
- name: The name of the vectorized operation.
- fallback_fn: A function used if the vectorized operation isn't available.
run_vectorized_unary_op : Column -> Text -> (Any -> Any) -> Column
run_vectorized_unary_op column name fallback_fn =
- on_missing: The value to return for missing cells. Ideally it should be
replaced with a `skip_nulls` parameter like elsewhere, but currently that
is not possible due to a bug: https://github.com/oracle/graal/issues/4741
run_vectorized_unary_op : Column -> Text -> (Any -> Any) -> Any -> Column
run_vectorized_unary_op column name fallback_fn on_missing=Nothing =
s = column.java_column.getStorage
ix = column.java_column.getIndex
rs = s.map name fallback_fn
rs = s.map name fallback_fn on_missing
Column.Column_Data (Java_Column.new "Result" ix rs)

## PRIVATE
Expand Down
43 changes: 28 additions & 15 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import project.Data.Value_Type.Value_Type
import project.Data.Sort_Column_Selector.Sort_Column_Selector
import project.Data.Sort_Column.Sort_Column
import project.Data.Aggregate_Column.Aggregate_Column
import project.Data.Storage.Storage
import project.Internal.Table_Helpers
import project.Internal.Aggregate_Column_Helper
import project.Internal.Parse_Values_Helper
Expand Down Expand Up @@ -768,7 +769,7 @@ type Table
> Example
Replace dashes with underscores in a column named "variable_names".

table.replace_text "variable_names" "-" "_"
table.replace_text "variable_names" "-" "_"

> Example
Remove leading and trailing spaces from cells in multiple columns.
Expand All @@ -782,7 +783,7 @@ type Table
replace_text : (Text | Integer | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table
replace_text self columns=(Column_Selector.By_Index [0]) term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
problem_builder = Problem_Builder.new

selector = case columns of
_ : Column_Selector -> columns
name : Text -> Column_Selector.By_Name [name]
Expand All @@ -806,11 +807,11 @@ type Table

new_columns = self.columns.map column->
is_selected = selected_names.get_or_else column.name False
if is_selected then transform column else column
if is_selected then transform column else column

result = Table.new new_columns
problem_builder.attach_problems_after on_problems result

## ALIAS Filter Rows

Selects only the rows of this table that correspond to `True` values of
Expand Down Expand Up @@ -1002,16 +1003,28 @@ type Table
Table.Table_Data t ->
Table.Table_Data (self.java_table.join t drop_unmatched on left_suffix right_suffix)

## DEPRECATED Will be replaced with `filter_incomplete_rows`.
drop_missing_rows : Table
drop_missing_rows self =
cols = self.columns
case cols.not_empty of
True ->
any_missing_mask = cols.map .is_missing . reduce (||)
non_missing_mask = any_missing_mask.not
self.filter non_missing_mask
False -> self
## ALIAS dropna
ALIAS drop_missing_rows
Remove rows which are all blank or containing blank values.

Arguments:
- when_any: If `True`, then remove any row containing any blank values.
If `False`, then only remove rows with all blank values.
- treat_nans_as_blank: If `True`, then `Number.nan` is considered as blank.

? Blank values
Blank values are `Nothing`, `""` and depending on setting `Number.nan`.
filter_blank_rows : Boolean -> Boolean -> Table
filter_blank_rows self when_any=False treat_nans_as_blank=False =
can_contain_text col = case col.storage_type of
Storage.Text -> True
Storage.Any -> True
_ -> False
can_contain_double col = case col.storage_type of
Storage.Decimal -> True
Storage.Any -> True
_ -> False
Table_Helpers.filter_blank_rows self can_contain_text can_contain_double when_any treat_nans_as_blank

## DEPRECATED Will be replaced with `Incomplete_Columns` selector (to be used with `remove_columns`).
drop_missing_columns : Table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -554,3 +554,24 @@ select_columns_by_column_reference internal_columns column_selectors problem_bui
column = column_extractor selector
column.name
select_columns_by_name internal_columns column_selectors Text_Matcher.Case_Sensitive problem_builder name_extractor

## PRIVATE
A helper method gathering the common logic for constructing expressions that
can filter out blank rows.
filter_blank_rows : Table -> (Column -> Boolean) -> (Column -> Boolean) -> Boolean -> Boolean -> Table
filter_blank_rows table can_contain_text can_contain_double when_any treat_nans_as_blank =
cols = table.columns
case cols.not_empty of
True ->
handle col =
is_blank = case can_contain_text col of
True -> col.is_empty
False -> col.is_missing
case treat_nans_as_blank && (can_contain_double col) of
True -> is_blank || col.is_nan
False -> is_blank
merge = if when_any then (||) else (&&)
missing_mask = cols.map handle . reduce merge
non_missing_mask = missing_mask.not
table.filter non_missing_mask
False -> table
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public int getType() {
public Boolean getItemBoxed(int idx) {
return isMissing.get(idx) ? null : getItem(idx);
}

public boolean getItem(long idx) {
return negated != values.get((int) idx);
}
Expand Down Expand Up @@ -291,6 +292,13 @@ public BoolStorage runZip(BoolStorage storage, Storage<?> arg) {
}
}
})
.add(
new UnaryMapOperation<>(Maps.IS_MISSING) {
@Override
public BoolStorage run(BoolStorage storage) {
return new BoolStorage(storage.isMissing, new BitSet(), storage.size, false);
}
})
.add(new BooleanIsInOp());
return ops;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,19 @@ public BoolStorage run(DoubleStorage storage) {
return new BoolStorage(storage.isMissing, new BitSet(), storage.size, false);
}
})
.add(
new UnaryMapOperation<>(Maps.IS_NAN) {
@Override
public BoolStorage run(DoubleStorage storage) {
BitSet nans = new BitSet();
for (int i = 0; i < storage.size; i++) {
if (!storage.isNa(i) && Double.isNaN(storage.getItem(i))) {
nans.set(i);
}
}
return new BoolStorage(nans, new BitSet(), storage.size, false);
}
})
.add(
SpecializedIsInOp.make(
list -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ public static final class Maps {
public static final String AND = "&&";
public static final String OR = "||";
public static final String IS_MISSING = "is_missing";
public static final String IS_NAN = "is_nan";
public static final String IS_EMPTY = "is_empty";
public static final String STARTS_WITH = "starts_with";
public static final String ENDS_WITH = "ends_with";
Expand Down Expand Up @@ -169,17 +170,19 @@ public final Aggregator getAggregator(
* @param name a name of potential vectorized variant of the function that should be used if
* supported. If this argument is null, the vectorized operation will never be used.
* @param function the function to run.
* @param onMissing the value to place for missing cells, usually just null
* @return the result of running the function on all non-missing elements.
*/
public final Storage<?> map(String name, Function<Object, Value> function) {
public final Storage<?> map(String name, Function<Object, Value> function, Value onMissing) {
if (name != null && isOpVectorized(name)) {
return runVectorizedMap(name, null);
}
Object missingValue = onMissing == null ? null : Polyglot_Utils.convertPolyglotValue(onMissing);
Builder builder = new InferredBuilder(size());
for (int i = 0; i < size(); i++) {
Object it = getItemBoxed(i);
if (it == null) {
builder.appendNoGrow(null);
builder.appendNoGrow(missingValue);
} else {
Value result = function.apply(it);
Object converted = Polyglot_Utils.convertPolyglotValue(result);
Expand Down
Loading

0 comments on commit 17f7398

Please sign in to comment.