diff --git a/CHANGELOG.md b/CHANGELOG.md index a8629ad56b3f..dc50cadbf0e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ existing functions][3292] - [Implemented `Text.to_case`, replacing `Text.to_lower_case` and `Text.to_upper_case`][3302] +- [Implemented initial `Table.group_by` function on Standard.Table][3305] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -88,6 +89,7 @@ [3287]: https://github.com/enso-org/enso/pull/3287 [3292]: https://github.com/enso-org/enso/pull/3292 [3302]: https://github.com/enso-org/enso/pull/3302 +[3305]: https://github.com/enso-org/enso/pull/3305 #### Enso Compiler diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Aggregate_Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Aggregate_Column.enso new file mode 100644 index 000000000000..dbc590c0e75f --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Aggregate_Column.enso @@ -0,0 +1,215 @@ +from Standard.Base import all + +from Standard.Table.Data.Column as Column_Module import Column +import Standard.Table.Data.Group_By + +## Defines an Aggregate Column +type Aggregate_Column + ## Creates a new column with the row count of each group + type Count (name:Text|Nothing=Nothing) + + ## Creates a new column with the count of unique items in the selected + column(s) within each group. + type Count_Distinct (columns:Column|Text|Integer|[(Column|Text|Integer)]) (name:Text|Nothing=Nothing) + + ## ALIAS Count_Not_Null + + Creates a new column with the count of not `Nothing` (null) values of the + specified column within each group. + type Count_Not_Nothing (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## ALIAS Count_Null, Count_Missing + + Creates a new column with the count of `Nothing` (null) values of the + specified column within each group. + type Count_Nothing (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the count of not `Nothing` (null) and non-empty + ("") values of the column within each group. + type Count_Not_Empty (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the count of `Nothing` (null) or empty ("") + text values of the column within each group. + type Count_Empty (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the sum of values (ignoring missing values) of + the specified column within each group. + type Sum (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the mean of values (ignoring missing values) of + the specified column within each group. + type Average (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the median of values (ignoring missing values) + of the specified column within each group. + type Median (column:Column|Text|Integer) (name:Text|Nothing=Nothing) + + ## Creates a new column with the standard deviation of values (ignoring + missing values) of the column within each group. + + Arguments: + - population argument specifies if group is a sample or the population + type Standard_Deviation (column:Column|Text|Integer) (name:Text|Nothing=Nothing) (population:Boolean=False) + + ## Creates a new column with the values concatenated together. `Nothing` values will become an empty string. + + Arguments: + - separator: added between each value. + - prefix: added at the start of the result. + - suffix: added at the end of the result. + - quote_char: character used to quote the values if the value is `Empty` + or contains the separtor. + type Concatenate (column:Column|Text|Integer) (name:Text|Nothing=Nothing) (separator:Text="") (prefix:Text="") (suffix:Text="") (quote_char:Text="") + + ## Creates a new column with the first value in each group. + + Arguments: + - ignore_nothing: if `True`, then missing values are ignored and first + not missing value returned. + - order_by: required for database tables. Specifies how to order the + results within the group. + type First (column:Column|Text|Integer) (name:Text|Nothing=Nothing) (ignore_nothing:Boolean=True) (order_by:Column_Selector|Nothing=Nothing) + + ## Creates a new column with the last value in each group. + + Arguments: + - ignore_nothing: if `True`, then missing values are ignored and last + not missing value returned. + - order_by: required for database tables. Specifies how to order the + results within the group. + type Last (column:Column|Text|Integer) (name:Text|Nothing=Nothing) (ignore_nothing:Boolean=True) (order_by:Column_Selector|Nothing=Nothing) + + ## Gets a column name to use for the aggregate column + column_name : Table->Text + column_name table = + if this.name.is_nothing.not then this.name else + get_name c = (this.resolve_column table c).name + case this of + Count _ -> "Count" + Count_Distinct c _ -> + case c of + Vector.Vector _ -> "Count Distinct " + ((c.map get_name).join " ") + _ -> "Count Distinct " + (get_name c) + Count_Not_Nothing c _ -> "Count Not Nothing " + (get_name c) + Count_Nothing c _ -> "Count Nothing " + (get_name c) + Count_Not_Empty c _ -> "Count Not Empty " + (get_name c) + Count_Empty c _ -> "Count Empty " + (get_name c) + Sum c _ -> "Sum " + (get_name c) + Average c _ -> "Average " + (get_name c) + Median c _ -> "Median " + (get_name c) + Standard_Deviation c _ _ -> "Standard Deviation " + (get_name c) + Concatenate c _ _ _ _ _ -> "Concatenate " + (get_name c) + First c _ _ _ -> "First " + (get_name c) + Last c _ _ _ -> "Last " + (get_name c) + + ## PRIVATE + Given a column reference resolve to the underlying column + resolve_column : Table->(Column|Text|Integer)->Column + resolve_column table column = + case column of + Column _ -> table.at (column.name) + Text -> table.at column + Integer -> table.columns.at column + + initial_value : Any + initial_value = case this of + Count_Distinct _ _ -> Map.empty + Median _ _ -> Map.empty + Average _ _ -> [0, 0] + Standard_Deviation _ _ _ -> [0, 0, 0] + Concatenate _ _ _ _ _ _ -> Nothing + First _ _ _ _ -> Nothing + Last _ _ _ _ -> Nothing + Sum _ _ -> Nothing + _ -> 0 + + make_aggregator : Table->(Any->Integer->Any) + make_aggregator table = + create_closure c function = + col = this.resolve_column table c + function col + + is_empty s = if s.is_nothing then True else case s of + Text -> s.is_empty + _ -> Error.throw (Invalid_Aggregation_Method this.col "Empty is only valid for Text") + + case this of + Count _ -> count->_->(count+1) + Count_Not_Nothing c _ -> create_closure c col->count->i->(count + if (col.at i).is_nothing then 0 else 1) + Count_Nothing c _ -> create_closure c col->count->i->(count + if (col.at i).is_nothing then 1 else 0) + Count_Not_Empty c _ -> create_closure c col->count->i->(count + if is_empty (col.at i) then 0 else 1) + Count_Empty c _ -> create_closure c col->count->i->(count + if is_empty (col.at i) then 1 else 0) + First c _ i _ -> + case i of + False -> create_closure c col->current->i->(if i==0 then (col.at i) else current) + True -> create_closure c col->current->i->if current.is_nothing then (col.at i) else current + Last c _ i _ -> + case i of + False -> create_closure c col->_->i->(col.at i) + True -> create_closure c col->current->i-> + v = (col.at i) + if v.is_nothing then current else v + Sum c _ -> create_closure c col->total->i-> + v = col.at i + if v.is_nothing then total else + if total.is_nothing then v else total + v + Average c _ -> create_closure c col->a->i-> + v = col.at i + if v.is_nothing then a else [a.first + 1, a.second + v] + Standard_Deviation c _ _ -> create_closure c col->a->i-> + v = col.at i + if v.is_nothing then a else [a.first + 1, a.second + v, (a.at 2) + v*v] + Concatenate c _ j _ _ q -> create_closure c col->text->i-> + v = col.at i + val=if v.is_nothing then "" else + text = case v of + Text -> v + _ -> v.to_text + if text == "" then (q+q) else + if text.contains j then (q+text+q) else text + if i==0 then val else (text + j + val) + Median c _ -> create_closure c col->map->i-> + val = col.at i + if val.is_nothing then map else (map.insert val (1 + (map.get_or_else val 0))) + Count_Distinct columns _ -> + resolved = case columns of + Vector.Vector _ -> columns.map c->(this.resolve_column table c) + _ -> [this.resolve_column table columns] + key_maker i = Group_By.key (resolved.map c->(c.at i)) + map->i->(map.insert (key_maker i) 1) + + evaluate : Any->Any + evaluate value = case this of + Count_Distinct _ _ -> value.size + Median _ _ -> + count = value.fold 0 (+) + if count == 0 then Nothing else + case count%2 == 1 of + True -> + mid = (count-1) / 2 + output = value.fold_with_key [0, Nothing] c->k->v-> + new_v = c.first + v + [new_v, if c.first.up_to new_v . contains mid then k else c.second] + output.second + False -> + mid = count / 2 + output = value.fold_with_key [0, Nothing, Nothing] c->k->v-> + new_v = c.first + v + new_s = if c.first.up_to new_v . contains mid then k else c.second + new_t = if c.first.up_to new_v . contains (mid-1) then k else (c.at 2) + [new_v, new_s, new_t] + (output.second + (output.at 2)) / 2 + Average _ _ -> if value.first == 0 then Nothing else (value.second / value.first) + Standard_Deviation _ _ p -> if value.first == 0 then Nothing else + f = if p then 1 else (value.first / (value.first - 1)).sqrt + ((value.at 2)/value.first - (value.second/value.first)^2).sqrt * f + Concatenate _ _ _ s p _ -> if value.is_nothing then value else (s + value + p) + _ -> value + + +## Occurs when cannot aggregate a column +type Invalid_Aggregation_Method (column : Text) (message : Text) + +Invalid_Aggregation_Method.to_display_text : Text +Invalid_Aggregation_Method.to_display_text = + "The aggregate column "+this.column+" resulted in an error: "+this.message diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By.enso new file mode 100644 index 000000000000..218d8e872e98 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By.enso @@ -0,0 +1,54 @@ +from Standard.Base import all +import Standard.Base.Data.Ordering.Vector_Lexicographic_Order + +## Create a key structure for grouping operations +key : Vector -> Group_By_Key +key values = + mapper c = case c of + Boolean -> Comparable_Boolean c + Nothing -> Comparable_Nothing + _ -> c + + Group_By_Key <| values.map mapper + +## PRIVATE + Represents a comparable vector of element which is used as key for grouping. +type Group_By_Key + type Group_By_Key values + + ## See if two keys are equal + == : Group_By_Key->Boolean + == that = this.values == that.values + + ## Compares two keys + compare_to : Group_By_Key->Ordering + compare_to that = + Vector_Lexicographic_Order.compare this.values that.values + +## PRIVATE + Temporary workaround until Boolean compare_to completed +type Comparable_Boolean + type Comparable_Boolean value + + == : Comparable_Boolean->Boolean + == that = (this.compare_to that) == Ordering.Equal + + compare_to : Any->Ordering + compare_to that = + if this.value == that.value then Ordering.Equal else + if this.value then Ordering.Greater else Ordering.Less + +## PRIVATE + Temporary workaround allowing Nothing to be in a Group_By +type Comparable_Nothing + type Comparable_Nothing + + == : Comparable_Nothing->Boolean + == that = (this.compare_to that) == Ordering.Equal + + compare_to : Any->Ordering + compare_to that = + case that of + Comparable_Nothing -> Ordering.Equal + Nothing -> Ordering.Equal + _ -> Ordering.Less diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index c3e67e5041fd..3c8b51cc2d5b 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -17,6 +17,9 @@ import Standard.Table.Data.Column_Mapping import Standard.Table.Data.Position import Standard.Base.Error.Warnings +import Standard.Table.Data.Group_By +import Standard.Table.Data.Aggregate_Column + polyglot java import org.enso.table.data.table.Table as Java_Table polyglot java import org.enso.table.operations.OrderBuilder polyglot java import org.enso.table.format.csv.Writer as Csv_Writer @@ -507,6 +510,53 @@ type Table new_names = this.columns.map mapper this.take_end (this.length - 1) . rename_columns (Column_Mapping.By_Position new_names) on_problems=on_problems warnings=warnings + + ## Prototype Group By function + group_by : Column_Selector -> [Aggregate_Column] -> Problem_Behavior -> Warnings.Warning_System -> Table + group_by selector columns (on_problems=Report_Warning) (warnings=Warnings.default) = + # Grouping Key + key_columns = if selector.is_nothing then [] else + Table_Helpers.select_columns internal_columns=this.columns selector=selector reorder=True on_problems=on_problems warnings=warnings + key_length = key_columns.length + make_key = if (key_length == 0) then _->(Group_By.key [1]) else i->(Group_By.key (key_columns.map v->(v.at i))) + + # New Table Accumulator + new_table = (key_columns.map c->c.name)+(columns.map c->(c.column_name this)) . map n->[n, Vector.new_builder] + add_row key = + idx = new_table.at 0 . at 1 . length + 0.up_to key_length . each i-> + new_table.at i . at 1 . append (key.values.at i) + 0.up_to (columns.length) . each i-> + column = columns.at i + new_table.at (i + key_length) . at 1 . append (column.initial_value) + idx + + # Fold + aggregators = columns.map c->(c.make_aggregator this) + aggregate map i = + key = make_key i + row_index = map.get_or_else key (add_row key) + + # Accumulate + 0.up_to (columns.length) . each j-> + aggregator = aggregators.at j + array = new_table.at (j + key_length) . at 1 . to_array + current = array . at row_index + new = aggregator current i + array . set_at row_index new + + map.insert key row_index + if ((key_length == 0) && (this.row_count == 0)) then (add_row []) else + 0.up_to this.row_count . fold Map.empty aggregate + + # Now Finalise and make a table + finalise builder index = + if index < key_length then builder.to_vector else + column = columns.at (index - key_length) + Vector.new builder.length i->(column.evaluate (builder.to_array.at i)) + here.new (new_table.map_with_index i->c->[c.at 0,finalise (c.at 1) i]) + + ## ALIAS Filter Rows ALIAS Mask Columns diff --git a/test/Table_Tests/src/Aggregate_Column_Spec.enso b/test/Table_Tests/src/Aggregate_Column_Spec.enso new file mode 100644 index 000000000000..920fe6f5c2fc --- /dev/null +++ b/test/Table_Tests/src/Aggregate_Column_Spec.enso @@ -0,0 +1,141 @@ +from Standard.Base import all + +import Standard.Table.Data.Table +from Standard.Table.Data.Aggregate_Column import all + +import Standard.Test + +spec = Test.group "Aggregate Columns" <| + simple_table = Table.new [["count", [1, 2, Nothing, 3, Nothing]], ["is_valid", [Nothing, False, True, False, Nothing]], ["float", [1, 2.1, 3.4, 5.6, Nothing]], ["text", ["A", "", Nothing, "B,C", Nothing]]] + text_col = simple_table.at "text" + bool_col = simple_table.at "is_valid" + float_col = simple_table.at "float" + int_col = simple_table.at "count" + empty_table = Table.new [["count", []], ["is_valid", []], ["text", []]] + + test_name = "Test Column" + + test_aggregator table col expected_name expected_result epsilon=False = + col.column_name table . should_equal expected_name + + acc = col.make_aggregator table + folded_value = 0.up_to table.row_count . fold col.initial_value acc + result = col.evaluate folded_value + + if epsilon != False then ((result - expected_result).abs < epsilon).should_be_true else + result.should_equal expected_result + + Test.specify "should be able to count a set" <| + test_aggregator simple_table (Count Nothing) "Count" simple_table.row_count + test_aggregator simple_table (Count test_name) test_name simple_table.row_count + test_aggregator empty_table (Count test_name) test_name empty_table.row_count + + Test.specify "should be able to count missing values in a set" <| + test_aggregator simple_table (Count_Nothing 0) "Count Nothing count" 2 + test_aggregator simple_table (Count_Nothing 0 test_name) test_name 2 + test_aggregator simple_table (Count_Nothing "text" test_name) test_name 2 + test_aggregator simple_table (Count_Nothing text_col test_name) test_name 2 + test_aggregator empty_table (Count_Nothing 0 test_name) test_name empty_table.row_count + + Test.specify "should be able to count non missing values in a set" <| + test_aggregator simple_table (Count_Not_Nothing 0) "Count Not Nothing count" 3 + test_aggregator simple_table (Count_Not_Nothing 0 test_name) test_name 3 + test_aggregator simple_table (Count_Not_Nothing "text" test_name) test_name 3 + test_aggregator simple_table (Count_Not_Nothing text_col test_name) test_name 3 + test_aggregator empty_table (Count_Not_Nothing 0 test_name) test_name empty_table.row_count + + Test.specify "should be able to count empties in a set of Texts" <| + test_aggregator simple_table (Count_Empty -1) "Count Empty text" 3 + test_aggregator simple_table (Count_Empty -1 test_name) test_name 3 + test_aggregator simple_table (Count_Empty "text" test_name) test_name 3 + test_aggregator simple_table (Count_Empty text_col test_name) test_name 3 + test_aggregator empty_table (Count_Empty 0 test_name) test_name empty_table.row_count + + Test.specify "should be able to count non empties in a set of Texts" <| + test_aggregator simple_table (Count_Not_Empty -1) "Count Not Empty text" 2 + test_aggregator simple_table (Count_Not_Empty -1 test_name) test_name 2 + test_aggregator simple_table (Count_Not_Empty "text" test_name) test_name 2 + test_aggregator simple_table (Count_Not_Empty text_col test_name) test_name 2 + test_aggregator empty_table (Count_Not_Empty 0 test_name) test_name empty_table.row_count + + Test.specify "should be able to total a set of values" <| + test_aggregator simple_table (Sum -2) "Sum float" 12.1 + test_aggregator simple_table (Sum -2 test_name) test_name 12.1 + test_aggregator simple_table (Sum "float" test_name) test_name 12.1 + test_aggregator simple_table (Sum float_col test_name) test_name 12.1 + test_aggregator empty_table (Sum 0 test_name) test_name Nothing + + Test.specify "should be able to average a set of values" <| + test_aggregator simple_table (Average -2) "Average float" 3.025 0.000001 + test_aggregator simple_table (Average -2 test_name) test_name 3.025 0.000001 + test_aggregator simple_table (Average "float" test_name) test_name 3.025 0.000001 + test_aggregator simple_table (Average float_col test_name) test_name 3.025 0.000001 + test_aggregator empty_table (Average 0 test_name) test_name Nothing + + Test.specify "should be able to compute standard deviation a set of values" <| + test_aggregator simple_table (Standard_Deviation -2) "Standard Deviation float" 1.977161 0.000001 + test_aggregator simple_table (Standard_Deviation -2 test_name) test_name 1.977161 0.000001 + test_aggregator simple_table (Standard_Deviation "float" test_name) test_name 1.977161 0.000001 + test_aggregator simple_table (Standard_Deviation float_col test_name) test_name 1.977161 0.000001 + test_aggregator empty_table (Standard_Deviation 0 test_name) test_name Nothing + + Test.specify "should be able to compute standard deviation of a population a set of values" <| + test_aggregator simple_table (Standard_Deviation -2 population=True) "Standard Deviation float" 1.712271 0.000001 + test_aggregator simple_table (Standard_Deviation -2 test_name population=True) test_name 1.712271 0.000001 + test_aggregator simple_table (Standard_Deviation "float" test_name population=True) test_name 1.712271 0.000001 + test_aggregator simple_table (Standard_Deviation float_col test_name population=True) test_name 1.712271 0.000001 + test_aggregator empty_table (Standard_Deviation 0 test_name population=True) test_name Nothing + + Test.specify "should be able to compute median a set of values" <| + test_aggregator simple_table (Median -2) "Median float" 2.75 0.000001 + test_aggregator simple_table (Median -2 test_name) test_name 2.75 0.000001 + test_aggregator simple_table (Median "float" test_name) test_name 2.75 0.000001 + test_aggregator simple_table (Median float_col test_name) test_name 2.75 0.000001 + test_aggregator empty_table (Median 0 test_name) test_name Nothing + + Test.specify "should be able to compute first of a set of values including missing" <| + test_aggregator simple_table (First 1 ignore_nothing=False) "First is_valid" Nothing + test_aggregator simple_table (First 1 test_name ignore_nothing=False) test_name Nothing + test_aggregator simple_table (First "is_valid" test_name ignore_nothing=False) test_name Nothing + test_aggregator simple_table (First bool_col test_name ignore_nothing=False) test_name Nothing + test_aggregator empty_table (First 0 test_name ignore_nothing=False) test_name Nothing + + Test.specify "should be able to compute first of a set of values excluding missing" <| + test_aggregator simple_table (First 1) "First is_valid" False + test_aggregator simple_table (First 1 test_name) test_name False + test_aggregator simple_table (First "is_valid" test_name) test_name False + test_aggregator simple_table (First bool_col test_name) test_name False + test_aggregator empty_table (First 0 test_name) test_name Nothing + + Test.specify "should be able to compute last of a set of values including missing" <| + test_aggregator simple_table (Last 1 ignore_nothing=False) "Last is_valid" Nothing + test_aggregator simple_table (Last 1 test_name ignore_nothing=False) test_name Nothing + test_aggregator simple_table (Last "is_valid" test_name ignore_nothing=False) test_name Nothing + test_aggregator simple_table (Last bool_col test_name ignore_nothing=False) test_name Nothing + test_aggregator empty_table (Last 0 test_name ignore_nothing=False) test_name Nothing + + Test.specify "should be able to compute last of a set of values excluding missing" <| + test_aggregator simple_table (Last 1) "Last is_valid" False + test_aggregator simple_table (Last 1 test_name) test_name False + test_aggregator simple_table (Last "is_valid" test_name) test_name False + test_aggregator simple_table (Last bool_col test_name) test_name False + test_aggregator empty_table (Last 0 test_name) test_name Nothing + + Test.specify "should be able to concatenate a set of values excluding missing" <| + test_aggregator simple_table (Concatenate -1 Nothing ',' '[' ']' '"') "Concatenate text" '[A,"",,"B,C",]' + test_aggregator simple_table (Concatenate -1 test_name) test_name 'AB,C' + test_aggregator simple_table (Concatenate "text" test_name ',') test_name 'A,,,B,C,' + test_aggregator simple_table (Concatenate text_col test_name) test_name 'AB,C' + test_aggregator empty_table (Concatenate 0 test_name) test_name Nothing + + Test.specify "should be able to count distinct items on a single set of values" <| + test_aggregator simple_table (Count_Distinct 0) "Count Distinct count" 4 + test_aggregator simple_table (Count_Distinct 0 test_name) test_name 4 + test_aggregator simple_table (Count_Distinct "count" test_name) test_name 4 + test_aggregator simple_table (Count_Distinct int_col test_name) test_name 4 + test_aggregator empty_table (Count_Distinct 0 test_name) test_name 0 + + Test.specify "should be able to count distinct items on a multiple sets of values" <| + test_aggregator simple_table (Count_Distinct [0, 1]) "Count Distinct count is_valid" 5 + +main = Test.Suite.run_main here.spec