Skip to content

Commit

Permalink
Refactor Aggregate Column (#3349)
Browse files Browse the repository at this point in the history
- Make it easier to understand the computations.
- Fix issue with First.
- Improve quote handling in Concatenate
- Added validation and warnings to input
  • Loading branch information
jdunkerley authored Mar 22, 2022
1 parent ccaf248 commit 02bcfbb
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 182 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
- [Replaced `Table.group_by` with `Table.aggregate`][3339]
- [Implemented `Panic.catch` and helper functions for handling errors. Added a
type parameter to `Panic.recover` to recover specific types of errors.][3344]
- [Added warning handling to `Table.aggregate`][3349]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -119,6 +120,7 @@
[3339]: https://github.com/enso-org/enso/pull/3339
[3344]: https://github.com/enso-org/enso/pull/3344
[3346]: https://github.com/enso-org/enso/pull/3346
[3349]: https://github.com/enso-org/enso/pull/3349

#### Enso Compiler

Expand Down
11 changes: 10 additions & 1 deletion distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ type Vector
["foo", "bar", "baz"].join ", "
join : Text -> Text
join separator="" prefix="" suffix="" =
if this.length == 0 then prefix+suffix else
if this.is_empty then prefix+suffix else
if this.length == 1 then prefix + this.unsafe_at 0 + suffix else
prefix + this.unsafe_at 0 + (1.up_to this.length . fold "" acc-> i-> acc + separator + this.unsafe_at i) + suffix

Expand Down Expand Up @@ -1013,6 +1013,15 @@ type Builder
capacity : Integer
capacity = this.to_array.length

## Checks if this builder is empty.

> Example
Checking for emptiness.

[].is_empty
is_empty : Boolean
is_empty = this.length == 0

## Appends a new element into this builder and returns it, propagating any
errors that the provided element could have contained.

Expand Down
158 changes: 10 additions & 148 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Aggregate_Column.enso
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from Standard.Base import all

from Standard.Table.Data.Column as Column_Module import Column
import Standard.Table.Data.Group_By_Key

## Defines an Aggregate Column
type Aggregate_Column
Expand Down Expand Up @@ -177,29 +176,17 @@ type Aggregate_Column
if this.new_name.is_nothing.not then this.new_name else
get_name c = (this.resolve_column table c).name
case this of
Group_By c _ -> (get_name c)
Group_By c _ -> get_name c
Count _ -> "Count"
Count_Distinct c _ _ ->
case c of
Vector.Vector _ -> "Count Distinct " + ((c.map get_name).join " ")
_ -> "Count Distinct " + (get_name c)
Count_Not_Nothing c _ -> "Count Not Nothing " + (get_name c)
Count_Nothing c _ -> "Count Nothing " + (get_name c)
Count_Not_Empty c _ -> "Count Not Empty " + (get_name c)
Count_Empty c _ -> "Count Empty " + (get_name c)
Sum c _ -> "Sum " + (get_name c)
Average c _ -> "Average " + (get_name c)
Median c _ -> "Median " + (get_name c)
Percentile p c _ -> (p*100).floor.to_text + "%-ile " + (get_name c)
Mode c _ -> "Mode " + (get_name c)
Standard_Deviation c _ _ -> "Standard Deviation " + (get_name c)
Concatenate c _ _ _ _ _ -> "Concatenate " + (get_name c)
First c _ _ _ -> "First " + (get_name c)
Last c _ _ _ -> "Last " + (get_name c)
Maximum c _ -> "Maximum " + (get_name c)
Minimum c _ -> "Minimum " + (get_name c)
Shortest c _ -> "Shortest " + (get_name c)
Longest c _ -> "Longest " + (get_name c)
Count_Distinct columns _ _ ->
as_vector = case columns of
Vector.Vector _ -> columns
_ -> [columns]
"Count Distinct " + (as_vector.map get_name . join " ")
Percentile p c _ -> ((p*100).floor.to_text + "%-ile ") + get_name c
_ ->
prefix = Meta.get_simple_type_name this . replace "_" " "
prefix + " " + get_name this.column

## PRIVATE
Given a column reference resolve to the underlying column
Expand All @@ -210,131 +197,6 @@ type Aggregate_Column
Text -> table.at column
Integer -> table.columns.at column

initial_value : Any
initial_value = case this of
Group_By _ _ -> Nothing
Count_Distinct _ _ _ -> Map.empty
Median _ _ -> Map.empty
Percentile _ _ _ -> Map.empty
Mode _ _ -> Map.empty
Average _ _ -> [0, 0]
Standard_Deviation _ _ _ -> [0, 0, 0]
Concatenate _ _ _ _ _ _ -> Nothing
First _ _ _ _ -> Nothing
Last _ _ _ _ -> Nothing
Sum _ _ -> Nothing
Maximum _ _ -> Nothing
Minimum _ _ -> Nothing
Shortest _ _ -> Nothing
Longest _ _ -> Nothing
_ -> 0

make_aggregator : Table->(Any->Integer->Any)
make_aggregator table =
create_closure c function =
col = this.resolve_column table c
function col

is_empty s = if s.is_nothing then True else case s of
Text -> s.is_empty
_ -> Error.throw (Invalid_Aggregation_Method this.col "Empty is only valid for Text")

case this of
Group_By c _ -> create_closure c col->_->i->(col.at i)
Count _ -> count->_->(count+1)
Count_Not_Nothing c _ -> create_closure c col->count->i->(count + if (col.at i).is_nothing then 0 else 1)
Count_Nothing c _ -> create_closure c col->count->i->(count + if (col.at i).is_nothing then 1 else 0)
Count_Not_Empty c _ -> create_closure c col->count->i->(count + if is_empty (col.at i) then 0 else 1)
Count_Empty c _ -> create_closure c col->count->i->(count + if is_empty (col.at i) then 1 else 0)
First c _ ignore_nothing _ ->
case ignore_nothing of
False -> create_closure c col->current->i->(if i==0 then (col.at i) else current)
True -> create_closure c col->current->i->if current.is_nothing then (col.at i) else current
Last c _ ignore_nothing _ ->
case ignore_nothing of
False -> create_closure c col->_->i->(col.at i)
True -> create_closure c col->current->i->
v = (col.at i)
if v.is_nothing then current else v
Maximum c _ -> create_closure c col->m->i->
v = col.at i
if v.is_nothing then m else if m.is_nothing then v else m.max v
Minimum c _ -> create_closure c col->m->i->
v = col.at i
if v.is_nothing then m else if m.is_nothing then v else m.min v
Shortest c _ -> create_closure c col->m->i->
v = col.at i
if v.is_nothing then m else if m.is_nothing then v else if m.length <= v.length then m else v
Longest c _ -> create_closure c col->m->i->
v = col.at i
if v.is_nothing then m else if m.is_nothing then v else if m.length >= v.length then m else v
Sum c _ -> create_closure c col->total->i->
v = col.at i
if v.is_nothing then total else
if total.is_nothing then v else total + v
Average c _ -> create_closure c col->a->i->
v = col.at i
if v.is_nothing then a else [a.first + 1, a.second + v]
Standard_Deviation c _ _ -> create_closure c col->a->i->
v = col.at i
if v.is_nothing then a else [a.first + 1, a.second + v, (a.at 2) + v*v]
Concatenate c _ join _ _ quote -> create_closure c col->text->i->
v = col.at i
val=if v.is_nothing then "" else
text = case v of
Text -> v
_ -> v.to_text
if text == "" then (quote+quote) else
if text.contains join then (quote+text+quote) else text
if text.is_nothing then val else (text + join + val)
Median c _ -> create_closure c col->map->i->
val = col.at i
if val.is_nothing then map else (map.insert val (1 + (map.get_or_else val 0)))
Percentile _ c _ -> create_closure c col->map->i->
val = col.at i
if val.is_nothing then map else (map.insert val (1 + (map.get_or_else val 0)))
Mode c _ -> create_closure c col->map->i->
val = col.at i
if val.is_nothing then map else (map.insert val (1 + (map.get_or_else val 0)))
Count_Distinct columns _ ignore_nothing ->
resolved = case columns of
Vector.Vector _ -> columns.map c->(this.resolve_column table c)
_ -> [this.resolve_column table columns]
key_maker i = Group_By_Key.key (resolved.map c->(c.at i))
case ignore_nothing of
False-> map->i->(map.insert (key_maker i) 1)
True-> map->i->
key = key_maker i
if key.values.all .is_nothing then map else (map.insert key 1)

evaluate : Any->Any
evaluate value =
## Given a map of values and counts, find the value at a specified percentile
percentile p:Decimal value:Map =
count = value.fold 0 (+)
if count == 0 then Nothing else
mid_value = (count - 1)*p + 1
if mid_value <= 1 then value.first.first else
if mid_value >= count then value.last.first else
mid = mid_value.floor
output = value.fold_with_key [0, Nothing, Nothing] c->k->v->
new_v = c.first + v
new_s = if c.first.up_to new_v . contains (mid-1) then k else c.second
new_t = if c.first.up_to new_v . contains mid then k else (c.at 2)
[new_v, new_s, new_t]
(output.second + (output.at 2 - output.second) * (mid_value - mid))

case this of
Count_Distinct _ _ _ -> value.size
Median _ _ -> percentile 0.5 value
Percentile p _ _ -> percentile p value
Mode _ _ -> (value.fold_with_key (Pair 0 Nothing) p->k->v-> if v>(p.first) then (Pair v k) else p) . second
Average _ _ -> if value.first == 0 then Nothing else (value.second / value.first)
Standard_Deviation _ _ p -> if value.first == 0 then Nothing else
f = if p then 1 else (value.first / (value.first - 1)).sqrt
((value.at 2)/value.first - (value.second/value.first)^2).sqrt * f
Concatenate _ _ _ s p _ -> if value.is_nothing then value else (s + value + p)
_ -> value

## Occurs when cannot aggregate a column
type Invalid_Aggregation_Method (column : Text) (message : Text)
Expand Down
51 changes: 26 additions & 25 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import Standard.Base.Data.Time.Date
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Internal.Table_Helpers
import Standard.Table.Internal.Unique_Name_Strategy
import Standard.Table.Internal.Aggregate_Column_Helper
import Standard.Table.Internal.Aggregate_Column_Aggregator

from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
Expand Down Expand Up @@ -489,45 +490,45 @@ type Table
## Prototype Group By function
aggregate : [Aggregate_Column] -> Problem_Behavior -> Table
aggregate columns (on_problems=Report_Warning) =
# Grouping Key
is_a_key c = case c of
Aggregate_Column.Group_By _ _ -> True
_ -> False
key_columns = columns.filter is_a_key . map c->(c.resolve_column this c.column)
make_key = if (key_columns.length == 0) then _->(Group_By_Key.key [1]) else i->(Group_By_Key.key (key_columns.map v->(v.at i)))

# New Table Accumulator
name_strategy = Unique_Name_Strategy.new
new_table = columns.map c->(c.column_name this) . map n->[name_strategy.make_unique n, Vector.new_builder]
validated = Aggregate_Column_Helper.validate columns this

make_key = if (validated.key_columns.length == 0) then _->(Group_By_Key.key [1]) else
i->(Group_By_Key.key (validated.key_columns.map v->(v.at i)))

new_table = validated.valid_columns.map c->[c.first, Vector.new_builder]
aggregators = validated.valid_columns.map c->(Aggregate_Column_Aggregator.new this c.second)
add_row _ =
idx = new_table.at 0 . at 1 . length
0.up_to (columns.length) . each i->
new_table.at i . at 1 . append ((columns.at i).initial_value)
0.up_to (aggregators.length) . each i->
new_table.at i . at 1 . append ((aggregators.at i).initial)
idx

# Fold
aggregators = columns.map c->(c.make_aggregator this)
aggregate map i =
key = make_key i
row_index = map.get_or_else key (add_row Nothing)

# Accumulate
0.up_to (columns.length) . each j->
aggregator = aggregators.at j
0.up_to (validated.valid_columns.length) . each j->
accumulator = aggregators.at j . accumulator
array = new_table.at j . at 1 . to_array
current = array . at row_index
new = aggregator current i
new = accumulator current i
array . set_at row_index new

map.insert key row_index
if ((key_columns.length == 0) && (this.row_count == 0)) then (add_row Nothing) else
0.up_to this.row_count . fold Map.empty aggregate

# Now Finalise and make a table
finalise builder index =
column = columns.at index
Vector.new builder.length i->(column.evaluate (builder.to_array.at i))
here.new (new_table.map_with_index i->c->[c.at 0,finalise (c.at 1) i])
on_problems.attach_problems_before validated.problems <|
# Build Table
if ((validated.key_columns.length == 0) && (this.row_count == 0)) then (add_row Nothing) else
0.up_to this.row_count . fold Map.empty aggregate

# Now Finalise and make a table
finalise builder index =
aggregator = aggregators.at index
Vector.new builder.length i->(aggregator.finalizer (builder.to_array.at i))

here.new (new_table.map_with_index i->c->[c.first, finalise c.second i])


## ALIAS Filter Rows
Expand Down Expand Up @@ -1449,7 +1450,7 @@ print_table header rows indices_count format_term =

Arguments:
- cmp: The Enso comparator function.
- x: The left operand to the compartor.
- x: The left operand to the comparator.
- y: The right operand to the comparator.
comparator_to_java : (Any -> Any -> Ordering) -> Any -> Any -> Integer
comparator_to_java cmp x y = cmp x y . to_sign
Loading

0 comments on commit 02bcfbb

Please sign in to comment.