Skip to content

Commit

Permalink
Sorting Tables (#1471)
Browse files Browse the repository at this point in the history
  • Loading branch information
kustosz authored Feb 11, 2021
1 parent 5ba3b5d commit 93b6680
Show file tree
Hide file tree
Showing 18 changed files with 446 additions and 29 deletions.
4 changes: 4 additions & 0 deletions distribution/std-lib/Base/src/Error/Extensions.enso
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ unimplemented message="" = Panic.throw (Unimplemented_Error message)
Error.catch : (Error -> Any) -> Any
Error.catch (handler = x->x) = this.catch_primitive handler

## Takes any value, and if it is a dataflow error, throws it as a Panic.
Otherwise, returns the original value unchanged.
Panic.rethrow : (Any ! Any) -> Any
Panic.rethrow value = value.catch Panic.throw
29 changes: 29 additions & 0 deletions distribution/std-lib/Table/src/Data/Order_Rule.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from Base import all

type Order_Rule
## UNSTABLE

A rule used for sorting table-like structures.

Arguments:
- column: a value representing the data dimension by which this rule is
sorting. This type does not specify the underlying representation of
a column, assuming that the sorting engine defines its own column
representation.
- comparator: a function taking two elements of the data being sorted
on and returning an `Ordering`. The function may be `Nothing`, in
which case a natural ordering will be used. Note that certain table
backends (such us database connectors) may not support this field
being set to a non-`Nothing` value.
- order: specifies whether the table should be sorted in an ascending
or descending order. The default value of `Nothing` delegates the
decision to the sorting function. Can be set to
`Sort_Order.Ascending` or `Sort_Order.Descending` from the `Base`
library, to specify the ordering.
- missing_last: whether the missing values should be placed at the
beginning or end of the sorted table. Note that this argument is
independent from `order`, i.e. missing values will always be sorted
according to this rule, ignoring the ascending / descending setting.
The default value of `Nothing` delegates the decision to the sorting
function.
type Order_Rule column comparator=Nothing order=Nothing missing_last=Nothing
112 changes: 110 additions & 2 deletions distribution/std-lib/Table/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@ from Base import all
import Table.Io.Csv
import Table.Data.Column
import Base.System.Platform
from Table.Data.Order_Rule as Order_Rule_Module import Order_Rule

polyglot java import org.enso.table.data.table.Table as Java_Table
polyglot java import org.enso.table.operations.OrderBuilder

## An error returned when a non-existent column is being looked up.
type No_Such_Column_Error column_name

## Represents a column-oriented table data structure.
type Table
Expand Down Expand Up @@ -48,9 +53,9 @@ type Table
Json.Object fields

## Returns the column with the given name.
at : Text -> Column | Nothing
at : Text -> Column ! No_Such_Column_Error
at name = case this.java_table.getColumnByName name of
Nothing -> Nothing
Nothing -> Error.throw (No_Such_Column_Error name)
c -> Column.Column c

## Selects only the rows of this table that correspond to `True` values in
Expand Down Expand Up @@ -165,6 +170,109 @@ type Table
group by=Nothing =
Aggregate_Table (this.java_table.group by)

## UNSTABLE

Sorts the table according to the specified rules.

Arguments:
- by: specifies the columns used for reordering the table. This
argument may be one of:
- a text: the text is treated as a column name.
- a column: any column, that may or may not belong to this table.
Sorting by a column will result in reordering the rows of this
table in a way that would result in sorting the given column.
- an order rule: specifies both the sorting column and additional
settings, that will take precedence over the global parameters of
this sort operation. The `column` field of the rule may be a text
or a column, with the semantics described above.
- a vector of any of the above: this will result in a hierarchical
sorting, such that the first rule is applied first, the second is
used for breaking ties, etc.
- order: specifies the default sort order for this operation. All the
rules specified in the `by` argument will default to this setting,
unless specified in the rule.
- missing_last: specifies the default placement of missing values when
compared to non-missing ones. This setting may be overriden by the
particular rules of the `by` argument. Note thet this argument is
independent from `order`, i.e. missing values will always be sorted
according to this rule, ignoring the ascending / descending setting.

> Example
Sorting `table` in ascending order by the value in column `'Quantity'`
table.sort by='Quantity'

> Example
Sorting `table` in descending order by the value in column `'Quantity'`,
placing missing values at the top of the table.
table.sort by='Quantity' order=Sort_Order.Descending missing_last=False

> Example
Sorting `table` in ascending order by the value in column `'Quantity'`,
using the value in column `'Rating'` for breaking ties.
table.sort by=['Quantity', 'Rating']

> Example
Sorting `table` in ascending order by the value in column `'Quantity'`,
using the value in column `'Rating'` in descending order for breaking
ties.
table.sort by=['Quantity', Order_Rule 'Rating' (order=Sort_Order.Descending)]

> Example
Sorting `table` in ascending order by the value in an externally
computed column, using the value in column `'Rating'` for breaking
ties.
quality_ratio = table.at 'Rating' / table.at 'Price'
table.sort by=[quality_ratio, 'Rating']

> Sorting `table` in ascending order, by the value in column
`'position'`, using a custom comparator function.
manhattan_comparator a b = (a.x.abs + a.y.abs) . compare_to (b.x.abs + b.y.abs)
table.sort by=(Order_Rule 'position' comparator=manhattan_comparator)
sort : Text | Column.Column | Order_Rule | Vector.Vector (Text | Column.Column | Order_Rule) -> Sort_Order -> Boolean -> Table
sort by order=Sort_Order.Ascending missing_last=True = Panic.recover <|
rules = this.build_java_order_rules by order missing_last
fallback_cmp = here.comparator_to_java .compare_to
mask = OrderBuilder.buildOrderMask rules.to_array fallback_cmp
new_table = this.java_table.applyMask mask
Table new_table

## PRIVATE
build_java_order_rules rules order missing_last = case rules of
Text -> [this.build_java_order_rule rules order missing_last]
Column.Column _ -> [this.build_java_order_rule rules order missing_last]
Order_Rule _ _ _ _ -> [this.build_java_order_rule rules order missing_last]
Vector.Vector _ -> rules.map (this.build_java_order_rule _ order missing_last)

## PRIVATE
build_java_order_rule rule order missing_last =
order_bool = case order of
Sort_Order.Ascending -> True
Sort_Order.Descending -> False
case rule of
Text ->
column = Panic.rethrow (this.at rule)
OrderBuilder.OrderRule.new column.java_column Nothing order_bool missing_last
Column.Column c ->
OrderBuilder.OrderRule.new c Nothing order_bool missing_last
Order_Rule col_ref cmp rule_order rule_nulls_last ->
c = case col_ref of
Text -> this.at col_ref . java_column
Column.Column c -> c
o = case rule_order of
Nothing -> order_bool
Sort_Order.Ascending -> True
Sort_Order.Descending -> False
nulls = case rule_nulls_last of
Nothing -> missing_last
_ -> rule_nulls_last
java_cmp = case cmp of
Nothing -> Nothing
c -> here.comparator_to_java c
OrderBuilder.OrderRule.new c java_cmp o nulls

## PRIVATE
comparator_to_java cmp x y = cmp x y . to_sign

## Represents a table with grouped rows.
type Aggregate_Table
type Aggregate_Table java_table
Expand Down
4 changes: 3 additions & 1 deletion distribution/std-lib/Table/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ from Base import all
import Table.Io.Csv
import Table.Data.Table
import Table.Data.Column
import Table.Data.Order_Rule

from Table.Io.Csv export all hiding Parser
export Table.Data.Column
from Table.Data.Table export new, join
from Table.Data.Table export new, join, No_Such_Column_Error
from Table.Data.Order_Rule export Order_Rule

## Converts a JSON array into a dataframe, by looking up the requested keys
from each item.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import java.util.Comparator;

import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;
import org.enso.table.error.UnexpectedColumnTypeException;
import org.enso.table.error.UnexpectedTypeException;

Expand Down Expand Up @@ -120,7 +123,8 @@ public Storage mask(BitSet mask, int cardinality) {
}

@Override
public Storage orderMask(int[] positions) {
public Storage applyMask(OrderMask mask) {
int[] positions = mask.getPositions();
BitSet newNa = new BitSet();
BitSet newVals = new BitSet();
for (int i = 0; i < positions.length; i++) {
Expand Down Expand Up @@ -297,4 +301,10 @@ public static BitSet toMask(BoolStorage storage) {
mask.andNot(storage.getIsMissing());
return mask;
}

@SuppressWarnings("unchecked")
@Override
public Comparator getDefaultComparator() {
return Comparator.naturalOrder();
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import java.util.Comparator;

import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.column.operation.map.numeric.DoubleBooleanOp;
import org.enso.table.data.column.operation.map.numeric.DoubleNumericOp;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;

/** A column containing floating point numbers. */
public class DoubleStorage extends NumericStorage {
Expand Down Expand Up @@ -126,7 +129,8 @@ public DoubleStorage mask(BitSet mask, int cardinality) {
}

@Override
public Storage orderMask(int[] positions) {
public Storage applyMask(OrderMask mask) {
int[] positions = mask.getPositions();
long[] newData = new long[positions.length];
BitSet newMissing = new BitSet();
for (int i = 0; i < positions.length; i++) {
Expand Down Expand Up @@ -157,6 +161,11 @@ public Storage countMask(int[] counts, int total) {
return new DoubleStorage(newData, total, newMissing);
}

@Override
public Comparator getDefaultComparator() {
return Comparator.<Double>naturalOrder();
}

public BitSet getIsMissing() {
return isMissing;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
package org.enso.table.data.column.storage;

import java.util.Arrays;
import java.util.BitSet;
import java.util.OptionalDouble;
import java.util.OptionalLong;
import java.util.stream.DoubleStream;
import java.util.*;
import java.util.stream.LongStream;

import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.data.column.operation.aggregate.Aggregator;
import org.enso.table.data.column.operation.aggregate.numeric.LongToLongAggregator;
import org.enso.table.data.column.operation.aggregate.numeric.NumericAggregator;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.column.operation.map.numeric.LongBooleanOp;
import org.enso.table.data.column.operation.map.numeric.LongNumericOp;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;

/** A column storing 64-bit integers. */
public class LongStorage extends NumericStorage {
Expand Down Expand Up @@ -196,7 +192,8 @@ public LongStorage mask(BitSet mask, int cardinality) {
}

@Override
public Storage orderMask(int[] positions) {
public Storage applyMask(OrderMask mask) {
int[] positions = mask.getPositions();
long[] newData = new long[positions.length];
BitSet newMissing = new BitSet();
for (int i = 0; i < positions.length; i++) {
Expand Down Expand Up @@ -227,6 +224,12 @@ public Storage countMask(int[] counts, int total) {
return new LongStorage(newData, total, newMissing);
}

@SuppressWarnings("unchecked")
@Override
public Comparator getDefaultComparator() {
return Comparator.<Long>naturalOrder();
}

public BitSet getIsMissing() {
return isMissing;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import java.util.Comparator;

import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;

/** A column storing arbitrary objects. */
public class ObjectStorage extends Storage {
Expand Down Expand Up @@ -92,7 +94,8 @@ public ObjectStorage mask(BitSet mask, int cardinality) {
}

@Override
public ObjectStorage orderMask(int[] positions) {
public ObjectStorage applyMask(OrderMask mask) {
int[] positions = mask.getPositions();
Object[] newData = new Object[positions.length];
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND) {
Expand Down Expand Up @@ -120,6 +123,11 @@ public Object[] getData() {
return data;
}

@Override
public Comparator<Object> getDefaultComparator() {
return null;
}

private static MapOpStorage<ObjectStorage> buildOps() {
MapOpStorage<ObjectStorage> ops = new MapOpStorage<>();
ops.add(
Expand Down
22 changes: 12 additions & 10 deletions table/src/main/java/org/enso/table/data/column/storage/Storage.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
import org.enso.table.data.column.operation.aggregate.FunctionAggregator;

import java.util.BitSet;
import java.util.Comparator;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.InferredBuilder;

import org.enso.table.data.column.builder.object.ObjectBuilder;
import org.enso.table.data.mask.OrderMask;

/** An abstract representation of a data column. */
public abstract class Storage {
Expand Down Expand Up @@ -228,16 +229,11 @@ protected final Storage fillMissingHelper(Object arg, Builder builder) {
public abstract Storage mask(BitSet mask, int cardinality);

/**
* Returns a new storage, ordered according to the rules specified in a mask. The resulting
* storage should contain the {@code positions[i]}-th element of the original storage at the i-th
* position. {@code positions[i]} may be equal to {@link
* org.enso.table.data.index.Index.NOT_FOUND}, in which case a missing value should be inserted at
* this position.
* Returns a new storage, ordered according to the rules specified in a mask.
*
* @param positions an array specifying the ordering as described
* @return a storage resulting from applying the reordering rules
* @param mask@return a storage resulting from applying the reordering rules
*/
public abstract Storage orderMask(int[] positions);
public abstract Storage applyMask(OrderMask mask);

/**
* Returns a new storage, resulting from applying the rules specified in a mask. The resulting
Expand All @@ -251,4 +247,10 @@ protected final Storage fillMissingHelper(Object arg, Builder builder) {
* @return the storage masked according to the specified rules
*/
public abstract Storage countMask(int[] counts, int total);

/**
* @return a comparator comparing objects in this storage in a natural order. May be {@code null}
* to specify no natural ordering.
*/
public abstract Comparator<Object> getDefaultComparator();
}
Loading

0 comments on commit 93b6680

Please sign in to comment.