Skip to content

Commit

Permalink
Restructuring the Faker type and creating tests for Group_By (#3318)
Browse files Browse the repository at this point in the history
- Added Minimum, Maximum, Longest. Shortest, Mode, Percentile
- Added first and last to Map
- Restructured Faker type more inline with FakerJS
- Created 2,500 row data set
- Tests for group_by
- Performance tests for group_by
  • Loading branch information
jdunkerley authored Mar 9, 2022
1 parent f921081 commit 65465fb
Show file tree
Hide file tree
Showing 16 changed files with 3,333 additions and 132 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
`Vector.fold_with_index` and `Vector.take` methods.][3236]
- [Implemented new `Text.insert` method][3311]
- [Implemented `Bool.compare_to` method][3317]
- [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
also compute mode, percentile, minimum, maximum.][3318]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -101,6 +103,7 @@
[3236]: https://github.com/enso-org/enso/pull/3236
[3311]: https://github.com/enso-org/enso/pull/3311
[3317]: https://github.com/enso-org/enso/pull/3317
[3317]: https://github.com/enso-org/enso/pull/3318

#### Enso Compiler

Expand Down
18 changes: 18 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Map.enso
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,24 @@ type Map
to_vector_with_builder this
builder.to_vector

## Get a key value pair of the lowest key in the map.
If the map is empty, returns Nothing.
first : Pair
first =
first p m = case m of
Bin _ k v l _ -> @Tail_Call first (Pair k v) l
Tip -> p
first Nothing this

## Get a key value pair of the highest key in the map.
If the map is empty, returns Nothing.
last : Pair
last =
last p m = case m of
Bin _ k v _ r -> @Tail_Call last (Pair k v) r
Tip -> p
last Nothing this

## UNSTABLE

An error for getting a missing value from a map.
Expand Down
211 changes: 167 additions & 44 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Aggregate_Column.enso

Large diffs are not rendered by default.

37 changes: 13 additions & 24 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By.enso
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@ import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
## Create a key structure for grouping operations
key : Vector -> Group_By_Key
key values =
mapper c = case c of
Boolean -> Comparable_Boolean c
Nothing -> Comparable_Nothing
_ -> c

mapper c = Comparable_Value c
Group_By_Key <| values.map mapper

## PRIVATE
Expand All @@ -25,30 +21,23 @@ type Group_By_Key
compare_to that =
Vector_Lexicographic_Order.compare this.values that.values

## PRIVATE
Temporary workaround until Boolean compare_to completed
type Comparable_Boolean
type Comparable_Boolean value

== : Comparable_Boolean->Boolean
== that = (this.compare_to that) == Ordering.Equal

compare_to : Any->Ordering
compare_to that =
if this.value == that.value then Ordering.Equal else
if this.value then Ordering.Greater else Ordering.Less

## PRIVATE
Temporary workaround allowing Nothing to be in a Group_By
type Comparable_Nothing
type Comparable_Nothing
type Comparable_Value
type Comparable_Value value

== : Comparable_Nothing->Boolean
== that = (this.compare_to that) == Ordering.Equal

compare_to : Any->Ordering
compare_to that =
case that of
Comparable_Nothing -> Ordering.Equal
Nothing -> Ordering.Equal
_ -> Ordering.Less
value = case that of
Comparable_Value v -> v
_ -> that

case this.value of
Nothing -> if value.is_nothing then Ordering.Equal else Ordering.Less
_ -> if value.is_nothing then Ordering.Greater else this.value.compare_to value

is_nothing : Boolean
is_nothing = this.value.is_nothing
8 changes: 5 additions & 3 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import Standard.Base.Data.Time.Date
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Internal.Table_Helpers
import Standard.Table.Internal.Unique_Name_Strategy

from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
Expand Down Expand Up @@ -516,16 +517,17 @@ type Table
group_by selector columns (on_problems=Report_Warning) (warnings=Warnings.default) =
# Grouping Key
key_columns = if selector.is_nothing then [] else
Table_Helpers.select_columns internal_columns=this.columns selector=selector reorder=True on_problems=on_problems warnings=warnings
Table_Helpers.select_columns_helper internal_columns=this.columns selector=selector reorder=True on_problems=on_problems warnings=warnings
key_length = key_columns.length
make_key = if (key_length == 0) then _->(Group_By.key [1]) else i->(Group_By.key (key_columns.map v->(v.at i)))

# New Table Accumulator
new_table = (key_columns.map c->c.name)+(columns.map c->(c.column_name this)) . map n->[n, Vector.new_builder]
name_strategy = Unique_Name_Strategy.new
new_table = (key_columns.map c->c.name)+(columns.map c->(c.column_name this)) . map n->[name_strategy.make_unique n, Vector.new_builder]
add_row key =
idx = new_table.at 0 . at 1 . length
0.up_to key_length . each i->
new_table.at i . at 1 . append (key.values.at i)
new_table.at i . at 1 . append (key.values.at i).value
0.up_to (columns.length) . each i->
column = columns.at i
new_table.at (i + key_length) . at 1 . append (column.initial_value)
Expand Down
109 changes: 85 additions & 24 deletions distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,94 @@ upper_case_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".utf_16
lower_case_letters = "abcdefghijklmnopqrstuvwxyz".utf_16
numbers = "0123456789".utf_16

## Creates a random number generator which can be used for creating test values.
## Creates a new Faker which can be used for creating test values.

Arguments:
- seed: Optional seed value to make the sequence deterministic
make_generator : Integer -> Random
make_generator (seed = 0) =
if seed == 0 then Random.new else Random.new seed
new : Integer -> Faker
new (seed = 0) =
generator = if seed == 0 then Random.new else Random.new seed
Faker generator

## Object to generate (deterministic) random value for testing
type Faker
type Faker generator

## Creates a random string based on a template and random number generator.
## Creates a random Text based on a template of character sets.

Arguments:
- template: Vector of vectors that represent the possible characters for each
letter, as UTF-16 code units.
- generator: Random number generator

> Examples:
Creates a fake UK National Insurance number:

l = "ABCEGHJKLMNOPRSTWXYZ".utf_16
n = "0123456789".utf_16
s = "ABCDFMP ".utf_16
template = [l, l, n, n, n, n, n, s]
ni_number = make_string template make_generator
make_string : Vector -> Any -> Text
make_string template generator =
characters = template.map possible_chars->
selected_char_ix = generator.nextInt possible_chars.length
possible_chars.at selected_char_ix
Text.from_utf_16 characters
Arguments:
- template: Vector of vectors that represent the possible characters for each
letter, as UTF-16 code units.

> Examples:
Creates a fake UK National Insurance number:

l = "ABCEGHJKLMNOPRSTWXYZ".utf_16
n = "0123456789".utf_16
s = "ABCDFMP ".utf_16
template = [l, l, n, n, n, n, n, s]
ni_number = Faker.new . string_value template
string_value : Vector -> Text
string_value template =
characters = template.map possible_chars->
selected_char_ix = this.generator.nextInt possible_chars.length
possible_chars.at selected_char_ix
Text.from_utf_16 characters

## Generates a Text consisting of lower/upper case characters.

Arguments:
- length: length of text to generate
- upper_case: use upper_case letters
alpha : Integer->Boolean->Text
alpha length=1 upper_case=False =
alphabet = if upper_case then here.upper_case_letters else here.lower_case_letters
this.string_value <| 0.up_to length . map _->alphabet

## Generates a Text consisting of lower/upper case characters and digits.

Arguments:
- length: length of text to generate
- upper_case: use upper_case letters
alpha_numeric : Integer->Boolean->Text
alpha_numeric length=1 upper_case=False =
alphabet = (if upper_case then here.upper_case_letters else here.lower_case_letters) + here.numbers
this.string_value <| 0.up_to length . map _->alphabet

## Generates a Text for a hexadecimal number

Arguments:
- length: length of text to generate
hexadecimal : Integer->Text
hexadecimal length=1 =
alphabet = "0123456789ABCDEF".utf_16
this.string_value <| 0.up_to length . map _->alphabet

## Create a random Boolean value
boolean : Boolean
boolean =
if this.generator.nextDouble < 0.5 then True else False

## Create a random Integer value
integer : Integer->Integer->Integer
integer minimum=0 maximum=100 =
minimum + (this.generator.nextInt (maximum - minimum))

## Create a random Decimal value
decimal : Decimal->Decimal->Decimal
decimal minimum=0.0 maximum=1.0 =
minimum + this.generator.nextDouble * (maximum - minimum)

## Picks an item at Random from a list

Arguments:
- items: Vector of items to pick from
- generator: Random number generator
vector_item : Vector->Any
vector_item items =
items.at (this.generator.nextInt items.length)

## Randomly converts some values to Nothing
make_some_nothing : Any->Decimal->Any
make_some_nothing value (chance=0.1) =
if this.generator.nextDouble <= chance then Nothing else value
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
package org.enso.interpreter.node.expression.builtin.mutable;

import com.oracle.truffle.api.dsl.Cached;
import com.oracle.truffle.api.dsl.CachedContext;
import com.oracle.truffle.api.dsl.Fallback;
import com.oracle.truffle.api.dsl.Specialization;
import com.oracle.truffle.api.interop.InteropLibrary;
import com.oracle.truffle.api.interop.InvalidArrayIndexException;
import com.oracle.truffle.api.interop.UnsupportedMessageException;
import com.oracle.truffle.api.library.CachedLibrary;
import com.oracle.truffle.api.nodes.Node;
import org.enso.interpreter.Language;
import org.enso.interpreter.dsl.BuiltinMethod;
import org.enso.interpreter.node.expression.builtin.interop.syntax.HostValueToEnsoNode;
import org.enso.interpreter.runtime.Context;
Expand All @@ -34,11 +32,10 @@ Object doArray(
long source_index,
Array dest,
long dest_index,
long count,
@CachedContext(Language.class) Context ctx) {
long count) {
System.arraycopy(
src.getItems(), (int) source_index, dest.getItems(), (int) dest_index, (int) count);
return ctx.getBuiltins().nothing().newInstance();
return Context.get(this).getBuiltins().nothing().newInstance();
}

@Specialization(guards = "arrays.hasArrayElements(src)")
Expand Down
6 changes: 4 additions & 2 deletions test/Benchmarks/src/Natural_Order_Sort.enso
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ main =

## No specific significance to this constant, just fixed to make generated set deterministic
fixed_random_seed = 1644575867
random_generator = Faker.make_generator fixed_random_seed
faker = Faker.new fixed_random_seed

unsorted = 0.up_to here.vector_size . map _->(Faker.make_string template random_generator)
IO.println <| "Creating unsorted vector"
unsorted = 0.up_to here.vector_size . map _->(faker.string_value template)

IO.println <| "Benchmarking..."
Bench.measure (unsorted.sort by=Natural_Order.compare) "Natural Order" here.iter_size here.num_iterations
30 changes: 8 additions & 22 deletions test/Benchmarks/src/Number_Parse.enso
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,6 @@ import Standard.Test.Faker

## Bench Utilities ============================================================

make_double_strings : Integer -> Any -> Integer -> Integer -> Vector
make_double_strings count generator (min = -1000000000) (max = 1000000000) =
range = max - min
output = Array.new count
0.up_to count . each i->
v = generator.nextDouble * range - min
output.set_at i v.to_text
Vector.Vector output

make_integer_strings : Integer -> Any -> Integer -> Integer -> Vector
make_integer_strings count generator (min = -1000000000) (max = 1000000000) =
range = max - min
output = Array.new count
0.up_to count . each i->
v = (generator.nextInt range - min)
output.set_at i v.to_text
Vector.Vector output

vector_size = 1000000
iter_size = 100
num_iterations = 10
Expand All @@ -32,10 +14,14 @@ num_iterations = 10
main =
## No specific significance to this constant, just fixed to make generated set deterministic
fixed_random_seed = 1644575867
random_generator = Faker.make_generator fixed_random_seed
faker = Faker.new fixed_random_seed

double_string = here.make_double_strings here.vector_size random_generator
Bench.measure (double_string.map Decimal.parse) "Decimal.parse" here.iter_size here.num_iterations
IO.println <| "Creating decimal strings"
decimal_strings = Vector.new here.vector_size _->(faker.decimal -1000000000 1000000000).to_text
IO.println <| "Benchmarking Decimal.parse"
Bench.measure (decimal_strings.map Decimal.parse) "Decimal.parse" here.iter_size here.num_iterations

int_strings = here.make_integer_strings here.vector_size random_generator
IO.println <| "Creating integer strings"
int_strings = Vector.new here.vector_size _->(faker.integer -1000000000 1000000000).to_text
IO.println <| "Benchmarking Integer.parse"
Bench.measure (int_strings.map Integer.parse) "Integer.parse" here.iter_size here.num_iterations
49 changes: 49 additions & 0 deletions test/Benchmarks/src/Table/Group_By.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from Standard.Base import all

import Standard.Test.Bench
import Standard.Test.Faker

import Standard.Table.Data.Table
import Standard.Table.Data.Column_Selector
from Standard.Table.Data.Aggregate_Column import all

## Bench Utilities ============================================================

vector_size = 2500
iter_size = 100
num_iterations = 10

create_table : Integer->Integer->Table
create_table rows (seed=1646322139) =
faker = Faker.new seed
key1 = ["Code", 0.up_to rows . map _-> faker.alpha 3]
key2 = ["Index", 0.up_to rows . map _-> faker.integer 0 10]
key3 = ["Flag", 0.up_to rows . map _-> faker.boolean]
value1 = ["Value", 0.up_to rows . map _-> ((faker.decimal -100 100)*100000).floor/100000]
value2 = ["ValueWithNothing", 0.up_to rows . map _-> faker.make_some_nothing ((faker.decimal -100 100)*100).floor/100]
text1 = ["TextWithNothing", 0.up_to rows . map _-> faker.make_some_nothing (faker.alpha_numeric 10)]
text2 = ["Hexadecimal", 0.up_to rows . map _-> faker.make_some_nothing (faker.hexadecimal 8)]
Table.new [key1, key2, key3, value1, value2, text1, text2]

# The Benchmarks ==============================================================
main =
IO.println <| "Making table data..."
table = here.create_table here.vector_size

Bench.measure (table.group_by (Column_Selector.By_Index []) [Count Nothing]) "Count table" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Index []) [Count_Distinct "Index"]) "Count Distinct table" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Index []) [Standard_Deviation "Value"]) "StDev table" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Index []) [Median "Value"]) "Median table" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Index []) [Mode "Index"]) "Mode table" here.iter_size here.num_iterations

Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index"]) [Count Nothing]) "Count grouped" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index"]) [Count_Distinct "Code"]) "Count Distinct grouped" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index"]) [Standard_Deviation "Value"]) "StDev grouped" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index"]) [Median "Value"]) "Median grouped" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index"]) [Mode "Index"]) "Mode grouped" here.iter_size here.num_iterations

Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index", "Flag"]) [Count Nothing]) "Count 2 level groups" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index", "Flag"]) [Count_Distinct "Code"]) "Count Distinct 2 level groups" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index", "Flag"]) [Standard_Deviation "Value"]) "StDev 2 level groups" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index", "Flag"]) [Median "Value"]) "Median 2 level groups" here.iter_size here.num_iterations
Bench.measure (table.group_by (Column_Selector.By_Name.new ["Index", "Flag"]) [Mode "Index"]) "Mode 2 level groups" here.iter_size here.num_iterations
Loading

0 comments on commit 65465fb

Please sign in to comment.