Skip to content

Commit

Permalink
Improvement to the Natural Order Sort (#3276)
Browse files Browse the repository at this point in the history
* Improved Natural Order
Data generator for benchmarking

* Missing Import
Benchmark script

* Update Natural_Order.enso

Restore missing ToDo

* Changelog

* PR Comments

* PR Comments

* Additional comments.

* Correction
  • Loading branch information
jdunkerley authored Feb 16, 2022
1 parent 9f051ad commit 68b85de
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 58 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
- [Implemented `Runtime.get_stack_trace` together with some utilities to process
stack traces and code locations][3271]
- [Implemented `Vector.flatten`][3259]
- [Significant performance improvement in `Natural_Order` and new `Faker`
methods added to `Standard.Test`][3276]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand All @@ -63,6 +65,7 @@
[3269]: https://github.com/enso-org/enso/pull/3269
[3271]: https://github.com/enso-org/enso/pull/3271
[3259]: https://github.com/enso-org/enso/pull/3259
[3276]: https://github.com/enso-org/enso/pull/3276

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ type Generator
range has an equal chance of occurring.
type Deterministic_Random

## A determinstic random noise generator that performs a peterbation of the
## A deterministic random noise generator that performs a perturbation of the
input

It produices what is commonly termed as "white" noise, where any value in
It produces what is commonly termed as "white" noise, where any value in
the range has an equal chance of occurring.
type Deterministic_Random

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Ordering.Vector_Lexicographic_Order

polyglot java import org.enso.base.Text_Utils
polyglot java import com.ibm.icu.text.BreakIterator

## Compares two text values according to the natural dictionary ordering.

> Example
Expand All @@ -17,62 +20,85 @@ import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"]
compare : Text -> Text -> Ordering
compare text1 text2 =
## TODO [RW] A more efficient algorithm which works better for longer texts
should be implemented, as described in
https://www.pivotaltracker.com/story/show/181176589
nat1 = here.to_natural_key text1
nat2 = here.to_natural_key text2
## TODO [RW] The additional assignment to a temporary variable is a
workaround for the following bug:
https://www.pivotaltracker.com/story/show/181162108
res = Vector_Lexicographic_Order.compare nat1 nat2
res

## PRIVATE
to_natural_key : Text -> Vector
to_natural_key text = if text.is_empty then [] else
splitter_regex = Regex.compile "([^0-9]+|[0-9]+)"
parts = splitter_regex.find text mode=Mode.All

## TODO [RW] Currently there is no `is_digit` method. Once
https://www.pivotaltracker.com/story/show/181176532 is implemented, this
regex can be replaced with a simpler and faster `is_digit` check.
is_integer_regex = Regex.compile "[0-9]+"
parts.map part-> case is_integer_regex.matches part of
True ->
## TODO [RW] Currently there is no `Integer.parse` method, so we
iter1 = BreakIterator.getCharacterInstance
iter1.setText text1

iter2 = BreakIterator.getCharacterInstance
iter2.setText text2

## check if a single character is between '0' and '9'
ascii_code_zero = 48
ascii_code_nine = 57
is_digit=character -> character>=ascii_code_zero && character<=ascii_code_nine

## Find the end of a number and then return the substring, value and new
indices of the bounds of the next character. If the end of the text has
been reached then the second index will be -1.
get_number text prev next iter =
## Find end of number and return pair of index and flag if reached end
loop text next iter =
new_next = iter.next
if (new_next == -1) then (Pair next True) else
substring = Text_Utils.substring text next new_next
character = Text_Utils.get_chars substring . at 0
if (is_digit character).not then (Pair next False) else
@Tail_Call loop text new_next iter

pair = loop text next iter
substring = Text_Utils.substring text prev pair.first

## TODO [RW] Currently there is no `Integer.parse` method, so we
parse a decimal and convert it to an integer. Once
https://www.pivotaltracker.com/story/show/181176522 is
implemented, this should be changed to use `Integer.parse`.
value = Decimal.parse part . floor
Natural_Ordering_Numeric_Part value part
False ->
Natural_Ordering_Alphabetical_Part part

## PRIVATE
type Natural_Ordering_Part
## PRIVATE
type Natural_Ordering_Alphabetical_Part (text : Text)

## PRIVATE
type Natural_Ordering_Numeric_Part (value : Integer) (original_text : Text)

## PRIVATE
compare_to : Natural_Ordering_Part -> Ordering
compare_to that = case Pair this that of
Pair (Natural_Ordering_Alphabetical_Part text1) (Natural_Ordering_Alphabetical_Part text2) ->
text1 . compare_to text2
Pair (Natural_Ordering_Numeric_Part value1 original1) (Natural_Ordering_Numeric_Part value2 original2) ->
value_ordering = value1.compare_to value2
case value_ordering of
Ordering.Equal -> original1 . compare_to original2
_ -> value_ordering
Pair (Natural_Ordering_Numeric_Part _ _) (Natural_Ordering_Alphabetical_Part _) ->
Ordering.Less
Pair (Natural_Ordering_Alphabetical_Part _) (Natural_Ordering_Numeric_Part _ _) ->
Ordering.Greater

to_text : Text
to_text = case this of
Natural_Ordering_Alphabetical_Part text -> text.to_text
Natural_Ordering_Numeric_Part v o -> [v, o].to_text
decimal = Decimal.parse substring . floor

next_index = if pair.second then -1 else iter.current
[substring, decimal, pair.first, next_index]


## Loop to computer the ordering of text1 and text2.
Ordering: Nothing < Number < Text
prev1 - index to start of current character in text1.
next1 - index to start of next character (or -1 if finished) in text1.
prev2 - index to start of current character in text2.
next2 - index to start of next character (or -1 if finished) in text2.
order prev1 next1 prev2 next2 =
case (Pair (next1 == -1) (next2 == -1)) of
Pair True True -> Ordering.Equal
Pair True False -> Ordering.Less
Pair False True -> Ordering.Greater
Pair False False ->
substring1 = Text_Utils.substring text1 prev1 next1
first_char_1 = Text_Utils.get_chars substring1 . at 0

substring2 = Text_Utils.substring text2 prev2 next2
first_char_2 = Text_Utils.get_chars substring2 . at 0

tmp = Pair (is_digit first_char_1) (is_digit first_char_2)
## ToDo: Move to case on second block
Appears to be an issue using a nested case statement on a pair
https://www.pivotaltracker.com/story/show/181280737
if (tmp.first && tmp.second.not) then Ordering.Less else
if (tmp.first.not && tmp.second) then Ordering.Greater else
case tmp.first.not of
True ->
text_comparison = substring1.compare_to substring2
if text_comparison != Ordering.Equal then text_comparison else
@Tail_Call order next1 iter1.next next2 iter2.next
False ->
parsed1 = get_number text1 prev1 next1 iter1
num_text1 = parsed1.at 0
value1 = parsed1.at 1

parsed2 = get_number text2 prev2 next2 iter2
num_text2 = parsed2.at 0
value2 = parsed2.at 1

value_comparison = value1.compare_to value2
if value_comparison != Ordering.Equal then value_comparison else
text_comparison = num_text1.compare_to num_text2
if text_comparison != Ordering.Equal then text_comparison else
@Tail_Call order (parsed1.at 2) (parsed1.at 3) (parsed2.at 2) (parsed2.at 3)

order 0 iter1.next 0 iter2.next
39 changes: 39 additions & 0 deletions distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from Standard.Base import all

polyglot java import java.util.Random
polyglot java import org.enso.base.Text_Utils

upper_case_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".utf_8
numbers = "0123456789".utf_8

## Creates a random number generator which can be used for creating test values.

Arguments:
- seed: Optional seed value to make the sequence deterministic
make_generator : Integer -> Random
make_generator (seed = 0) =
if seed == 0 then Random.new else Random.new seed


## Creates a random string based on a template and random number generator.

Arguments:
- template: Vector of character arrays that represent the possible
characters for each letter.
- generator: Random number generator

> Examples:
Creates a fake UK National Insurance number:

l = "ABCEGHJKLMNOPRSTWXYZ".utf_8
n = "0123456789".utf_8
s = "ABCDFMP ".utf_8
template = [l, l, n, n, n, n, n, s]
ni_number = make_string template make_generator
make_string : Vector -> Any -> Text
make_string template generator =
output = Array.new template.length
0.up_to template.length . each i->
a = template.at i
output.set_at i (a.at (generator.nextInt a.length))
Text_Utils.from_utf_8 output
28 changes: 28 additions & 0 deletions test/Benchmarks/src/Natural_Order_Sort.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from Standard.Base import all

import Standard.Test.Bench

import Standard.Test.Faker
import Standard.Base.Data.Ordering.Natural_Order

## Bench Utilities ============================================================

vector_size = 10000
iter_size = 100
num_iterations = 10


# The Benchmarks ==============================================================

main =
l = Faker.upper_case_letters
n = Faker.numbers
template = [l, l, l, n, n, n, n, n, l]

## No specific significance to this constant, just fixed to make generated set deterministic
fixed_random_seed = 1644575867
random_generator = Faker.make_generator fixed_random_seed

unsorted = 0.up_to here.vector_size . map _->(Faker.make_string template random_generator)

Bench.measure (unsorted.sort by=Natural_Order.compare) "Natural Order" here.iter_size here.num_iterations

0 comments on commit 68b85de

Please sign in to comment.