From 68b85dea82ab173ae028005920b8dc47d71bf9ef Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 16 Feb 2022 17:40:33 +0000 Subject: [PATCH] Improvement to the Natural Order Sort (#3276) * Improved Natural Order Data generator for benchmarking * Missing Import Benchmark script * Update Natural_Order.enso Restore missing ToDo * Changelog * PR Comments * PR Comments * Additional comments. * Correction --- CHANGELOG.md | 3 + .../0.0.0-dev/src/Data/Noise/Generator.enso | 4 +- .../src/Data/Ordering/Natural_Order.enso | 138 +++++++++++------- .../Standard/Test/0.0.0-dev/src/Faker.enso | 39 +++++ test/Benchmarks/src/Natural_Order_Sort.enso | 28 ++++ 5 files changed, 154 insertions(+), 58 deletions(-) create mode 100644 distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso create mode 100644 test/Benchmarks/src/Natural_Order_Sort.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bd22a6cf83c..8cce6efb9db5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,8 @@ - [Implemented `Runtime.get_stack_trace` together with some utilities to process stack traces and code locations][3271] - [Implemented `Vector.flatten`][3259] +- [Significant performance improvement in `Natural_Order` and new `Faker` + methods added to `Standard.Test`][3276] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -63,6 +65,7 @@ [3269]: https://github.com/enso-org/enso/pull/3269 [3271]: https://github.com/enso-org/enso/pull/3271 [3259]: https://github.com/enso-org/enso/pull/3259 +[3276]: https://github.com/enso-org/enso/pull/3276 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso index b8fd7c1bc24e..e5baffd38afe 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso @@ -40,10 +40,10 @@ type Generator range has an equal chance of occurring. type Deterministic_Random - ## A determinstic random noise generator that performs a peterbation of the + ## A deterministic random noise generator that performs a perturbation of the input - It produices what is commonly termed as "white" noise, where any value in + It produces what is commonly termed as "white" noise, where any value in the range has an equal chance of occurring. type Deterministic_Random diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso index 7070f10994d5..ef855555371d 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso @@ -4,6 +4,9 @@ import Standard.Base.Data.Text.Regex import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Ordering.Vector_Lexicographic_Order +polyglot java import org.enso.base.Text_Utils +polyglot java import com.ibm.icu.text.BreakIterator + ## Compares two text values according to the natural dictionary ordering. > Example @@ -17,62 +20,85 @@ import Standard.Base.Data.Ordering.Vector_Lexicographic_Order ["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"] compare : Text -> Text -> Ordering compare text1 text2 = - ## TODO [RW] A more efficient algorithm which works better for longer texts - should be implemented, as described in - https://www.pivotaltracker.com/story/show/181176589 - nat1 = here.to_natural_key text1 - nat2 = here.to_natural_key text2 - ## TODO [RW] The additional assignment to a temporary variable is a - workaround for the following bug: - https://www.pivotaltracker.com/story/show/181162108 - res = Vector_Lexicographic_Order.compare nat1 nat2 - res - -## PRIVATE -to_natural_key : Text -> Vector -to_natural_key text = if text.is_empty then [] else - splitter_regex = Regex.compile "([^0-9]+|[0-9]+)" - parts = splitter_regex.find text mode=Mode.All - - ## TODO [RW] Currently there is no `is_digit` method. Once - https://www.pivotaltracker.com/story/show/181176532 is implemented, this - regex can be replaced with a simpler and faster `is_digit` check. - is_integer_regex = Regex.compile "[0-9]+" - parts.map part-> case is_integer_regex.matches part of - True -> - ## TODO [RW] Currently there is no `Integer.parse` method, so we + iter1 = BreakIterator.getCharacterInstance + iter1.setText text1 + + iter2 = BreakIterator.getCharacterInstance + iter2.setText text2 + + ## check if a single character is between '0' and '9' + ascii_code_zero = 48 + ascii_code_nine = 57 + is_digit=character -> character>=ascii_code_zero && character<=ascii_code_nine + + ## Find the end of a number and then return the substring, value and new + indices of the bounds of the next character. If the end of the text has + been reached then the second index will be -1. + get_number text prev next iter = + ## Find end of number and return pair of index and flag if reached end + loop text next iter = + new_next = iter.next + if (new_next == -1) then (Pair next True) else + substring = Text_Utils.substring text next new_next + character = Text_Utils.get_chars substring . at 0 + if (is_digit character).not then (Pair next False) else + @Tail_Call loop text new_next iter + + pair = loop text next iter + substring = Text_Utils.substring text prev pair.first + + ## TODO [RW] Currently there is no `Integer.parse` method, so we parse a decimal and convert it to an integer. Once https://www.pivotaltracker.com/story/show/181176522 is implemented, this should be changed to use `Integer.parse`. - value = Decimal.parse part . floor - Natural_Ordering_Numeric_Part value part - False -> - Natural_Ordering_Alphabetical_Part part - -## PRIVATE -type Natural_Ordering_Part - ## PRIVATE - type Natural_Ordering_Alphabetical_Part (text : Text) - - ## PRIVATE - type Natural_Ordering_Numeric_Part (value : Integer) (original_text : Text) - - ## PRIVATE - compare_to : Natural_Ordering_Part -> Ordering - compare_to that = case Pair this that of - Pair (Natural_Ordering_Alphabetical_Part text1) (Natural_Ordering_Alphabetical_Part text2) -> - text1 . compare_to text2 - Pair (Natural_Ordering_Numeric_Part value1 original1) (Natural_Ordering_Numeric_Part value2 original2) -> - value_ordering = value1.compare_to value2 - case value_ordering of - Ordering.Equal -> original1 . compare_to original2 - _ -> value_ordering - Pair (Natural_Ordering_Numeric_Part _ _) (Natural_Ordering_Alphabetical_Part _) -> - Ordering.Less - Pair (Natural_Ordering_Alphabetical_Part _) (Natural_Ordering_Numeric_Part _ _) -> - Ordering.Greater - - to_text : Text - to_text = case this of - Natural_Ordering_Alphabetical_Part text -> text.to_text - Natural_Ordering_Numeric_Part v o -> [v, o].to_text + decimal = Decimal.parse substring . floor + + next_index = if pair.second then -1 else iter.current + [substring, decimal, pair.first, next_index] + + + ## Loop to computer the ordering of text1 and text2. + Ordering: Nothing < Number < Text + prev1 - index to start of current character in text1. + next1 - index to start of next character (or -1 if finished) in text1. + prev2 - index to start of current character in text2. + next2 - index to start of next character (or -1 if finished) in text2. + order prev1 next1 prev2 next2 = + case (Pair (next1 == -1) (next2 == -1)) of + Pair True True -> Ordering.Equal + Pair True False -> Ordering.Less + Pair False True -> Ordering.Greater + Pair False False -> + substring1 = Text_Utils.substring text1 prev1 next1 + first_char_1 = Text_Utils.get_chars substring1 . at 0 + + substring2 = Text_Utils.substring text2 prev2 next2 + first_char_2 = Text_Utils.get_chars substring2 . at 0 + + tmp = Pair (is_digit first_char_1) (is_digit first_char_2) + ## ToDo: Move to case on second block + Appears to be an issue using a nested case statement on a pair + https://www.pivotaltracker.com/story/show/181280737 + if (tmp.first && tmp.second.not) then Ordering.Less else + if (tmp.first.not && tmp.second) then Ordering.Greater else + case tmp.first.not of + True -> + text_comparison = substring1.compare_to substring2 + if text_comparison != Ordering.Equal then text_comparison else + @Tail_Call order next1 iter1.next next2 iter2.next + False -> + parsed1 = get_number text1 prev1 next1 iter1 + num_text1 = parsed1.at 0 + value1 = parsed1.at 1 + + parsed2 = get_number text2 prev2 next2 iter2 + num_text2 = parsed2.at 0 + value2 = parsed2.at 1 + + value_comparison = value1.compare_to value2 + if value_comparison != Ordering.Equal then value_comparison else + text_comparison = num_text1.compare_to num_text2 + if text_comparison != Ordering.Equal then text_comparison else + @Tail_Call order (parsed1.at 2) (parsed1.at 3) (parsed2.at 2) (parsed2.at 3) + + order 0 iter1.next 0 iter2.next diff --git a/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso b/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso new file mode 100644 index 000000000000..6b3fe8ec54bc --- /dev/null +++ b/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso @@ -0,0 +1,39 @@ +from Standard.Base import all + +polyglot java import java.util.Random +polyglot java import org.enso.base.Text_Utils + +upper_case_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".utf_8 +numbers = "0123456789".utf_8 + +## Creates a random number generator which can be used for creating test values. + + Arguments: + - seed: Optional seed value to make the sequence deterministic +make_generator : Integer -> Random +make_generator (seed = 0) = + if seed == 0 then Random.new else Random.new seed + + +## Creates a random string based on a template and random number generator. + + Arguments: + - template: Vector of character arrays that represent the possible + characters for each letter. + - generator: Random number generator + + > Examples: + Creates a fake UK National Insurance number: + + l = "ABCEGHJKLMNOPRSTWXYZ".utf_8 + n = "0123456789".utf_8 + s = "ABCDFMP ".utf_8 + template = [l, l, n, n, n, n, n, s] + ni_number = make_string template make_generator +make_string : Vector -> Any -> Text +make_string template generator = + output = Array.new template.length + 0.up_to template.length . each i-> + a = template.at i + output.set_at i (a.at (generator.nextInt a.length)) + Text_Utils.from_utf_8 output diff --git a/test/Benchmarks/src/Natural_Order_Sort.enso b/test/Benchmarks/src/Natural_Order_Sort.enso new file mode 100644 index 000000000000..133334048af1 --- /dev/null +++ b/test/Benchmarks/src/Natural_Order_Sort.enso @@ -0,0 +1,28 @@ +from Standard.Base import all + +import Standard.Test.Bench + +import Standard.Test.Faker +import Standard.Base.Data.Ordering.Natural_Order + +## Bench Utilities ============================================================ + +vector_size = 10000 +iter_size = 100 +num_iterations = 10 + + +# The Benchmarks ============================================================== + +main = + l = Faker.upper_case_letters + n = Faker.numbers + template = [l, l, l, n, n, n, n, n, l] + + ## No specific significance to this constant, just fixed to make generated set deterministic + fixed_random_seed = 1644575867 + random_generator = Faker.make_generator fixed_random_seed + + unsorted = 0.up_to here.vector_size . map _->(Faker.make_string template random_generator) + + Bench.measure (unsorted.sort by=Natural_Order.compare) "Natural Order" here.iter_size here.num_iterations