Improvement to the Natural Order Sort (#3276)

* Improved Natural Order Data generator for benchmarking * Missing Import Benchmark script * Update Natural_Order.enso Restore missing ToDo * Changelog * PR Comments * PR Comments * Additional comments. * Correction
enso-org · Feb 16, 2022 · 68b85de · 68b85de
1 parent 9f051ad
commit 68b85de
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 58 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,8 @@
 - [Implemented `Runtime.get_stack_trace` together with some utilities to process
   stack traces and code locations][3271]
 - [Implemented `Vector.flatten`][3259]
+- [Significant performance improvement in `Natural_Order` and new `Faker`
+  methods added to `Standard.Test`][3276]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -63,6 +65,7 @@
 [3269]: https://github.com/enso-org/enso/pull/3269
 [3271]: https://github.com/enso-org/enso/pull/3271
 [3259]: https://github.com/enso-org/enso/pull/3259
+[3276]: https://github.com/enso-org/enso/pull/3276
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso
@@ -40,10 +40,10 @@ type Generator
    range has an equal chance of occurring.
 type Deterministic_Random
 
-    ## A determinstic random noise generator that performs a peterbation of the
+    ## A deterministic random noise generator that performs a perturbation of the
        input
 
-       It produices what is commonly termed as "white" noise, where any value in
+       It produces what is commonly termed as "white" noise, where any value in
        the range has an equal chance of occurring.
     type Deterministic_Random
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso
@@ -4,6 +4,9 @@ import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Mode
 import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
 
+polyglot java import org.enso.base.Text_Utils
+polyglot java import com.ibm.icu.text.BreakIterator
+
 ## Compares two text values according to the natural dictionary ordering.
 
    > Example
@@ -17,62 +20,85 @@ import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
          ["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"]
 compare : Text -> Text -> Ordering
 compare text1 text2 =
-    ## TODO [RW] A more efficient algorithm which works better for longer texts
-       should be implemented, as described in
-       https://www.pivotaltracker.com/story/show/181176589
-    nat1 = here.to_natural_key text1
-    nat2 = here.to_natural_key text2
-    ## TODO [RW] The additional assignment to a temporary variable is a
-       workaround for the following bug:
-       https://www.pivotaltracker.com/story/show/181162108
-    res = Vector_Lexicographic_Order.compare nat1 nat2
-    res
-
-## PRIVATE
-to_natural_key : Text -> Vector
-to_natural_key text = if text.is_empty then [] else
-    splitter_regex = Regex.compile "([^0-9]+|[0-9]+)"
-    parts = splitter_regex.find text mode=Mode.All
-
-    ## TODO [RW] Currently there is no `is_digit` method. Once
-       https://www.pivotaltracker.com/story/show/181176532 is implemented, this
-       regex can be replaced with a simpler and faster `is_digit` check.
-    is_integer_regex = Regex.compile "[0-9]+"
-    parts.map part-> case is_integer_regex.matches part of
-        True ->
-            ## TODO [RW] Currently there is no `Integer.parse` method, so we
+    iter1 = BreakIterator.getCharacterInstance
+    iter1.setText text1
+
+    iter2 = BreakIterator.getCharacterInstance
+    iter2.setText text2
+
+    ## check if a single character is between '0' and '9'
+    ascii_code_zero = 48
+    ascii_code_nine = 57
+    is_digit=character -> character>=ascii_code_zero && character<=ascii_code_nine
+
+    ## Find the end of a number and then return the substring, value and new
+       indices of the bounds of the next character. If the end of the text has
+       been reached then the second index will be -1.
+    get_number text prev next iter =
+        ## Find end of number and return pair of index and flag if reached end
+        loop text next iter =
+            new_next = iter.next
+            if (new_next == -1) then (Pair next True) else
+                substring = Text_Utils.substring text next new_next
+                character = Text_Utils.get_chars substring . at 0
+                if (is_digit character).not then (Pair next False) else
+                    @Tail_Call loop text new_next iter
+
+        pair = loop text next iter
+        substring = Text_Utils.substring text prev pair.first
+
+        ## TODO [RW] Currently there is no `Integer.parse` method, so we
                parse a decimal and convert it to an integer. Once
                https://www.pivotaltracker.com/story/show/181176522 is
                implemented, this should be changed to use `Integer.parse`.
-            value = Decimal.parse part . floor
-            Natural_Ordering_Numeric_Part value part
-        False ->
-            Natural_Ordering_Alphabetical_Part part
-
-## PRIVATE
-type Natural_Ordering_Part
-    ## PRIVATE
-    type Natural_Ordering_Alphabetical_Part (text : Text)
-
-    ## PRIVATE
-    type Natural_Ordering_Numeric_Part (value : Integer) (original_text : Text)
-
-    ## PRIVATE
-    compare_to : Natural_Ordering_Part -> Ordering
-    compare_to that = case Pair this that of
-        Pair (Natural_Ordering_Alphabetical_Part text1) (Natural_Ordering_Alphabetical_Part text2) ->
-            text1 . compare_to text2
-        Pair (Natural_Ordering_Numeric_Part value1 original1) (Natural_Ordering_Numeric_Part value2 original2) ->
-            value_ordering = value1.compare_to value2
-            case value_ordering of
-                Ordering.Equal -> original1 . compare_to original2
-                _ -> value_ordering
-        Pair (Natural_Ordering_Numeric_Part _ _) (Natural_Ordering_Alphabetical_Part _) ->
-            Ordering.Less
-        Pair (Natural_Ordering_Alphabetical_Part _) (Natural_Ordering_Numeric_Part _ _) ->
-            Ordering.Greater
-
-    to_text : Text
-    to_text = case this of
-        Natural_Ordering_Alphabetical_Part text -> text.to_text
-        Natural_Ordering_Numeric_Part v o -> [v, o].to_text
+        decimal = Decimal.parse substring . floor
+
+        next_index = if pair.second then -1 else iter.current
+        [substring, decimal, pair.first, next_index]
+
+
+    ## Loop to computer the ordering of text1 and text2.
+       Ordering: Nothing < Number < Text
+       prev1 - index to start of current character in text1.
+       next1 - index to start of next character (or -1 if finished) in text1.
+       prev2 - index to start of current character in text2.
+       next2 - index to start of next character (or -1 if finished) in text2.
+    order prev1 next1 prev2 next2 =
+        case (Pair (next1 == -1) (next2 == -1)) of
+            Pair True True -> Ordering.Equal
+            Pair True False -> Ordering.Less
+            Pair False True -> Ordering.Greater
+            Pair False False ->
+                substring1 = Text_Utils.substring text1 prev1 next1
+                first_char_1 = Text_Utils.get_chars substring1 . at 0
+
+                substring2 = Text_Utils.substring text2 prev2 next2
+                first_char_2 = Text_Utils.get_chars substring2 . at 0
+
+                tmp = Pair (is_digit first_char_1) (is_digit first_char_2)
+                ## ToDo: Move to case on second block
+                   Appears to be an issue using a nested case statement on a pair
+                   https://www.pivotaltracker.com/story/show/181280737
+                if (tmp.first && tmp.second.not) then Ordering.Less else
+                    if (tmp.first.not && tmp.second) then Ordering.Greater else
+                        case tmp.first.not of
+                            True ->
+                                text_comparison = substring1.compare_to substring2
+                                if text_comparison != Ordering.Equal then text_comparison else
+                                    @Tail_Call order next1 iter1.next next2 iter2.next
+                            False ->
+                                parsed1 = get_number text1 prev1 next1 iter1
+                                num_text1 = parsed1.at 0
+                                value1 = parsed1.at 1
+
+                                parsed2 = get_number text2 prev2 next2 iter2
+                                num_text2 = parsed2.at 0
+                                value2 = parsed2.at 1
+
+                                value_comparison = value1.compare_to value2
+                                if value_comparison != Ordering.Equal then value_comparison else
+                                    text_comparison = num_text1.compare_to num_text2
+                                    if text_comparison != Ordering.Equal then text_comparison else
+                                        @Tail_Call order (parsed1.at 2) (parsed1.at 3) (parsed2.at 2) (parsed2.at 3)
+
+    order 0 iter1.next 0 iter2.next
diff --git a/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso b/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso
@@ -0,0 +1,39 @@
+from Standard.Base import all
+
+polyglot java import java.util.Random
+polyglot java import org.enso.base.Text_Utils
+
+upper_case_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".utf_8
+numbers = "0123456789".utf_8
+
+## Creates a random number generator which can be used for creating test values.
+
+   Arguments:
+   - seed: Optional seed value to make the sequence deterministic
+make_generator : Integer -> Random
+make_generator (seed = 0) =
+    if seed == 0 then Random.new else Random.new seed
+
+
+## Creates a random string based on a template and random number generator.
+
+   Arguments:
+   - template: Vector of character arrays that represent the possible
+     characters for each letter.
+   - generator: Random number generator
+
+   > Examples:
+     Creates a fake UK National Insurance number:
+
+            l = "ABCEGHJKLMNOPRSTWXYZ".utf_8
+            n = "0123456789".utf_8
+            s = "ABCDFMP ".utf_8
+            template = [l, l, n, n, n, n, n, s]
+            ni_number = make_string template make_generator
+make_string : Vector -> Any -> Text
+make_string template generator =
+    output = Array.new template.length
+    0.up_to template.length . each i->
+        a = template.at i
+        output.set_at i (a.at (generator.nextInt a.length))
+    Text_Utils.from_utf_8 output
diff --git a/test/Benchmarks/src/Natural_Order_Sort.enso b/test/Benchmarks/src/Natural_Order_Sort.enso
@@ -0,0 +1,28 @@
+from Standard.Base import all
+
+import Standard.Test.Bench
+
+import Standard.Test.Faker
+import Standard.Base.Data.Ordering.Natural_Order
+
+## Bench Utilities ============================================================
+
+vector_size = 10000
+iter_size = 100
+num_iterations = 10
+
+
+# The Benchmarks ==============================================================
+
+main =
+    l = Faker.upper_case_letters
+    n = Faker.numbers
+    template = [l, l, l, n, n, n, n, n, l]
+
+    ## No specific significance to this constant, just fixed to make generated set deterministic
+    fixed_random_seed = 1644575867
+    random_generator = Faker.make_generator fixed_random_seed
+
+    unsorted = 0.up_to here.vector_size . map _->(Faker.make_string template random_generator)
+
+    Bench.measure (unsorted.sort by=Natural_Order.compare) "Natural Order" here.iter_size here.num_iterations