Improved Natural Order

Data generator for benchmarking
enso-org · Feb 14, 2022 · 61b18fd · 61b18fd
1 parent 16a7ec7
commit 61b18fd
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 61 deletions.
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Noise/Generator.enso
@@ -40,10 +40,10 @@ type Generator
    range has an equal chance of occurring.
 type Deterministic_Random
 
-    ## A determinstic random noise generator that performs a peterbation of the
+    ## A deterministic random noise generator that performs a perturbation of the
        input
 
-       It produices what is commonly termed as "white" noise, where any value in
+       It produces what is commonly termed as "white" noise, where any value in
        the range has an equal chance of occurring.
     type Deterministic_Random
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Natural_Order.enso
@@ -4,6 +4,9 @@ import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Mode
 import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
 
+polyglot java import org.enso.base.Text_Utils
+polyglot java import com.ibm.icu.text.BreakIterator
+
 ## Compares two text values according to the natural dictionary ordering.
 
    > Example
@@ -17,62 +20,55 @@ import Standard.Base.Data.Ordering.Vector_Lexicographic_Order
          ["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"]
 compare : Text -> Text -> Ordering
 compare text1 text2 =
-    ## TODO [RW] A more efficient algorithm which works better for longer texts
-       should be implemented, as described in
-       https://www.pivotaltracker.com/story/show/181176589
-    nat1 = here.to_natural_key text1
-    nat2 = here.to_natural_key text2
-    ## TODO [RW] The additional assignment to a temporary variable is a
-       workaround for the following bug:
-       https://www.pivotaltracker.com/story/show/181162108
-    res = Vector_Lexicographic_Order.compare nat1 nat2
-    res
-
-## PRIVATE
-to_natural_key : Text -> Vector
-to_natural_key text = if text.is_empty then [] else
-    splitter_regex = Regex.compile "([^0-9]+|[0-9]+)"
-    parts = splitter_regex.find text mode=Mode.All
-
-    ## TODO [RW] Currently there is no `is_digit` method. Once
-       https://www.pivotaltracker.com/story/show/181176532 is implemented, this
-       regex can be replaced with a simpler and faster `is_digit` check.
-    is_integer_regex = Regex.compile "[0-9]+"
-    parts.map part-> case is_integer_regex.matches part of
-        True ->
-            ## TODO [RW] Currently there is no `Integer.parse` method, so we
-               parse a decimal and convert it to an integer. Once
-               https://www.pivotaltracker.com/story/show/181176522 is
-               implemented, this should be changed to use `Integer.parse`.
-            value = Decimal.parse part . floor
-            Natural_Ordering_Numeric_Part value part
-        False ->
-            Natural_Ordering_Alphabetical_Part part
-
-## PRIVATE
-type Natural_Ordering_Part
-    ## PRIVATE
-    type Natural_Ordering_Alphabetical_Part (text : Text)
-
-    ## PRIVATE
-    type Natural_Ordering_Numeric_Part (value : Integer) (original_text : Text)
-
-    ## PRIVATE
-    compare_to : Natural_Ordering_Part -> Ordering
-    compare_to that = case Pair this that of
-        Pair (Natural_Ordering_Alphabetical_Part text1) (Natural_Ordering_Alphabetical_Part text2) ->
-            text1 . compare_to text2
-        Pair (Natural_Ordering_Numeric_Part value1 original1) (Natural_Ordering_Numeric_Part value2 original2) ->
-            value_ordering = value1.compare_to value2
-            case value_ordering of
-                Ordering.Equal -> original1 . compare_to original2
-                _ -> value_ordering
-        Pair (Natural_Ordering_Numeric_Part _ _) (Natural_Ordering_Alphabetical_Part _) ->
-            Ordering.Less
-        Pair (Natural_Ordering_Alphabetical_Part _) (Natural_Ordering_Numeric_Part _ _) ->
-            Ordering.Greater
-
-    to_text : Text
-    to_text = case this of
-        Natural_Ordering_Alphabetical_Part text -> text.to_text
-        Natural_Ordering_Numeric_Part v o -> [v, o].to_text
+    iter1 = BreakIterator.getCharacterInstance
+    iter1.setText text1
+
+    iter2 = BreakIterator.getCharacterInstance
+    iter2.setText text2
+
+    is_digit=c -> c>=48 && c<=57
+
+    get_number text prev next iter =
+        find_number text next iter =
+            new_next = iter.next
+            if (new_next == -1) then -next else
+                substring = Text_Utils.substring text next new_next
+                c = Text_Utils.get_chars substring . at 0
+                if (is_digit c).not then next else
+                    @Tail_Call find_number text new_next iter
+
+        n = find_number text next iter
+        s = Text_Utils.substring text prev n.abs
+        d = Decimal.parse s . floor
+        i = if n < 0 then -1 else iter.current
+        [s, d, n, i]
+
+
+    ## Ordering: Nothing < Number < Text
+    order prev1 next1 prev2 next2 =
+        case (next1 == -1) of
+            True ->
+                if (next2 == -1) then Ordering.Equal else Ordering.Less
+            False ->
+                if (next2 == -1) then Ordering.Greater else
+                    s1 = Text_Utils.substring text1 prev1 next1
+                    c1 = Text_Utils.get_chars s1 . at 0
+
+                    s2 = Text_Utils.substring text2 prev2 next2
+                    c2 = Text_Utils.get_chars s2 . at 0
+
+                    case (is_digit c1) of
+                        True ->
+                            if (is_digit c2).not then Ordering.Less else
+                                a1 = get_number text1 prev1 next1 iter1
+                                a2 = get_number text2 prev2 next2 iter2
+
+                                if (a1.at 1) != (a2.at 1) then (a1.at 1).compare_to (a2.at 1) else
+                                    if (a1.at 0) != (a2.at 0) then (a1.at 0).compare_to (a2.at 0) else
+                                        @Tail_Call order (a1.at 2) (a1.at 3) (a2.at 2) (a2.at 3)
+                        False ->
+                            if (is_digit c2) then Ordering.Greater else
+                                if s2 != s1 then s1.compare_to s2 else
+                                    @Tail_Call order next1 iter1.next next2 iter2.next
+
+    order 0 iter1.next 0 iter2.next
diff --git a/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso b/distribution/lib/Standard/Test/0.0.0-dev/src/Faker.enso
@@ -0,0 +1,38 @@
+from Standard.Base import all
+
+polyglot java import java.util.Random
+
+upper_case_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".utf_8
+numbers = "0123456789".utf_8
+
+## Creates a random number generator which can be used for creating test values.
+
+   Arguments:
+   - seed: Optional seed value to make the sequence deterministic
+make_generator : Integer -> Random
+make_generator (seed = 0) =
+    if seed == 0 then Random.new else Random.new seed
+
+
+## Creates a random string based on a template and random number generator.
+
+   Arguments:
+   - template: Vector of character arrays that represent the possible
+     characters for each letter.
+   - generator: Random number generator
+
+   > Examples:
+     Creates a fake UK National Insurance number:
+
+            l = "ABCEGHJKLMNOPRSTWXYZ".utf_8
+            n = "0123456789".utf_8
+            s = "ABCDFMP ".utf_8
+            template = [l, l, n, n, n, n, n, s]
+            ni_number = make_string template make_generator
+make_string : Vector -> Any -> Text
+make_string template generator =
+    output = Array.new template.length
+    0.up_to template.length . each i->
+        a = template.at i
+        output.set_at i (a.at (generator.nextInt a.length))
+    Text_Utils.from_utf_8 output