diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cf57f6534e5c..af14004faf330 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -131,6 +131,7 @@ and made it the default.][3472] - [Implemented a `Table.from Text` conversion allowing to parse strings representing `Delimited` files without storing them on the filesystem.][3478] +- [Added rank data, correlation and covariance statistics for `Vector`][3484] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -204,6 +205,7 @@ [3472]: https://github.com/enso-org/enso/pull/3472 [3486]: https://github.com/enso-org/enso/pull/3486 [3478]: https://github.com/enso-org/enso/pull/3478 +[3484]: https://github.com/enso-org/enso/pull/3484 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index fa06bb821a6c4..0c28a624a19db 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -1,16 +1,25 @@ from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types + from Standard.Base.Data.Vector import Empty_Error import Standard.Base.Data.Ordering.Comparator +import Standard.Base.Data.Statistics.Rank_Method + polyglot java import org.enso.base.statistics.Moments polyglot java import org.enso.base.statistics.CountMinMax +polyglot java import org.enso.base.statistics.CorrelationStatistics +polyglot java import org.enso.base.statistics.Rank + +polyglot java import java.lang.IllegalArgumentException +polyglot java import java.lang.ClassCastException +polyglot java import java.lang.NullPointerException type Statistic ## PRIVATE Convert the Enso Statistic into Java equivalent. - to_java : SingleValue - to_java = case this of + to_moment_statistic : SingleValue + to_moment_statistic = case this of Sum -> Moments.SUM Mean -> Moments.MEAN Variance p -> if p then Moments.VARIANCE_POPULATION else Moments.VARIANCE @@ -52,6 +61,32 @@ type Statistic ## The sample kurtosis of the values. type Kurtosis + ## Calculate the Covariance between data and series. + + Arguments: + - series: the series to compute the covariance with. + type Covariance (series:Vector) + + ## Calculate the Pearson Correlation between data and series. + + Arguments: + - series: the series to compute the correlation with. + type Pearson (series:Vector) + + ## Calculate the Spearman Rank Correlation between data and series. + + Arguments: + - series: the series to compute the correlation with. + type Spearman (series:Vector) + + ## Calculate the coefficient of determination between data and predicted + series. + + Arguments: + - predicted: the series to compute the r_squared with. + type R_Squared (predicted:Vector) + + ## Compute a single statistic on a vector like object. Arguments: @@ -69,11 +104,11 @@ compute data statistic=Count = - statistics: Set of statistics to calculate. compute_bulk : Vector -> [Statistic] -> [Any] compute_bulk data statistics=[Count, Sum] = - count_min_max = statistics.any s->((s.is_a Count) || (s.is_a Minimum) || (s.is_a Maximum)) - java_stats = statistics.map .to_java + java_stats = statistics.map .to_moment_statistic skip_java_stats = java_stats.all s->s.is_nothing + report_invalid _ = statistics.map_with_index i->v-> if java_stats.at i . is_nothing then Nothing else @@ -97,8 +132,88 @@ compute_bulk data statistics=[Count, Sum] = Maximum -> if count_min_max_values.comparatorError then (Error.throw Vector.Incomparable_Values_Error) else count_min_max_values.maximum + Covariance s -> here.calculate_correlation_statistics data s . covariance + Pearson s -> here.calculate_correlation_statistics data s . pearsonCorrelation + Spearman s -> here.calculate_spearman_rank data s + R_Squared s -> here.calculate_correlation_statistics data s . rSquared _ -> stats_array.at i + +## Calculate a variance-covariance matrix between the input series. + + Arguments: + - data: The input data sets +covariance_matrix : [Vector] -> [Vector] +covariance_matrix data = + stats_vectors = here.calculate_correlation_statistics_matrix data + stats_vectors.map v->(v.map .covariance) + + +## Calculate a Pearson correlation matrix between the input series. + + Arguments: + - data: The input data sets +pearson_correlation : [Vector] -> [Vector] +pearson_correlation data = + stats_vectors = here.calculate_correlation_statistics_matrix data + stats_vectors.map v->(v.map .pearsonCorrelation) + + +## Calculate a Spearman Rank correlation matrix between the input series. + + Arguments: + - data: The input data sets +spearman_correlation : [Vector] -> [Vector] +spearman_correlation data = + Panic.handle_wrapped_dataflow_error <| + output = Vector.new_builder data.length + + 0.up_to data.length . each i-> + output.append <| + Vector.new data.length j-> + if j == i then 1 else + if j < i then (output.at j . at i) else + Panic.throw_wrapped_if_error <| + here.calculate_spearman_rank (data.at i) (data.at j) + + output.to_vector + + +## PRIVATE +wrap_java_call : Any -> Any +wrap_java_call ~function = + report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets.")) + handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported + + report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal + + handle_unsupported <| handle_illegal <| function + + +## PRIVATE + Given two series, get a computed CorrelationStatistics object +calculate_correlation_statistics : Vector -> Vector -> CorrelationStatistics +calculate_correlation_statistics x_data y_data = + here.wrap_java_call <| CorrelationStatistics.compute x_data.to_array y_data.to_array + + +## PRIVATE + Given two series, get a compute the Spearman Rank correlation +calculate_spearman_rank : Vector -> Vector -> Decimal +calculate_spearman_rank x_data y_data = + here.wrap_java_call <| CorrelationStatistics.spearmanRankCorrelation x_data.to_array y_data.to_array + + +## PRIVATE + Given a set of series get CorrelationStatistics objects +calculate_correlation_statistics_matrix : [Vector] -> [CorrelationStatistics] +calculate_correlation_statistics_matrix data = + data_array = Vector.new data.length i->(data.at i).to_array . to_array + stats_array = here.wrap_java_call <| CorrelationStatistics.computeMatrix data_array + Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) + + ## Compute a single statistic on the vector. Arguments: @@ -115,3 +230,26 @@ Vector.Vector.compute statistic=Count = Vector.Vector.compute_bulk : [Statistic] -> [Any] Vector.Vector.compute_bulk statistics=[Count, Sum] = here.compute_bulk this statistics + + +## Assigns a rank to each value of data, dealing with equal values according to the method. + + Arguments: + - data: Input data to rank. + - method: Method used to deal with equal values. +rank_data : Vector -> Rank_Method -> Vector +rank_data input method=Rank_Method.Average = + java_method = case method of + Rank_Method.Minimum -> Rank.Method.MINIMUM + Rank_Method.Maximum -> Rank.Method.MAXIMUM + Rank_Method.Average -> Rank.Method.AVERAGE + Rank_Method.Ordinal -> Rank.Method.ORDINAL + Rank_Method.Dense -> Rank.Method.DENSE + + report_nullpointer caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_nullpointer = Panic.catch NullPointerException handler=report_nullpointer + handle_classcast = Panic.catch ClassCastException handler=(Error.throw Vector.Incomparable_Values_Error) + + handle_classcast <| handle_nullpointer <| + java_ranks = Rank.rank input.to_array Comparator.new java_method + Vector.Vector java_ranks diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso new file mode 100644 index 0000000000000..1718667bdf790 --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso @@ -0,0 +1,18 @@ + +## Specifies how to handle ranking of equal values. +type Rank_Method + ## Use the mean of all ranks for equal values. + type Average + + ## Use the lowest of all ranks for equal values. + type Minimum + + ## Use the highest of all ranks for equal values. + type Maximum + + ## Use same rank value for equal values and next group is the immediate + following ranking number. + type Dense + + ## Equal values are assigned the next rank in order that they occur. + type Ordinal diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso index b4808b58e152a..9821106db9914 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso @@ -55,19 +55,22 @@ fill length ~item = A vector allows to store an arbitrary number of elements in linear memory. It is the recommended data structure for most applications. + Arguments: + - capacity: Initial capacity of the Vector.Builder + > Example Construct a vector using a builder that contains the items 1 to 10. example_new_builder = - builder = Vector.new_builder + builder = Vector.new_builder 10 do_build start stop = builder.append start if start >= stop then Nothing else @Tail_Call do_build start+1 stop do_build 1 10 builder.to_vector -new_builder : Builder -new_builder = Builder.new +new_builder : Integer -> Builder +new_builder (capacity=1) = Builder.new capacity ## ADVANCED @@ -141,13 +144,7 @@ type Vector at : Integer -> Any ! Index_Out_Of_Bounds_Error at index = actual_index = if index < 0 then this.length + index else index - ## TODO [RW] Ideally we do not want an additional check here, but we - should catch a Invalid_Array_Index_Error panic. However, such a catch - should still properly forward any other panics or dataflow errors - which is not fully possible until the approach to handling Panics is - improved, as described in the following Pivotal ticket: - https://www.pivotaltracker.com/n/projects/2539304/stories/181029230 - if actual_index>=0 && actual_index Error.throw (Index_Out_Of_Bounds_Error index this.length) ## ADVANCED @@ -1015,12 +1012,15 @@ type Builder ## Creates a new builder. + Arguments: + - capacity: Initial capacity of the Vector.Builder + > Example Make a new builder Vector.new_builder - new : Builder - new = Builder (Array.new 1) 0 + new : Integer->Builder + new (capacity=1) = Builder (Array.new capacity) 0 ## Returns the current capacity (i.e. the size of the underlying storage) of this builder. @@ -1088,6 +1088,18 @@ type Builder this.append item Nothing + ## Gets an element from the vector at a specified index (0-based). + + Arguments: + - index: The location in the vector to get the element from. The index is + also allowed be negative, then the elements are indexed from the back + of the vector, i.e. -1 will correspond to the last element. + at : Integer -> Any ! Index_Out_Of_Bounds_Error + at index = + actual_index = if index < 0 then this.length + index else index + Panic.catch Invalid_Array_Index_Error (this.to_array.at actual_index) _-> + Error.throw (Index_Out_Of_Bounds_Error index this.length) + ## Checks whether a predicate holds for at least one element of this builder. Arguments: diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso index b15632960019e..1ba22dd37c887 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso @@ -386,6 +386,23 @@ type Panic True -> caught_panic.convert_to_dataflow_error False -> Panic.throw caught_panic + ## If a dataflow error had occurred, wrap it in a `Wrapped_Dataflow_Error` and promote to a Panic. + + Arguments: + - value: value to return if not an error, or rethrow as a Panic. + throw_wrapped_if_error : Any -> Any + throw_wrapped_if_error ~value = + if value.is_error then Panic.throw (Wrapped_Dataflow_Error value.catch) else value + + ## Catch any `Wrapped_Dataflow_Error` Panic and rethrow it as a dataflow error. + + Arguments: + - action: The code to execute that potentially raised a Wrapped_Dataflow_Error. + handle_wrapped_dataflow_error : Any -> Any + handle_wrapped_dataflow_error ~action = + Panic.catch Wrapped_Dataflow_Error action caught_panic-> + Error.throw caught_panic.payload.payload + ## The runtime representation of a syntax error. Arguments: diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso index 96a19ed355cc2..d520d4ea974c6 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso @@ -50,8 +50,7 @@ type Vector_Builder array = Array.new this.length go ix elem = case elem of Leaf vec -> - vec.map_with_index vi-> elem-> - array.set_at ix+vi elem + Array.copy vec.to_array 0 array ix vec.length ix + vec.length Append l r _ -> ix2 = go ix l diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java new file mode 100644 index 0000000000000..04fa6a731146f --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java @@ -0,0 +1,96 @@ +package org.enso.base.statistics; + +/** Class to compute covariance and correlations between series. */ +public class CorrelationStatistics { + private long count = 0; + private double totalX = 0.0; + private double totalXX = 0.0; + private double totalY = 0.0; + private double totalYY = 0.0; + private double totalXY = 0.0; + + private void append(Double x, Double y) { + if (x == null || x.isNaN() || y == null || y.isNaN()) { + return; + } + + count++; + totalX += x; + totalXX += x * x; + totalY += y; + totalYY += y * y; + totalXY += x * y; + } + + public double covariance() { + if (count < 2) { + return Double.NaN; + } + + return (totalXY - totalX * totalY / count) / count; + } + + public double pearsonCorrelation() { + if (count < 2) { + return Double.NaN; + } + + double n_stdev_x = Math.sqrt(count * totalXX - totalX * totalX); + double n_stdev_y = Math.sqrt(count * totalYY - totalY * totalY); + return (count * totalXY - totalX * totalY) / (n_stdev_x * n_stdev_y); + } + + public double rSquared() { + double correl = this.pearsonCorrelation(); + return correl * correl; + } + + /** + * Create the CorrelationStats between two series + * + * @param x Array of X values + * @param y Array of Y values + * @return CorrelationStats object for the 2 series. + */ + public static CorrelationStatistics compute(Double[] x, Double[] y) { + if (x.length != y.length) { + throw new IllegalArgumentException("Left and right lengths are not the same."); + } + + CorrelationStatistics output = new CorrelationStatistics(); + for (int i = 0; i < x.length; i++) { + output.append(x[i], y[i]); + } + return output; + } + + public static CorrelationStatistics[][] computeMatrix(Double[][] data) { + int len = data[0].length; + + CorrelationStatistics[][] output = new CorrelationStatistics[data.length][]; + for (int i = 0; i < data.length; i++) { + if (data[i].length != len) { + throw new IllegalArgumentException("Data lengths are not consistent."); + } + output[i] = new CorrelationStatistics[data.length]; + for (int j = 0; j < data.length; j++) { + if (j < i) { + output[i][j] = output[j][i]; + } else { + output[i][j] = compute(data[i], data[j]); + } + } + } + return output; + } + + public static double spearmanRankCorrelation(Double[] x, Double[] y) { + double[][] pairedRanks = Rank.pairedRanks(x, y, Rank.Method.AVERAGE); + + CorrelationStatistics computation = new CorrelationStatistics(); + for (int i = 0; i < pairedRanks[0].length; i++) { + computation.append(pairedRanks[0][i], pairedRanks[1][i]); + } + return computation.pearsonCorrelation(); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java new file mode 100644 index 0000000000000..bf0f5ef5de3d3 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java @@ -0,0 +1,95 @@ +package org.enso.base.statistics; + +import java.util.*; + +public class Rank { + private static final Comparator DOUBLE_COMPARATOR = (a, b) -> Double.compare((Double)a, (Double)b); + + public enum Method { + AVERAGE, + MINIMUM, + MAXIMUM, + DENSE, + ORDINAL + } + + private record ValueWithIndex(Object value, int index) { + } + + public static double[] rank(Object[] input, Comparator comparator, Method method) + throws NullPointerException, ClassCastException + { + List tuples = new ArrayList<>(input.length); + for(int i = 0; i < input.length; i++) { + if (input[i] == null) { + throw new NullPointerException("Value is Nothing at index " + i); + } + tuples.add(new ValueWithIndex(input[i], i)); + } + + return computeRankFromTuples(tuples, comparator, method); + } + + public static double[][] pairedRanks(Double[] x, Double[] y, Method method) + throws IllegalArgumentException, NullPointerException, ClassCastException + { + if (x.length != y.length) { + throw new IllegalArgumentException("Left and right lengths are not the same."); + } + + List x_tuples = new ArrayList<>(x.length); + List y_tuples = new ArrayList<>(y.length); + for (int i = 0; i < x.length; i++) { + if (x[i] == null || Double.isNaN(x[i]) || y[i] == null || Double.isNaN(y[i])) { + continue; + } + + x_tuples.add(new ValueWithIndex(x[i], x_tuples.size())); + y_tuples.add(new ValueWithIndex(y[i], y_tuples.size())); + } + + return new double[][] { + computeRankFromTuples(x_tuples, DOUBLE_COMPARATOR, method), + computeRankFromTuples(y_tuples, DOUBLE_COMPARATOR, method) + }; + } + + private static double[] computeRankFromTuples(List tuples, Comparator comparator, Method method) + throws NullPointerException, ClassCastException + { + Comparator tupleComparator = (a, b) -> { + int c = comparator.compare(a.value, b.value); + return c == 0 ? Integer.compare(a.index, b.index) : -c; + }; + tuples.sort(tupleComparator); + + double[] output = new double[tuples.size()]; + + int index = 0; + int dense = 0; + while (index < tuples.size()) { + dense++; + int start = index; + + // Find End of Equal Values + while (index < tuples.size() && comparator.compare(tuples.get(start).value, tuples.get(index).value) == 0) { + index++; + } + + // Build Rank + for (int i = start; i < index; i++) { + double rank = switch (method) { + case MINIMUM -> start + 1; + case MAXIMUM -> index; + case DENSE -> dense; + case AVERAGE -> (start + 1 + index) / 2.0; + case ORDINAL -> i + 1; + }; + + output[tuples.get(i).index] = rank; + } + } + + return output; + } +} diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso index 2332934d001d3..bf08ff7c740b1 100644 --- a/test/Tests/src/Data/Statistics_Spec.enso +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -1,6 +1,7 @@ -from Standard.Base import Nothing, Vector, Number, True, Illegal_Argument_Error, False +from Standard.Base import Nothing, Vector, Number, Decimal, True, Illegal_Argument_Error, False import Standard.Base.Data.Statistics +import Standard.Base.Data.Statistics.Rank_Method from Standard.Base.Data.Statistics import all import Standard.Test @@ -17,18 +18,21 @@ type No_Ord number # Tests spec = - simple_set = [1, 2, 3, 4, 5] - number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01] - missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v) - with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v) - text_set = ["A", "B", Nothing, "D"] - - ord_set = [Ord 10, Ord 2, Nothing, Ord 9] - no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9] - double_error = 0.000001 + vector_compare values expected = + values.each_with_index i->v-> + case v of + Decimal -> v.should_equal (expected.at i) epsilon=double_error + _ -> v.should_equal (expected.at i) + Test.group "Statistics" <| + simple_set = [1, 2, 3, 4, 5] + number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01] + missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v) + with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v) + text_set = ["A", "B", Nothing, "D"] + Test.specify "should be able to count valid values" <| simple_set.compute . should_equal 5 number_set.compute . should_equal 20 @@ -111,8 +115,9 @@ spec = stats = [Count, Minimum, Mean, Variance, Skew] expected = [20, -45.84, -5.064, 582.0137832, 0.165086552] values = number_set.compute_bulk stats - values.map_with_index i->v->((expected.at i - v).abs < double_error) . any v->(v == True) . should_equal True + vector_compare values expected + Test.group "Statistics - empty Vector " <| Test.specify "should be able to count and sum on empty Vector" <| [].compute . should_equal 0 [].compute Sum . should_equal 0 @@ -127,6 +132,11 @@ spec = [].compute Skew . is_nan . should_equal True [].compute Kurtosis . is_nan . should_equal True + Test.group "Statistics - invalid input" <| + text_set = ["A", "B", Nothing, "D"] + ord_set = [Ord 10, Ord 2, Nothing, Ord 9] + no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9] + Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| text_set.compute Sum . should_fail_with Illegal_Argument_Error text_set.compute Mean . should_fail_with Illegal_Argument_Error @@ -147,4 +157,95 @@ spec = Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| [1, False].compute Minimum . should_fail_with Vector.Incomparable_Values_Error + Test.group "Rank Data" <| + Test.specify "can rank a Decimal data series" <| + values = [409.892906, 0.839952, 796.468572, 126.931298, -405.265005, -476.675817, 441.651325, 796.468572, 78.50094, 340.163324, 234.861926, 409.892906, 226.467105, 234.861926, 126.931298, 637.870512, -71.008044, -386.399663, -126.534337, -476.675817, 78.50094, -386.399663, 409.892906, 868.54485, 669.113037, 669.113037, 0.839952, 407.162613, -476.675817, 126.931298] + Statistics.rank_data values . should_equal [9, 21.5, 2.5, 17, 27, 29, 7, 2.5, 19.5, 12, 13.5, 9, 15, 13.5, 17, 6, 23, 25.5, 24, 29, 19.5, 25.5, 9, 1, 4.5, 4.5, 21.5, 11, 29, 17] + Statistics.rank_data values Rank_Method.Minimum . should_equal [8, 21, 2, 16, 27, 28, 7, 2, 19, 12, 13, 8, 15, 13, 16, 6, 23, 25, 24, 28, 19, 25, 8, 1, 4, 4, 21, 11, 28, 16] + Statistics.rank_data values Rank_Method.Maximum . should_equal [10, 22, 3, 18, 27, 30, 7, 3, 20, 12, 14, 10, 15, 14, 18, 6, 23, 26, 24, 30, 20, 26, 10, 1, 5, 5, 22, 11, 30, 18] + Statistics.rank_data values Rank_Method.Ordinal . should_equal [8, 21, 2, 16, 27, 28, 7, 3, 19, 12, 13, 9, 15, 14, 17, 6, 23, 25, 24, 29, 20, 26, 10, 1, 4, 5, 22, 11, 30, 18] + Statistics.rank_data values Rank_Method.Dense . should_equal [6, 13, 2, 11, 17, 18, 5, 2, 12, 8, 9, 6, 10, 9, 11, 4, 14, 16, 15, 18, 12, 16, 6, 1, 3, 3, 13, 7, 18, 11] + + Test.specify "can rank an Integer data series" <| + values = [10, 1, 124, 10] + Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5] + + Test.specify "can rank a Number data series" <| + values = [10.0, 1, 12.4, 10] + Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5] + + Test.specify "can rank a Text data series" <| + values = ["G", "AA", "B", "G", "D"] + Statistics.rank_data values . should_equal [1.5, 5, 4, 1.5, 3] + + Test.specify "should fail with Incomparable_Values_Error on custom type without compare_to" <| + values = [No_Ord 10, No_Ord 2, No_Ord 9] + Statistics.rank_data values . should_fail_with Vector.Incomparable_Values_Error + + Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| + Statistics.rank_data [1, "A"] . should_fail_with Vector.Incomparable_Values_Error + + Test.specify "should fail with Illegal_Argument_Error on Vectors with Nothing" <| + Statistics.rank_data [1, Nothing, 4] . should_fail_with Illegal_Argument_Error + + Test.group "Correlation Statistics" <| + series_a = [0.22345,0.258315,0.74663,Nothing,0.686843,0.692246,Nothing,0.401859,0.725442,Nothing,0.963527,0.520363,0.633053,0.397123,Nothing,0.458942,0.036499,0.368194,0.598939,0.296476,0.093746,0.609329] + series_b = [0.140743,Nothing,0.574639,0.251683,0.902023,0.08723,0.251813,0.1669,0.234405,Nothing,0.28774,0.471757,0.280681,0.925207,0.919041,0.626234,0.429497,0.358597,0.566118,0.333606,0.828172,0.887829] + series_c = [Nothing,0.769797,0.281678,0.462145,0.727132,0.327978,Nothing,0.648639,0.562636,Nothing,0.159836,0.367404,0.877087,0.365483,Nothing,0.931873,0.723546,0.558085,0.163396,0.940997,0.399685,0.617509] + series = [series_a, series_b, series_c] + + Test.specify "can compute Covariance, Correlation and R Squared between a pair of series" + series_a.compute (Covariance series_b) . should_equal -0.0053554 epsilon=double_error + series_a.compute (Pearson series_b) . should_equal -0.08263943 epsilon=double_error + series_a.compute (Spearman series_b) . should_equal -0.09313725 epsilon=double_error + series_a.compute (R_Squared series_b) . should_equal 0.006829275 epsilon=double_error + + Test.specify "can calculate a covariance matrix" <| + matrix = Statistics.covariance_matrix series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [0.0571699, -0.0053554, -0.02378204] + vector_compare (matrix.at 1) [-0.0053554, 0.07707381, -0.00098274] + vector_compare (matrix.at 2) [-0.02378204, -0.00098274, 0.05837098] + + Test.specify "can calculate a pearson correlation matrix" <| + matrix = Statistics.pearson_correlation series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [1, -0.08263943, -0.40469045] + vector_compare (matrix.at 1) [-0.08263943, 1, -0.01537537] + vector_compare (matrix.at 2) [-0.40469045, -0.01537537, 1] + + Test.specify "can calculate a spearman rank correlation matrix" <| + matrix = Statistics.spearman_correlation series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [1, -0.09313725, -0.43382353] + vector_compare (matrix.at 1) [-0.09313725, 1, 0] + vector_compare (matrix.at 2) [-0.43382353, 0, 1] + + Test.specify "should fail with Illegal_Argument_Error if different lengths" <| + data = [[1,2,3,4],[10,20,30]] + data.first.compute (Covariance data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (Pearson data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (Spearman data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (R_Squared data.second) . should_fail_with Illegal_Argument_Error + Statistics.covariance_matrix data . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation data . should_fail_with Illegal_Argument_Error + Statistics.spearman_correlation data . should_fail_with Illegal_Argument_Error + + Test.specify "should fail with Illegal_Argument_Error if not number based" <| + text = [["A","BC","CD"], ["0", "1", "2"], ["H", "I", "J"]] + text.first.compute (Covariance text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (Pearson text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (Spearman text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (R_Squared text.second) . should_fail_with Illegal_Argument_Error + Statistics.covariance_matrix text . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation text . should_fail_with Illegal_Argument_Error + Statistics.spearman_correlation text . should_fail_with Illegal_Argument_Error + + Test.group "Statistics - invalid input" <| + Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| + series = [["A", "B", Nothing, "D"], ["A", "B", Nothing, "D"]] + Statistics.covariance_matrix series . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation series . should_fail_with Illegal_Argument_Error + + main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso index b87dc37874c55..d6aa7e6d5c8fe 100644 --- a/test/Tests/src/Main.enso +++ b/test/Tests/src/Main.enso @@ -34,6 +34,7 @@ import project.Data.Ref_Spec import project.Data.Text_Spec import project.Data.Time.Spec as Time_Spec import project.Data.Vector_Spec +import project.Data.Statistics_Spec import project.Data.Text.Regex_Spec import project.Data.Text.Utils_Spec import project.Data.Text.Default_Regex_Engine_Spec @@ -104,4 +105,5 @@ main = Test.Suite.run_main <| Time_Spec.spec Uri_Spec.spec Vector_Spec.spec + Statistics_Spec.spec Warnings_Spec.spec