From c9dc5b40b1204263a5061fee9c8c381ec31c2436 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Tue, 17 May 2022 14:38:58 +0100 Subject: [PATCH 01/13] Covariance and Correlation matrices --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 68 +++++++++++++- .../base/statistics/CorrelationStats.java | 90 +++++++++++++++++++ test/Tests/src/Data/Statistics_Spec.enso | 58 +++++++++--- 3 files changed, 201 insertions(+), 15 deletions(-) create mode 100644 std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index fa06bb821a6c..def5dadaad4b 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -5,12 +5,14 @@ import Standard.Base.Data.Ordering.Comparator polyglot java import org.enso.base.statistics.Moments polyglot java import org.enso.base.statistics.CountMinMax +polyglot java import org.enso.base.statistics.CorrelationStats +polyglot java import java.lang.IllegalArgumentException type Statistic ## PRIVATE Convert the Enso Statistic into Java equivalent. - to_java : SingleValue - to_java = case this of + to_moment_statistic : SingleValue + to_moment_statistic = case this of Sum -> Moments.SUM Mean -> Moments.MEAN Variance p -> if p then Moments.VARIANCE_POPULATION else Moments.VARIANCE @@ -52,6 +54,26 @@ type Statistic ## The sample kurtosis of the values. type Kurtosis + ## Calculate the Covariance between data and series. + + Arguments: + - series: the series to compute the covariance with. + type Covariance (series:Vector) + + ## Calculate the Pearson Correlation between data and series. + + Arguments: + - series: the series to compute the correlation with. + type Pearson (series:Vector) + + ## Calculate the coefficient of determination between data and predicted + series. + + Arguments: + - predicted: the series to compute the r_squared with. + type R_Squared (predicted:Vector) + + ## Compute a single statistic on a vector like object. Arguments: @@ -69,11 +91,11 @@ compute data statistic=Count = - statistics: Set of statistics to calculate. compute_bulk : Vector -> [Statistic] -> [Any] compute_bulk data statistics=[Count, Sum] = - count_min_max = statistics.any s->((s.is_a Count) || (s.is_a Minimum) || (s.is_a Maximum)) - java_stats = statistics.map .to_java + java_stats = statistics.map .to_moment_statistic skip_java_stats = java_stats.all s->s.is_nothing + report_invalid _ = statistics.map_with_index i->v-> if java_stats.at i . is_nothing then Nothing else @@ -99,6 +121,44 @@ compute_bulk data statistics=[Count, Sum] = count_min_max_values.maximum _ -> stats_array.at i + +## Calculate a variance-covariance matrix between the input series. + + Arguments: + - data: The input data sets +covariance_matrix : [Vector] -> [Vector] +covariance_matrix data = + stats_vectors = here.correlation_stats data + stats_vectors.map v->(v.map .covariance) + + +## Calculate a Pearson correlation matrix between the input series. + + Arguments: + - data: The input data sets +pearson_correlation : [Vector] -> [Vector] +pearson_correlation data = + stats_vectors = here.correlation_stats data + stats_vectors.map v->(v.map .pearsonCorrelation) + + +## PRIVATE + Given a set of series get CorrelationStats objects +correlation_stats : [Vector] -> [CorrelationStats] +correlation_stats data = + data_array = Array.new data.length + 0.up_to data.length . each i->(data_array.set_at i (data.at i).to_array) + + report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets.")) + handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported + + report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal + + stats_array = handle_unsupported <| handle_illegal <| CorrelationStats.computeMatrix data_array + Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) + + ## Compute a single statistic on the vector. Arguments: diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java new file mode 100644 index 000000000000..d9c19c92b96a --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java @@ -0,0 +1,90 @@ +package org.enso.base.statistics; + +/** + * Class to compute covariance and correlations between series. + */ +public class CorrelationStats { + private long count = 0; + private double totalX = 0.0; + private double totalXX = 0.0; + private double totalY = 0.0; + private double totalYY = 0.0; + private double totalXY = 0.0; + + private void append(Double x, Double y) { + if (x == null || x.isNaN() || y == null || y.isNaN()) { + return; + } + + count++; + totalX += x; + totalXX += x * x; + totalY += y; + totalYY += y * y; + totalXY += x * y; + } + + public double covariance() { + if (count < 2) { + return Double.NaN; + } + + return (totalXY - totalX * totalY / count) / count; + } + + public double pearsonCorrelation() { + if (count < 2) { + return Double.NaN; + } + + double n_stdev_x = Math.sqrt(count * totalXX - totalX * totalX); + double n_stdev_y = Math.sqrt(count * totalYY - totalY * totalY); + return (count * totalXY - totalX * totalY) / (n_stdev_x * n_stdev_y); + } + + public double rSquared() { + double correl = this.pearsonCorrelation(); + return correl * correl; + } + + /*** + * Create the CorrelationStats between two series + * @param x Array of X values + * @param y Array of Y values + * @return CorrelationStats object for the 2 series. + */ + public static CorrelationStats compute(Double[] x, Double[] y) { + if (x.length != y.length) { + throw new IllegalArgumentException("Left and right lengths are not the same."); + } + + CorrelationStats output = new CorrelationStats(); + for (int i = 0; i < x.length; i++) { + output.append(x[i], y[i]); + } + return output; + } + + public static CorrelationStats[][] computeMatrix(Double[][] data) { + int len = data[0].length; + + for (int i = 1; i < data.length; i++) { + if (data[i].length != len) { + throw new IllegalArgumentException("Data lengths are not consistent."); + } + } + + CorrelationStats[][] output = new CorrelationStats[data.length][]; + for (int i = 0; i < data.length; i++) { + output[i] = new CorrelationStats[data.length]; + for (int j = 0; j < data.length; j++) { + if (j < i) { + output[i][j] = output[j][i]; + } else { + output[i][j] = compute(data[i], data[j]); + } + } + } + return output; + } +} diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso index 2332934d001d..1393f318b448 100644 --- a/test/Tests/src/Data/Statistics_Spec.enso +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -1,4 +1,4 @@ -from Standard.Base import Nothing, Vector, Number, True, Illegal_Argument_Error, False +from Standard.Base import Nothing, Vector, Number, Decimal, True, Illegal_Argument_Error, False import Standard.Base.Data.Statistics from Standard.Base.Data.Statistics import all @@ -17,18 +17,21 @@ type No_Ord number # Tests spec = - simple_set = [1, 2, 3, 4, 5] - number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01] - missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v) - with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v) - text_set = ["A", "B", Nothing, "D"] - - ord_set = [Ord 10, Ord 2, Nothing, Ord 9] - no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9] - double_error = 0.000001 + vector_compare values expected = + values.each_with_index i->v-> + case v of + Decimal -> v.should_equal (expected.at i) epsilon=double_error + _ -> v.should_equal (expected.at i) + Test.group "Statistics" <| + simple_set = [1, 2, 3, 4, 5] + number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01] + missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v) + with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v) + text_set = ["A", "B", Nothing, "D"] + Test.specify "should be able to count valid values" <| simple_set.compute . should_equal 5 number_set.compute . should_equal 20 @@ -111,8 +114,9 @@ spec = stats = [Count, Minimum, Mean, Variance, Skew] expected = [20, -45.84, -5.064, 582.0137832, 0.165086552] values = number_set.compute_bulk stats - values.map_with_index i->v->((expected.at i - v).abs < double_error) . any v->(v == True) . should_equal True + vector_compare values expected + Test.group "Statistics - empty Vector " <| Test.specify "should be able to count and sum on empty Vector" <| [].compute . should_equal 0 [].compute Sum . should_equal 0 @@ -127,6 +131,11 @@ spec = [].compute Skew . is_nan . should_equal True [].compute Kurtosis . is_nan . should_equal True + Test.group "Statistics - invalid input" <| + text_set = ["A", "B", Nothing, "D"] + ord_set = [Ord 10, Ord 2, Nothing, Ord 9] + no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9] + Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| text_set.compute Sum . should_fail_with Illegal_Argument_Error text_set.compute Mean . should_fail_with Illegal_Argument_Error @@ -147,4 +156,31 @@ spec = Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| [1, False].compute Minimum . should_fail_with Vector.Incomparable_Values_Error + Test.group "Correlation Statistics" <| + series_a = [0.22345,0.258315,0.74663,Nothing,0.686843,0.692246,Nothing,0.401859,0.725442,Nothing,0.963527,0.520363,0.633053,0.397123,Nothing,0.458942,0.036499,0.368194,0.598939,0.296476,0.093746,0.609329] + series_b = [0.140743,Nothing,0.574639,0.251683,0.902023,0.08723,0.251813,0.1669,0.234405,Nothing,0.28774,0.471757,0.280681,0.925207,0.919041,0.626234,0.429497,0.358597,0.566118,0.333606,0.828172,0.887829] + series_c = [Nothing,0.769797,0.281678,0.462145,0.727132,0.327978,Nothing,0.648639,0.562636,Nothing,0.159836,0.367404,0.877087,0.365483,Nothing,0.931873,0.723546,0.558085,0.163396,0.940997,0.399685,0.617509] + series = [series_a, series_b, series_c] + + Test.specify "can calculate a covariance matrix" <| + matrix = Statistics.covariance_matrix series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [0.0571699, -0.0053554, -0.02378204] + vector_compare (matrix.at 1) [-0.0053554, 0.07707381, -0.00098274] + vector_compare (matrix.at 2) [-0.02378204, -0.00098274, 0.05837098] + + Test.specify "can calculate a correlation matrix" <| + matrix = Statistics.pearson_correlation series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [1, -0.08263943, -0.40469045] + vector_compare (matrix.at 1) [-0.08263943, 1, -0.01537537] + vector_compare (matrix.at 2) [-0.40469045, -0.01537537, 1] + + Test.group "Statistics - invalid input" <| + Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| + series = [["A", "B", Nothing, "D"], ["A", "B", Nothing, "D"]] + Statistics.covariance_matrix series . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation series . should_fail_with Illegal_Argument_Error + + main = Test.Suite.run_main here.spec From e9d863860ac5ed897f224cb2d38e889e0f0ce805 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 18 May 2022 11:43:57 +0100 Subject: [PATCH 02/13] WIP --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 5 +++++ .../src/Data/Statistics/Rank_Method.enso | 18 ++++++++++++++++++ .../java/org/enso/base/statistics/Rank.java | 10 ++++++++++ 3 files changed, 33 insertions(+) create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso create mode 100644 std-bits/base/src/main/java/org/enso/base/statistics/Rank.java diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index def5dadaad4b..f1f55a07b0cf 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -6,6 +6,7 @@ import Standard.Base.Data.Ordering.Comparator polyglot java import org.enso.base.statistics.Moments polyglot java import org.enso.base.statistics.CountMinMax polyglot java import org.enso.base.statistics.CorrelationStats +polyglot java import org.enso.base.statistics.Rank polyglot java import java.lang.IllegalArgumentException type Statistic @@ -175,3 +176,7 @@ Vector.Vector.compute statistic=Count = Vector.Vector.compute_bulk : [Statistic] -> [Any] Vector.Vector.compute_bulk statistics=[Count, Sum] = here.compute_bulk this statistics + +test : Any +test = + Vector.from_array (Rank.rank) \ No newline at end of file diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso new file mode 100644 index 000000000000..0e667014209e --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso @@ -0,0 +1,18 @@ +from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types + +type Rank_Method + ## Use the mean of all ranks for equal values. + type Average + + ## Use the lowest of all ranks for equal values. + type Minimum + + ## Use the highest of all ranks for equal values. + type Maximum + + ## Use same rank value for equal values and next group is + type Dense + + ## Equal values are assigned the next rank in order that they occur. + type Ordinal + diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java new file mode 100644 index 000000000000..1c73b61244ce --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java @@ -0,0 +1,10 @@ +package org.enso.base.statistics; + +import java.time.LocalDate; +import java.time.LocalDateTime; + +public class Rank { + public static Object[] rank() { + return new Object[] { 1, 2.2, true, LocalDate.now(), LocalDateTime.now() }; + } +} From 8516c3577e04c387d2c6916045a6d672fb7181ed Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Tue, 24 May 2022 17:44:50 +0100 Subject: [PATCH 03/13] Adding Rank functionality to Java and Enso --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 36 ++++++++++-- .../src/Data/Statistics/Rank_Method.enso | 18 ------ .../java/org/enso/base/statistics/Rank.java | 56 +++++++++++++++++-- 3 files changed, 84 insertions(+), 26 deletions(-) delete mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index f1f55a07b0cf..d26c95ef44f3 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -75,6 +75,23 @@ type Statistic type R_Squared (predicted:Vector) +type Rank_Method + ## Use the mean of all ranks for equal values. + type Average + + ## Use the lowest of all ranks for equal values. + type Minimum + + ## Use the highest of all ranks for equal values. + type Maximum + + ## Use same rank value for equal values and next group is + type Dense + + ## Equal values are assigned the next rank in order that they occur. + type Ordinal + + ## Compute a single statistic on a vector like object. Arguments: @@ -159,6 +176,21 @@ correlation_stats data = stats_array = handle_unsupported <| handle_illegal <| CorrelationStats.computeMatrix data_array Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) +## Assigns a rank to each value of data, dealing with equal values according to the method. + + Arguments: + - data: Input data to rank. + - method: Method used to deal with equal values. +rank_data : Vector -> Rank_Method -> Vector +rank_data input method=Rank_Method.Average = + java_method = case method of + Rank_Method.Minimum -> Rank.Method.MINIMUM + Rank_Method.Maximum -> Rank.Method.MAXIMUM + Rank_Method.Average -> Rank.Method.AVERAGE + Rank_Method.Ordinal -> Rank.Method.ORDINAL + Rank_Method.Dense -> Rank.Method.DENSE + java_ranks = Rank.rank input.to_array Comparator.new java_method + Vector.Vector java_ranks ## Compute a single statistic on the vector. @@ -176,7 +208,3 @@ Vector.Vector.compute statistic=Count = Vector.Vector.compute_bulk : [Statistic] -> [Any] Vector.Vector.compute_bulk statistics=[Count, Sum] = here.compute_bulk this statistics - -test : Any -test = - Vector.from_array (Rank.rank) \ No newline at end of file diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso deleted file mode 100644 index 0e667014209e..000000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso +++ /dev/null @@ -1,18 +0,0 @@ -from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types - -type Rank_Method - ## Use the mean of all ranks for equal values. - type Average - - ## Use the lowest of all ranks for equal values. - type Minimum - - ## Use the highest of all ranks for equal values. - type Maximum - - ## Use same rank value for equal values and next group is - type Dense - - ## Equal values are assigned the next rank in order that they occur. - type Ordinal - diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java index 1c73b61244ce..c438326f4526 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java @@ -1,10 +1,58 @@ package org.enso.base.statistics; -import java.time.LocalDate; -import java.time.LocalDateTime; +import java.util.Arrays; +import java.util.Comparator; +import java.util.stream.IntStream; public class Rank { - public static Object[] rank() { - return new Object[] { 1, 2.2, true, LocalDate.now(), LocalDateTime.now() }; + public enum Method { + AVERAGE, + MINIMUM, + MAXIMUM, + DENSE, + ORDINAL + } + + private record ValueWithIndex(Object value, int index) { + } + + public static double[] rank(Object[] input, Comparator comparator, Method method) + { + Comparator tupleComparator = (a, b) -> { + int c = comparator.compare(a.value, b.value); + return c == 0 ? Integer.compare(a.index, b.index) : c; + }; + ValueWithIndex[] tuples = IntStream.range(0, input.length).mapToObj(i -> new ValueWithIndex(input[i], i)).toArray(ValueWithIndex[]::new); + Arrays.sort(tuples, tupleComparator); + + double[] output = new double[input.length]; + + int index = 0; + int dense = 0; + while (index < tuples.length) + { + dense++; + int bottom = index; + + // Find Top + while (index < tuples.length && comparator.compare(tuples[bottom].value, tuples[index].value) == 0) { + index++; + } + + // Build Rank + for (int i = bottom; i < index; i++) { + double rank = switch (method) { + case MINIMUM -> bottom + 1; + case MAXIMUM -> index; + case DENSE -> dense; + case AVERAGE -> (bottom + 1 + index) / 2.0; + case ORDINAL -> i + 1; + }; + + output[tuples[i].index] = rank; + } + } + + return output; } } From b0bf5545ecde4d685e13af93dd773b4d0030e6b4 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 25 May 2022 16:22:05 +0100 Subject: [PATCH 04/13] Add rank_data to Statistics module --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 60 +++++++++---------- .../src/Data/Statistics/Rank_Method.enso | 16 +++++ .../java/org/enso/base/statistics/Rank.java | 12 +++- test/Tests/src/Data/Statistics_Spec.enso | 32 ++++++++++ 4 files changed, 86 insertions(+), 34 deletions(-) create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index d26c95ef44f3..ae1b39bb2140 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -3,11 +3,16 @@ from Standard.Base.Data.Vector import Empty_Error import Standard.Base.Data.Ordering.Comparator +import Standard.Base.Data.Statistics.Rank_Method + polyglot java import org.enso.base.statistics.Moments polyglot java import org.enso.base.statistics.CountMinMax polyglot java import org.enso.base.statistics.CorrelationStats polyglot java import org.enso.base.statistics.Rank + polyglot java import java.lang.IllegalArgumentException +polyglot java import java.lang.ClassCastException +polyglot java import java.lang.NullPointerException type Statistic ## PRIVATE @@ -75,23 +80,6 @@ type Statistic type R_Squared (predicted:Vector) -type Rank_Method - ## Use the mean of all ranks for equal values. - type Average - - ## Use the lowest of all ranks for equal values. - type Minimum - - ## Use the highest of all ranks for equal values. - type Maximum - - ## Use same rank value for equal values and next group is - type Dense - - ## Equal values are assigned the next rank in order that they occur. - type Ordinal - - ## Compute a single statistic on a vector like object. Arguments: @@ -176,21 +164,6 @@ correlation_stats data = stats_array = handle_unsupported <| handle_illegal <| CorrelationStats.computeMatrix data_array Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) -## Assigns a rank to each value of data, dealing with equal values according to the method. - - Arguments: - - data: Input data to rank. - - method: Method used to deal with equal values. -rank_data : Vector -> Rank_Method -> Vector -rank_data input method=Rank_Method.Average = - java_method = case method of - Rank_Method.Minimum -> Rank.Method.MINIMUM - Rank_Method.Maximum -> Rank.Method.MAXIMUM - Rank_Method.Average -> Rank.Method.AVERAGE - Rank_Method.Ordinal -> Rank.Method.ORDINAL - Rank_Method.Dense -> Rank.Method.DENSE - java_ranks = Rank.rank input.to_array Comparator.new java_method - Vector.Vector java_ranks ## Compute a single statistic on the vector. @@ -208,3 +181,26 @@ Vector.Vector.compute statistic=Count = Vector.Vector.compute_bulk : [Statistic] -> [Any] Vector.Vector.compute_bulk statistics=[Count, Sum] = here.compute_bulk this statistics + + +## Assigns a rank to each value of data, dealing with equal values according to the method. + + Arguments: + - data: Input data to rank. + - method: Method used to deal with equal values. +rank_data : Vector -> Rank_Method -> Vector +rank_data input method=Rank_Method.Average = + java_method = case method of + Rank_Method.Minimum -> Rank.Method.MINIMUM + Rank_Method.Maximum -> Rank.Method.MAXIMUM + Rank_Method.Average -> Rank.Method.AVERAGE + Rank_Method.Ordinal -> Rank.Method.ORDINAL + Rank_Method.Dense -> Rank.Method.DENSE + + report_nullpointer caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_nullpointer = Panic.catch NullPointerException handler=report_nullpointer + handle_classcast = Panic.catch ClassCastException handler=(Error.throw Vector.Incomparable_Values_Error) + + handle_classcast <| handle_nullpointer <| + java_ranks = Rank.rank input.to_array Comparator.new java_method + Vector.Vector java_ranks diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso new file mode 100644 index 000000000000..2fc9c2fa7e67 --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso @@ -0,0 +1,16 @@ + +type Rank_Method + ## Use the mean of all ranks for equal values. + type Average + + ## Use the lowest of all ranks for equal values. + type Minimum + + ## Use the highest of all ranks for equal values. + type Maximum + + ## Use same rank value for equal values and next group is + type Dense + + ## Equal values are assigned the next rank in order that they occur. + type Ordinal diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java index c438326f4526..e291cd125982 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java @@ -17,12 +17,20 @@ private record ValueWithIndex(Object value, int index) { } public static double[] rank(Object[] input, Comparator comparator, Method method) + throws NullPointerException, ClassCastException { Comparator tupleComparator = (a, b) -> { int c = comparator.compare(a.value, b.value); - return c == 0 ? Integer.compare(a.index, b.index) : c; + return c == 0 ? Integer.compare(a.index, b.index) : -c; }; - ValueWithIndex[] tuples = IntStream.range(0, input.length).mapToObj(i -> new ValueWithIndex(input[i], i)).toArray(ValueWithIndex[]::new); + + ValueWithIndex[] tuples = new ValueWithIndex[input.length]; + for(int i = 0; i < input.length; i++) { + if (input[i] == null) { + throw new NullPointerException("Value is Nothing at index " + i); + } + tuples[i] = new ValueWithIndex(input[i], i); + } Arrays.sort(tuples, tupleComparator); double[] output = new double[input.length]; diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso index 1393f318b448..b0ff145adea4 100644 --- a/test/Tests/src/Data/Statistics_Spec.enso +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -1,6 +1,7 @@ from Standard.Base import Nothing, Vector, Number, Decimal, True, Illegal_Argument_Error, False import Standard.Base.Data.Statistics +import Standard.Base.Data.Statistics.Rank_Method from Standard.Base.Data.Statistics import all import Standard.Test @@ -156,6 +157,37 @@ spec = Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| [1, False].compute Minimum . should_fail_with Vector.Incomparable_Values_Error + Test.group "Rank Data" <| + Test.specify "can rank a Decimal data series" <| + values = [409.892906, 0.839952, 796.468572, 126.931298, -405.265005, -476.675817, 441.651325, 796.468572, 78.50094, 340.163324, 234.861926, 409.892906, 226.467105, 234.861926, 126.931298, 637.870512, -71.008044, -386.399663, -126.534337, -476.675817, 78.50094, -386.399663, 409.892906, 868.54485, 669.113037, 669.113037, 0.839952, 407.162613, -476.675817, 126.931298] + Statistics.rank_data values . should_equal [9, 21.5, 2.5, 17, 27, 29, 7, 2.5, 19.5, 12, 13.5, 9, 15, 13.5, 17, 6, 23, 25.5, 24, 29, 19.5, 25.5, 9, 1, 4.5, 4.5, 21.5, 11, 29, 17] + Statistics.rank_data values Rank_Method.Minimum . should_equal [8, 21, 2, 16, 27, 28, 7, 2, 19, 12, 13, 8, 15, 13, 16, 6, 23, 25, 24, 28, 19, 25, 8, 1, 4, 4, 21, 11, 28, 16] + Statistics.rank_data values Rank_Method.Maximum . should_equal [10, 22, 3, 18, 27, 30, 7, 3, 20, 12, 14, 10, 15, 14, 18, 6, 23, 26, 24, 30, 20, 26, 10, 1, 5, 5, 22, 11, 30, 18] + Statistics.rank_data values Rank_Method.Ordinal . should_equal [8, 21, 2, 16, 27, 28, 7, 3, 19, 12, 13, 9, 15, 14, 17, 6, 23, 25, 24, 29, 20, 26, 10, 1, 4, 5, 22, 11, 30, 18] + Statistics.rank_data values Rank_Method.Dense . should_equal [6, 13, 2, 11, 17, 18, 5, 2, 12, 8, 9, 6, 10, 9, 11, 4, 14, 16, 15, 18, 12, 16, 6, 1, 3, 3, 13, 7, 18, 11] + + Test.specify "can rank an Integer data series" <| + values = [10, 1, 124, 10] + Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5] + + Test.specify "can rank a Number data series" <| + values = [10.0, 1, 12.4, 10] + Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5] + + Test.specify "can rank a Text data series" <| + values = ["G", "AA", "B", "G", "D"] + Statistics.rank_data values . should_equal [1.5, 5, 4, 1.5, 3] + + Test.specify "should fail with Incomparable_Values_Error on custom type without compare_to" <| + values = [No_Ord 10, No_Ord 2, No_Ord 9] + Statistics.rank_data values . should_fail_with Vector.Incomparable_Values_Error + + Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| + Statistics.rank_data [1, "A"] . should_fail_with Vector.Incomparable_Values_Error + + Test.specify "should fail with Illegal_Argument_Error on Vectors with Nothing" <| + Statistics.rank_data [1, Nothing, 4] . should_fail_with Illegal_Argument_Error + Test.group "Correlation Statistics" <| series_a = [0.22345,0.258315,0.74663,Nothing,0.686843,0.692246,Nothing,0.401859,0.725442,Nothing,0.963527,0.520363,0.633053,0.397123,Nothing,0.458942,0.036499,0.368194,0.598939,0.296476,0.093746,0.609329] series_b = [0.140743,Nothing,0.574639,0.251683,0.902023,0.08723,0.251813,0.1669,0.234405,Nothing,0.28774,0.471757,0.280681,0.925207,0.919041,0.626234,0.429497,0.358597,0.566118,0.333606,0.828172,0.887829] From 18dbeb59237e1afa1928e58f5db5b8e2b9a77725 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 25 May 2022 17:01:13 +0100 Subject: [PATCH 05/13] Single statistic version of Correlation, Covariance and RSquared. Added test cases --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 30 ++++++++++++++----- ...nStats.java => CorrelationStatistics.java} | 12 ++++---- test/Tests/src/Data/Statistics_Spec.enso | 21 +++++++++++++ 3 files changed, 50 insertions(+), 13 deletions(-) rename std-bits/base/src/main/java/org/enso/base/statistics/{CorrelationStats.java => CorrelationStatistics.java} (83%) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index ae1b39bb2140..e41ea34fe475 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -7,7 +7,7 @@ import Standard.Base.Data.Statistics.Rank_Method polyglot java import org.enso.base.statistics.Moments polyglot java import org.enso.base.statistics.CountMinMax -polyglot java import org.enso.base.statistics.CorrelationStats +polyglot java import org.enso.base.statistics.CorrelationStatistics polyglot java import org.enso.base.statistics.Rank polyglot java import java.lang.IllegalArgumentException @@ -125,6 +125,9 @@ compute_bulk data statistics=[Count, Sum] = Maximum -> if count_min_max_values.comparatorError then (Error.throw Vector.Incomparable_Values_Error) else count_min_max_values.maximum + Covariance s -> here.calculate_correlation_statistics data s . covariance + Pearson s -> here.calculate_correlation_statistics data s . pearsonCorrelation + R_Squared s -> here.calculate_correlation_statistics data s . rSquared _ -> stats_array.at i @@ -134,7 +137,7 @@ compute_bulk data statistics=[Count, Sum] = - data: The input data sets covariance_matrix : [Vector] -> [Vector] covariance_matrix data = - stats_vectors = here.correlation_stats data + stats_vectors = here.calculate_correlation_statistics_matrix data stats_vectors.map v->(v.map .covariance) @@ -144,14 +147,27 @@ covariance_matrix data = - data: The input data sets pearson_correlation : [Vector] -> [Vector] pearson_correlation data = - stats_vectors = here.correlation_stats data + stats_vectors = here.calculate_correlation_statistics_matrix data stats_vectors.map v->(v.map .pearsonCorrelation) ## PRIVATE - Given a set of series get CorrelationStats objects -correlation_stats : [Vector] -> [CorrelationStats] -correlation_stats data = + Given a set of series get CorrelationStatistics objects +calculate_correlation_statistics : Vector -> Vector -> CorrelationStatistics +calculate_correlation_statistics x_data y_data = + report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets.")) + handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported + + report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal + + handle_unsupported <| handle_illegal <| CorrelationStatistics.compute x_data.to_array y_data.to_array + + +## PRIVATE + Given a set of series get CorrelationStatistics objects +calculate_correlation_statistics_matrix : [Vector] -> [CorrelationStatistics] +calculate_correlation_statistics_matrix data = data_array = Array.new data.length 0.up_to data.length . each i->(data_array.set_at i (data.at i).to_array) @@ -161,7 +177,7 @@ correlation_stats data = report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal - stats_array = handle_unsupported <| handle_illegal <| CorrelationStats.computeMatrix data_array + stats_array = handle_unsupported <| handle_illegal <| CorrelationStatistics.computeMatrix data_array Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java similarity index 83% rename from std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java rename to std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java index d9c19c92b96a..0f37296ae84b 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStats.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java @@ -3,7 +3,7 @@ /** * Class to compute covariance and correlations between series. */ -public class CorrelationStats { +public class CorrelationStatistics { private long count = 0; private double totalX = 0.0; private double totalXX = 0.0; @@ -53,19 +53,19 @@ public double rSquared() { * @param y Array of Y values * @return CorrelationStats object for the 2 series. */ - public static CorrelationStats compute(Double[] x, Double[] y) { + public static CorrelationStatistics compute(Double[] x, Double[] y) { if (x.length != y.length) { throw new IllegalArgumentException("Left and right lengths are not the same."); } - CorrelationStats output = new CorrelationStats(); + CorrelationStatistics output = new CorrelationStatistics(); for (int i = 0; i < x.length; i++) { output.append(x[i], y[i]); } return output; } - public static CorrelationStats[][] computeMatrix(Double[][] data) { + public static CorrelationStatistics[][] computeMatrix(Double[][] data) { int len = data[0].length; for (int i = 1; i < data.length; i++) { @@ -74,9 +74,9 @@ public static CorrelationStats[][] computeMatrix(Double[][] data) { } } - CorrelationStats[][] output = new CorrelationStats[data.length][]; + CorrelationStatistics[][] output = new CorrelationStatistics[data.length][]; for (int i = 0; i < data.length; i++) { - output[i] = new CorrelationStats[data.length]; + output[i] = new CorrelationStatistics[data.length]; for (int j = 0; j < data.length; j++) { if (j < i) { output[i][j] = output[j][i]; diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso index b0ff145adea4..105c152333cf 100644 --- a/test/Tests/src/Data/Statistics_Spec.enso +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -194,6 +194,11 @@ spec = series_c = [Nothing,0.769797,0.281678,0.462145,0.727132,0.327978,Nothing,0.648639,0.562636,Nothing,0.159836,0.367404,0.877087,0.365483,Nothing,0.931873,0.723546,0.558085,0.163396,0.940997,0.399685,0.617509] series = [series_a, series_b, series_c] + Test.specify "can compute Covariance, Correlation and R Squared between a pair of series" + series_a.compute (Covariance series_b) . should_equal -0.0053554 epsilon=double_error + series_a.compute (Pearson series_b) . should_equal -0.08263943 epsilon=double_error + series_a.compute (R_Squared series_b) . should_equal 0.006829275 epsilon=double_error + Test.specify "can calculate a covariance matrix" <| matrix = Statistics.covariance_matrix series matrix.length . should_equal 3 @@ -208,6 +213,22 @@ spec = vector_compare (matrix.at 1) [-0.08263943, 1, -0.01537537] vector_compare (matrix.at 2) [-0.40469045, -0.01537537, 1] + Test.specify "should fail with Illegal_Argument_Error if different lengths" <| + data = [[1,2,3,4],[10,20,30]] + data.first.compute (Covariance data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (Pearson data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (R_Squared data.second) . should_fail_with Illegal_Argument_Error + Statistics.covariance_matrix data . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation data . should_fail_with Illegal_Argument_Error + + Test.specify "should fail with Illegal_Argument_Error if not number based" <| + text = [["A","BC","CD"], ["0", "1", "2"], ["H", "I", "J"]] + text.first.compute (Covariance text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (Pearson text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (R_Squared text.second) . should_fail_with Illegal_Argument_Error + Statistics.covariance_matrix text . should_fail_with Illegal_Argument_Error + Statistics.pearson_correlation text . should_fail_with Illegal_Argument_Error + Test.group "Statistics - invalid input" <| Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| series = [["A", "B", Nothing, "D"], ["A", "B", Nothing, "D"]] From 80f07d72c00059dcee61a9f42fe295b1b61ee055 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 26 May 2022 11:53:51 +0100 Subject: [PATCH 06/13] Error handling in spearman matrix still to do --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 61 ++++++++++++---- .../statistics/CorrelationStatistics.java | 10 +++ .../java/org/enso/base/statistics/Rank.java | 71 +++++++++++++------ test/Tests/src/Data/Statistics_Spec.enso | 14 +++- 4 files changed, 122 insertions(+), 34 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index e41ea34fe475..61e383734ced 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -1,4 +1,6 @@ -from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types +from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types, IO +import Standard.Base.Runtime.Ref + from Standard.Base.Data.Vector import Empty_Error import Standard.Base.Data.Ordering.Comparator @@ -72,6 +74,12 @@ type Statistic - series: the series to compute the correlation with. type Pearson (series:Vector) + ## Calculate the Spearman Rank Correlation between data and series. + + Arguments: + - series: the series to compute the correlation with. + type Spearman (series:Vector) + ## Calculate the coefficient of determination between data and predicted series. @@ -127,6 +135,7 @@ compute_bulk data statistics=[Count, Sum] = count_min_max_values.maximum Covariance s -> here.calculate_correlation_statistics data s . covariance Pearson s -> here.calculate_correlation_statistics data s . pearsonCorrelation + Spearman s -> here.calculate_spearman_rank data s R_Squared s -> here.calculate_correlation_statistics data s . rSquared _ -> stats_array.at i @@ -151,17 +160,51 @@ pearson_correlation data = stats_vectors.map v->(v.map .pearsonCorrelation) +## Calculate a Spearman Rank correlation matrix between the input series. + + Arguments: + - data: The input data sets +spearman_correlation : [Vector] -> [Vector] +spearman_correlation data = + output_array = Array.new data.length + ref_error = Ref.new Nothing + + 0.up_to data.length . each i-> + output_array.set_at i <| + Vector.new data.length j-> + if j == i then 1 else + if j < i then (output_array.at j . at i) else + value = here.calculate_spearman_rank (data.at i) (data.at j) + if value.is_error then ref_error.put value + value + + ref_error.get.if_nothing (Vector.Vector output_array) + + ## PRIVATE - Given a set of series get CorrelationStatistics objects -calculate_correlation_statistics : Vector -> Vector -> CorrelationStatistics -calculate_correlation_statistics x_data y_data = +wrap_java_call : Any -> Any +wrap_java_call ~function = report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets.")) handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal - handle_unsupported <| handle_illegal <| CorrelationStatistics.compute x_data.to_array y_data.to_array + handle_unsupported <| handle_illegal <| function + + +## PRIVATE + Given two series, get a computed CorrelationStatistics object +calculate_correlation_statistics : Vector -> Vector -> CorrelationStatistics +calculate_correlation_statistics x_data y_data = + here.wrap_java_call <| CorrelationStatistics.compute x_data.to_array y_data.to_array + + +## PRIVATE + Given two series, get a compute the Spearman Rank correlation +calculate_spearman_rank : Vector -> Vector -> Decimal +calculate_spearman_rank x_data y_data = + here.wrap_java_call <| CorrelationStatistics.spearmanRankCorrelation x_data.to_array y_data.to_array ## PRIVATE @@ -171,13 +214,7 @@ calculate_correlation_statistics_matrix data = data_array = Array.new data.length 0.up_to data.length . each i->(data_array.set_at i (data.at i).to_array) - report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets.")) - handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported - - report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) - handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal - - stats_array = handle_unsupported <| handle_illegal <| CorrelationStatistics.computeMatrix data_array + stats_array = here.wrap_java_call <| CorrelationStatistics.computeMatrix data_array Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java index 0f37296ae84b..5fa2380a9c3d 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java @@ -87,4 +87,14 @@ public static CorrelationStatistics[][] computeMatrix(Double[][] data) { } return output; } + + public static double spearmanRankCorrelation(Double[] x, Double[] y) { + double[][] pairedRanks = Rank.pairedRanks(x, y, Rank.Method.AVERAGE); + + CorrelationStatistics computation = new CorrelationStatistics(); + for (int i = 0; i < pairedRanks[0].length; i++) { + computation.append(pairedRanks[0][i], pairedRanks[1][i]); + } + return computation.pearsonCorrelation(); + } } diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java index e291cd125982..bf0f5ef5de3d 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Rank.java @@ -1,10 +1,10 @@ package org.enso.base.statistics; -import java.util.Arrays; -import java.util.Comparator; -import java.util.stream.IntStream; +import java.util.*; public class Rank { + private static final Comparator DOUBLE_COMPARATOR = (a, b) -> Double.compare((Double)a, (Double)b); + public enum Method { AVERAGE, MINIMUM, @@ -19,45 +19,74 @@ private record ValueWithIndex(Object value, int index) { public static double[] rank(Object[] input, Comparator comparator, Method method) throws NullPointerException, ClassCastException { - Comparator tupleComparator = (a, b) -> { - int c = comparator.compare(a.value, b.value); - return c == 0 ? Integer.compare(a.index, b.index) : -c; - }; - - ValueWithIndex[] tuples = new ValueWithIndex[input.length]; + List tuples = new ArrayList<>(input.length); for(int i = 0; i < input.length; i++) { if (input[i] == null) { throw new NullPointerException("Value is Nothing at index " + i); } - tuples[i] = new ValueWithIndex(input[i], i); + tuples.add(new ValueWithIndex(input[i], i)); } - Arrays.sort(tuples, tupleComparator); - double[] output = new double[input.length]; + return computeRankFromTuples(tuples, comparator, method); + } + + public static double[][] pairedRanks(Double[] x, Double[] y, Method method) + throws IllegalArgumentException, NullPointerException, ClassCastException + { + if (x.length != y.length) { + throw new IllegalArgumentException("Left and right lengths are not the same."); + } + + List x_tuples = new ArrayList<>(x.length); + List y_tuples = new ArrayList<>(y.length); + for (int i = 0; i < x.length; i++) { + if (x[i] == null || Double.isNaN(x[i]) || y[i] == null || Double.isNaN(y[i])) { + continue; + } + + x_tuples.add(new ValueWithIndex(x[i], x_tuples.size())); + y_tuples.add(new ValueWithIndex(y[i], y_tuples.size())); + } + + return new double[][] { + computeRankFromTuples(x_tuples, DOUBLE_COMPARATOR, method), + computeRankFromTuples(y_tuples, DOUBLE_COMPARATOR, method) + }; + } + + private static double[] computeRankFromTuples(List tuples, Comparator comparator, Method method) + throws NullPointerException, ClassCastException + { + Comparator tupleComparator = (a, b) -> { + int c = comparator.compare(a.value, b.value); + return c == 0 ? Integer.compare(a.index, b.index) : -c; + }; + tuples.sort(tupleComparator); + + double[] output = new double[tuples.size()]; int index = 0; int dense = 0; - while (index < tuples.length) - { + while (index < tuples.size()) { dense++; - int bottom = index; + int start = index; - // Find Top - while (index < tuples.length && comparator.compare(tuples[bottom].value, tuples[index].value) == 0) { + // Find End of Equal Values + while (index < tuples.size() && comparator.compare(tuples.get(start).value, tuples.get(index).value) == 0) { index++; } // Build Rank - for (int i = bottom; i < index; i++) { + for (int i = start; i < index; i++) { double rank = switch (method) { - case MINIMUM -> bottom + 1; + case MINIMUM -> start + 1; case MAXIMUM -> index; case DENSE -> dense; - case AVERAGE -> (bottom + 1 + index) / 2.0; + case AVERAGE -> (start + 1 + index) / 2.0; case ORDINAL -> i + 1; }; - output[tuples[i].index] = rank; + output[tuples.get(i).index] = rank; } } diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso index 105c152333cf..bf08ff7c740b 100644 --- a/test/Tests/src/Data/Statistics_Spec.enso +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -197,6 +197,7 @@ spec = Test.specify "can compute Covariance, Correlation and R Squared between a pair of series" series_a.compute (Covariance series_b) . should_equal -0.0053554 epsilon=double_error series_a.compute (Pearson series_b) . should_equal -0.08263943 epsilon=double_error + series_a.compute (Spearman series_b) . should_equal -0.09313725 epsilon=double_error series_a.compute (R_Squared series_b) . should_equal 0.006829275 epsilon=double_error Test.specify "can calculate a covariance matrix" <| @@ -206,28 +207,39 @@ spec = vector_compare (matrix.at 1) [-0.0053554, 0.07707381, -0.00098274] vector_compare (matrix.at 2) [-0.02378204, -0.00098274, 0.05837098] - Test.specify "can calculate a correlation matrix" <| + Test.specify "can calculate a pearson correlation matrix" <| matrix = Statistics.pearson_correlation series matrix.length . should_equal 3 vector_compare (matrix.at 0) [1, -0.08263943, -0.40469045] vector_compare (matrix.at 1) [-0.08263943, 1, -0.01537537] vector_compare (matrix.at 2) [-0.40469045, -0.01537537, 1] + Test.specify "can calculate a spearman rank correlation matrix" <| + matrix = Statistics.spearman_correlation series + matrix.length . should_equal 3 + vector_compare (matrix.at 0) [1, -0.09313725, -0.43382353] + vector_compare (matrix.at 1) [-0.09313725, 1, 0] + vector_compare (matrix.at 2) [-0.43382353, 0, 1] + Test.specify "should fail with Illegal_Argument_Error if different lengths" <| data = [[1,2,3,4],[10,20,30]] data.first.compute (Covariance data.second) . should_fail_with Illegal_Argument_Error data.first.compute (Pearson data.second) . should_fail_with Illegal_Argument_Error + data.first.compute (Spearman data.second) . should_fail_with Illegal_Argument_Error data.first.compute (R_Squared data.second) . should_fail_with Illegal_Argument_Error Statistics.covariance_matrix data . should_fail_with Illegal_Argument_Error Statistics.pearson_correlation data . should_fail_with Illegal_Argument_Error + Statistics.spearman_correlation data . should_fail_with Illegal_Argument_Error Test.specify "should fail with Illegal_Argument_Error if not number based" <| text = [["A","BC","CD"], ["0", "1", "2"], ["H", "I", "J"]] text.first.compute (Covariance text.second) . should_fail_with Illegal_Argument_Error text.first.compute (Pearson text.second) . should_fail_with Illegal_Argument_Error + text.first.compute (Spearman text.second) . should_fail_with Illegal_Argument_Error text.first.compute (R_Squared text.second) . should_fail_with Illegal_Argument_Error Statistics.covariance_matrix text . should_fail_with Illegal_Argument_Error Statistics.pearson_correlation text . should_fail_with Illegal_Argument_Error + Statistics.spearman_correlation text . should_fail_with Illegal_Argument_Error Test.group "Statistics - invalid input" <| Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| From b127fb8837edaaedd31fcbd214ac24d0c569abdb Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 26 May 2022 17:05:23 +0100 Subject: [PATCH 07/13] Add Stats tests to Main.enso. Create helpers for promoting dataflow errors as panics. Remove set_at where possible. --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 28 +++++++-------- .../Base/0.0.0-dev/src/Data/Vector.enso | 34 +++++++++++++------ .../Base/0.0.0-dev/src/Error/Common.enso | 17 ++++++++++ .../src/Internal/Vector_Builder.enso | 3 +- test/Tests/src/Main.enso | 2 ++ 5 files changed, 55 insertions(+), 29 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index 61e383734ced..0c28a624a19d 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -1,5 +1,4 @@ -from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types, IO -import Standard.Base.Runtime.Ref +from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types from Standard.Base.Data.Vector import Empty_Error @@ -166,19 +165,18 @@ pearson_correlation data = - data: The input data sets spearman_correlation : [Vector] -> [Vector] spearman_correlation data = - output_array = Array.new data.length - ref_error = Ref.new Nothing + Panic.handle_wrapped_dataflow_error <| + output = Vector.new_builder data.length - 0.up_to data.length . each i-> - output_array.set_at i <| - Vector.new data.length j-> - if j == i then 1 else - if j < i then (output_array.at j . at i) else - value = here.calculate_spearman_rank (data.at i) (data.at j) - if value.is_error then ref_error.put value - value + 0.up_to data.length . each i-> + output.append <| + Vector.new data.length j-> + if j == i then 1 else + if j < i then (output.at j . at i) else + Panic.throw_wrapped_if_error <| + here.calculate_spearman_rank (data.at i) (data.at j) - ref_error.get.if_nothing (Vector.Vector output_array) + output.to_vector ## PRIVATE @@ -211,9 +209,7 @@ calculate_spearman_rank x_data y_data = Given a set of series get CorrelationStatistics objects calculate_correlation_statistics_matrix : [Vector] -> [CorrelationStatistics] calculate_correlation_statistics_matrix data = - data_array = Array.new data.length - 0.up_to data.length . each i->(data_array.set_at i (data.at i).to_array) - + data_array = Vector.new data.length i->(data.at i).to_array . to_array stats_array = here.wrap_java_call <| CorrelationStatistics.computeMatrix data_array Vector.new stats_array.length i->(Vector.Vector (stats_array.at i)) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso index b4808b58e152..e28baabf918c 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso @@ -55,6 +55,9 @@ fill length ~item = A vector allows to store an arbitrary number of elements in linear memory. It is the recommended data structure for most applications. + Arguments: + - capacity: Initial capacity of the Vector.Builder + > Example Construct a vector using a builder that contains the items 1 to 10. @@ -66,8 +69,8 @@ fill length ~item = @Tail_Call do_build start+1 stop do_build 1 10 builder.to_vector -new_builder : Builder -new_builder = Builder.new +new_builder : Integer -> Builder +new_builder (capacity=1) = Builder.new capacity ## ADVANCED @@ -141,13 +144,7 @@ type Vector at : Integer -> Any ! Index_Out_Of_Bounds_Error at index = actual_index = if index < 0 then this.length + index else index - ## TODO [RW] Ideally we do not want an additional check here, but we - should catch a Invalid_Array_Index_Error panic. However, such a catch - should still properly forward any other panics or dataflow errors - which is not fully possible until the approach to handling Panics is - improved, as described in the following Pivotal ticket: - https://www.pivotaltracker.com/n/projects/2539304/stories/181029230 - if actual_index>=0 && actual_index Error.throw (Index_Out_Of_Bounds_Error index this.length) ## ADVANCED @@ -1015,12 +1012,15 @@ type Builder ## Creates a new builder. + Arguments: + - capacity: Initial capacity of the Vector.Builder + > Example Make a new builder Vector.new_builder - new : Builder - new = Builder (Array.new 1) 0 + new : Integer->Builder + new (capacity=1) = Builder (Array.new capacity) 0 ## Returns the current capacity (i.e. the size of the underlying storage) of this builder. @@ -1088,6 +1088,18 @@ type Builder this.append item Nothing + ## Gets an element from the vector at a specified index (0-based). + + Arguments: + - index: The location in the vector to get the element from. The index is + also allowed be negative, then the elements are indexed from the back + of the vector, i.e. -1 will correspond to the last element. + at : Integer -> Any ! Index_Out_Of_Bounds_Error + at index = + actual_index = if index < 0 then this.length + index else index + Panic.catch Invalid_Array_Index_Error (this.to_array.at actual_index) _-> + Error.throw (Index_Out_Of_Bounds_Error index this.length) + ## Checks whether a predicate holds for at least one element of this builder. Arguments: diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso index b15632960019..71c19edaec6b 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso @@ -386,6 +386,23 @@ type Panic True -> caught_panic.convert_to_dataflow_error False -> Panic.throw caught_panic + ## If a dataflow error had occurred, promote to a Panic and wrap in a Wrapped_Dataflow_Error + + Arguments: + - value: value to return if not an error, or rethrow as a Panic. + throw_wrapped_if_error : Any -> Any + throw_wrapped_if_error ~value = + if value.is_error then Panic.throw (Wrapped_Dataflow_Error value.catch) else value + + ## Catch any Wrapped_Dataflow_Error Panic and rethrow as a dataflow error + + Arguments: + - action: The code to execute that potentially raised a Wrapped_Dataflow_Error. + handle_wrapped_dataflow_error : Any -> Any + handle_wrapped_dataflow_error ~action = + Panic.catch Wrapped_Dataflow_Error action caught_panic-> + Error.throw caught_panic.payload.payload + ## The runtime representation of a syntax error. Arguments: diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso index 96a19ed355cc..d520d4ea974c 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Vector_Builder.enso @@ -50,8 +50,7 @@ type Vector_Builder array = Array.new this.length go ix elem = case elem of Leaf vec -> - vec.map_with_index vi-> elem-> - array.set_at ix+vi elem + Array.copy vec.to_array 0 array ix vec.length ix + vec.length Append l r _ -> ix2 = go ix l diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso index b87dc37874c5..d6aa7e6d5c8f 100644 --- a/test/Tests/src/Main.enso +++ b/test/Tests/src/Main.enso @@ -34,6 +34,7 @@ import project.Data.Ref_Spec import project.Data.Text_Spec import project.Data.Time.Spec as Time_Spec import project.Data.Vector_Spec +import project.Data.Statistics_Spec import project.Data.Text.Regex_Spec import project.Data.Text.Utils_Spec import project.Data.Text.Default_Regex_Engine_Spec @@ -104,4 +105,5 @@ main = Test.Suite.run_main <| Time_Spec.spec Uri_Spec.spec Vector_Spec.spec + Statistics_Spec.spec Warnings_Spec.spec From 5be226f2f0c0b9f7675a71b5982c1bfd99ee2497 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 26 May 2022 18:04:49 +0100 Subject: [PATCH 08/13] Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cf57f6534e5..af14004faf33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -131,6 +131,7 @@ and made it the default.][3472] - [Implemented a `Table.from Text` conversion allowing to parse strings representing `Delimited` files without storing them on the filesystem.][3478] +- [Added rank data, correlation and covariance statistics for `Vector`][3484] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -204,6 +205,7 @@ [3472]: https://github.com/enso-org/enso/pull/3472 [3486]: https://github.com/enso-org/enso/pull/3486 [3478]: https://github.com/enso-org/enso/pull/3478 +[3484]: https://github.com/enso-org/enso/pull/3484 #### Enso Compiler From 1fc867d7a87768d36b95cd0a73e5993bebc47b56 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 27 May 2022 08:43:11 +0100 Subject: [PATCH 09/13] Formatting --- .../org/enso/base/statistics/CorrelationStatistics.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java index 5fa2380a9c3d..437378fe4ede 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java @@ -1,8 +1,6 @@ package org.enso.base.statistics; -/** - * Class to compute covariance and correlations between series. - */ +/** Class to compute covariance and correlations between series. */ public class CorrelationStatistics { private long count = 0; private double totalX = 0.0; @@ -44,11 +42,12 @@ public double pearsonCorrelation() { public double rSquared() { double correl = this.pearsonCorrelation(); - return correl * correl; + return correl * correl; } - /*** + /** * Create the CorrelationStats between two series + * * @param x Array of X values * @param y Array of Y values * @return CorrelationStats object for the 2 series. From 44e05faf0e50f9a23b9c0b782806ca0b4688a1d4 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Mon, 30 May 2022 09:13:41 +0100 Subject: [PATCH 10/13] Extra doc-strings --- .../Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso index 2fc9c2fa7e67..1718667bdf79 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics/Rank_Method.enso @@ -1,4 +1,5 @@ +## Specifies how to handle ranking of equal values. type Rank_Method ## Use the mean of all ranks for equal values. type Average @@ -9,7 +10,8 @@ type Rank_Method ## Use the highest of all ranks for equal values. type Maximum - ## Use same rank value for equal values and next group is + ## Use same rank value for equal values and next group is the immediate + following ranking number. type Dense ## Equal values are assigned the next rank in order that they occur. From 5c1488e9d059527ef20e0e626abcd01f07d9373b Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Mon, 30 May 2022 10:25:14 +0100 Subject: [PATCH 11/13] Update distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Radosław Waśko --- distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso index 71c19edaec6b..01d05a660a6c 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso @@ -394,7 +394,7 @@ type Panic throw_wrapped_if_error ~value = if value.is_error then Panic.throw (Wrapped_Dataflow_Error value.catch) else value - ## Catch any Wrapped_Dataflow_Error Panic and rethrow as a dataflow error + ## Catch any `Wrapped_Dataflow_Error` Panic and rethrow it as a dataflow error. Arguments: - action: The code to execute that potentially raised a Wrapped_Dataflow_Error. From e3e75d4c66ccfe77b31c367eb98ecabf50d5fc92 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Mon, 30 May 2022 10:25:18 +0100 Subject: [PATCH 12/13] Update distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Radosław Waśko --- distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso index 01d05a660a6c..1ba22dd37c88 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Error/Common.enso @@ -386,7 +386,7 @@ type Panic True -> caught_panic.convert_to_dataflow_error False -> Panic.throw caught_panic - ## If a dataflow error had occurred, promote to a Panic and wrap in a Wrapped_Dataflow_Error + ## If a dataflow error had occurred, wrap it in a `Wrapped_Dataflow_Error` and promote to a Panic. Arguments: - value: value to return if not an error, or rethrow as a Panic. From a4fa1d4aa48fb53f4c87fffc9059401347c35c66 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Mon, 30 May 2022 15:53:37 +0100 Subject: [PATCH 13/13] Hubert's PR comments. --- .../lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso | 2 +- .../org/enso/base/statistics/CorrelationStatistics.java | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso index e28baabf918c..9821106db991 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso @@ -62,7 +62,7 @@ fill length ~item = Construct a vector using a builder that contains the items 1 to 10. example_new_builder = - builder = Vector.new_builder + builder = Vector.new_builder 10 do_build start stop = builder.append start if start >= stop then Nothing else diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java index 437378fe4ede..04fa6a731146 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CorrelationStatistics.java @@ -67,14 +67,11 @@ public static CorrelationStatistics compute(Double[] x, Double[] y) { public static CorrelationStatistics[][] computeMatrix(Double[][] data) { int len = data[0].length; - for (int i = 1; i < data.length; i++) { + CorrelationStatistics[][] output = new CorrelationStatistics[data.length][]; + for (int i = 0; i < data.length; i++) { if (data[i].length != len) { throw new IllegalArgumentException("Data lengths are not consistent."); } - } - - CorrelationStatistics[][] output = new CorrelationStatistics[data.length][]; - for (int i = 0; i < data.length; i++) { output[i] = new CorrelationStatistics[data.length]; for (int j = 0; j < data.length; j++) { if (j < i) {