From bb2125f2f9e0c251510486f667bc88ba6db443ec Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 21 Jul 2021 14:02:11 -0400 Subject: [PATCH] [7.x] [ML] adding new p_value scoring heuristic to significant terms aggregation (#75313) (#75597) * [ML] adding new p_value scoring heuristic to significant terms aggregation (#75313) This commit adds a new p_value score heuristic to significant terms. The p_value is calculating assuming that the foreground set and the background set are independent Bernoulli trials with the null hypothesis that the probabilities are the same. * adjusting for backport --- .../heuristic/NXYSignificanceHeuristic.java | 2 +- .../xpack/ml/MachineLearning.java | 8 + .../heuristic/LongBinomialDistribution.java | 192 ++++++++++++++++++ .../heuristic/MlChiSquaredDistribution.java | 27 +++ .../xpack/ml/aggs/heuristic/PValueScore.java | 179 ++++++++++++++++ .../MlChiSquaredDistributionTests.java | 28 +++ .../ml/aggs/heuristic/PValueScoreTests.java | 185 +++++++++++++++++ .../ml/p_value_significant_term_score.yml | 108 ++++++++++ 8 files changed, 728 insertions(+), 1 deletion(-) create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/LongBinomialDistribution.java create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistribution.java create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java create mode 100644 x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistributionTests.java create mode 100644 x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java create mode 100644 x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/p_value_significant_term_score.yml diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/heuristic/NXYSignificanceHeuristic.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/heuristic/NXYSignificanceHeuristic.java index 49e8653512724..7647b1f68b7f8 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/heuristic/NXYSignificanceHeuristic.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/heuristic/NXYSignificanceHeuristic.java @@ -83,7 +83,7 @@ public int hashCode() { } protected static class Frequencies { - double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N; + public double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N; } protected Frequencies computeNxys(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 5c96e38591835..6ac1bba7e1355 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -230,6 +230,7 @@ import org.elasticsearch.xpack.ml.action.TransportValidateJobConfigAction; import org.elasticsearch.xpack.ml.aggs.correlation.BucketCorrelationAggregationBuilder; import org.elasticsearch.xpack.ml.aggs.correlation.CorrelationNamedContentProvider; +import org.elasticsearch.xpack.ml.aggs.heuristic.PValueScore; import org.elasticsearch.xpack.ml.aggs.kstest.BucketCountKSTestAggregationBuilder; import org.elasticsearch.xpack.ml.aggs.inference.InferencePipelineAggregationBuilder; import org.elasticsearch.xpack.ml.annotations.AnnotationPersister; @@ -1131,6 +1132,13 @@ public List getPipelineAggregations() { ); } + @Override + public List> getSignificanceHeuristics() { + return Arrays.asList( + new SignificanceHeuristicSpec<>(PValueScore.NAME, PValueScore::new, PValueScore.PARSER) + ); + } + @Override public UnaryOperator> getIndexTemplateMetadataUpgrader() { return UnaryOperator.identity(); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/LongBinomialDistribution.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/LongBinomialDistribution.java new file mode 100644 index 0000000000000..8cccb1e4aa9ab --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/LongBinomialDistribution.java @@ -0,0 +1,192 @@ +/* @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.elasticsearch.xpack.ml.aggs.heuristic; + + +import org.apache.commons.math3.special.Gamma; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.MathUtils; + +/** + * Modified version of org.apache.commons.math3.distribution.BinomialDistribution from version 3.6.1 + * + * It expands its usage to allow `long` values instead of restricting to `int` + */ +public class LongBinomialDistribution { + + /** 1/2 * log(2π). */ + private static final double HALF_LOG_2_PI = 0.5 * FastMath.log(MathUtils.TWO_PI); + + /** exact Stirling expansion error for certain values. */ + private static final double[] EXACT_STIRLING_ERRORS = { 0.0, /* 0.0 */ + 0.1534264097200273452913848, /* 0.5 */ + 0.0810614667953272582196702, /* 1.0 */ + 0.0548141210519176538961390, /* 1.5 */ + 0.0413406959554092940938221, /* 2.0 */ + 0.03316287351993628748511048, /* 2.5 */ + 0.02767792568499833914878929, /* 3.0 */ + 0.02374616365629749597132920, /* 3.5 */ + 0.02079067210376509311152277, /* 4.0 */ + 0.01848845053267318523077934, /* 4.5 */ + 0.01664469118982119216319487, /* 5.0 */ + 0.01513497322191737887351255, /* 5.5 */ + 0.01387612882307074799874573, /* 6.0 */ + 0.01281046524292022692424986, /* 6.5 */ + 0.01189670994589177009505572, /* 7.0 */ + 0.01110455975820691732662991, /* 7.5 */ + 0.010411265261972096497478567, /* 8.0 */ + 0.009799416126158803298389475, /* 8.5 */ + 0.009255462182712732917728637, /* 9.0 */ + 0.008768700134139385462952823, /* 9.5 */ + 0.008330563433362871256469318, /* 10.0 */ + 0.007934114564314020547248100, /* 10.5 */ + 0.007573675487951840794972024, /* 11.0 */ + 0.007244554301320383179543912, /* 11.5 */ + 0.006942840107209529865664152, /* 12.0 */ + 0.006665247032707682442354394, /* 12.5 */ + 0.006408994188004207068439631, /* 13.0 */ + 0.006171712263039457647532867, /* 13.5 */ + 0.005951370112758847735624416, /* 14.0 */ + 0.005746216513010115682023589, /* 14.5 */ + 0.005554733551962801371038690 /* 15.0 */ + }; + + private final long numberOfTrials; + private final double probabilityOfSuccess; + + public LongBinomialDistribution(long numberOfTrials, double probabilityOfSuccess) { + this.numberOfTrials = numberOfTrials; + this.probabilityOfSuccess = probabilityOfSuccess; + } + + /** + * For a random variable X whose values are distributed according to this distribution, + * this method returns log(P(X = x)), where log is the natural logarithm. + * In other words, this method represents the logarithm of the probability mass function (PMF) for the distribution. + * Note that due to the floating point precision and under/overflow issues, + * this method will for some distributions be more precise and faster than computing the logarithm of probability(int). + */ + public double logProbability(long x) { + if (numberOfTrials == 0) { + return (x == 0) ? 0. : Double.NEGATIVE_INFINITY; + } + double ret; + if (x < 0 || x > numberOfTrials) { + ret = Double.NEGATIVE_INFINITY; + } else { + ret = logBinomialProbability(x, + numberOfTrials, probabilityOfSuccess, + 1.0 - probabilityOfSuccess); + } + return ret; + } + + /** + * A part of the deviance portion of the saddle point approximation. + * References: + * Catherine Loader (2000). "Fast and Accurate Computation of Binomial Probabilities.". http://www.herine.net/stat/papers/dbinom.pdf + * @param x – the x value. + * @param mu – the average. + * @return : a part of the deviance. + */ + static double getDeviancePart(double x, double mu) { + double ret; + if (FastMath.abs(x - mu) < 0.1 * (x + mu)) { + double d = x - mu; + double v = d / (x + mu); + double s1 = v * d; + double s = Double.NaN; + double ej = 2.0 * x * v; + v *= v; + int j = 1; + while (s1 != s) { + s = s1; + ej *= v; + s1 = s + ej / ((j * 2) + 1); + ++j; + } + ret = s1; + } else { + ret = x * FastMath.log(x / mu) + mu - x; + } + return ret; + } + + /** + * Compute the error of Stirling's series at the given value. + * References: + * Eric W. Weisstein. "Stirling's Series." From MathWorld--A Wolfram Web + * Resource. http://mathworld.wolfram.com/StirlingsSeries.html + * + * @param z the value. + * @return the Striling's series error. + */ + static double getStirlingError(double z) { + double ret; + if (z < 15.0) { + double z2 = 2.0 * z; + if (FastMath.floor(z2) == z2) { + ret = EXACT_STIRLING_ERRORS[(int) z2]; + } else { + ret = Gamma.logGamma(z + 1.0) - (z + 0.5) * FastMath.log(z) + + z - HALF_LOG_2_PI; + } + } else { + double z2 = z * z; + ret = (0.083333333333333333333 - + (0.00277777777777777777778 - + (0.00079365079365079365079365 - + (0.000595238095238095238095238 - + 0.0008417508417508417508417508 / + z2) / z2) / z2) / z2) / z; + } + return ret; + } + + /** + * Compute the logarithm of the PMF for a binomial distribution using the saddle point expansion. + * Params: + * @param x – the value at which the probability is evaluated. + * @param n – the number of trials. + * @param p – the probability of success. + * @param q – the probability of failure (1 - p). + * @return : log(p(x)). + */ + static double logBinomialProbability(long x, long n, double p, double q) { + double ret; + if (x == 0) { + if (p < 0.1) { + ret = -getDeviancePart(n, n * q) - n * p; + } else { + ret = n * FastMath.log(q); + } + } else if (x == n) { + if (q < 0.1) { + ret = -getDeviancePart(n, n * p) - n * q; + } else { + ret = n * FastMath.log(p); + } + } else { + ret = getStirlingError(n) - getStirlingError(x) - + getStirlingError(n - x) - getDeviancePart(x, n * p) - + getDeviancePart(n - x, n * q); + double f = (MathUtils.TWO_PI * x * (n - x)) / n; + ret = -0.5 * FastMath.log(f) + ret; + } + return ret; + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistribution.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistribution.java new file mode 100644 index 0000000000000..53ec4d9212414 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistribution.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.ml.aggs.heuristic; + +import org.apache.commons.math3.distribution.GammaDistribution; +import org.apache.commons.math3.special.Gamma; + +public class MlChiSquaredDistribution { + + private final GammaDistribution gamma; + + public MlChiSquaredDistribution(double degreesOfFreedom) { + gamma = new GammaDistribution(degreesOfFreedom / 2, 2); + } + + public double survivalFunction(double x) { + return x <= 0 ? + 1 : + Gamma.regularizedGammaQ(gamma.getShape(), x / gamma.getScale()); + } + +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java new file mode 100644 index 0000000000000..0149d7feec9d1 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java @@ -0,0 +1,179 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + + +package org.elasticsearch.xpack.ml.aggs.heuristic; + + +import org.apache.commons.math3.util.FastMath; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ConstructingObjectParser; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.bucket.terms.heuristic.NXYSignificanceHeuristic; +import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; + +import java.io.IOException; + +import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg; + +public class PValueScore extends NXYSignificanceHeuristic { + public static final String NAME = "p_value"; + public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(NAME, args -> { + boolean backgroundIsSuperset = args[0] == null || (boolean) args[0]; + return new PValueScore(backgroundIsSuperset); + }); + static { + PARSER.declareBoolean(optionalConstructorArg(), BACKGROUND_IS_SUPERSET); + } + + private static final MlChiSquaredDistribution CHI_SQUARED_DISTRIBUTION = new MlChiSquaredDistribution(1); + + public PValueScore(boolean backgroundIsSuperset) { + super(true, backgroundIsSuperset); + } + + public PValueScore(StreamInput in) throws IOException { + super(true, in.readBoolean()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeBoolean(backgroundIsSuperset); + } + + @Override + public boolean equals(Object obj) { + if ((obj instanceof PValueScore) == false) { + return false; + } + return super.equals(obj); + } + + @Override + public int hashCode() { + int result = NAME.hashCode(); + result = 31 * result + super.hashCode(); + return result; + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(NAME); + builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset); + builder.endObject(); + return builder; + } + + public static SignificanceHeuristic parse(XContentParser parser) throws IOException { + return PARSER.apply(parser, null); + } + + /** + * This finds the p-value that the frequency of a category is unchanged on set subset assuming + * we observe subsetFreq out of subset values in total relative to set supersetFreq where it accounts + * supersetFreq out of supersetSize total. + * + * This assumes that each sample is an independent Bernoulli trial and computes the p-value + * under the null hypothesis that the probabilities are the same. Note that the independence + * assumption is quite strong and can lead to low p-values even if the fractions are very + * similar if there are many trials. We arrange for small differences in frequency to always + * have large p-values. We also artificially increase the p-value of when the probability + * of the category is very small. + * + * NOTE: Since in the original calculation of `p-value`, smaller indicates more significance, the value actual value returned + * is `log(-p-value)`. To get the original p-value from the score, simply calculate `exp(-retval)` + * + * @return log(-p-value) + */ + @Override + public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) { + Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "PValueScore"); + double docsContainTermInClass = frequencies.N11; + double allDocsInClass = frequencies.N_1; + double docsContainTermNotInClass = frequencies.N10; + double allDocsNotInClass = frequencies.N_0; + + if (docsContainTermInClass * allDocsNotInClass <= allDocsInClass * docsContainTermNotInClass) { + return 0.0; + } + + if (allDocsNotInClass == 0L || allDocsInClass == 0L) { + return 0.0; + } + + // Adjust counts to ignore ratio changes which are less than 5% + // casting to `long` to round down to nearest whole number + docsContainTermNotInClass = (long)(Math.min( + 1.05 * docsContainTermNotInClass, + docsContainTermInClass / allDocsInClass * allDocsNotInClass + ) + 0.5); + + // casting to `long` to round down to nearest whole number + double epsAllDocsInClass = (long)eps(allDocsInClass); + double epsAllDocsNotInClass = (long)eps(allDocsNotInClass); + + if ((allDocsInClass + epsAllDocsInClass) > Long.MAX_VALUE + || (docsContainTermInClass + epsAllDocsInClass) > Long.MAX_VALUE + || (allDocsNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE + || (docsContainTermNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE) { + throw new AggregationExecutionException( + "too many documents in background and foreground sets, further restrict sets for execution" + ); + } + + double v1 = new LongBinomialDistribution( + (long)(allDocsInClass + epsAllDocsInClass), + (docsContainTermInClass + epsAllDocsInClass)/(allDocsInClass + epsAllDocsInClass) + ).logProbability((long)(docsContainTermInClass + epsAllDocsInClass)); + + double v2 = new LongBinomialDistribution( + (long)(allDocsNotInClass + epsAllDocsNotInClass), + (docsContainTermNotInClass + epsAllDocsNotInClass)/(allDocsNotInClass + epsAllDocsNotInClass) + ).logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass)); + + double p2 = (docsContainTermInClass + docsContainTermNotInClass + epsAllDocsNotInClass + epsAllDocsInClass) + / (allDocsInClass + allDocsNotInClass + epsAllDocsNotInClass + epsAllDocsInClass); + + double v3 = new LongBinomialDistribution((long)(allDocsInClass + epsAllDocsInClass), p2) + .logProbability((long)(docsContainTermInClass + epsAllDocsInClass)); + + double v4 = new LongBinomialDistribution((long)(allDocsNotInClass + epsAllDocsNotInClass), p2) + .logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass)); + + double logLikelihoodRatio = v1 + v2 - v3 - v4; + double pValue = CHI_SQUARED_DISTRIBUTION.survivalFunction(2.0 * logLikelihoodRatio); + return -FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL)); + } + + private double eps(double value) { + return Math.max(0.05 * value + 0.5, 1.0); + } + + public static class PValueScoreBuilder extends NXYBuilder { + + public PValueScoreBuilder(boolean backgroundIsSuperset) { + super(true, backgroundIsSuperset); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(NAME); + builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset); + builder.endObject(); + return builder; + } + } +} + diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistributionTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistributionTests.java new file mode 100644 index 0000000000000..d2e329fc6abdb --- /dev/null +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/MlChiSquaredDistributionTests.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.ml.aggs.heuristic; + +import org.elasticsearch.test.ESTestCase; + +import static org.hamcrest.Matchers.closeTo; + +public class MlChiSquaredDistributionTests extends ESTestCase { + + public void testSurvivalFunction() { + double[] inputs = new double[] {0.210212602629, 0.554298076728, 0.831211613487, 1.14547622606, 1.61030798696, + 20.5150056524, 15.0862724694, 12.8325019940, 11.0704976935, 9.23635689978, 0.0, -1.0}; + double[] results = new double[] {0.001, 0.01, 0.025, 0.05, 0.1, 0.999, 0.990, 0.975, 0.950, 0.900, 0.0, 0.0}; + + MlChiSquaredDistribution mlChiSquaredDistribution = new MlChiSquaredDistribution(5.0); + + for (int j = 0; j < inputs.length; j++) { + assertThat(mlChiSquaredDistribution.survivalFunction(inputs[j]), closeTo(1 - results[j], 1e-9)); + } + } + +} diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java new file mode 100644 index 0000000000000..a885dc41597f4 --- /dev/null +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java @@ -0,0 +1,185 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.ml.aggs.heuristic; + +import org.apache.commons.math3.util.FastMath; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.aggregations.bucket.AbstractNXYSignificanceHeuristicTestCase; +import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic; +import org.elasticsearch.xpack.ml.MachineLearning; + +import java.util.Arrays; +import java.util.function.Function; + +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.closeTo; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class PValueScoreTests extends AbstractNXYSignificanceHeuristicTestCase { + + private static final double eps = 1e-9; + + @Override + protected SignificanceHeuristic getHeuristic() { + return new PValueScore(randomBoolean()); + } + + @Override + protected SignificanceHeuristic getHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) { + return new PValueScore(backgroundIsSuperset); + } + + @Override + public void testAssertions() { + testBackgroundAssertions(new PValueScore(true), new PValueScore(false)); + } + + @Override + protected NamedXContentRegistry xContentRegistry() { + return new NamedXContentRegistry( + new SearchModule(Settings.EMPTY, false, Arrays.asList(new MachineLearning(Settings.EMPTY, null))).getNamedXContents() + ); + } + + @Override + protected NamedWriteableRegistry writableRegistry() { + return new NamedWriteableRegistry( + new SearchModule(Settings.EMPTY, false, Arrays.asList(new MachineLearning(Settings.EMPTY, null))).getNamedWriteables() + ); + } + + public void testPValueScore_WhenAllDocsContainTerm() { + PValueScore pValueScore = new PValueScore(randomBoolean()); + long supersetCount = randomNonNegativeLong(); + long subsetCount = randomLongBetween(0L, supersetCount); + assertThat(pValueScore.getScore(subsetCount, subsetCount, supersetCount, supersetCount), equalTo(0.0)); + } + + public void testHighPValueScore() { + boolean backgroundIsSuperset = randomBoolean(); + long supersetCount = randomLongBetween(0L, Long.MAX_VALUE/2); + long subsetCount = randomLongBetween(0L, supersetCount); + if (backgroundIsSuperset) { + supersetCount += subsetCount; + } + + PValueScore pValueScore = new PValueScore(backgroundIsSuperset); + assertThat(pValueScore.getScore(subsetCount, subsetCount, subsetCount, supersetCount), greaterThanOrEqualTo(700.0)); + } + + public void testLowPValueScore() { + boolean backgroundIsSuperset = randomBoolean(); + long supersetCount = randomLongBetween(0L, Long.MAX_VALUE/2); + long subsetCount = randomLongBetween(0L, supersetCount); + long subsetFreqCount = randomLongBetween(0L, subsetCount/5); + if (backgroundIsSuperset) { + supersetCount += subsetCount; + } + + PValueScore pValueScore = new PValueScore(backgroundIsSuperset); + assertThat( + pValueScore.getScore(subsetFreqCount, subsetCount, subsetCount, supersetCount), + allOf(lessThanOrEqualTo(5.0), greaterThanOrEqualTo(0.0)) + ); + } + + public void testPValueScore() { + assertThat( + FastMath.exp(-new PValueScore(false).getScore(10, 100, 100, 1000)), + closeTo(1.0, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(10, 100, 10, 1000)), + closeTo(0.002988594884934073, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(10, 100, 200, 1000)), + closeTo(1.0, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(20, 10000, 5, 10000)), + closeTo(0.6309430298306147, eps) + ); + } + + public void testSmallChanges() { + assertThat( + FastMath.exp(-new PValueScore(false).getScore(1, 4205, 0, 821496)), + closeTo(0.9572480202044421, eps) + ); + // Same(ish) ratios + assertThat( + FastMath.exp(-new PValueScore(false).getScore(10, 4205, 195, 82149)), + closeTo(0.9893886454928338, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(10, 4205, 1950, 821496)), + closeTo(0.9867689169546193, eps) + ); + + // 4% vs 0% + assertThat( + FastMath.exp(-new PValueScore(false).getScore(168, 4205, 0, 821496)), + closeTo(1.2680918648731284e-26, eps) + ); + // 4% vs 2% + assertThat( + FastMath.exp(-new PValueScore(false).getScore(168, 4205, 16429, 821496)), + closeTo(4.78464746423625e-06, eps) + ); + // 4% vs 3.5% + assertThat( + FastMath.exp(-new PValueScore(false).getScore(168, 4205, 28752, 821496)), + closeTo(0.4728938449949742, eps) + ); + } + + public void testLargerValues() { + assertThat( + FastMath.exp(-new PValueScore(false).getScore(101000, 1000000, 500000, 5000000)), + closeTo(1.0, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(102000, 1000000, 500000, 5000000)), + closeTo(1.0, eps) + ); + assertThat( + FastMath.exp(-new PValueScore(false).getScore(103000, 1000000, 500000, 5000000)), + closeTo(1.0, eps) + ); + } + + public void testScoreIsZero() { + for (int j = 0; j < 10; j++) { + assertThat( + new PValueScore(false).getScore((j + 1)*5, (j + 10)*100, (j + 1)*10, (j + 10)*100), + equalTo(0.0) + ); + } + } + + public void testIncreasedSubsetIncreasedScore() { + final Function getScore = (subsetFreq) -> + new PValueScore(false).getScore(subsetFreq, 5000, 5, 5000); + double priorScore = getScore.apply(5L); + assertThat(priorScore, greaterThanOrEqualTo(0.0)); + for (int j = 1; j < 11; j++) { + double nextScore = getScore.apply(j*10L); + assertThat(nextScore, greaterThanOrEqualTo(0.0)); + assertThat(nextScore, greaterThan(priorScore)); + priorScore = nextScore; + } + } + +} diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/p_value_significant_term_score.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/p_value_significant_term_score.yml new file mode 100644 index 0000000000000..ceb066afe01d3 --- /dev/null +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/p_value_significant_term_score.yml @@ -0,0 +1,108 @@ +setup: + - skip: + features: headers + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + indices.create: + index: store + body: + mappings: + properties: + product: + type: keyword + failed: + type: boolean + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: store + refresh: true + body: | + { "index": {} } + { "product": "TV", "failed": false } + { "index": {} } + { "product": "TV", "failed": false } + { "index": {} } + { "product": "TV", "failed": false } + { "index": {} } + { "product": "TV", "failed": false } + { "index": {} } + { "product": "TV", "failed": true } + { "index": {} } + { "product": "TV", "failed": true } + { "index": {} } + { "product": "TV", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": true } + { "index": {} } + { "product": "VCR", "failed": false } + { "index": {} } + { "product": "VCR", "failed": false } + { "index": {} } + { "product": "VCR", "failed": false } +--- +"Test p_value significant terms score": + - skip: + features: close_to + - do: + search: + index: store + body: > + { + "size": 0, + "query": { "bool": {"filter":[{"term": {"failed": true}}]}}, + "aggs": { + "significant_failures": { + "significant_terms": { + "field": "product", + "p_value": {"background_is_superset": true} + } + } + } + } + - close_to: { aggregations.significant_failures.buckets.0.score: {value: 1.957508054106207, error: 0.000001}} + - match: {aggregations.significant_failures.buckets.0.key: VCR} + + - do: + search: + index: store + body: > + { + "size": 0, + "query": { "bool": {"filter":[{"term": {"failed": true}}]}}, + "aggs": { + "significant_failures": { + "significant_terms": { + "field": "product", + "background_filter": {"bool": {"filter":[{"term": {"failed": false}}]}}, + "p_value": {"background_is_superset": false} + } + } + } + } + - close_to: { aggregations.significant_failures.buckets.0.score: {value: 1.957508054106207, error: 0.000001}} + - match: {aggregations.significant_failures.buckets.0.key: VCR}