From 12c67b2ed00910bbca8f537b035c5d68673d92b3 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 20 Aug 2021 15:53:22 +0100 Subject: [PATCH] [ML] Avoid very low p-values if the term is only a tiny fraction of the foreground set (#76764) Whilst testing the p_value scoring heuristic for significant terms introduced in #75313 it became clear we can assign arbitrarily low p-values if the overall counts are high enough for terms which constitute a very small fraction of the foreground set. Even if the difference in their frequency on the foreground and background set is statistically significant they don't explain the majority of the foreground cases and so are not of significant interest (certainly not in the use cases we have for this aggregation). We already have some mitigation for the cases that 1. the term frequency is small on both the foreground and background set, 2. the term frequencies are very similar. These offset the actual term counts by a fixed small fraction of the background counts and make the foreground and background frequencies more similar by a small relative amount, respectively. This change simply applies offsets to the term counts before making frequencies more similar. For frequencies much less than the offset we therefore get equal frequencies on the foreground and background sets and p-value tends to 1. This retains the advantage of being a smooth correction to the p-value so we get no strange discontinuities in the vicinity of the small absolute and difference thresholds for the frequency. --- .../xpack/ml/aggs/heuristic/PValueScore.java | 44 ++++++++++--------- .../ml/aggs/heuristic/PValueScoreTests.java | 17 ++++--- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java index 0149d7feec9d1..f365e4758cc94 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java @@ -113,6 +113,15 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long return 0.0; } + // casting to `long` to round down to nearest whole number + double epsAllDocsInClass = (long)eps(allDocsInClass); + double epsAllDocsNotInClass = (long)eps(allDocsNotInClass); + + docsContainTermInClass += epsAllDocsInClass; + docsContainTermNotInClass += epsAllDocsNotInClass; + allDocsInClass += epsAllDocsInClass; + allDocsNotInClass += epsAllDocsNotInClass; + // Adjust counts to ignore ratio changes which are less than 5% // casting to `long` to round down to nearest whole number docsContainTermNotInClass = (long)(Math.min( @@ -120,41 +129,34 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long docsContainTermInClass / allDocsInClass * allDocsNotInClass ) + 0.5); - // casting to `long` to round down to nearest whole number - double epsAllDocsInClass = (long)eps(allDocsInClass); - double epsAllDocsNotInClass = (long)eps(allDocsNotInClass); - - if ((allDocsInClass + epsAllDocsInClass) > Long.MAX_VALUE - || (docsContainTermInClass + epsAllDocsInClass) > Long.MAX_VALUE - || (allDocsNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE - || (docsContainTermNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE) { + if (allDocsInClass > Long.MAX_VALUE + || docsContainTermInClass > Long.MAX_VALUE + || allDocsNotInClass > Long.MAX_VALUE + || docsContainTermNotInClass > Long.MAX_VALUE) { throw new AggregationExecutionException( "too many documents in background and foreground sets, further restrict sets for execution" ); } double v1 = new LongBinomialDistribution( - (long)(allDocsInClass + epsAllDocsInClass), - (docsContainTermInClass + epsAllDocsInClass)/(allDocsInClass + epsAllDocsInClass) - ).logProbability((long)(docsContainTermInClass + epsAllDocsInClass)); + (long)allDocsInClass, docsContainTermInClass / allDocsInClass + ).logProbability((long)docsContainTermInClass); double v2 = new LongBinomialDistribution( - (long)(allDocsNotInClass + epsAllDocsNotInClass), - (docsContainTermNotInClass + epsAllDocsNotInClass)/(allDocsNotInClass + epsAllDocsNotInClass) - ).logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass)); + (long)allDocsNotInClass, docsContainTermNotInClass / allDocsNotInClass + ).logProbability((long)docsContainTermNotInClass); - double p2 = (docsContainTermInClass + docsContainTermNotInClass + epsAllDocsNotInClass + epsAllDocsInClass) - / (allDocsInClass + allDocsNotInClass + epsAllDocsNotInClass + epsAllDocsInClass); + double p2 = (docsContainTermInClass + docsContainTermNotInClass) / (allDocsInClass + allDocsNotInClass); - double v3 = new LongBinomialDistribution((long)(allDocsInClass + epsAllDocsInClass), p2) - .logProbability((long)(docsContainTermInClass + epsAllDocsInClass)); + double v3 = new LongBinomialDistribution((long)allDocsInClass, p2) + .logProbability((long)docsContainTermInClass); - double v4 = new LongBinomialDistribution((long)(allDocsNotInClass + epsAllDocsNotInClass), p2) - .logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass)); + double v4 = new LongBinomialDistribution((long)allDocsNotInClass, p2) + .logProbability((long)docsContainTermNotInClass); double logLikelihoodRatio = v1 + v2 - v3 - v4; double pValue = CHI_SQUARED_DISTRIBUTION.survivalFunction(2.0 * logLikelihoodRatio); - return -FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL)); + return FastMath.max(-FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL)), 0.0); } private double eps(double value) { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java index 24119d31c080a..42346627b17d5 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java @@ -22,7 +22,6 @@ import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.lessThanOrEqualTo; @@ -110,7 +109,7 @@ public void testPValueScore() { ); assertThat( FastMath.exp(-new PValueScore(false).getScore(10, 100, 10, 1000)), - closeTo(0.002988594884934073, eps) + closeTo(0.003972388976814195, eps) ); assertThat( FastMath.exp(-new PValueScore(false).getScore(10, 100, 200, 1000)), @@ -118,23 +117,23 @@ public void testPValueScore() { ); assertThat( FastMath.exp(-new PValueScore(false).getScore(20, 10000, 5, 10000)), - closeTo(0.6309430298306147, eps) + closeTo(1.0, eps) ); } public void testSmallChanges() { assertThat( FastMath.exp(-new PValueScore(false).getScore(1, 4205, 0, 821496)), - closeTo(0.9572480202044421, eps) + closeTo(0.9999037287868853, eps) ); // Same(ish) ratios assertThat( FastMath.exp(-new PValueScore(false).getScore(10, 4205, 195, 82149)), - closeTo(0.9893886454928338, eps) + closeTo(0.9995943820612134, eps) ); assertThat( FastMath.exp(-new PValueScore(false).getScore(10, 4205, 1950, 821496)), - closeTo(0.9867689169546193, eps) + closeTo(0.9999942565428899, eps) ); // 4% vs 0% @@ -145,12 +144,12 @@ public void testSmallChanges() { // 4% vs 2% assertThat( FastMath.exp(-new PValueScore(false).getScore(168, 4205, 16429, 821496)), - closeTo(4.78464746423625e-06, eps) + closeTo(8.542608559219833e-5, eps) ); // 4% vs 3.5% assertThat( FastMath.exp(-new PValueScore(false).getScore(168, 4205, 28752, 821496)), - closeTo(0.4728938449949742, eps) + closeTo(0.8833950526957098, eps) ); } @@ -186,7 +185,7 @@ public void testIncreasedSubsetIncreasedScore() { for (int j = 1; j < 11; j++) { double nextScore = getScore.apply(j*10L); assertThat(nextScore, greaterThanOrEqualTo(0.0)); - assertThat(nextScore, greaterThan(priorScore)); + assertThat(nextScore, greaterThanOrEqualTo(priorScore)); priorScore = nextScore; } }