Skip to content

Commit

Permalink
[ML] Avoid very low p-values if the term is only a tiny fraction of t…
Browse files Browse the repository at this point in the history
…he foreground set (elastic#76764)

Whilst testing the p_value scoring heuristic for significant terms introduced
in elastic#75313 it became clear we can assign arbitrarily low p-values if the overall
counts are high enough for terms which constitute a very small fraction of the
foreground set. Even if the difference in their frequency on the foreground and
background set is statistically significant they don't explain the majority of the
foreground cases and so are not of significant interest (certainly not in the use
cases we have for this aggregation).

We already have some mitigation for the cases that 1. the term frequency is
small on both the foreground and background set, 2. the term frequencies are
very similar. These offset the actual term counts by a fixed small fraction of
the background counts and make the foreground and background frequencies
more similar by a small relative amount, respectively. This change simply applies
offsets to the term counts before making frequencies more similar. For frequencies
much less than the offset we therefore get equal frequencies on the foreground
and background sets and p-value tends to 1. This retains the advantage of being
a smooth correction to the p-value so we get no strange discontinuities in the
vicinity of the small absolute and difference thresholds for the frequency.
  • Loading branch information
tveasey authored and benwtrent committed Aug 20, 2021
1 parent f9cc151 commit 5747037
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,48 +113,50 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long
return 0.0;
}

// casting to `long` to round down to nearest whole number
double epsAllDocsInClass = (long)eps(allDocsInClass);
double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);

docsContainTermInClass += epsAllDocsInClass;
docsContainTermNotInClass += epsAllDocsNotInClass;
allDocsInClass += epsAllDocsInClass;
allDocsNotInClass += epsAllDocsNotInClass;

// Adjust counts to ignore ratio changes which are less than 5%
// casting to `long` to round down to nearest whole number
docsContainTermNotInClass = (long)(Math.min(
1.05 * docsContainTermNotInClass,
docsContainTermInClass / allDocsInClass * allDocsNotInClass
) + 0.5);

// casting to `long` to round down to nearest whole number
double epsAllDocsInClass = (long)eps(allDocsInClass);
double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);

if ((allDocsInClass + epsAllDocsInClass) > Long.MAX_VALUE
|| (docsContainTermInClass + epsAllDocsInClass) > Long.MAX_VALUE
|| (allDocsNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE
|| (docsContainTermNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE) {
if (allDocsInClass > Long.MAX_VALUE
|| docsContainTermInClass > Long.MAX_VALUE
|| allDocsNotInClass > Long.MAX_VALUE
|| docsContainTermNotInClass > Long.MAX_VALUE) {
throw new AggregationExecutionException(
"too many documents in background and foreground sets, further restrict sets for execution"
);
}

double v1 = new LongBinomialDistribution(
(long)(allDocsInClass + epsAllDocsInClass),
(docsContainTermInClass + epsAllDocsInClass)/(allDocsInClass + epsAllDocsInClass)
).logProbability((long)(docsContainTermInClass + epsAllDocsInClass));
(long)allDocsInClass, docsContainTermInClass / allDocsInClass
).logProbability((long)docsContainTermInClass);

double v2 = new LongBinomialDistribution(
(long)(allDocsNotInClass + epsAllDocsNotInClass),
(docsContainTermNotInClass + epsAllDocsNotInClass)/(allDocsNotInClass + epsAllDocsNotInClass)
).logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass));
(long)allDocsNotInClass, docsContainTermNotInClass / allDocsNotInClass
).logProbability((long)docsContainTermNotInClass);

double p2 = (docsContainTermInClass + docsContainTermNotInClass + epsAllDocsNotInClass + epsAllDocsInClass)
/ (allDocsInClass + allDocsNotInClass + epsAllDocsNotInClass + epsAllDocsInClass);
double p2 = (docsContainTermInClass + docsContainTermNotInClass) / (allDocsInClass + allDocsNotInClass);

double v3 = new LongBinomialDistribution((long)(allDocsInClass + epsAllDocsInClass), p2)
.logProbability((long)(docsContainTermInClass + epsAllDocsInClass));
double v3 = new LongBinomialDistribution((long)allDocsInClass, p2)
.logProbability((long)docsContainTermInClass);

double v4 = new LongBinomialDistribution((long)(allDocsNotInClass + epsAllDocsNotInClass), p2)
.logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass));
double v4 = new LongBinomialDistribution((long)allDocsNotInClass, p2)
.logProbability((long)docsContainTermNotInClass);

double logLikelihoodRatio = v1 + v2 - v3 - v4;
double pValue = CHI_SQUARED_DISTRIBUTION.survivalFunction(2.0 * logLikelihoodRatio);
return -FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL));
return FastMath.max(-FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL)), 0.0);
}

private double eps(double value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.lessThanOrEqualTo;

Expand Down Expand Up @@ -110,31 +109,31 @@ public void testPValueScore() {
);
assertThat(
FastMath.exp(-new PValueScore(false).getScore(10, 100, 10, 1000)),
closeTo(0.002988594884934073, eps)
closeTo(0.003972388976814195, eps)
);
assertThat(
FastMath.exp(-new PValueScore(false).getScore(10, 100, 200, 1000)),
closeTo(1.0, eps)
);
assertThat(
FastMath.exp(-new PValueScore(false).getScore(20, 10000, 5, 10000)),
closeTo(0.6309430298306147, eps)
closeTo(1.0, eps)
);
}

public void testSmallChanges() {
assertThat(
FastMath.exp(-new PValueScore(false).getScore(1, 4205, 0, 821496)),
closeTo(0.9572480202044421, eps)
closeTo(0.9999037287868853, eps)
);
// Same(ish) ratios
assertThat(
FastMath.exp(-new PValueScore(false).getScore(10, 4205, 195, 82149)),
closeTo(0.9893886454928338, eps)
closeTo(0.9995943820612134, eps)
);
assertThat(
FastMath.exp(-new PValueScore(false).getScore(10, 4205, 1950, 821496)),
closeTo(0.9867689169546193, eps)
closeTo(0.9999942565428899, eps)
);

// 4% vs 0%
Expand All @@ -145,12 +144,12 @@ public void testSmallChanges() {
// 4% vs 2%
assertThat(
FastMath.exp(-new PValueScore(false).getScore(168, 4205, 16429, 821496)),
closeTo(4.78464746423625e-06, eps)
closeTo(8.542608559219833e-5, eps)
);
// 4% vs 3.5%
assertThat(
FastMath.exp(-new PValueScore(false).getScore(168, 4205, 28752, 821496)),
closeTo(0.4728938449949742, eps)
closeTo(0.8833950526957098, eps)
);
}

Expand Down Expand Up @@ -186,7 +185,7 @@ public void testIncreasedSubsetIncreasedScore() {
for (int j = 1; j < 11; j++) {
double nextScore = getScore.apply(j*10L);
assertThat(nextScore, greaterThanOrEqualTo(0.0));
assertThat(nextScore, greaterThan(priorScore));
assertThat(nextScore, greaterThanOrEqualTo(priorScore));
priorScore = nextScore;
}
}
Expand Down

0 comments on commit 5747037

Please sign in to comment.