Fix threshold frequency computation in Suggesters (#34312)

The `term` and `phrase` suggesters have different options to filter candidates based on their frequencies. The `popular` mode for instance filters candidate terms that occur in less docs than the original term. However when we compute this threshold we use the total term frequency of a term instead of the document frequency. This is not inline with the actual filtering which is always based on the document frequency. This change fixes this discrepancy and clarifies the meaning of the different frequencies in use in the suggesters. It also ensures that the threshold doesn't overflow the maximum allowed value (Integer.MAX_VALUE). Closes #34282
elastic · Oct 30, 2018 · ea62c16 · ea62c16
1 parent 86ae7dd
commit ea62c16
Show file tree

Hide file tree

Showing 14 changed files with 260 additions and 114 deletions.
diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml
@@ -372,9 +372,6 @@
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]ContextMapping.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoContextMapping.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoQueryContext.java" checks="LineLength" />
-  <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]CandidateScorer.java" checks="LineLength" />
-  <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellChecker.java" checks="LineLength" />
-  <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]WordScorer.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]RestoreService.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardFailure.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardsService.java" checks="LineLength" />
@@ -564,7 +561,6 @@
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]CorruptedTranslogIT.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]IndexStoreTests.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]StoreTests.java" checks="LineLength" />
-  <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]suggest[/\\]stats[/\\]SuggestStatsIT.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]translog[/\\]TranslogTests.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexing[/\\]IndexActionIT.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexlifecycle[/\\]IndexLifecycleActionIT.java" checks="LineLength" />
@@ -644,7 +640,6 @@
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]ContextCompletionSuggestSearchIT.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]CategoryContextMappingTests.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]GeoContextMappingTests.java" checks="LineLength" />
-  <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellCheckerTests.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]similarity[/\\]SimilarityIT.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]AbstractSnapshotIntegTestCase.java" checks="LineLength" />
   <suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]DedicatedClusterSnapshotRestoreIT.java" checks="LineLength" />

diff --git a/docs/reference/migration/migrate_7_0/search.asciidoc b/docs/reference/migration/migrate_7_0/search.asciidoc
@@ -78,6 +78,13 @@ removed.
 * 	`levenstein` - replaced by `levenshtein`
 * 	`jarowinkler` - replaced by `jaro_winkler`
 
+[float]
+==== `popular` mode for Suggesters
+
+The `popular` mode for Suggesters (`term` and `phrase`) now uses the doc frequency
+(instead of the sum of the doc frequency) of the input terms to compute the frequency
+threshold for candidate suggestions.
+
 [float]
 ==== Limiting the number of terms that can be used in a Terms Query request
 

diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java
@@ -18,6 +18,7 @@
  */
 package org.elasticsearch.search.suggest.phrase;
 
+import org.apache.lucene.codecs.TermStats;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
@@ -29,22 +30,22 @@ public abstract class CandidateGenerator {
 
     public abstract boolean isKnownWord(BytesRef term) throws IOException;
 
-    public abstract long frequency(BytesRef term) throws IOException;
+    public abstract TermStats termStats(BytesRef term) throws IOException;
 
     public CandidateSet drawCandidates(BytesRef term) throws IOException {
         CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term, true));
         return drawCandidates(set);
     }
 
     public Candidate createCandidate(BytesRef term, boolean userInput) throws IOException {
-        return createCandidate(term, frequency(term), 1.0, userInput);
+        return createCandidate(term, termStats(term), 1.0, userInput);
     }
-    public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException {
-        return createCandidate(term, frequency, channelScore, false);
+    public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore) throws IOException {
+        return createCandidate(term, termStats, channelScore, false);
     }
 
-    public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException;
+    public abstract Candidate createCandidate(BytesRef term, TermStats termStats,
+                                                double channelScore, boolean userInput) throws IOException;
 
     public abstract CandidateSet drawCandidates(CandidateSet set) throws IOException;
-
 }
diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java
@@ -77,21 +77,24 @@ public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord,
         } else {
             if (numMissspellingsLeft > 0) {
                 path[ord] = current.originalTerm;
-                findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
+                findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore,
+                    pathScore + scorer.score(path, candidates, ord, gramSize));
                 for (int i = 0; i < current.candidates.length; i++) {
                     path[ord] = current.candidates[i];
-                    findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
+                    findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore,
+                        pathScore + scorer.score(path, candidates, ord, gramSize));
                 }
             } else {
                 path[ord] = current.originalTerm;
-                findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
+                findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore,
+                    pathScore + scorer.score(path, candidates, ord, gramSize));
             }
         }
 
     }
 
-    private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
-            throws IOException {
+    private void updateTop(CandidateSet[] candidates, Candidate[] path,
+                                PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException {
         score = Math.exp(score);
         assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates);
         if (score > cutoffScore) {

diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.codecs.TermStats;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Term;
@@ -48,6 +49,7 @@
 
 import static java.lang.Math.log10;
 import static java.lang.Math.max;
+import static java.lang.Math.min;
 import static java.lang.Math.round;
 
 public final class DirectCandidateGenerator extends CandidateGenerator {
@@ -57,20 +59,20 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
     private final SuggestMode suggestMode;
     private final TermsEnum termsEnum;
     private final IndexReader reader;
-    private final long dictSize;
+    private final long sumTotalTermFreq;
     private static final double LOG_BASE = 5;
     private final long frequencyPlateau;
     private final Analyzer preFilter;
     private final Analyzer postFilter;
     private final double nonErrorLikelihood;
-    private final boolean useTotalTermFrequency;
     private final CharsRefBuilder spare = new CharsRefBuilder();
     private final BytesRefBuilder byteSpare = new BytesRefBuilder();
     private final int numCandidates;
 
     public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
             double nonErrorLikelihood, int numCandidates) throws IOException {
-        this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field));
+        this(spellchecker, field, suggestMode, reader, nonErrorLikelihood,
+                numCandidates, null, null, MultiFields.getTerms(reader, field));
     }
 
     public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
@@ -83,14 +85,12 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S
         this.numCandidates = numCandidates;
         this.suggestMode = suggestMode;
         this.reader = reader;
-        final long dictSize = terms.getSumTotalTermFreq();
-        this.useTotalTermFrequency = dictSize != -1;
-        this.dictSize =  dictSize == -1 ? reader.maxDoc() : dictSize;
+        this.sumTotalTermFreq =  terms.getSumTotalTermFreq() == -1 ? reader.maxDoc() : terms.getSumTotalTermFreq();
         this.preFilter = preFilter;
         this.postFilter = postFilter;
         this.nonErrorLikelihood = nonErrorLikelihood;
         float thresholdFrequency = spellchecker.getThresholdFrequency();
-        this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency);
+        this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int) (reader.maxDoc() * thresholdFrequency);
         termsEnum = terms.iterator();
     }
 
@@ -99,24 +99,29 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S
      */
     @Override
     public boolean isKnownWord(BytesRef term) throws IOException {
-        return frequency(term) > 0;
+        return termStats(term).docFreq > 0;
     }
 
     /* (non-Javadoc)
      * @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef)
      */
     @Override
-    public long frequency(BytesRef term) throws IOException {
+    public TermStats termStats(BytesRef term) throws IOException {
         term = preFilter(term, spare, byteSpare);
-        return internalFrequency(term);
+        return internalTermStats(term);
     }
 
 
-    public long internalFrequency(BytesRef term) throws IOException {
+    public TermStats internalTermStats(BytesRef term) throws IOException {
         if (termsEnum.seekExact(term)) {
-            return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq();
+            return new TermStats(termsEnum.docFreq(),
+                /**
+                 * We use the {@link TermsEnum#docFreq()} for fields that don't
+                 * record the {@link TermsEnum#totalTermFreq()}.
+                 */
+                termsEnum.totalTermFreq() == -1 ? termsEnum.docFreq() : termsEnum.totalTermFreq());
         }
-        return 0;
+        return new TermStats(0, 0);
     }
 
     public String getField() {
@@ -127,15 +132,28 @@ public String getField() {
     public CandidateSet drawCandidates(CandidateSet set) throws IOException {
         Candidate original = set.originalTerm;
         BytesRef term = preFilter(original.term, spare, byteSpare);
-        final long frequency = original.frequency;
-        spellchecker.setThresholdFrequency(this.suggestMode == SuggestMode.SUGGEST_ALWAYS ? 0 : thresholdFrequency(frequency, dictSize));
+        if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
+            /**
+             * We use the {@link TermStats#docFreq} to compute the frequency threshold
+             * because that's what {@link DirectSpellChecker#suggestSimilar} expects
+             * when filtering terms.
+             */
+            int threshold = thresholdTermFrequency(original.termStats.docFreq);
+            if (threshold == Integer.MAX_VALUE) {
+                // the threshold is the max possible frequency so we can skip the search
+                return set;
+            }
+            spellchecker.setThresholdFrequency(threshold);
+        }
+
         SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode);
         List<Candidate> candidates = new ArrayList<>(suggestSimilar.length);
         for (int i = 0; i < suggestSimilar.length; i++) {
             SuggestWord suggestWord = suggestSimilar[i];
             BytesRef candidate = new BytesRef(suggestWord.string);
-            postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score,
-                    score(suggestWord.freq, suggestWord.score, dictSize), false), spare, byteSpare, candidates);
+            TermStats termStats = internalTermStats(candidate);
+            postFilter(new Candidate(candidate, termStats,
+                suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates);
         }
         set.addCandidates(candidates);
         return set;
@@ -171,28 +189,30 @@ public void nextToken() throws IOException {
                         BytesRef term = result.toBytesRef();
                         // We should not use frequency(term) here because it will analyze the term again
                         // If preFilter and postFilter are the same analyzer it would fail.
-                        long freq = internalFrequency(term);
-                        candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance,
-                                score(candidate.frequency, candidate.stringDistance, dictSize), false));
+                        TermStats termStats = internalTermStats(term);
+                        candidates.add(new Candidate(result.toBytesRef(), termStats, candidate.stringDistance,
+                                score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
                     } else {
-                        candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood,
-                                score(candidate.frequency, candidate.stringDistance, dictSize), false));
+                        candidates.add(new Candidate(result.toBytesRef(), candidate.termStats, nonErrorLikelihood,
+                                score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
                     }
                 }
             }, spare);
         }
     }
 
-    private double score(long frequency, double errorScore, long dictionarySize) {
-        return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1));
+    private double score(TermStats termStats, double errorScore, long dictionarySize) {
+        return errorScore * (((double)termStats.totalTermFreq + 1) / ((double)dictionarySize +1));
     }
 
-    protected long thresholdFrequency(long termFrequency, long dictionarySize) {
-        if (termFrequency > 0) {
-            return max(0, round(termFrequency * (log10(termFrequency - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1));
+    // package protected for test
+    int thresholdTermFrequency(int docFreq) {
+        if (docFreq > 0) {
+            return (int) min(
+                max(0, round(docFreq * (log10(docFreq - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)), Integer.MAX_VALUE
+            );
         }
         return 0;
-
     }
 
     public abstract static class TokenConsumer {
@@ -249,12 +269,12 @@ public static class Candidate implements Comparable<Candidate> {
         public static final Candidate[] EMPTY = new Candidate[0];
         public final BytesRef term;
         public final double stringDistance;
-        public final long frequency;
+        public final TermStats termStats;
         public final double score;
         public final boolean userInput;
 
-        public Candidate(BytesRef term, long frequency, double stringDistance, double score, boolean userInput) {
-            this.frequency = frequency;
+        public Candidate(BytesRef term, TermStats termStats, double stringDistance, double score, boolean userInput) {
+            this.termStats = termStats;
             this.term = term;
             this.stringDistance = stringDistance;
             this.score = score;
@@ -266,7 +286,7 @@ public String toString() {
             return "Candidate [term=" + term.utf8ToString()
                     + ", stringDistance=" + stringDistance
                     + ", score=" + score
-                    + ", frequency=" + frequency
+                    + ", termStats=" + termStats
                     + (userInput ? ", userInput" : "") + "]";
         }
 
@@ -305,8 +325,8 @@ public int compareTo(Candidate other) {
     }
 
     @Override
-    public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException {
-        return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
+    public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException {
+        return new Candidate(term, termStats, channelScore, score(termStats, channelScore, sumTotalTermFreq), userInput);
     }
 
     public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)

diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java
@@ -46,7 +46,7 @@ protected double scoreUnigram(Candidate word) throws IOException {
     @Override
     protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
         join(separator, spare, w_1.term, word.term);
-        return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms);
+        return (alpha + frequency(spare.get())) / (w_1.termStats.totalTermFreq + alpha * numTerms);
     }
 
     @Override

diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java
@@ -60,7 +60,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
         if (count < 1) {
             return unigramLambda * scoreUnigram(word);
         }
-        return bigramLambda * (count / (0.5d + w_1.frequency)) + unigramLambda * scoreUnigram(word);
+        return bigramLambda * (count / (0.5d + w_1.termStats.totalTermFreq)) + unigramLambda * scoreUnigram(word);
     }
 
     @Override