From f59c1dddda5e05bb7b9488ede7d40ce6b1c8816c Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Thu, 4 Oct 2018 21:12:58 +0200 Subject: [PATCH] Fix threshold frequency computation in Suggesters The `term` and `phrase` suggesters have different options to filter candidates based on their frequencies. The `popular` mode for instance filters candidate terms that occur in less docs than the original term. However when we compute this threshold we use the total term frequency of a term instead of the document frequency. This is not inline with the actual filtering which is always based on the document frequency. This change fixes this discrepancy and clarifies the meaning of the different frequencies in use in the suggesters. It also ensures that the threshold doesn't overflow the maximum allowed value (Integer.MAX_VALUE). Closes #34282 --- .../resources/checkstyle_suppressions.xml | 5 - .../suggest/phrase/CandidateGenerator.java | 12 +- .../suggest/phrase/CandidateScorer.java | 13 +- .../phrase/DirectCandidateGenerator.java | 87 +++++++----- .../DirectCandidateGeneratorBuilder.java | 12 +- .../search/suggest/phrase/LaplaceScorer.java | 2 +- .../phrase/LinearInterpolatingScorer.java | 2 +- .../MultiCandidateGeneratorWrapper.java | 9 +- .../phrase/NoisyChannelSpellChecker.java | 13 +- .../suggest/phrase/StupidBackoffScorer.java | 8 +- .../search/suggest/phrase/WordScorer.java | 3 +- .../index/suggest/stats/SuggestStatsIT.java | 9 +- .../phrase/DirectCandidateGeneratorTests.java | 64 ++++++++- .../phrase/NoisyChannelSpellCheckerTests.java | 129 ++++++++++++------ 14 files changed, 248 insertions(+), 120 deletions(-) diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index 47ff196c0249a..7e15649cc13c3 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -398,9 +398,6 @@ - - - @@ -601,7 +598,6 @@ - @@ -688,7 +684,6 @@ - diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java index f98822296b086..34a7612baf84f 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java @@ -18,6 +18,7 @@ */ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -29,7 +30,7 @@ public abstract class CandidateGenerator { public abstract boolean isKnownWord(BytesRef term) throws IOException; - public abstract long frequency(BytesRef term) throws IOException; + public abstract TermStats termStats(BytesRef term) throws IOException; public CandidateSet drawCandidates(BytesRef term) throws IOException { CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term, true)); @@ -37,14 +38,13 @@ public CandidateSet drawCandidates(BytesRef term) throws IOException { } public Candidate createCandidate(BytesRef term, boolean userInput) throws IOException { - return createCandidate(term, frequency(term), 1.0, userInput); + return createCandidate(term, termStats(term), 1.0, userInput); } - public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException { - return createCandidate(term, frequency, channelScore, false); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore) throws IOException { + return createCandidate(term, termStats, channelScore, false); } - public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException; + public abstract Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException; public abstract CandidateSet drawCandidates(CandidateSet set) throws IOException; - } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java index 3928a16b7c9a0..d93ef42ee4ff7 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java @@ -77,21 +77,24 @@ public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, } else { if (numMissspellingsLeft > 0) { path[ord] = current.originalTerm; - findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; - findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); } } else { path[ord] = current.originalTerm; - findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); } } } - private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue corrections, double cutoffScore, double score) - throws IOException { + private void updateTop(CandidateSet[] candidates, Candidate[] path, + PriorityQueue corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates); if (score > cutoffScore) { diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index 678b00aa13dca..1cc8486c26644 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; @@ -48,6 +49,7 @@ import static java.lang.Math.log10; import static java.lang.Math.max; +import static java.lang.Math.min; import static java.lang.Math.round; public final class DirectCandidateGenerator extends CandidateGenerator { @@ -57,20 +59,20 @@ public final class DirectCandidateGenerator extends CandidateGenerator { private final SuggestMode suggestMode; private final TermsEnum termsEnum; private final IndexReader reader; - private final long dictSize; + private final long sumTotalTermFreq; private static final double LOG_BASE = 5; private final long frequencyPlateau; private final Analyzer preFilter; private final Analyzer postFilter; private final double nonErrorLikelihood; - private final boolean useTotalTermFrequency; private final CharsRefBuilder spare = new CharsRefBuilder(); private final BytesRefBuilder byteSpare = new BytesRefBuilder(); private final int numCandidates; public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates) throws IOException { - this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field)); + this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, + numCandidates, null, null, MultiFields.getTerms(reader, field)); } public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, @@ -83,14 +85,12 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S this.numCandidates = numCandidates; this.suggestMode = suggestMode; this.reader = reader; - final long dictSize = terms.getSumTotalTermFreq(); - this.useTotalTermFrequency = dictSize != -1; - this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize; + this.sumTotalTermFreq = terms.getSumTotalTermFreq() == -1 ? reader.maxDoc() : terms.getSumTotalTermFreq(); this.preFilter = preFilter; this.postFilter = postFilter; this.nonErrorLikelihood = nonErrorLikelihood; float thresholdFrequency = spellchecker.getThresholdFrequency(); - this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency); + this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int) (reader.maxDoc() * thresholdFrequency); termsEnum = terms.iterator(); } @@ -99,24 +99,29 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S */ @Override public boolean isKnownWord(BytesRef term) throws IOException { - return frequency(term) > 0; + return termStats(term).docFreq > 0; } /* (non-Javadoc) * @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef) */ @Override - public long frequency(BytesRef term) throws IOException { + public TermStats termStats(BytesRef term) throws IOException { term = preFilter(term, spare, byteSpare); - return internalFrequency(term); + return internalTermStats(term); } - public long internalFrequency(BytesRef term) throws IOException { + public TermStats internalTermStats(BytesRef term) throws IOException { if (termsEnum.seekExact(term)) { - return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq(); + return new TermStats(termsEnum.docFreq(), + /** + * We use the {@link TermsEnum#docFreq()} for fields that don't + * record the {@link TermsEnum#totalTermFreq()}. + */ + termsEnum.totalTermFreq() == -1 ? termsEnum.docFreq() : termsEnum.totalTermFreq()); } - return 0; + return new TermStats(0, 0); } public String getField() { @@ -127,15 +132,28 @@ public String getField() { public CandidateSet drawCandidates(CandidateSet set) throws IOException { Candidate original = set.originalTerm; BytesRef term = preFilter(original.term, spare, byteSpare); - final long frequency = original.frequency; - spellchecker.setThresholdFrequency(this.suggestMode == SuggestMode.SUGGEST_ALWAYS ? 0 : thresholdFrequency(frequency, dictSize)); + if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { + /** + * We use the {@link TermStats#docFreq} to compute the frequency threshold + * because that's what {@link DirectSpellChecker#suggestSimilar} expects + * when filtering terms. + */ + int threshold = thresholdTermFrequency(original.termStats.docFreq); + if (threshold == Integer.MAX_VALUE) { + // the threshold is the max possible frequency so we can skip the search + return set; + } + spellchecker.setThresholdFrequency(threshold); + } + SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode); List candidates = new ArrayList<>(suggestSimilar.length); for (int i = 0; i < suggestSimilar.length; i++) { SuggestWord suggestWord = suggestSimilar[i]; BytesRef candidate = new BytesRef(suggestWord.string); - postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score, - score(suggestWord.freq, suggestWord.score, dictSize), false), spare, byteSpare, candidates); + TermStats termStats = internalTermStats(candidate); + postFilter(new Candidate(candidate, termStats, + suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates); } set.addCandidates(candidates); return set; @@ -171,28 +189,29 @@ public void nextToken() throws IOException { BytesRef term = result.toBytesRef(); // We should not use frequency(term) here because it will analyze the term again // If preFilter and postFilter are the same analyzer it would fail. - long freq = internalFrequency(term); - candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance, - score(candidate.frequency, candidate.stringDistance, dictSize), false)); + TermStats termStats = internalTermStats(term); + candidates.add(new Candidate(result.toBytesRef(), termStats, candidate.stringDistance, + score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false)); } else { - candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood, - score(candidate.frequency, candidate.stringDistance, dictSize), false)); + candidates.add(new Candidate(result.toBytesRef(), candidate.termStats, nonErrorLikelihood, + score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false)); } } }, spare); } } - private double score(long frequency, double errorScore, long dictionarySize) { - return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1)); + private double score(TermStats termStats, double errorScore, long dictionarySize) { + return errorScore * (((double)termStats.totalTermFreq + 1) / ((double)dictionarySize +1)); } - protected long thresholdFrequency(long termFrequency, long dictionarySize) { - if (termFrequency > 0) { - return max(0, round(termFrequency * (log10(termFrequency - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)); + protected int thresholdTermFrequency(int docFreq) { + if (docFreq > 0) { + return (int) min( + max(0, round(docFreq * (log10(docFreq - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)), Integer.MAX_VALUE + ); } return 0; - } public abstract static class TokenConsumer { @@ -249,12 +268,12 @@ public static class Candidate implements Comparable { public static final Candidate[] EMPTY = new Candidate[0]; public final BytesRef term; public final double stringDistance; - public final long frequency; + public final TermStats termStats; public final double score; public final boolean userInput; - public Candidate(BytesRef term, long frequency, double stringDistance, double score, boolean userInput) { - this.frequency = frequency; + public Candidate(BytesRef term, TermStats termStats, double stringDistance, double score, boolean userInput) { + this.termStats = termStats; this.term = term; this.stringDistance = stringDistance; this.score = score; @@ -266,7 +285,7 @@ public String toString() { return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score - + ", frequency=" + frequency + + ", termStats=" + termStats + (userInput ? ", userInput" : "") + "]"; } @@ -305,8 +324,8 @@ public int compareTo(Candidate other) { } @Override - public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException { - return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException { + return new Candidate(term, termStats, channelScore, score(termStats, channelScore, sumTotalTermFreq), userInput); } public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java index 6fdff8d18eba0..b9e8c632ec13a 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java @@ -187,9 +187,9 @@ Integer size() { * possible values: *
    *
  1. score - Sort should first be based on score, then - * document frequency and then the term itself. - *
  2. frequency - Sort should first be based on document - * frequency, then score and then the term itself. + * document totalTermFrequency and then the term itself. + *
  3. totalTermFrequency - Sort should first be based on document + * totalTermFrequency, then score and then the term itself. *
*

* What the score is depends on the suggester being used. @@ -268,8 +268,8 @@ Integer maxInspections() { * frequencies. If an value higher than 1 is specified then fractional * can not be specified. Defaults to {@code 0.01}. *

- * This can be used to exclude high frequency terms from being - * suggested. High frequency terms are usually spelled correctly on top + * This can be used to exclude high totalTermFrequency terms from being + * suggested. High totalTermFrequency terms are usually spelled correctly on top * of this this also improves the suggest performance. */ public DirectCandidateGeneratorBuilder maxTermFreq(float maxTermFreq) { @@ -313,7 +313,7 @@ Integer minWordLength() { * Sets a minimal threshold in number of documents a suggested term * should appear in. This can be specified as an absolute number or as a * relative percentage of number of documents. This can improve quality - * by only suggesting high frequency terms. Defaults to 0f and is not + * by only suggesting high totalTermFrequency terms. Defaults to 0f and is not * enabled. If a value higher than 1 is specified then the number cannot * be fractional. */ diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index d9797a4207e22..52157a0fe8bde 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -46,7 +46,7 @@ protected double scoreUnigram(Candidate word) throws IOException { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { join(separator, spare, w_1.term, word.term); - return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms); + return (alpha + frequency(spare.get())) / (w_1.termStats.totalTermFreq + alpha * numTerms); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java index c6d67fe8cf7cd..b0c9552f8a8d6 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java @@ -60,7 +60,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { if (count < 1) { return unigramLambda * scoreUnigram(word); } - return bigramLambda * (count / (0.5d + w_1.frequency)) + unigramLambda * scoreUnigram(word); + return bigramLambda * (count / (0.5d + w_1.termStats.totalTermFreq)) + unigramLambda * scoreUnigram(word); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java index 904822e389483..e0e93e26164d3 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java @@ -18,6 +18,7 @@ */ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -41,8 +42,8 @@ public boolean isKnownWord(BytesRef term) throws IOException { } @Override - public long frequency(BytesRef term) throws IOException { - return candidateGenerator[0].frequency(term); + public TermStats termStats(BytesRef term) throws IOException { + return candidateGenerator[0].termStats(term); } @Override @@ -65,8 +66,8 @@ private CandidateSet reduce(CandidateSet set, int numCandidates) { return set; } @Override - public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException { - return candidateGenerator[0].createCandidate(term, frequency, channelScore, userInput); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException { + return candidateGenerator[0].createCandidate(term, termStats, channelScore, userInput); } } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index eb9694c6039b7..635fa64c59b53 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -84,9 +85,9 @@ public void nextToken() throws IOException { anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; - long freq = 0; - if ((freq = generator.frequency(term)) > 0) { - currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); + TermStats termStats = generator.termStats(term); + if (termStats.docFreq > 0) { + currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), termStats, realWordLikelihood)); } } else { if (currentSet != null) { @@ -131,9 +132,11 @@ public void end() { } public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, - float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException { + float maxErrors, int numCorrections, IndexReader reader, String analysisField, + WordScorer scorer, float confidence, int gramSize) throws IOException { - return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize); + return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, + numCorrections, scorer, confidence, gramSize); } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java index 166a4182c8f81..d6862f384bebf 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java @@ -28,8 +28,8 @@ class StupidBackoffScorer extends WordScorer { private final double discount; - StupidBackoffScorer(IndexReader reader, Terms terms,String field, double realWordLikelyhood, BytesRef separator, double discount) - throws IOException { + StupidBackoffScorer(IndexReader reader, Terms terms,String field, + double realWordLikelyhood, BytesRef separator, double discount) throws IOException { super(reader, terms, field, realWordLikelyhood, separator); this.discount = discount; } @@ -45,7 +45,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { if (count < 1) { return discount * scoreUnigram(word); } - return count / (w_1.frequency + 0.00000000001d); + return count / (w_1.termStats.totalTermFreq + 0.00000000001d); } @Override @@ -60,7 +60,7 @@ protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws join(separator, spare, w_2.term, w_1.term, w.term); long trigramCount = frequency(spare.get()); if (trigramCount < 1) { - return discount * (bigramCount / (w_1.frequency + 0.00000000001d)); + return discount * (bigramCount / (w_1.termStats.totalTermFreq + 0.00000000001d)); } return trigramCount / (bigramCount + 0.00000000001d); } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 22515489ee252..1bdf1c90d7d09 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -62,7 +62,8 @@ public WordScorer(IndexReader reader, Terms terms, String field, double realWord // division by zero, by scoreUnigram. final long nTerms = terms.size(); this.numTerms = nTerms == -1 ? reader.maxDoc() : nTerms; - this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now + this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, + BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; this.separator = separator; diff --git a/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java b/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java index 7b7e7a41783a3..c355725f62acd 100644 --- a/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java +++ b/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java @@ -106,9 +106,12 @@ public void testSimpleStats() throws Exception { assertThat(suggest.getSuggestCurrent(), equalTo(0L)); // check suggest count - assertThat(suggest.getSuggestCount(), equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2))); - assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1))); - assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2))); + assertThat(suggest.getSuggestCount(), + equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2))); + assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(), + equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1))); + assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(), + equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2))); logger.info("iter {}, iter1 {}, iter2 {}, {}", suggestAllIdx, suggestIdx1, suggestIdx2, endTime - startTime); // check suggest time diff --git a/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java b/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java index ca95310cd501f..bf9751b7b8dba 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java @@ -19,11 +19,20 @@ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.JaroWinklerDistance; import org.apache.lucene.search.spell.LevenshteinDistance; import org.apache.lucene.search.spell.LuceneLevenshteinDistance; import org.apache.lucene.search.spell.NGramDistance; +import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContent; @@ -33,7 +42,6 @@ import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.json.JsonXContent; -import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator; import org.elasticsearch.test.ESTestCase; import java.io.IOException; @@ -134,7 +142,8 @@ public void testFromXContent() throws IOException { } } - public static void assertEqualGenerators(DirectCandidateGenerator first, DirectCandidateGenerator second) { + public static void assertEqualGenerators(PhraseSuggestionContext.DirectCandidateGenerator first, + PhraseSuggestionContext.DirectCandidateGenerator second) { assertEquals(first.field(), second.field()); assertEquals(first.accuracy(), second.accuracy(), Float.MIN_VALUE); assertEquals(first.maxTermFreq(), second.maxTermFreq(), Float.MIN_VALUE); @@ -186,6 +195,57 @@ public void testIllegalXContent() throws IOException { "[direct_generator] size doesn't support values of type: START_ARRAY"); } + public void testFrequencyThreshold() throws Exception { + try (Directory dir = newDirectory()) { + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); + int numDocs = randomIntBetween(10, 20); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (i == 0) { + for (int j = 0; j < numDocs; j++) { + doc.add(new TextField("field", "fooz", Field.Store.NO)); + } + } else { + doc.add(new TextField("field", "foo", Field.Store.NO)); + } + writer.addDocument(doc); + } + try (IndexReader reader = DirectoryReader.open(writer)) { + writer.close(); + DirectSpellChecker spellchecker = new DirectSpellChecker(); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, + reader, 0f, 10); + DirectCandidateGenerator.CandidateSet candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(1)); + assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1)); + assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1)); + + spellchecker = new DirectSpellChecker(); + spellchecker.setThresholdFrequency(0.5f); + generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, + reader, 0f, 10); + candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(1)); + assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1)); + assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1)); + + spellchecker = new DirectSpellChecker(); + spellchecker.setThresholdFrequency(0.5f); + generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_ALWAYS, + reader, 0f, 10); + candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(01)); + } + } + + } + private void assertIllegalXContent(String directGenerator, Class exceptionClass, String exceptionMsg) throws IOException { try (XContentParser parser = createParser(JsonXContent.jsonXContent, directGenerator)) { diff --git a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java index 40b2b023334ca..171bb0bf1697f 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java @@ -110,20 +110,24 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); + WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, + ir, 0.95, 5); + Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 2); Correction[] corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ace")); assertThat(result.cutoffScore, greaterThan(0d)); - result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); + result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 0, 1); corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame")); @@ -131,8 +135,10 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE)); suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -143,7 +149,8 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, + 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -152,8 +159,10 @@ protected TokenStreamComponents createComponents(String fieldName) { // Test some of the highlighting corner cases suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -187,19 +196,25 @@ protected TokenStreamComponents createComponents(String fieldName) { spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); } @@ -265,47 +280,58 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); + LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); - DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); + DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, + 0.95, 10); + DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, + 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix - corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, + "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, + "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, + "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer. - corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii")); } @@ -362,22 +388,28 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); + WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5, 0.4, 0.1); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections; + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, + 0.95, 5); + Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 1).corrections; assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); - wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections; + wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5, 0.4, 0.1); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -387,7 +419,8 @@ protected TokenStreamComponents createComponents(String fieldName) { - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -395,7 +428,8 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, + ir, "body", wordScorer, 100, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -423,17 +457,23 @@ protected TokenStreamComponents createComponents(String fieldName) { spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.95); - wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; + wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5, 0.4, 0.1); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections; + wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.4); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, + ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -452,11 +492,14 @@ public void testFewDocsEgdeCase() throws Exception { } try (DirectoryReader ir = DirectoryReader.open(dir)) { - WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, new BytesRef(" "), 0.4f); + WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, + new BytesRef(" "), 0.4f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, ir, "field", wordScorer, 1, 2); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", + SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); + Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, + ir, "field", wordScorer, 1, 2); assertThat(result.corrections.length, equalTo(1)); assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value")); }