diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index b0ab7f9b7e9ac..212b407f02d0b 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -372,9 +372,6 @@ - - - @@ -564,7 +561,6 @@ - @@ -644,7 +640,6 @@ - diff --git a/docs/reference/migration/migrate_7_0/search.asciidoc b/docs/reference/migration/migrate_7_0/search.asciidoc index 6cf004da6ce8d..44e22c97f2f9b 100644 --- a/docs/reference/migration/migrate_7_0/search.asciidoc +++ b/docs/reference/migration/migrate_7_0/search.asciidoc @@ -78,6 +78,13 @@ removed. * `levenstein` - replaced by `levenshtein` * `jarowinkler` - replaced by `jaro_winkler` +[float] +==== `popular` mode for Suggesters + +The `popular` mode for Suggesters (`term` and `phrase`) now uses the doc frequency +(instead of the sum of the doc frequency) of the input terms to compute the frequency +threshold for candidate suggestions. + [float] ==== Limiting the number of terms that can be used in a Terms Query request diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java index f98822296b086..8730330d39fea 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java @@ -18,6 +18,7 @@ */ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -29,7 +30,7 @@ public abstract class CandidateGenerator { public abstract boolean isKnownWord(BytesRef term) throws IOException; - public abstract long frequency(BytesRef term) throws IOException; + public abstract TermStats termStats(BytesRef term) throws IOException; public CandidateSet drawCandidates(BytesRef term) throws IOException { CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term, true)); @@ -37,14 +38,14 @@ public CandidateSet drawCandidates(BytesRef term) throws IOException { } public Candidate createCandidate(BytesRef term, boolean userInput) throws IOException { - return createCandidate(term, frequency(term), 1.0, userInput); + return createCandidate(term, termStats(term), 1.0, userInput); } - public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException { - return createCandidate(term, frequency, channelScore, false); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore) throws IOException { + return createCandidate(term, termStats, channelScore, false); } - public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException; + public abstract Candidate createCandidate(BytesRef term, TermStats termStats, + double channelScore, boolean userInput) throws IOException; public abstract CandidateSet drawCandidates(CandidateSet set) throws IOException; - } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java index 3928a16b7c9a0..d93ef42ee4ff7 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java @@ -77,21 +77,24 @@ public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, } else { if (numMissspellingsLeft > 0) { path[ord] = current.originalTerm; - findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; - findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); } } else { path[ord] = current.originalTerm; - findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); + findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, + pathScore + scorer.score(path, candidates, ord, gramSize)); } } } - private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue corrections, double cutoffScore, double score) - throws IOException { + private void updateTop(CandidateSet[] candidates, Candidate[] path, + PriorityQueue corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates); if (score > cutoffScore) { diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index 678b00aa13dca..bc2f6a8d42f65 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; @@ -48,6 +49,7 @@ import static java.lang.Math.log10; import static java.lang.Math.max; +import static java.lang.Math.min; import static java.lang.Math.round; public final class DirectCandidateGenerator extends CandidateGenerator { @@ -57,20 +59,20 @@ public final class DirectCandidateGenerator extends CandidateGenerator { private final SuggestMode suggestMode; private final TermsEnum termsEnum; private final IndexReader reader; - private final long dictSize; + private final long sumTotalTermFreq; private static final double LOG_BASE = 5; private final long frequencyPlateau; private final Analyzer preFilter; private final Analyzer postFilter; private final double nonErrorLikelihood; - private final boolean useTotalTermFrequency; private final CharsRefBuilder spare = new CharsRefBuilder(); private final BytesRefBuilder byteSpare = new BytesRefBuilder(); private final int numCandidates; public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates) throws IOException { - this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field)); + this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, + numCandidates, null, null, MultiFields.getTerms(reader, field)); } public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, @@ -83,14 +85,12 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S this.numCandidates = numCandidates; this.suggestMode = suggestMode; this.reader = reader; - final long dictSize = terms.getSumTotalTermFreq(); - this.useTotalTermFrequency = dictSize != -1; - this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize; + this.sumTotalTermFreq = terms.getSumTotalTermFreq() == -1 ? reader.maxDoc() : terms.getSumTotalTermFreq(); this.preFilter = preFilter; this.postFilter = postFilter; this.nonErrorLikelihood = nonErrorLikelihood; float thresholdFrequency = spellchecker.getThresholdFrequency(); - this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency); + this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int) (reader.maxDoc() * thresholdFrequency); termsEnum = terms.iterator(); } @@ -99,24 +99,29 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S */ @Override public boolean isKnownWord(BytesRef term) throws IOException { - return frequency(term) > 0; + return termStats(term).docFreq > 0; } /* (non-Javadoc) * @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef) */ @Override - public long frequency(BytesRef term) throws IOException { + public TermStats termStats(BytesRef term) throws IOException { term = preFilter(term, spare, byteSpare); - return internalFrequency(term); + return internalTermStats(term); } - public long internalFrequency(BytesRef term) throws IOException { + public TermStats internalTermStats(BytesRef term) throws IOException { if (termsEnum.seekExact(term)) { - return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq(); + return new TermStats(termsEnum.docFreq(), + /** + * We use the {@link TermsEnum#docFreq()} for fields that don't + * record the {@link TermsEnum#totalTermFreq()}. + */ + termsEnum.totalTermFreq() == -1 ? termsEnum.docFreq() : termsEnum.totalTermFreq()); } - return 0; + return new TermStats(0, 0); } public String getField() { @@ -127,15 +132,28 @@ public String getField() { public CandidateSet drawCandidates(CandidateSet set) throws IOException { Candidate original = set.originalTerm; BytesRef term = preFilter(original.term, spare, byteSpare); - final long frequency = original.frequency; - spellchecker.setThresholdFrequency(this.suggestMode == SuggestMode.SUGGEST_ALWAYS ? 0 : thresholdFrequency(frequency, dictSize)); + if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { + /** + * We use the {@link TermStats#docFreq} to compute the frequency threshold + * because that's what {@link DirectSpellChecker#suggestSimilar} expects + * when filtering terms. + */ + int threshold = thresholdTermFrequency(original.termStats.docFreq); + if (threshold == Integer.MAX_VALUE) { + // the threshold is the max possible frequency so we can skip the search + return set; + } + spellchecker.setThresholdFrequency(threshold); + } + SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode); List candidates = new ArrayList<>(suggestSimilar.length); for (int i = 0; i < suggestSimilar.length; i++) { SuggestWord suggestWord = suggestSimilar[i]; BytesRef candidate = new BytesRef(suggestWord.string); - postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score, - score(suggestWord.freq, suggestWord.score, dictSize), false), spare, byteSpare, candidates); + TermStats termStats = internalTermStats(candidate); + postFilter(new Candidate(candidate, termStats, + suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates); } set.addCandidates(candidates); return set; @@ -171,28 +189,30 @@ public void nextToken() throws IOException { BytesRef term = result.toBytesRef(); // We should not use frequency(term) here because it will analyze the term again // If preFilter and postFilter are the same analyzer it would fail. - long freq = internalFrequency(term); - candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance, - score(candidate.frequency, candidate.stringDistance, dictSize), false)); + TermStats termStats = internalTermStats(term); + candidates.add(new Candidate(result.toBytesRef(), termStats, candidate.stringDistance, + score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false)); } else { - candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood, - score(candidate.frequency, candidate.stringDistance, dictSize), false)); + candidates.add(new Candidate(result.toBytesRef(), candidate.termStats, nonErrorLikelihood, + score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false)); } } }, spare); } } - private double score(long frequency, double errorScore, long dictionarySize) { - return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1)); + private double score(TermStats termStats, double errorScore, long dictionarySize) { + return errorScore * (((double)termStats.totalTermFreq + 1) / ((double)dictionarySize +1)); } - protected long thresholdFrequency(long termFrequency, long dictionarySize) { - if (termFrequency > 0) { - return max(0, round(termFrequency * (log10(termFrequency - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)); + // package protected for test + int thresholdTermFrequency(int docFreq) { + if (docFreq > 0) { + return (int) min( + max(0, round(docFreq * (log10(docFreq - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)), Integer.MAX_VALUE + ); } return 0; - } public abstract static class TokenConsumer { @@ -249,12 +269,12 @@ public static class Candidate implements Comparable { public static final Candidate[] EMPTY = new Candidate[0]; public final BytesRef term; public final double stringDistance; - public final long frequency; + public final TermStats termStats; public final double score; public final boolean userInput; - public Candidate(BytesRef term, long frequency, double stringDistance, double score, boolean userInput) { - this.frequency = frequency; + public Candidate(BytesRef term, TermStats termStats, double stringDistance, double score, boolean userInput) { + this.termStats = termStats; this.term = term; this.stringDistance = stringDistance; this.score = score; @@ -266,7 +286,7 @@ public String toString() { return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score - + ", frequency=" + frequency + + ", termStats=" + termStats + (userInput ? ", userInput" : "") + "]"; } @@ -305,8 +325,8 @@ public int compareTo(Candidate other) { } @Override - public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException { - return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException { + return new Candidate(term, termStats, channelScore, score(termStats, channelScore, sumTotalTermFreq), userInput); } public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index d9797a4207e22..52157a0fe8bde 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -46,7 +46,7 @@ protected double scoreUnigram(Candidate word) throws IOException { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { join(separator, spare, w_1.term, word.term); - return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms); + return (alpha + frequency(spare.get())) / (w_1.termStats.totalTermFreq + alpha * numTerms); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java index c6d67fe8cf7cd..b0c9552f8a8d6 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java @@ -60,7 +60,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { if (count < 1) { return unigramLambda * scoreUnigram(word); } - return bigramLambda * (count / (0.5d + w_1.frequency)) + unigramLambda * scoreUnigram(word); + return bigramLambda * (count / (0.5d + w_1.termStats.totalTermFreq)) + unigramLambda * scoreUnigram(word); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java index 904822e389483..e0e93e26164d3 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java @@ -18,6 +18,7 @@ */ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -41,8 +42,8 @@ public boolean isKnownWord(BytesRef term) throws IOException { } @Override - public long frequency(BytesRef term) throws IOException { - return candidateGenerator[0].frequency(term); + public TermStats termStats(BytesRef term) throws IOException { + return candidateGenerator[0].termStats(term); } @Override @@ -65,8 +66,8 @@ private CandidateSet reduce(CandidateSet set, int numCandidates) { return set; } @Override - public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException { - return candidateGenerator[0].createCandidate(term, frequency, channelScore, userInput); + public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException { + return candidateGenerator[0].createCandidate(term, termStats, channelScore, userInput); } } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index eb9694c6039b7..635fa64c59b53 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.codecs.TermStats; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -84,9 +85,9 @@ public void nextToken() throws IOException { anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; - long freq = 0; - if ((freq = generator.frequency(term)) > 0) { - currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); + TermStats termStats = generator.termStats(term); + if (termStats.docFreq > 0) { + currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), termStats, realWordLikelihood)); } } else { if (currentSet != null) { @@ -131,9 +132,11 @@ public void end() { } public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, - float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException { + float maxErrors, int numCorrections, IndexReader reader, String analysisField, + WordScorer scorer, float confidence, int gramSize) throws IOException { - return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize); + return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, + numCorrections, scorer, confidence, gramSize); } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java index 166a4182c8f81..d6862f384bebf 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java @@ -28,8 +28,8 @@ class StupidBackoffScorer extends WordScorer { private final double discount; - StupidBackoffScorer(IndexReader reader, Terms terms,String field, double realWordLikelyhood, BytesRef separator, double discount) - throws IOException { + StupidBackoffScorer(IndexReader reader, Terms terms,String field, + double realWordLikelyhood, BytesRef separator, double discount) throws IOException { super(reader, terms, field, realWordLikelyhood, separator); this.discount = discount; } @@ -45,7 +45,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { if (count < 1) { return discount * scoreUnigram(word); } - return count / (w_1.frequency + 0.00000000001d); + return count / (w_1.termStats.totalTermFreq + 0.00000000001d); } @Override @@ -60,7 +60,7 @@ protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws join(separator, spare, w_2.term, w_1.term, w.term); long trigramCount = frequency(spare.get()); if (trigramCount < 1) { - return discount * (bigramCount / (w_1.frequency + 0.00000000001d)); + return discount * (bigramCount / (w_1.termStats.totalTermFreq + 0.00000000001d)); } return trigramCount / (bigramCount + 0.00000000001d); } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 22515489ee252..1bdf1c90d7d09 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -62,7 +62,8 @@ public WordScorer(IndexReader reader, Terms terms, String field, double realWord // division by zero, by scoreUnigram. final long nTerms = terms.size(); this.numTerms = nTerms == -1 ? reader.maxDoc() : nTerms; - this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now + this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, + BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; this.separator = separator; diff --git a/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java b/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java index 7b7e7a41783a3..c355725f62acd 100644 --- a/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java +++ b/server/src/test/java/org/elasticsearch/index/suggest/stats/SuggestStatsIT.java @@ -106,9 +106,12 @@ public void testSimpleStats() throws Exception { assertThat(suggest.getSuggestCurrent(), equalTo(0L)); // check suggest count - assertThat(suggest.getSuggestCount(), equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2))); - assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1))); - assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2))); + assertThat(suggest.getSuggestCount(), + equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2))); + assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(), + equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1))); + assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(), + equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2))); logger.info("iter {}, iter1 {}, iter2 {}, {}", suggestAllIdx, suggestIdx1, suggestIdx2, endTime - startTime); // check suggest time diff --git a/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java b/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java index 4ae19a1b6b04b..5bff24b934837 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorTests.java @@ -19,11 +19,20 @@ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.JaroWinklerDistance; import org.apache.lucene.search.spell.LevenshteinDistance; import org.apache.lucene.search.spell.LuceneLevenshteinDistance; import org.apache.lucene.search.spell.NGramDistance; +import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -32,7 +41,6 @@ import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.json.JsonXContent; -import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator; import org.elasticsearch.test.ESTestCase; import java.io.IOException; @@ -133,7 +141,8 @@ public void testFromXContent() throws IOException { } } - public static void assertEqualGenerators(DirectCandidateGenerator first, DirectCandidateGenerator second) { + public static void assertEqualGenerators(PhraseSuggestionContext.DirectCandidateGenerator first, + PhraseSuggestionContext.DirectCandidateGenerator second) { assertEquals(first.field(), second.field()); assertEquals(first.accuracy(), second.accuracy(), Float.MIN_VALUE); assertEquals(first.maxTermFreq(), second.maxTermFreq(), Float.MIN_VALUE); @@ -176,6 +185,66 @@ public void testIllegalXContent() throws IOException { "[direct_generator] size doesn't support values of type: START_ARRAY"); } + public void testFrequencyThreshold() throws Exception { + try (Directory dir = newDirectory()) { + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); + int numDocs = randomIntBetween(10, 20); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (i == 0) { + for (int j = 0; j < numDocs; j++) { + doc.add(new TextField("field", "fooz", Field.Store.NO)); + } + } else { + doc.add(new TextField("field", "foo", Field.Store.NO)); + } + writer.addDocument(doc); + } + try (IndexReader reader = DirectoryReader.open(writer)) { + writer.close(); + DirectSpellChecker spellchecker = new DirectSpellChecker(); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, + reader, 0f, 10); + DirectCandidateGenerator.CandidateSet candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(1)); + assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1)); + assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1)); + + // test that it doesn't overflow + assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE)); + + spellchecker = new DirectSpellChecker(); + spellchecker.setThresholdFrequency(0.5f); + generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, + reader, 0f, 10); + candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(1)); + assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1)); + assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1)); + + // test that it doesn't overflow + assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE)); + + spellchecker = new DirectSpellChecker(); + spellchecker.setThresholdFrequency(0.5f); + generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_ALWAYS, + reader, 0f, 10); + candidateSet = + generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY, + generator.createCandidate(new BytesRef("fooz"), false))); + assertThat(candidateSet.candidates.length, equalTo(01)); + + // test that it doesn't overflow + assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE)); + } + } + + } + private void assertIllegalXContent(String directGenerator, Class exceptionClass, String exceptionMsg) throws IOException { try (XContentParser parser = createParser(JsonXContent.jsonXContent, directGenerator)) { diff --git a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java index 40b2b023334ca..171bb0bf1697f 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java @@ -110,20 +110,24 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); + WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, + ir, 0.95, 5); + Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 2); Correction[] corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ace")); assertThat(result.cutoffScore, greaterThan(0d)); - result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); + result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 0, 1); corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame")); @@ -131,8 +135,10 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE)); suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -143,7 +149,8 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, + 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -152,8 +159,10 @@ protected TokenStreamComponents createComponents(String fieldName) { // Test some of the highlighting corner cases suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); @@ -187,19 +196,25 @@ protected TokenStreamComponents createComponents(String fieldName) { spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); - wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5f); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); } @@ -265,47 +280,58 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); + LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); - DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); + DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, + 0.95, 10); + DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, + 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, + ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix - corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, + "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, + "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, + "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer. - corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, + "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii")); } @@ -362,22 +388,28 @@ protected TokenStreamComponents createComponents(String fieldName) { } DirectoryReader ir = DirectoryReader.open(writer); - WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); + WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5, 0.4, 0.1); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections; + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, + 0.95, 5); + Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + ir, "body", wordScorer, 1, 1).corrections; assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); - wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections; + wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.5, 0.4, 0.1); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -387,7 +419,8 @@ protected TokenStreamComponents createComponents(String fieldName) { - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -395,7 +428,8 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections; + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, + ir, "body", wordScorer, 100, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -423,17 +457,23 @@ protected TokenStreamComponents createComponents(String fieldName) { spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.95); - wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; + wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, + new BytesRef(" "), 0.5, 0.4, 0.1); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; + generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, + 10, null, analyzer, MultiFields.getTerms(ir, "body")); + corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections; + wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, + new BytesRef(" "), 0.4); + corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, + ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); @@ -452,11 +492,14 @@ public void testFewDocsEgdeCase() throws Exception { } try (DirectoryReader ir = DirectoryReader.open(dir)) { - WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, new BytesRef(" "), 0.4f); + WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, + new BytesRef(" "), 0.4f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); - DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, ir, "field", wordScorer, 1, 2); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", + SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); + Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, + ir, "field", wordScorer, 1, 2); assertThat(result.corrections.length, equalTo(1)); assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value")); }