From c406a91158a8e24622dcc3338094b1ce8c194fc5 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 27 Oct 2017 16:07:09 +0200 Subject: [PATCH] Fix division by zero in phrase suggester that causes assertion to fail --- .../search/suggest/phrase/WordScorer.java | 5 +++- .../phrase/NoisyChannelSpellCheckerTests.java | 30 +++++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 32d4feb4b27a2..a1c41e4015130 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -57,7 +57,10 @@ public WordScorer(IndexReader reader, Terms terms, String field, double realWord final long vocSize = terms.getSumTotalTermFreq(); this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize; this.useTotalTermFreq = vocSize != -1; - this.numTerms = terms.size(); + long numTerms = terms.size(); + // -1 cannot be used as value, because scoreUnigram(...) can then divide by 0 if vocabluarySize is 1. + // -1 is returned when terms is a MultiTerms instance. + this.numTerms = vocabluarySize + numTerms > 1 ? numTerms : 0; this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; diff --git a/core/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/core/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java index d66fd8596bb90..40b2b023334ca 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; @@ -38,16 +39,14 @@ import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result; import org.elasticsearch.test.ESTestCase; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStreamReader; import java.io.StringReader; -import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -439,4 +438,29 @@ protected TokenStreamComponents createComponents(String fieldName) { assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); } + + public void testFewDocsEgdeCase() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig())) { + Document document = new Document(); + document.add(new TextField("field", "value", Field.Store.NO)); + iw.addDocument(document); + iw.commit(); + document = new Document(); + document.add(new TextField("other_field", "value", Field.Store.NO)); + iw.addDocument(document); + } + + try (DirectoryReader ir = DirectoryReader.open(dir)) { + WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, new BytesRef(" "), 0.4f); + NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); + DirectSpellChecker spellchecker = new DirectSpellChecker(); + DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); + Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, ir, "field", wordScorer, 1, 2); + assertThat(result.corrections.length, equalTo(1)); + assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value")); + } + } + } + }