From 1c513bc262efa9f76be2a957d666088998c3661a Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 6 Jun 2013 11:09:36 +0200 Subject: [PATCH] Fallback to extract terms if MultiPhraseQuery is large Currently if MPQ is very large highlighing can take down a node or cause high CPU / RAM consumption. If the query grows > 16 terms we just extract the terms and do term by term highlighting. Closes #3142 #3128 --- .../vectorhighlight/CustomFieldQuery.java | 15 ++++++ .../highlight/HighlighterSearchTests.java | 51 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java index 0f11d7915da2b..5fccc6a84227f 100644 --- a/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java +++ b/src/main/java/org/apache/lucene/search/vectorhighlight/CustomFieldQuery.java @@ -104,6 +104,21 @@ void flatten(Query sourceQuery, IndexReader reader, Collection flatQuerie } private void convertMultiPhraseQuery(int currentPos, int[] termsIdx, MultiPhraseQuery orig, List terms, int[] pos, IndexReader reader, Collection flatQueries) throws IOException { + if (currentPos == 0) { + // if we have more than 16 terms + int numTerms = 0; + for (Term[] currentPosTerm : terms) { + numTerms += currentPosTerm.length; + } + if (numTerms > 16) { + for (Term[] currentPosTerm : terms) { + for (Term term : currentPosTerm) { + super.flatten(new TermQuery(term), reader, flatQueries); + } + } + return; + } + } /* * we walk all possible ways and for each path down the MPQ we create a PhraseQuery this is what FieldQuery supports. * It seems expensive but most queries will pretty small. diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java index da6fa3b3231e1..a192023f1737f 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java @@ -54,6 +54,7 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.startsWith; import static org.testng.Assert.fail; /** @@ -125,6 +126,56 @@ public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchExcep assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCOTEL Hotels Deutschland")); } + + + @Test + public void testMultiPhraseCutoff() throws ElasticSearchException, IOException { + /* + * MultiPhraseQuery can literally kill an entire node if there are too many terms in the + * query. We cut off and extract terms if there are more than 16 terms in the query + */ + prepareCreate("test") + .addMapping("test", jsonBuilder() + .startObject() + .startObject("test") + .startObject("properties") + .startObject("body") + .field("type", "string") + .field("index_analyzer", "custom_analyzer") + .field("search_analyzer", "custom_analyzer") + .field("term_vector", "with_positions_offsets") + .endObject() + .endObject() + .endObject() + .endObject()) + .setSettings(ImmutableSettings.settingsBuilder() + .put("index.number_of_shards", 1) + .put("index.number_of_replicas", 0) + .put("analysis.filter.wordDelimiter.type", "word_delimiter") + .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) + .put("analysis.filter.wordDelimiter.generate_word_parts", true) + .put("analysis.filter.wordDelimiter.generate_number_parts", true) + .put("analysis.filter.wordDelimiter.catenate_words", true) + .put("analysis.filter.wordDelimiter.catenate_numbers", true) + .put("analysis.filter.wordDelimiter.catenate_all", false) + .put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace") + .putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")) + .execute().actionGet(); + + ensureGreen(); + client().prepareIndex("test", "test", "1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature") + .endObject()) + .execute().actionGet(); + refresh(); + SearchResponse search = client().prepareSearch().setQuery(matchQuery("body", "Test: http://www.facebook.com ").type(Type.PHRASE)).addHighlightedField("body").execute().actionGet(); + assertHighlight(search, 0, "body", 0, startsWith("Test: http://www.facebook.com")); + search = client().prepareSearch().setQuery(matchQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature").type(Type.PHRASE)).addHighlightedField("body").execute().actionGet(); + assertHighlight(search, 0, "body", 0, equalTo("Test: http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com")); + } + @Test public void testNgramHighlightingPreLucene42() throws ElasticSearchException, IOException { boolean[] doStore = {true, false};