Add query param to limit highlighting to specified length (#67325) (#…

…69016) Add a `max_analyzed_offset` query parameter to allow users to limit the highlighting of text fields to a value less than or equal to the `index.highlight.max_analyzed_offset`, thus avoiding an exception when the length of the text field exceeds the limit. The highlighting still takes place, but stops at the length defined by the new parameter. Closes: #52155 (cherry picked from commit f9af60b)
elastic · Feb 16, 2021 · 931e149 · 931e149
1 parent 4b8c8f8
commit 931e149
Show file tree

Hide file tree

Showing 14 changed files with 564 additions and 214 deletions.
diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
@@ -223,6 +223,7 @@ specific index module:
     The maximum number of tokens that can be produced using _analyze API.
     Defaults to `10000`.
 
+[[index-max-analyzed-offset]]
  `index.highlight.max_analyzed_offset`::
 
      The maximum number of characters that will be analyzed for a highlight request.

diff --git a/docs/reference/search/search-your-data/highlighting.asciidoc b/docs/reference/search/search-your-data/highlighting.asciidoc
@@ -117,7 +117,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
 Plain highlighting for large texts may require substantial amount of time and memory.
 To protect against this, the maximum number of text characters that will be analyzed has been
 limited to 1000000. This default limit can be changed
-for a particular index with the index setting `index.highlight.max_analyzed_offset`.
+for a particular index with the index setting <<index-max-analyzed-offset,`index.highlight.max_analyzed_offset`>>.
 
 [discrete]
 [[highlighting-settings]]
@@ -242,6 +242,17 @@ require_field_match:: By default, only fields that contains a query match are
 highlighted. Set `require_field_match` to `false` to highlight all fields.
 Defaults to `true`.
 
+[[max-analyzed-offset]]
+max_analyzed_offset:: By default, the maximum number of characters
+analyzed for a highlight request is bounded by the value defined in the
+<<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>> setting,
+and when the number of characters exceeds this limit an error is returned. If
+this setting is set to a non-negative value, the highlighting stops at this defined
+maximum limit, and the rest of the text is not processed, thus not highlighted and
+no error is returned. The <<max-analyzed-offset, `max_analyzed_offset`>> query setting
+does *not* override the <<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>>
+which prevails when it's set to lower value than the query setting.
+
 tags_schema:: Set to `styled` to use the built-in tag schema. The `styled`
 schema defines the following `pre_tags` and defines `post_tags` as
 `</em>`.
@@ -1121,4 +1132,4 @@ using the passages's `matchStarts` and `matchEnds` information:
     I'll be the <em>only</em> <em>fox</em> in the world for you.
 
 This kind of formatted strings are the final result of the highlighter returned
-to the user.
+to the user.
diff --git a/...main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java b/...main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java
@@ -50,8 +50,8 @@ protected List<Object> loadFieldValues(
     }
 
     @Override
-    protected Analyzer wrapAnalyzer(Analyzer analyzer) {
-        return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer));
+    protected Analyzer wrapAnalyzer(Analyzer analyzer, Integer maxAnalyzedOffset) {
+        return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer, maxAnalyzedOffset));
     }
 
     @Override

diff --git a/...java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java b/...java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java
@@ -8,6 +8,14 @@
 
 package org.elasticsearch.search.fetch.subphase.highlight;
 
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+import java.net.URLEncoder;
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Locale;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
@@ -31,95 +39,96 @@
 import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
+import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.store.Directory;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
 import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
 import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
 import org.elasticsearch.test.ESTestCase;
 
-import java.net.URLEncoder;
-import java.text.BreakIterator;
-import java.util.ArrayList;
-import java.util.Locale;
+public class AnnotatedTextHighlighterTests extends ESTestCase {
 
-import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
-import static org.hamcrest.CoreMatchers.equalTo;
+    private void assertHighlightOneDoc(String fieldName, String[] markedUpInputs,
+                                       Query query, Locale locale, BreakIterator breakIterator,
+                                       int noMatchSize, String[] expectedPassages) throws Exception {
 
-public class AnnotatedTextHighlighterTests extends ESTestCase {
+        assertHighlightOneDoc(fieldName, markedUpInputs, query, locale, breakIterator, noMatchSize, expectedPassages,
+                Integer.MAX_VALUE, null);
+    }
 
     private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
             Query query, Locale locale, BreakIterator breakIterator,
-            int noMatchSize, String[] expectedPassages) throws Exception {
-
-
-        // Annotated fields wrap the usual analyzer with one that injects extra tokens
-        Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
-        Directory dir = newDirectory();
-        IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
-        iwc.setMergePolicy(newTieredMergePolicy(random()));
-        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
-        FieldType ft = new FieldType(TextField.TYPE_STORED);
-        if (randomBoolean()) {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        } else {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
-        }
-        ft.freeze();
-        Document doc = new Document();
-        for (String input : markedUpInputs) {
-            Field field = new Field(fieldName, "", ft);
-            field.setStringValue(input);
-            doc.add(field);
-        }
-        iw.addDocument(doc);
-        DirectoryReader reader = iw.getReader();
-        IndexSearcher searcher = newSearcher(reader);
-        iw.close();
-
-        AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
-        for (int i = 0; i < markedUpInputs.length; i++) {
-            annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
-        }
-        AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
-        hiliteAnalyzer.setAnnotations(annotations);
-        AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
-        passageFormatter.setAnnotations(annotations);
-
-        ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
-        for (int i = 0; i < annotations.length; i++) {
-            plainTextForHighlighter.add(annotations[i].textMinusMarkup);
-        }
+            int noMatchSize, String[] expectedPassages,
+            int maxAnalyzedOffset, Integer queryMaxAnalyzedOffset) throws Exception {
 
-        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits.value, equalTo(1L));
-        String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
-            searcher,
-            hiliteAnalyzer,
-            null,
-            passageFormatter,
-            locale,
-            breakIterator,
-            "index",
-            "text",
-            query,
-            noMatchSize,
-            expectedPassages.length,
-            name -> "text".equals(name),
-            Integer.MAX_VALUE
-        );
-        highlighter.setFieldMatcher((name) -> "text".equals(name));
-        final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
-        assertEquals(expectedPassages.length, snippets.length);
-        for (int i = 0; i < snippets.length; i++) {
-            assertEquals(expectedPassages[i], snippets[i].getText());
+        try (Directory dir = newDirectory()) {
+            // Annotated fields wrap the usual analyzer with one that injects extra tokens
+            Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
+            IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
+            iwc.setMergePolicy(newTieredMergePolicy(random()));
+            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+            FieldType ft = new FieldType(TextField.TYPE_STORED);
+            if (randomBoolean()) {
+                ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+            } else {
+                ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+            }
+            ft.freeze();
+            Document doc = new Document();
+            for (String input : markedUpInputs) {
+                Field field = new Field(fieldName, "", ft);
+                field.setStringValue(input);
+                doc.add(field);
+            }
+            iw.addDocument(doc);
+            try (DirectoryReader reader = iw.getReader()) {
+                IndexSearcher searcher = newSearcher(reader);
+                iw.close();
+
+                AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
+                for (int i = 0; i < markedUpInputs.length; i++) {
+                    annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
+                }
+                AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
+                hiliteAnalyzer.setAnnotations(annotations);
+                AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
+                passageFormatter.setAnnotations(annotations);
+
+                ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
+                for (int i = 0; i < annotations.length; i++) {
+                    plainTextForHighlighter.add(annotations[i].textMinusMarkup);
+                }
+
+                TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
+                assertThat(topDocs.totalHits.value, equalTo(1L));
+                String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
+                CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
+                        searcher,
+                        hiliteAnalyzer,
+                        UnifiedHighlighter.OffsetSource.ANALYSIS,
+                        passageFormatter,
+                        locale,
+                        breakIterator,
+                        "index",
+                        "text",
+                        query,
+                        noMatchSize,
+                        expectedPassages.length,
+                        name -> "text".equals(name),
+                        maxAnalyzedOffset,
+                        queryMaxAnalyzedOffset
+                );
+                highlighter.setFieldMatcher((name) -> "text".equals(name));
+                final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
+                assertEquals(expectedPassages.length, snippets.length);
+                for (int i = 0; i < snippets.length; i++) {
+                    assertEquals(expectedPassages[i], snippets[i].getText());
+                }
+            }
         }
-        reader.close();
-        dir.close();
     }
 
-
     public void testAnnotatedTextStructuredMatch() throws Exception {
         // Check that a structured token eg a URL can be highlighted in a query
         // on marked-up
@@ -191,4 +200,65 @@ public void testBadAnnotation() throws Exception {
         assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
     }
 
+    public void testExceedMaxAnalyzedOffset() throws Exception {
+        TermQuery query = new TermQuery(new Term("text", "exceeds"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", new String[] { "[Short Text](Short+Text)" }, query, Locale.ROOT, breakIterator, 0, new String[] {},
+                10, null);
+
+        IllegalArgumentException e = expectThrows(
+            IllegalArgumentException.class,
+            () -> assertHighlightOneDoc(
+                "text",
+                new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
+                query,
+                Locale.ROOT,
+                breakIterator,
+                0,
+                new String[] {},
+                20,
+                null
+            )
+        );
+        assertEquals(
+            "The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
+                + "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
+                + "will tolerate long field values by truncating them.",
+            e.getMessage()
+        );
+
+        final Integer queryMaxOffset = randomIntBetween(21, 1000);
+        e = expectThrows(
+                IllegalArgumentException.class,
+                () -> assertHighlightOneDoc(
+                        "text",
+                        new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
+                        query,
+                        Locale.ROOT,
+                        breakIterator,
+                        0,
+                        new String[] {},
+                        20,
+                        queryMaxOffset
+                )
+        );
+        assertEquals(
+            "The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
+                + "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
+                + "will tolerate long field values by truncating them.",
+            e.getMessage()
+        );
+
+        assertHighlightOneDoc(
+            "text",
+            new String[] { "[Long Text Exceeds](Long+Text+Exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
+            query,
+            Locale.ROOT,
+            breakIterator,
+            0,
+            new String[] { "Long Text [Exceeds](_hit_term=exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
+            20,
+            15
+        );
+    }
 }
diff --git a/...pi-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml b/...pi-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
@@ -6,7 +6,7 @@ setup:
           body:
               settings:
                   number_of_shards: 1
-                  index.highlight.max_analyzed_offset: 10
+                  index.highlight.max_analyzed_offset: 30
               mappings:
                   properties:
                       field1:
@@ -39,6 +39,20 @@ setup:
           body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
+
+  - skip:
+      version: " - 7.11.99"
+      reason: max_analyzed_offset query param added in 7.12.0
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}, "max_analyzed_offset": "20"}}
+  - match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
+
 
 ---
 "Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
@@ -50,9 +64,23 @@ setup:
       search:
           rest_total_hits_as_int: true
           index: test1
-          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
+
+  - skip:
+      version: " - 7.11.99"
+      reason: max_analyzed_offset query param added in 7.12.0
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}, "max_analyzed_offset": 20}}
+  - match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
+
 
 ---
 "Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
@@ -79,3 +107,35 @@ setup:
           index: test1
           body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
+
+---
+"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
+
+  - skip:
+      version: " - 7.11.99"
+      reason: max_analyzed_offset query param added in 7.12.0
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": 20}}
+  - match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
+
+---
+"Plain highlighter with max_analyzed_offset < 0 should FAIL":
+
+  - skip:
+      version: " - 7.11.99"
+      reason: max_analyzed_offset query param added in 7.12.0
+
+  - do:
+      catch: bad_request
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": -10}}
+  - match: { status: 400 }
+  - match: { error.root_cause.0.type: "x_content_parse_exception" }
+  - match: { error.caused_by.type: "illegal_argument_exception" }
+  - match: { error.caused_by.reason: "[max_analyzed_offset] must be a positive integer" }