opensearch-project · reta · Jul 28, 2022 · Jul 22, 2022 · Jul 28, 2022
@@ -41,7 +41,6 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
@@ -56,6 +55,7 @@
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.opensearch.common.Strings;
 import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
 import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
@@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
             noMatchSize,
             expectedPassages.length,
             name -> "text".equals(name),
-            Integer.MAX_VALUE
+            Integer.MAX_VALUE,
+            null
         );
         highlighter.setFieldMatcher((name) -> "text".equals(name));
         final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);

@@ -36,6 +36,17 @@ setup:
           body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED":
+  - skip:
+        version: " - 2.1.99"
+        reason: only starting supporting the parameter max_analyzer_offset on version 2.2
+  - do:
+      search:
+          rest_total_hits_as_int: true
+          index: test1
+          body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}}
+  - match: {hits.hits.0.highlight.field1.0: "The <em>quick</em> brown fox went to the forest and saw another fox."}
 
 ---
 "Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":

@@ -43,7 +43,6 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
 import org.apache.lucene.util.BytesRef;
 import org.opensearch.common.CheckedSupplier;
 import org.opensearch.common.Nullable;
@@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final int noMatchSize;
     private final FieldHighlighter fieldHighlighter;
     private final int maxAnalyzedOffset;
+    private final Integer fieldMaxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -99,6 +99,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
      * @param fieldMatcher decides which terms should be highlighted
      * @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
      *                          offset source for it because it'd be super slow
+     * @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
      */
     public CustomUnifiedHighlighter(
         IndexSearcher searcher,
@@ -113,7 +114,8 @@ public CustomUnifiedHighlighter(
         int noMatchSize,
         int maxPassages,
         Predicate<String> fieldMatcher,
-        int maxAnalyzedOffset
+        int maxAnalyzedOffset,
+        Integer fieldMaxAnalyzedOffset
     ) throws IOException {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
@@ -126,6 +128,7 @@ public CustomUnifiedHighlighter(
         this.setFieldMatcher(fieldMatcher);
         this.maxAnalyzedOffset = maxAnalyzedOffset;
         fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
+        this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
     }
 
     /**
@@ -141,7 +144,21 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
             return null;
         }
         int fieldValueLength = fieldValue.length();
-        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
+
+        if (fieldMaxAnalyzedOffset != null && fieldMaxAnalyzedOffset > maxAnalyzedOffset) {
+            throw new IllegalArgumentException(
+                "max_analyzer_offset has exceeded ["
+                    + maxAnalyzedOffset
+                    + "] - maximum allowed to be analyzed for highlighting. "
+                    + "This maximum can be set by changing the ["
+                    + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey()
+                    + "] index level setting. "
+                    + "For large texts, indexing with offsets or term vectors is recommended!"
+            );
+        }
+        // if fieldMaxAnalyzedOffset is not defined
+        // and if this happens we should fallback to the previous behavior
+        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset && fieldMaxAnalyzedOffset == null)) {
             throw new IllegalArgumentException(
                 "The length of ["
                     + field

@@ -34,6 +34,7 @@
 
 import org.apache.lucene.search.highlight.SimpleFragmenter;
 import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
+import org.opensearch.Version;
 import org.opensearch.common.ParseField;
 import org.opensearch.common.ParsingException;
 import org.opensearch.common.Strings;
@@ -92,6 +93,7 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
     public static final ParseField OPTIONS_FIELD = new ParseField("options");
     public static final ParseField HIGHLIGHT_QUERY_FIELD = new ParseField("highlight_query");
     public static final ParseField MATCHED_FIELDS_FIELD = new ParseField("matched_fields");
+    public static final ParseField MAX_ANALYZER_OFFSET_FIELD = new ParseField("max_analyzer_offset");
 
     protected String[] preTags;
 
@@ -129,6 +131,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
 
     protected Boolean requireFieldMatch;
 
+    protected Integer maxAnalyzerOffset = null;
+
     public AbstractHighlighterBuilder() {}
 
     protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, QueryBuilder queryBuilder) {
@@ -150,6 +154,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, Que
         phraseLimit = template.phraseLimit;
         options = template.options;
         requireFieldMatch = template.requireFieldMatch;
+        maxAnalyzerOffset = template.maxAnalyzerOffset;
     }
 
     /**
@@ -181,7 +186,13 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException {
         if (in.readBoolean()) {
             options(in.readMap());
         }
+
         requireFieldMatch(in.readOptionalBoolean());
+
+        if (in.getVersion().onOrAfter(Version.V_2_2_0)) {
+            maxAnalyzerOffset(in.readOptionalVInt());
+        }
+
     }
 
     /**
@@ -223,6 +234,9 @@ public final void writeTo(StreamOutput out) throws IOException {
             out.writeMap(options);
         }
         out.writeOptionalBoolean(requireFieldMatch);
+        if (out.getVersion().onOrAfter(Version.V_2_2_0)) {
+            out.writeOptionalVInt(maxAnalyzerOffset);
+        }
         doWriteTo(out);
     }
 
@@ -542,6 +556,21 @@ public Integer phraseLimit() {
         return this.phraseLimit;
     }
 
+    /**
+     * Sets the maximum offset for the highlighter
+     * @param maxAnalyzerOffset the maximum offset that the highlighter will consider
+     * @return this for chaining
+     */
+    @SuppressWarnings("unchecked")
+    public HB maxAnalyzerOffset(Integer maxAnalyzerOffset) {
+        this.maxAnalyzerOffset = maxAnalyzerOffset;
+        return (HB) this;
+    }
+
+    public Integer maxAnalyzerOffset() {
+        return this.maxAnalyzerOffset;
+    }
+
     /**
      * Forces the highlighting to highlight fields based on the source even if fields are stored separately.
      */
@@ -623,6 +652,10 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
         if (phraseLimit != null) {
             builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
         }
+        if (maxAnalyzerOffset != null) {
+            builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
+        }
+
     }
 
     static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB, HB> setupParser(ObjectParser<HB, Void> parser) {
@@ -642,6 +675,7 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB
         parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
         parser.declareBoolean(HB::forceSource, FORCE_SOURCE_FIELD);
         parser.declareInt(HB::phraseLimit, PHRASE_LIMIT_FIELD);
+        parser.declareInt(HB::maxAnalyzerOffset, MAX_ANALYZER_OFFSET_FIELD);
         parser.declareObject(HB::options, (XContentParser p, Void c) -> {
             try {
                 return p.map();

@@ -399,6 +399,10 @@ private static void transferOptions(
         if (highlighterBuilder.highlightQuery != null) {
             targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
         }
+        if (highlighterBuilder.maxAnalyzerOffset != null) {
+            targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
+        }
+
     }
 
     static Character[] convertCharArray(char[] array) {

@@ -154,6 +154,12 @@ public static class FieldOptions {
 
         private int phraseLimit = -1;
 
+        private Integer maxAnalyzerOffset = null;
+
+        public Integer maxAnalyzerOffset() {
+            return maxAnalyzerOffset;
+        }
+
         public int fragmentCharSize() {
             return fragmentCharSize;
         }
@@ -333,6 +339,15 @@ Builder phraseLimit(int phraseLimit) {
                 return this;
             }
 
+            Builder maxAnalyzerOffset(Integer maxAnalyzerOffset) {
+                // throws an execption if the value is not a positive integer
+                if (maxAnalyzerOffset != null && maxAnalyzerOffset <= 0) {
+                    throw new IllegalArgumentException("the value [" + maxAnalyzerOffset + "] of max_analyzer_offset is invalid");
+                }
+                fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
+                return this;
+            }
+
             Builder matchedFields(Set<String> matchedFields) {
                 fieldOptions.matchedFields = matchedFields;
                 return this;
@@ -405,6 +420,9 @@ Builder merge(FieldOptions globalOptions) {
                 if (fieldOptions.phraseLimit == -1) {
                     fieldOptions.phraseLimit = globalOptions.phraseLimit;
                 }
+                if (fieldOptions.maxAnalyzerOffset == null) {
+                    fieldOptions.maxAnalyzerOffset = globalOptions.maxAnalyzerOffset;
+                }
                 return this;
             }
         }

@@ -32,6 +32,8 @@
 package org.opensearch.search.fetch.subphase.highlight;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.highlight.Encoder;
 import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
@@ -133,13 +135,33 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
         return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
     }
 
+    public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer analyzer, int limit) {
+        return new AnalyzerWrapper(analyzer.getReuseStrategy()) {
+            @Override
+            protected Analyzer getWrappedAnalyzer(String fieldName) {
+                return analyzer;
+            }
+
+            @Override
+            protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+                return new TokenStreamComponents(components.getSource(), new LimitTokenOffsetFilter(components.getTokenStream(), limit));
+            }
+
+        };
+
+    }
+
     CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
         Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
             ? HighlightUtils.Encoders.HTML
             : HighlightUtils.Encoders.DEFAULT;
         int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
+        Integer fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
         int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
         Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
+        if (fieldMaxAnalyzedOffset != null) {
+            analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
+        }
         PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
         IndexSearcher searcher = fieldContext.context.searcher();
         OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
@@ -174,7 +196,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
             fieldContext.field.fieldOptions().noMatchSize(),
             higlighterNumberOfFragments,
             fieldMatcher(fieldContext),
-            maxAnalyzedOffset
+            maxAnalyzedOffset,
+            fieldMaxAnalyzedOffset
         );
     }
 

@@ -43,7 +43,6 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
@@ -63,6 +62,7 @@
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.opensearch.common.Strings;
 import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.opensearch.test.OpenSearchTestCase;
@@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
                     noMatchSize,
                     expectedPassages.length,
                     name -> "text".equals(name),
-                    Integer.MAX_VALUE
+                    Integer.MAX_VALUE,
+                    null
                 );
                 final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
                 assertEquals(snippets.length, expectedPassages.length);