Skip to content

Commit

Permalink
Adds a new parameter, max_analyzer_offset, for the highlighter (#3893)
Browse files Browse the repository at this point in the history
* #3842 adds a new parameter to the highlighter, the max_analyzer_offset. When this parameter is provided the highlight stops in its value. This prevents the highlighter to go beyond the index maxAnalyzedOffset.

Signed-off-by: Hauck <[email protected]>

* Adds a test for the new parameter

Signed-off-by: Hauck <[email protected]>

* Fix the test add in the previous commit;

Signed-off-by: Hauck <[email protected]>

* This was checking against the wrong field

Signed-off-by: Hauck <[email protected]>

* Only runs the test for the correct version

Signed-off-by: Hauck <[email protected]>

* Skips the test in Elasticsearch as well;

Signed-off-by: Hauck <[email protected]>

* Remove elastic 3.0 to test

Signed-off-by: Hauck <[email protected]>

* Skips all versions

Signed-off-by: Hauck <[email protected]>

* Remove unnecessary fields as pointed by @reta

Signed-off-by: Hauck <[email protected]>

* Compute if fieldMaxAnalyzedIsNotValid in the constructor as suggest by @reta

Signed-off-by: Hauck <[email protected]>

* As discussed, it is better to throws different exceptions for when the fieldMaxAnalyzed is not valid and for when it is disabled;

Signed-off-by: Hauck <[email protected]>

* hint what to do to allow highlight of bigger documents
Signed-off-by: Hauck <[email protected]>

* Let the user define the new parameter globally for all fields highlighted

Signed-off-by: Hauck <[email protected]>

* Change the  fieldMaxAnalyzedOffset Integer in order to use null when it is absent in highlight. This allows the error messages to much more precise, showing invalid for all negative numbers;

Signed-off-by: Hauck <[email protected]>

* Update javadocs and implements the stream methods for the new fields;

Signed-off-by: Hauck <[email protected]>

* builder.field do not accept null, so check before calling the method is necessary

Signed-off-by: Hauck <[email protected]>

* Only send and read the new fields if the version supports it

Signed-off-by: Hauck <[email protected]>

* the previous commit was checking the wrong field

Signed-off-by: Hauck <[email protected]>

* Check for version 3.0.0 instead of current version

Signed-off-by: Hauck <[email protected]>

* Update server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

Co-authored-by: Andriy Redko <[email protected]>
Signed-off-by: Hauck <[email protected]>

* Execute the test after version 3.0.0

Signed-off-by: Hauck <[email protected]>

Co-authored-by: Andriy Redko <[email protected]>
(cherry picked from commit 931813f)
  • Loading branch information
hauck-jvsh authored and github-actions[bot] committed Jul 22, 2022
1 parent c0b1b59 commit 14bf6a7
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
Expand All @@ -56,6 +55,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
Expand Down Expand Up @@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
null
);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ setup:
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
- match: { error.root_cause.0.type: "illegal_argument_exception" }

---
"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED":
- skip:
version: " - 2.99.99"
reason: only starting supporting the parameter max_analyzer_offset on version 3.0
- do:
search:
rest_total_hits_as_int: true
index: test1
body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}}
- match: {hits.hits.0.highlight.field1.0: "The <em>quick</em> brown fox went to the forest and saw another fox."}

---
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.Nullable;
Expand Down Expand Up @@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
private final int noMatchSize;
private final FieldHighlighter fieldHighlighter;
private final int maxAnalyzedOffset;
private final Integer fieldMaxAnalyzedOffset;

/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
Expand All @@ -99,6 +99,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
* @param fieldMatcher decides which terms should be highlighted
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
* offset source for it because it'd be super slow
* @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
*/
public CustomUnifiedHighlighter(
IndexSearcher searcher,
Expand All @@ -113,7 +114,8 @@ public CustomUnifiedHighlighter(
int noMatchSize,
int maxPassages,
Predicate<String> fieldMatcher,
int maxAnalyzedOffset
int maxAnalyzedOffset,
Integer fieldMaxAnalyzedOffset
) throws IOException {
super(searcher, analyzer);
this.offsetSource = offsetSource;
Expand All @@ -126,6 +128,7 @@ public CustomUnifiedHighlighter(
this.setFieldMatcher(fieldMatcher);
this.maxAnalyzedOffset = maxAnalyzedOffset;
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
}

/**
Expand All @@ -141,7 +144,21 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
return null;
}
int fieldValueLength = fieldValue.length();
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {

if (fieldMaxAnalyzedOffset != null && fieldMaxAnalyzedOffset > maxAnalyzedOffset) {
throw new IllegalArgumentException(
"max_analyzer_offset has exceeded ["
+ maxAnalyzedOffset
+ "] - maximum allowed to be analyzed for highlighting. "
+ "This maximum can be set by changing the ["
+ IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey()
+ "] index level setting. "
+ "For large texts, indexing with offsets or term vectors is recommended!"
);
}
// if fieldMaxAnalyzedOffset is not defined
// and if this happens we should fallback to the previous behavior
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset && fieldMaxAnalyzedOffset == null)) {
throw new IllegalArgumentException(
"The length of ["
+ field
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.opensearch.Version;
import org.opensearch.common.ParseField;
import org.opensearch.common.ParsingException;
import org.opensearch.common.Strings;
Expand Down Expand Up @@ -92,6 +93,7 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField OPTIONS_FIELD = new ParseField("options");
public static final ParseField HIGHLIGHT_QUERY_FIELD = new ParseField("highlight_query");
public static final ParseField MATCHED_FIELDS_FIELD = new ParseField("matched_fields");
public static final ParseField MAX_ANALYZER_OFFSET_FIELD = new ParseField("max_analyzer_offset");

protected String[] preTags;

Expand Down Expand Up @@ -129,6 +131,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB

protected Boolean requireFieldMatch;

protected Integer maxAnalyzerOffset = null;

public AbstractHighlighterBuilder() {}

protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, QueryBuilder queryBuilder) {
Expand All @@ -150,6 +154,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, Que
phraseLimit = template.phraseLimit;
options = template.options;
requireFieldMatch = template.requireFieldMatch;
maxAnalyzerOffset = template.maxAnalyzerOffset;
}

/**
Expand Down Expand Up @@ -181,7 +186,13 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException {
if (in.readBoolean()) {
options(in.readMap());
}

requireFieldMatch(in.readOptionalBoolean());

if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
maxAnalyzerOffset(in.readOptionalVInt());
}

}

/**
Expand Down Expand Up @@ -223,6 +234,9 @@ public final void writeTo(StreamOutput out) throws IOException {
out.writeMap(options);
}
out.writeOptionalBoolean(requireFieldMatch);
if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
out.writeOptionalVInt(maxAnalyzerOffset);
}
doWriteTo(out);
}

Expand Down Expand Up @@ -542,6 +556,21 @@ public Integer phraseLimit() {
return this.phraseLimit;
}

/**
* Sets the maximum offset for the highlighter
* @param maxAnalyzerOffset the maximum offset that the highlighter will consider
* @return this for chaining
*/
@SuppressWarnings("unchecked")
public HB maxAnalyzerOffset(Integer maxAnalyzerOffset) {
this.maxAnalyzerOffset = maxAnalyzerOffset;
return (HB) this;
}

public Integer maxAnalyzerOffset() {
return this.maxAnalyzerOffset;
}

/**
* Forces the highlighting to highlight fields based on the source even if fields are stored separately.
*/
Expand Down Expand Up @@ -623,6 +652,10 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
if (phraseLimit != null) {
builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
}
if (maxAnalyzerOffset != null) {
builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
}

}

static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB, HB> setupParser(ObjectParser<HB, Void> parser) {
Expand All @@ -642,6 +675,7 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
parser.declareBoolean(HB::forceSource, FORCE_SOURCE_FIELD);
parser.declareInt(HB::phraseLimit, PHRASE_LIMIT_FIELD);
parser.declareInt(HB::maxAnalyzerOffset, MAX_ANALYZER_OFFSET_FIELD);
parser.declareObject(HB::options, (XContentParser p, Void c) -> {
try {
return p.map();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,10 @@ private static void transferOptions(
if (highlighterBuilder.highlightQuery != null) {
targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
}
if (highlighterBuilder.maxAnalyzerOffset != null) {
targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
}

}

static Character[] convertCharArray(char[] array) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ public static class FieldOptions {

private int phraseLimit = -1;

private Integer maxAnalyzerOffset = null;

public Integer maxAnalyzerOffset() {
return maxAnalyzerOffset;
}

public int fragmentCharSize() {
return fragmentCharSize;
}
Expand Down Expand Up @@ -333,6 +339,15 @@ Builder phraseLimit(int phraseLimit) {
return this;
}

Builder maxAnalyzerOffset(Integer maxAnalyzerOffset) {
// throws an execption if the value is not a positive integer
if (maxAnalyzerOffset != null && maxAnalyzerOffset <= 0) {
throw new IllegalArgumentException("the value [" + maxAnalyzerOffset + "] of max_analyzer_offset is invalid");
}
fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
return this;
}

Builder matchedFields(Set<String> matchedFields) {
fieldOptions.matchedFields = matchedFields;
return this;
Expand Down Expand Up @@ -405,6 +420,9 @@ Builder merge(FieldOptions globalOptions) {
if (fieldOptions.phraseLimit == -1) {
fieldOptions.phraseLimit = globalOptions.phraseLimit;
}
if (fieldOptions.maxAnalyzerOffset == null) {
fieldOptions.maxAnalyzerOffset = globalOptions.maxAnalyzerOffset;
}
return this;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
package org.opensearch.search.fetch.subphase.highlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
Expand Down Expand Up @@ -133,13 +135,33 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
}

public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer analyzer, int limit) {
return new AnalyzerWrapper(analyzer.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer;
}

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
return new TokenStreamComponents(components.getSource(), new LimitTokenOffsetFilter(components.getTokenStream(), limit));
}

};

}

CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
? HighlightUtils.Encoders.HTML
: HighlightUtils.Encoders.DEFAULT;
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
Integer fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
if (fieldMaxAnalyzedOffset != null) {
analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
}
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
IndexSearcher searcher = fieldContext.context.searcher();
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
Expand Down Expand Up @@ -174,7 +196,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
fieldContext.field.fieldOptions().noMatchSize(),
higlighterNumberOfFragments,
fieldMatcher(fieldContext),
maxAnalyzedOffset
maxAnalyzedOffset,
fieldMaxAnalyzedOffset
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
Expand All @@ -63,6 +62,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.opensearch.test.OpenSearchTestCase;
Expand Down Expand Up @@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
null
);
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(snippets.length, expectedPassages.length);
Expand Down

0 comments on commit 14bf6a7

Please sign in to comment.