Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Adds a new parameter, max_analyzer_offset, for the highlighter #4031

Merged
merged 2 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
Expand All @@ -56,6 +55,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
Expand Down Expand Up @@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
null
);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ setup:
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
- match: { error.root_cause.0.type: "illegal_argument_exception" }

---
"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED":
- skip:
version: " - 2.1.99"
reason: only starting supporting the parameter max_analyzer_offset on version 2.2
- do:
search:
rest_total_hits_as_int: true
index: test1
body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}}
- match: {hits.hits.0.highlight.field1.0: "The <em>quick</em> brown fox went to the forest and saw another fox."}

---
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.Nullable;
Expand Down Expand Up @@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
private final int noMatchSize;
private final FieldHighlighter fieldHighlighter;
private final int maxAnalyzedOffset;
private final Integer fieldMaxAnalyzedOffset;

/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
Expand All @@ -99,6 +99,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
* @param fieldMatcher decides which terms should be highlighted
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
* offset source for it because it'd be super slow
* @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
*/
public CustomUnifiedHighlighter(
IndexSearcher searcher,
Expand All @@ -113,7 +114,8 @@ public CustomUnifiedHighlighter(
int noMatchSize,
int maxPassages,
Predicate<String> fieldMatcher,
int maxAnalyzedOffset
int maxAnalyzedOffset,
Integer fieldMaxAnalyzedOffset
) throws IOException {
super(searcher, analyzer);
this.offsetSource = offsetSource;
Expand All @@ -126,6 +128,7 @@ public CustomUnifiedHighlighter(
this.setFieldMatcher(fieldMatcher);
this.maxAnalyzedOffset = maxAnalyzedOffset;
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
}

/**
Expand All @@ -141,7 +144,21 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
return null;
}
int fieldValueLength = fieldValue.length();
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {

if (fieldMaxAnalyzedOffset != null && fieldMaxAnalyzedOffset > maxAnalyzedOffset) {
throw new IllegalArgumentException(
"max_analyzer_offset has exceeded ["
+ maxAnalyzedOffset
+ "] - maximum allowed to be analyzed for highlighting. "
+ "This maximum can be set by changing the ["
+ IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey()
+ "] index level setting. "
+ "For large texts, indexing with offsets or term vectors is recommended!"
);
}
// if fieldMaxAnalyzedOffset is not defined
// and if this happens we should fallback to the previous behavior
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset && fieldMaxAnalyzedOffset == null)) {
throw new IllegalArgumentException(
"The length of ["
+ field
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.opensearch.Version;
import org.opensearch.common.ParseField;
import org.opensearch.common.ParsingException;
import org.opensearch.common.Strings;
Expand Down Expand Up @@ -92,6 +93,7 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField OPTIONS_FIELD = new ParseField("options");
public static final ParseField HIGHLIGHT_QUERY_FIELD = new ParseField("highlight_query");
public static final ParseField MATCHED_FIELDS_FIELD = new ParseField("matched_fields");
public static final ParseField MAX_ANALYZER_OFFSET_FIELD = new ParseField("max_analyzer_offset");

protected String[] preTags;

Expand Down Expand Up @@ -129,6 +131,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB

protected Boolean requireFieldMatch;

protected Integer maxAnalyzerOffset = null;

public AbstractHighlighterBuilder() {}

protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, QueryBuilder queryBuilder) {
Expand All @@ -150,6 +154,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, Que
phraseLimit = template.phraseLimit;
options = template.options;
requireFieldMatch = template.requireFieldMatch;
maxAnalyzerOffset = template.maxAnalyzerOffset;
}

/**
Expand Down Expand Up @@ -181,7 +186,13 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException {
if (in.readBoolean()) {
options(in.readMap());
}

requireFieldMatch(in.readOptionalBoolean());

if (in.getVersion().onOrAfter(Version.V_2_2_0)) {
reta marked this conversation as resolved.
Show resolved Hide resolved
maxAnalyzerOffset(in.readOptionalVInt());
}

}

/**
Expand Down Expand Up @@ -223,6 +234,9 @@ public final void writeTo(StreamOutput out) throws IOException {
out.writeMap(options);
}
out.writeOptionalBoolean(requireFieldMatch);
if (out.getVersion().onOrAfter(Version.V_2_2_0)) {
out.writeOptionalVInt(maxAnalyzerOffset);
}
doWriteTo(out);
}

Expand Down Expand Up @@ -542,6 +556,21 @@ public Integer phraseLimit() {
return this.phraseLimit;
}

/**
* Sets the maximum offset for the highlighter
* @param maxAnalyzerOffset the maximum offset that the highlighter will consider
* @return this for chaining
*/
@SuppressWarnings("unchecked")
public HB maxAnalyzerOffset(Integer maxAnalyzerOffset) {
this.maxAnalyzerOffset = maxAnalyzerOffset;
return (HB) this;
}

public Integer maxAnalyzerOffset() {
return this.maxAnalyzerOffset;
}

/**
* Forces the highlighting to highlight fields based on the source even if fields are stored separately.
*/
Expand Down Expand Up @@ -623,6 +652,10 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
if (phraseLimit != null) {
builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
}
if (maxAnalyzerOffset != null) {
builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
}

}

static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB, HB> setupParser(ObjectParser<HB, Void> parser) {
Expand All @@ -642,6 +675,7 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
parser.declareBoolean(HB::forceSource, FORCE_SOURCE_FIELD);
parser.declareInt(HB::phraseLimit, PHRASE_LIMIT_FIELD);
parser.declareInt(HB::maxAnalyzerOffset, MAX_ANALYZER_OFFSET_FIELD);
parser.declareObject(HB::options, (XContentParser p, Void c) -> {
try {
return p.map();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,10 @@ private static void transferOptions(
if (highlighterBuilder.highlightQuery != null) {
targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
}
if (highlighterBuilder.maxAnalyzerOffset != null) {
targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
}

}

static Character[] convertCharArray(char[] array) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ public static class FieldOptions {

private int phraseLimit = -1;

private Integer maxAnalyzerOffset = null;

public Integer maxAnalyzerOffset() {
return maxAnalyzerOffset;
}

public int fragmentCharSize() {
return fragmentCharSize;
}
Expand Down Expand Up @@ -333,6 +339,15 @@ Builder phraseLimit(int phraseLimit) {
return this;
}

Builder maxAnalyzerOffset(Integer maxAnalyzerOffset) {
// throws an execption if the value is not a positive integer
if (maxAnalyzerOffset != null && maxAnalyzerOffset <= 0) {
throw new IllegalArgumentException("the value [" + maxAnalyzerOffset + "] of max_analyzer_offset is invalid");
}
fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
return this;
}

Builder matchedFields(Set<String> matchedFields) {
fieldOptions.matchedFields = matchedFields;
return this;
Expand Down Expand Up @@ -405,6 +420,9 @@ Builder merge(FieldOptions globalOptions) {
if (fieldOptions.phraseLimit == -1) {
fieldOptions.phraseLimit = globalOptions.phraseLimit;
}
if (fieldOptions.maxAnalyzerOffset == null) {
fieldOptions.maxAnalyzerOffset = globalOptions.maxAnalyzerOffset;
}
return this;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
package org.opensearch.search.fetch.subphase.highlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
Expand Down Expand Up @@ -133,13 +135,33 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
}

public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer analyzer, int limit) {
return new AnalyzerWrapper(analyzer.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer;
}

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
return new TokenStreamComponents(components.getSource(), new LimitTokenOffsetFilter(components.getTokenStream(), limit));
}

};

}

CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
? HighlightUtils.Encoders.HTML
: HighlightUtils.Encoders.DEFAULT;
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
Integer fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
if (fieldMaxAnalyzedOffset != null) {
analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
}
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
IndexSearcher searcher = fieldContext.context.searcher();
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
Expand Down Expand Up @@ -174,7 +196,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
fieldContext.field.fieldOptions().noMatchSize(),
higlighterNumberOfFragments,
fieldMatcher(fieldContext),
maxAnalyzedOffset
maxAnalyzedOffset,
fieldMaxAnalyzedOffset
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
Expand All @@ -63,6 +62,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.opensearch.test.OpenSearchTestCase;
Expand Down Expand Up @@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
null
);
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(snippets.length, expectedPassages.length);
Expand Down