Skip to content

Commit

Permalink
opensearch-project#3842 adds a new parameter to the highlighter, the …
Browse files Browse the repository at this point in the history
…max_analyzer_offset. When this parameter is provided the highlight stops in its value. This prevents the highlighter to go beyond the index maxAnalyzedOffset.

Signed-off-by: Hauck <[email protected]>
  • Loading branch information
hauck-jvsh committed Jul 15, 2022
1 parent f165845 commit f0d9805
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
Expand All @@ -56,6 +55,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
Expand Down Expand Up @@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
-1
);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.Nullable;
Expand Down Expand Up @@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
private final int noMatchSize;
private final FieldHighlighter fieldHighlighter;
private final int maxAnalyzedOffset;
private final int fieldMaxAnalyzedOffset;

/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
Expand Down Expand Up @@ -113,7 +113,8 @@ public CustomUnifiedHighlighter(
int noMatchSize,
int maxPassages,
Predicate<String> fieldMatcher,
int maxAnalyzedOffset
int maxAnalyzedOffset,
int fieldMaxAnalyzedOffset
) throws IOException {
super(searcher, analyzer);
this.offsetSource = offsetSource;
Expand All @@ -126,6 +127,7 @@ public CustomUnifiedHighlighter(
this.setFieldMatcher(fieldMatcher);
this.maxAnalyzedOffset = maxAnalyzedOffset;
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
}

/**
Expand All @@ -141,7 +143,10 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
return null;
}
int fieldValueLength = fieldValue.length();
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {

if ((fieldMaxAnalyzedOffset < 0 || fieldMaxAnalyzedOffset > maxAnalyzedOffset)
&& (offsetSource == OffsetSource.ANALYSIS)
&& (fieldValueLength > maxAnalyzedOffset)) {
throw new IllegalArgumentException(
"The length of ["
+ field
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField OPTIONS_FIELD = new ParseField("options");
public static final ParseField HIGHLIGHT_QUERY_FIELD = new ParseField("highlight_query");
public static final ParseField MATCHED_FIELDS_FIELD = new ParseField("matched_fields");
public static final ParseField MAX_ANALYZER_OFFSET_FIELD = new ParseField("max_analyzer_offset");

protected String[] preTags;

Expand Down Expand Up @@ -129,6 +130,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB

protected Boolean requireFieldMatch;

protected int maxAnalyzerOffset;

public AbstractHighlighterBuilder() {}

protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, QueryBuilder queryBuilder) {
Expand All @@ -150,6 +153,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, Que
phraseLimit = template.phraseLimit;
options = template.options;
requireFieldMatch = template.requireFieldMatch;
maxAnalyzerOffset = template.maxAnalyzerOffset;
}

/**
Expand Down Expand Up @@ -542,6 +546,21 @@ public Integer phraseLimit() {
return this.phraseLimit;
}

/**
* Sets the maximum offset for the highlighter
* @param maxAnalyzerOffset the maximum offset that the highlighter will consider
* @return this for chaining
*/
@SuppressWarnings("unchecked")
public HB maxAnalyzerOffset(int maxAnalyzerOffset) {
this.maxAnalyzerOffset = maxAnalyzerOffset;
return (HB) this;
}

public int maxAnalyzerOffset() {
return this.maxAnalyzerOffset;
}

/**
* Forces the highlighting to highlight fields based on the source even if fields are stored separately.
*/
Expand Down Expand Up @@ -623,6 +642,9 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
if (phraseLimit != null) {
builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
}
if (maxAnalyzerOffset > 0) {
builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
}
}

static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB, HB> setupParser(ObjectParser<HB, Void> parser) {
Expand All @@ -642,6 +664,7 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
parser.declareBoolean(HB::forceSource, FORCE_SOURCE_FIELD);
parser.declareInt(HB::phraseLimit, PHRASE_LIMIT_FIELD);
parser.declareInt(HB::maxAnalyzerOffset, MAX_ANALYZER_OFFSET_FIELD);
parser.declareObject(HB::options, (XContentParser p, Void c) -> {
try {
return p.map();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
static final String[] DEFAULT_PRE_TAGS = new String[] { "<em>" };
/** the default closing tag */
static final String[] DEFAULT_POST_TAGS = new String[] { "</em>" };
static final int DEFAULT_MAX_ANALYZER_OFFSET = -1;

/** the default opening tags when {@code tag_schema = "styled"} */
public static final String[] DEFAULT_STYLED_PRE_TAG = {
Expand Down Expand Up @@ -126,6 +127,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
.boundaryScannerLocale(Locale.ROOT)
.noMatchSize(DEFAULT_NO_MATCH_SIZE)
.phraseLimit(DEFAULT_PHRASE_LIMIT)
.maxAnalyzerOffset(DEFAULT_MAX_ANALYZER_OFFSET)
.build();

private final List<Field> fields;
Expand Down Expand Up @@ -399,6 +401,9 @@ private static void transferOptions(
if (highlighterBuilder.highlightQuery != null) {
targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
}
if (highlighterBuilder.maxAnalyzerOffset > 0) {
targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
}
}

static Character[] convertCharArray(char[] array) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ public static class FieldOptions {

private int phraseLimit = -1;

private int maxAnalyzerOffset = -1;

public int maxAnalyzerOffset() {
return maxAnalyzerOffset;
}

public int fragmentCharSize() {
return fragmentCharSize;
}
Expand Down Expand Up @@ -333,6 +339,11 @@ Builder phraseLimit(int phraseLimit) {
return this;
}

Builder maxAnalyzerOffset(int maxAnalyzerOffset) {
fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
return this;
}

Builder matchedFields(Set<String> matchedFields) {
fieldOptions.matchedFields = matchedFields;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
package org.opensearch.search.fetch.subphase.highlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
Expand Down Expand Up @@ -133,13 +135,40 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
}

public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer a, int limit) {
return new AnalyzerWrapper(a.getReuseStrategy()) {

private Analyzer old = a;
private int maxOffset = limit;

@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return old;
}

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
return new TokenStreamComponents(
components.getSource(),
new LimitTokenOffsetFilter(components.getTokenStream(), maxOffset)
);
}

};

}

CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
? HighlightUtils.Encoders.HTML
: HighlightUtils.Encoders.DEFAULT;
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
int fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
if (fieldMaxAnalyzedOffset > 0) {
analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
}
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
IndexSearcher searcher = fieldContext.context.searcher();
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
Expand Down Expand Up @@ -174,7 +203,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
fieldContext.field.fieldOptions().noMatchSize(),
higlighterNumberOfFragments,
fieldMatcher(fieldContext),
maxAnalyzedOffset
maxAnalyzedOffset,
fieldMaxAnalyzedOffset
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
Expand All @@ -63,6 +62,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.opensearch.test.OpenSearchTestCase;
Expand Down Expand Up @@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
Integer.MAX_VALUE
Integer.MAX_VALUE,
-1
);
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(snippets.length, expectedPassages.length);
Expand Down

0 comments on commit f0d9805

Please sign in to comment.