Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include all sentences smaller than fragment_size in the unified highlighter #28132

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@
import java.util.Locale;

/**
* A custom break iterator that scans text to find break-delimited passages bounded by
* a provided maximum length. This class delegates the boundary search to a first level
* break iterator. When this break iterator finds a passage greater than the maximum length
* A custom break iterator that is used to find break-delimited passages bounded by
* a provided maximum length in the {@link UnifiedHighlighter} context.
* This class uses a {@link BreakIterator} to find the last break after the provided offset
* that would create a passage smaller than <code>maxLen</code>.
* If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
* a secondary break iterator is used to re-split the passage at the first boundary after
* maximum length.
*
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
* can create big outliers on semi-structured text.
*
*
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
*
* TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not
* depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses
* only the first matching offset to derive the start and end of each passage.
**/
public class BoundedBreakIteratorScanner extends BreakIterator {
private final BreakIterator mainBreak;
Expand Down Expand Up @@ -93,7 +101,15 @@ public int preceding(int offset) {
innerEnd = windowEnd;
} else {
windowStart = innerStart = mainBreak.preceding(offset);
windowEnd = innerEnd = mainBreak.following(offset-1);
windowEnd = innerEnd = mainBreak.following(offset - 1);
// expand to next break until we reach maxLen
while (innerEnd - innerStart < maxLen) {
int newEnd = mainBreak.following(innerEnd);
if (newEnd == DONE || (newEnd - innerStart) > maxLen) {
break;
}
windowEnd = innerEnd = newEnd;
}
}

if (innerEnd - innerStart > maxLen) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,20 @@ public void testSentenceBoundedBreakIterator() throws Exception {
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}

public void testSmallSentenceBoundedBreakIterator() throws Exception {
final String[] inputs = {
"A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence."
};
final String[] outputs = {
"A short <b>sentence</b>.",
"Followed by a bigger <b>sentence</b>",
"And a last short <b>sentence</b>"
};
TermQuery query = new TermQuery(new Term("text", "sentence"));
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
}

public void testRepeat() throws Exception {
final String[] inputs = {
"Fun fun fun fun fun fun fun fun fun fun"
Expand All @@ -205,4 +219,25 @@ public void testRepeat() throws Exception {
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
}

public void testGroupSentences() throws Exception {
final String[] inputs = {
"Two words. Followed by many words in a big sentence. One. Two. Three. And more words."
};
final String[] outputs = {
"<b>Two</b> <b>words</b>.",
"Followed by many <b>words</b>",
"<b>One</b>. <b>Two</b>. <b>Three</b>.",
"And more <b>words</b>.",
};
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD)
.build();
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
}

}
Loading