elastic · jimczi · Jan 11, 2018 · Jan 8, 2018 · Jan 8, 2018
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -23,15 +23,23 @@
 import java.util.Locale;
 
 /**
- * A custom break iterator that scans text to find break-delimited passages bounded by
- * a provided maximum length. This class delegates the boundary search to a first level
- * break iterator. When this break iterator finds a passage greater than the maximum length
+ * A custom break iterator that is used to find break-delimited passages bounded by
+ * a provided maximum length in the {@link UnifiedHighlighter} context.
+ * This class uses a {@link BreakIterator} to find the last break after the provided offset
+ * that would create a passage smaller than <code>maxLen</code>.
+ * If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
  * a secondary break iterator is used to re-split the passage at the first boundary after
  * maximum length.
+ *
  * This is useful to split passages created by {@link BreakIterator}s like `sentence` that
  * can create big outliers on semi-structured text.
  *
+ *
  * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
+ *
+ * TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not
+ * depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses
+ * only the first matching offset to derive the start and end of each passage.
  **/
 public class BoundedBreakIteratorScanner extends BreakIterator {
     private final BreakIterator mainBreak;
@@ -93,7 +101,15 @@ public int preceding(int offset) {
             innerEnd = windowEnd;
         } else {
             windowStart = innerStart = mainBreak.preceding(offset);
-            windowEnd = innerEnd = mainBreak.following(offset-1);
+            windowEnd = innerEnd = mainBreak.following(offset - 1);
+            // expand to next break until we reach maxLen
+            while (innerEnd - innerStart < maxLen) {
+                int newEnd = mainBreak.following(innerEnd);
+                if (newEnd == DONE || (newEnd - innerStart) > maxLen) {
+                    break;
+                }
+                windowEnd = innerEnd = newEnd;
+            }
         }
 
         if (innerEnd - innerStart > maxLen) {

diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -184,6 +184,20 @@ public void testSentenceBoundedBreakIterator() throws Exception {
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
     }
 
+    public void testSmallSentenceBoundedBreakIterator() throws Exception {
+        final String[] inputs = {
+            "A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence."
+        };
+        final String[] outputs = {
+            "A short <b>sentence</b>.",
+            "Followed by a bigger <b>sentence</b>",
+            "And a last short <b>sentence</b>"
+        };
+        TermQuery query = new TermQuery(new Term("text", "sentence"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+    }
+
     public void testRepeat() throws Exception {
         final String[] inputs = {
             "Fun  fun fun  fun  fun  fun  fun  fun  fun  fun"
@@ -205,4 +219,25 @@ public void testRepeat() throws Exception {
         assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
     }
+
+    public void testGroupSentences() throws Exception {
+        final String[] inputs = {
+            "Two words. Followed by many words in a big sentence. One. Two. Three. And more words."
+        };
+        final String[] outputs = {
+            "<b>Two</b> <b>words</b>.",
+            "Followed by many <b>words</b>",
+            "<b>One</b>. <b>Two</b>. <b>Three</b>.",
+            "And more <b>words</b>.",
+        };
+        BooleanQuery query = new BooleanQuery.Builder()
+            .add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD)
+            .build();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+    }
+
 }