diff --git a/src/lib.rs b/src/lib.rs index a727636..27d2f78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -158,8 +158,8 @@ where cursor: usize, /// Reusable container for next sections to avoid extra allocations next_sections: Vec<(usize, &'text str)>, - /// Previous item's byte range - prev_item_range: Option>, + /// Previous item's end byte offset + prev_item_end: usize, /// Splitter used for determining semantic levels. semantic_split: SemanticSplitRanges, /// Original text to iterate over and generate chunks from @@ -179,7 +179,7 @@ where chunk_sizer: chunk_config.memoized_sizer(), cursor: 0, next_sections: Vec::new(), - prev_item_range: None, + prev_item_end: 0, semantic_split: SemanticSplitRanges::new(Level::offsets(text).collect()), text, } @@ -423,12 +423,12 @@ where // cases where we trim. (_, "") => continue, c => { - let item_range = Some(c.0..c.0 + c.1.len()); - // Skip because we've emitted a duplicate chunk - if item_range == self.prev_item_range { + let item_end = c.0 + c.1.len(); + // Skip because we've emitted a chunk whose content we've already emitted + if item_end <= self.prev_item_end { continue; } - self.prev_item_range = item_range; + self.prev_item_end = item_end; return Some(c); } } diff --git a/tests/text_splitter.rs b/tests/text_splitter.rs index 3b3d65e..2cf4c3e 100644 --- a/tests/text_splitter.rs +++ b/tests/text_splitter.rs @@ -125,7 +125,7 @@ fn chunk_overlap_words() { let chunks = splitter.chunks(text).collect::>(); - assert_eq!(chunks, ["An ", " ", "appl", "pple", " a ", "a ", " day"]); + assert_eq!(chunks, ["An ", "appl", "pple", " a ", " day"]); } #[test] @@ -137,3 +137,13 @@ fn chunk_overlap_words_trim() { assert_eq!(chunks, ["An", "appl", "pple", "a", "day"]); } + +#[test] +fn chunk_overlap_paragraph() { + let splitter = TextSplitter::new(ChunkConfig::new(14).with_overlap(7).unwrap()); + let text = "Item 1\nItem 2\nItem 3"; + + let chunks = splitter.chunks(text).collect::>(); + + assert_eq!(chunks, ["Item 1\nItem 2", "Item 2\nItem 3"]); +}