Skip to content

Commit

Permalink
Make sure there aren't chunks emitted whose entire content was alread…
Browse files Browse the repository at this point in the history
…y emitted
  • Loading branch information
benbrandt committed Apr 27, 2024
1 parent 83de62f commit c7f9c33
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 8 deletions.
14 changes: 7 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ where
cursor: usize,
/// Reusable container for next sections to avoid extra allocations
next_sections: Vec<(usize, &'text str)>,
/// Previous item's byte range
prev_item_range: Option<Range<usize>>,
/// Previous item's end byte offset
prev_item_end: usize,
/// Splitter used for determining semantic levels.
semantic_split: SemanticSplitRanges<Level>,
/// Original text to iterate over and generate chunks from
Expand All @@ -179,7 +179,7 @@ where
chunk_sizer: chunk_config.memoized_sizer(),
cursor: 0,
next_sections: Vec::new(),
prev_item_range: None,
prev_item_end: 0,
semantic_split: SemanticSplitRanges::new(Level::offsets(text).collect()),
text,
}
Expand Down Expand Up @@ -423,12 +423,12 @@ where
// cases where we trim.
(_, "") => continue,
c => {
let item_range = Some(c.0..c.0 + c.1.len());
// Skip because we've emitted a duplicate chunk
if item_range == self.prev_item_range {
let item_end = c.0 + c.1.len();
// Skip because we've emitted a chunk whose content we've already emitted
if item_end <= self.prev_item_end {
continue;
}
self.prev_item_range = item_range;
self.prev_item_end = item_end;
return Some(c);
}
}
Expand Down
12 changes: 11 additions & 1 deletion tests/text_splitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn chunk_overlap_words() {

let chunks = splitter.chunks(text).collect::<Vec<_>>();

assert_eq!(chunks, ["An ", " ", "appl", "pple", " a ", "a ", " day"]);
assert_eq!(chunks, ["An ", "appl", "pple", " a ", " day"]);
}

#[test]
Expand All @@ -137,3 +137,13 @@ fn chunk_overlap_words_trim() {

assert_eq!(chunks, ["An", "appl", "pple", "a", "day"]);
}

#[test]
fn chunk_overlap_paragraph() {
let splitter = TextSplitter::new(ChunkConfig::new(14).with_overlap(7).unwrap());
let text = "Item 1\nItem 2\nItem 3";

let chunks = splitter.chunks(text).collect::<Vec<_>>();

assert_eq!(chunks, ["Item 1\nItem 2", "Item 2\nItem 3"]);
}

0 comments on commit c7f9c33

Please sign in to comment.