Skip to content

Commit

Permalink
Preserve markdown indentation (#109)
Browse files Browse the repository at this point in the history
If a chunk includes newlines, preserve any indentation inside the chunk even when trimming
  • Loading branch information
benbrandt authored Mar 8, 2024
1 parent 9876af1 commit 28e19c0
Show file tree
Hide file tree
Showing 29 changed files with 3,809 additions and 3,356 deletions.
39 changes: 29 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ impl ChunkCapacity for RangeToInclusive<usize> {
}

/// How a particular semantic level relates to surrounding text elements.
#[allow(dead_code)]
#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum SemanticSplitPosition {
/// The semantic level should be included in the previous chunk.
Expand All @@ -210,6 +211,11 @@ enum SemanticSplitPosition {
/// Information required by generic Semantic Levels
trait Level: fmt::Debug {
fn split_position(&self) -> SemanticSplitPosition;

/// Whether or not when splitting ranges, whitespace should be included as previous.
fn treat_whitespace_as_previous(&self) -> bool {
false
}
}

/// Implementation that dictates the semantic split points available.
Expand Down Expand Up @@ -268,6 +274,18 @@ trait SemanticSplit {
text: &'text str,
semantic_level: Self::Level,
) -> impl Iterator<Item = (usize, &'text str)> + 'splitter;

/// Trim the str and adjust the offset if necessary.
/// This is the default behavior, but custom semantic levels may need different behavior.
fn trim_chunk<'splitter, 'text: 'splitter>(
&'splitter self,
offset: usize,
chunk: &'text str,
) -> (usize, &'text str) {
// Figure out how many bytes we lose trimming the beginning
let diff = chunk.len() - chunk.trim_start().len();
(offset + diff, chunk.trim())
}
}

/// Returns chunks of text with their byte offsets as an iterator.
Expand Down Expand Up @@ -314,9 +332,7 @@ where
/// If trim chunks is on, trim the str and adjust the offset
fn trim_chunk(&self, offset: usize, chunk: &'text str) -> (usize, &'text str) {
if self.trim_chunks {
// Figure out how many bytes we lose trimming the beginning
let diff = chunk.len() - chunk.trim_start().len();
(offset + diff, chunk.trim())
self.semantic_split.trim_chunk(offset, chunk)
} else {
(offset, chunk)
}
Expand Down Expand Up @@ -429,13 +445,7 @@ where
let chunk = self.text.get(start..self.cursor)?;

// Trim whitespace if user requested it
Some(if self.trim_chunks {
// Figure out how many bytes we lose trimming the beginning
let offset = chunk.len() - chunk.trim_start().len();
(start + offset, chunk.trim())
} else {
(start, chunk)
})
Some(self.trim_chunk(start, chunk))
}

/// Find the ideal next sections, breaking it up until we find the largest chunk.
Expand Down Expand Up @@ -551,6 +561,15 @@ fn split_str_by_separator<L: Level>(
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
if prev_section.trim().is_empty()
&& level.treat_whitespace_as_previous()
{
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Left(once((offset, section))));
}
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
Expand Down
39 changes: 37 additions & 2 deletions src/unstable_markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ where
/// let text = "Some text\n\nfrom a\ndocument";
/// let chunks = splitter.chunks(text, 10).collect::<Vec<_>>();
///
/// assert_eq!(vec!["Some text\n", "\n", "from a\n", "document"], chunks);
/// assert_eq!(vec!["Some text\n", "\nfrom a\n", "document"], chunks);
/// ```
pub fn chunks<'splitter, 'text: 'splitter>(
&'splitter self,
Expand All @@ -127,7 +127,7 @@ where
/// let text = "Some text\n\nfrom a\ndocument";
/// let chunks = splitter.chunk_indices(text, 10).collect::<Vec<_>>();
///
/// assert_eq!(vec![(0, "Some text\n"), (10, "\n"), (11, "from a\n"), (18, "document")], chunks);
/// assert_eq!(vec![(0, "Some text\n"), (10, "\nfrom a\n"), (18, "document")], chunks);
pub fn chunk_indices<'splitter, 'text: 'splitter>(
&'splitter self,
text: &'text str,
Expand Down Expand Up @@ -215,6 +215,24 @@ impl Level for SemanticLevel {
SemanticLevel::Heading(_) => SemanticSplitPosition::Next,
}
}

fn treat_whitespace_as_previous(&self) -> bool {
match self {
SemanticLevel::Char
| SemanticLevel::GraphemeCluster
| SemanticLevel::Word
| SemanticLevel::Sentence
| SemanticLevel::SoftBreak
| SemanticLevel::Text
| SemanticLevel::InlineElement(_)
| SemanticLevel::Rule
| SemanticLevel::Heading(_)
| SemanticLevel::Metadata => false,
SemanticLevel::Block
| SemanticLevel::ContainerBlock(_)
| SemanticLevel::MetaContainer => true,
}
}
}

/// Captures information about markdown structure for a given text, and their
Expand All @@ -225,6 +243,8 @@ struct Markdown {
ranges: Vec<(SemanticLevel, Range<usize>)>,
}

const NEWLINES: [char; 2] = ['\n', '\r'];

impl SemanticSplit for Markdown {
type Level = SemanticLevel;

Expand Down Expand Up @@ -337,6 +357,21 @@ impl SemanticSplit for Markdown {
.map(move |(i, str)| (offset + i, str)),
}
}

fn trim_chunk<'splitter, 'text: 'splitter>(
&'splitter self,
offset: usize,
chunk: &'text str,
) -> (usize, &'text str) {
// Preserve indentation if we have newlines inside the element
if chunk.trim().contains(NEWLINES) {
let diff = chunk.len() - chunk.trim_start_matches(NEWLINES).len();
(offset + diff, chunk.trim_start_matches(NEWLINES).trim_end())
} else {
let diff = chunk.len() - chunk.trim_start().len();
(offset + diff, chunk.trim())
}
}
}

#[cfg(test)]
Expand Down
42 changes: 38 additions & 4 deletions tests/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@ fn fallsback_to_normal_text_split_if_no_markdown_content() {
let chunk_size = 10;
let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();

assert_eq!(
["Some text\n", "\n", "from a\n", "document"].to_vec(),
chunks
);
assert_eq!(["Some text\n", "\nfrom a\n", "document"].to_vec(), chunks);
}

#[cfg(feature = "markdown")]
Expand Down Expand Up @@ -110,3 +107,40 @@ fn subheadings_grouped_with_top_header() {
chunks
);
}

#[cfg(feature = "markdown")]
#[test]
fn trimming_doesnt_trim_block_level_indentation_if_multiple_items() {
let splitter = MarkdownSplitter::default().with_trim_chunks(true);
let text = "* Really long list item that is too big to fit\n\n * Some Indented Text\n\n * More Indented Text\n\n";
let chunk_size = 48;
let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();

assert_eq!(
[
"* Really long list item that is too big to fit",
" * Some Indented Text\n\n * More Indented Text"
]
.to_vec(),
chunks
);
}

#[cfg(feature = "markdown")]
#[test]
fn trimming_does_trim_block_level_indentation_if_only_one_item() {
let splitter = MarkdownSplitter::default().with_trim_chunks(true);
let text = "1. Really long list item\n\n 1. Some Indented Text\n\n 2. More Indented Text\n\n";
let chunk_size = 30;
let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();

assert_eq!(
[
"1. Really long list item",
"1. Some Indented Text",
"2. More Indented Text"
]
.to_vec(),
chunks
);
}
Loading

0 comments on commit 28e19c0

Please sign in to comment.