Skip to content

Commit

Permalink
UnDRY the split by separator code
Browse files Browse the repository at this point in the history
I should have reflected more when the compiler told me there was dead code if the Markdown feature wasn't enabled around the `SemanticSplitPosition`.

I was trying to overfit too much structure in order to reuse one loop. This was already vastly different between text and markdown, and made the code harder to reason about. Just split the two implementations, and also got rid of the `Level` trait which I also didn't feel great about because the methods there were only ever really needed for Markdown anyway.
  • Loading branch information
benbrandt committed Mar 23, 2024
1 parent ecf0c78 commit f898ebc
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 126 deletions.
103 changes: 1 addition & 102 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,9 @@

use std::{
cmp::Ordering,
fmt,
iter::once,
ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive},
};

use either::Either;
use itertools::Itertools;

mod characters;
Expand Down Expand Up @@ -198,36 +195,14 @@ impl ChunkCapacity for RangeToInclusive<usize> {
}
}

/// How a particular semantic level relates to surrounding text elements.
#[allow(dead_code)]
#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum SemanticSplitPosition {
/// The semantic level should be included in the previous chunk.
Prev,
/// The semantic level should be treated as its own chunk.
Own,
/// The semantic level should be included in the next chunk.
Next,
}

/// Information required by generic Semantic Levels
trait Level: fmt::Debug {
fn split_position(&self) -> SemanticSplitPosition;

/// Whether or not when splitting ranges, whitespace should be included as previous.
fn treat_whitespace_as_previous(&self) -> bool {
false
}
}

/// Implementation that dictates the semantic split points available.
/// For plain text, this goes from characters, to grapheme clusters, to words,
/// to sentences, to linebreaks.
/// For something like Markdown, this would also include things like headers,
/// lists, and code blocks.
trait SemanticSplit {
/// Internal type used to represent the level of semantic splitting.
type Level: Copy + Level + Ord + PartialOrd + 'static;
type Level: Copy + Ord + PartialOrd + 'static;

/// Levels that are always considered in splitting text, because they are always present.
const PERSISTENT_LEVELS: &'static [Self::Level];
Expand Down Expand Up @@ -524,82 +499,6 @@ where
}
}

/// Given a list of separator ranges, construct the sections of the text
fn split_str_by_separator<L: Level>(
text: &str,
separator_ranges: impl Iterator<Item = (L, Range<usize>)>,
) -> impl Iterator<Item = (usize, &str)> {
let mut cursor = 0;
let mut final_match = false;
separator_ranges
.batching(move |it| {
loop {
match it.next() {
// If we've hit the end, actually return None
None if final_match => return None,
// First time we hit None, return the final section of the text
None => {
final_match = true;
return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
}
// Return text preceding match + the match
Some((level, range)) => {
let offset = cursor;
match level.split_position() {
SemanticSplitPosition::Prev => {
if range.end < cursor {
continue;
}
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Left(once((offset, section))));
}
SemanticSplitPosition::Own => {
if range.start < cursor {
continue;
}
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
if prev_section.trim().is_empty()
&& level.treat_whitespace_as_previous()
{
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Left(once((offset, section))));
}
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Right(
[(offset, prev_section), (range.start, separator)].into_iter(),
));
}
SemanticSplitPosition::Next => {
if range.start < cursor {
continue;
}
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
// Separator will be part of the next chunk
cursor = range.start;
return Some(Either::Left(once((offset, prev_section))));
}
}
}
}
}
})
.flatten()
.filter(|(_, s)| !s.is_empty())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
109 changes: 99 additions & 10 deletions src/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ Semantic splitting of Markdown documents. Tries to use as many semantic units fr
as possible, according to the Common Mark specification.
*/

use std::ops::Range;
use std::{iter::once, ops::Range};

use auto_enums::auto_enum;
use either::Either;
use itertools::Itertools;
use pulldown_cmark::{Event, Options, Parser, Tag};
use unicode_segmentation::UnicodeSegmentation;

use crate::{
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
SemanticSplitPosition, TextChunks,
};
use crate::{Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks};

/// Markdown splitter. Recursively splits chunks into the largest
/// semantic units that fit within the chunk size. Also will
Expand Down Expand Up @@ -169,6 +168,17 @@ impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
}
}

/// How a particular semantic level relates to surrounding text elements.
#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum SemanticSplitPosition {
/// The semantic level should be included in the previous chunk.
Prev,
/// The semantic level should be treated as its own chunk.
Own,
/// The semantic level should be included in the next chunk.
Next,
}

/// Different semantic levels that text can be split by.
/// Each level provides a method of splitting text into chunks of a given level
/// as well as a fallback in case a given fallback is too large.
Expand Down Expand Up @@ -204,8 +214,8 @@ enum SemanticLevel {
Metadata,
}

impl Level for SemanticLevel {
fn split_position(&self) -> SemanticSplitPosition {
impl SemanticLevel {
fn split_position(self) -> SemanticSplitPosition {
match self {
SemanticLevel::Char
| SemanticLevel::GraphemeCluster
Expand All @@ -217,13 +227,13 @@ impl Level for SemanticLevel {
| SemanticLevel::MetaContainer
| SemanticLevel::Rule
| SemanticLevel::Metadata => SemanticSplitPosition::Own,
SemanticLevel::InlineElement(p) | SemanticLevel::ContainerBlock(p) => *p,
SemanticLevel::InlineElement(p) | SemanticLevel::ContainerBlock(p) => p,
// Attach it to the next text
SemanticLevel::Heading(_) => SemanticSplitPosition::Next,
}
}

fn treat_whitespace_as_previous(&self) -> bool {
fn treat_whitespace_as_previous(self) -> bool {
match self {
SemanticLevel::Char
| SemanticLevel::GraphemeCluster
Expand All @@ -250,6 +260,85 @@ struct Markdown {
ranges: Vec<(SemanticLevel, Range<usize>)>,
}

impl Markdown {
/// Given a list of separator ranges, construct the sections of the text
fn split_str_by_separator(
text: &str,
separator_ranges: impl Iterator<Item = (SemanticLevel, Range<usize>)>,
) -> impl Iterator<Item = (usize, &str)> {
let mut cursor = 0;
let mut final_match = false;
separator_ranges
.batching(move |it| {
loop {
match it.next() {
// If we've hit the end, actually return None
None if final_match => return None,
// First time we hit None, return the final section of the text
None => {
final_match = true;
return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
}
// Return text preceding match + the match
Some((level, range)) => {
let offset = cursor;
match level.split_position() {
SemanticSplitPosition::Prev => {
if range.end < cursor {
continue;
}
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Left(once((offset, section))));
}
SemanticSplitPosition::Own => {
if range.start < cursor {
continue;
}
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
if prev_section.trim().is_empty()
&& level.treat_whitespace_as_previous()
{
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Left(once((offset, section))));
}
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Right(
[(offset, prev_section), (range.start, separator)]
.into_iter(),
));
}
SemanticSplitPosition::Next => {
if range.start < cursor {
continue;
}
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
// Separator will be part of the next chunk
cursor = range.start;
return Some(Either::Left(once((offset, prev_section))));
}
}
}
}
}
})
.flatten()
.filter(|(_, s)| !s.is_empty())
}
}

const NEWLINES: [char; 2] = ['\n', '\r'];

impl SemanticSplit for Markdown {
Expand Down Expand Up @@ -356,7 +445,7 @@ impl SemanticSplit for Markdown {
| SemanticLevel::MetaContainer
| SemanticLevel::Heading(_)
| SemanticLevel::Rule
| SemanticLevel::Metadata => split_str_by_separator(
| SemanticLevel::Metadata => Self::split_str_by_separator(
text,
self.ranges_after_offset(offset, semantic_level)
.map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
Expand Down
65 changes: 51 additions & 14 deletions src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
Semantic splitting of text documents.
*/

use std::ops::Range;
use std::{iter::once, ops::Range};

use auto_enums::auto_enum;
use either::Either;
use itertools::Itertools;
use once_cell::sync::Lazy;
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;

use crate::{
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
SemanticSplitPosition, TextChunks,
};
use crate::{Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks};

/// Default plain-text splitter. Recursively splits chunks into the largest
/// semantic units that fit within the chunk size. Also will attempt to merge
Expand Down Expand Up @@ -164,13 +163,6 @@ enum SemanticLevel {
LineBreak(usize),
}

impl Level for SemanticLevel {
/// All of these levels should be treated as their own chunk
fn split_position(&self) -> SemanticSplitPosition {
SemanticSplitPosition::Own
}
}

// Lazy so that we don't have to compile them more than once
static LINEBREAKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unwrap());

Expand All @@ -182,6 +174,51 @@ struct LineBreaks {
line_breaks: Vec<(SemanticLevel, Range<usize>)>,
}

impl LineBreaks {
/// Given a list of separator ranges, construct the sections of the text
fn split_str_by_separator(
text: &str,
separator_ranges: impl Iterator<Item = Range<usize>>,
) -> impl Iterator<Item = (usize, &str)> {
let mut cursor = 0;
let mut final_match = false;
separator_ranges
.batching(move |it| {
loop {
match it.next() {
// If we've hit the end, actually return None
None if final_match => return None,
// First time we hit None, return the final section of the text
None => {
final_match = true;
return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
}
// Return text preceding match + the match
Some(range) => {
if range.start < cursor {
continue;
}

let offset = cursor;
let prev_section = text
.get(offset..range.start)
.expect("invalid character sequence");
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
cursor = range.end;
return Some(Either::Right(
[(offset, prev_section), (range.start, separator)].into_iter(),
));
}
}
}
})
.flatten()
.filter(|(_, s)| !s.is_empty())
}
}

impl SemanticSplit for LineBreaks {
type Level = SemanticLevel;

Expand Down Expand Up @@ -245,10 +282,10 @@ impl SemanticSplit for LineBreaks {
SemanticLevel::Sentence => text
.split_sentence_bound_indices()
.map(move |(i, str)| (offset + i, str)),
SemanticLevel::LineBreak(_) => split_str_by_separator(
SemanticLevel::LineBreak(_) => Self::split_str_by_separator(
text,
self.ranges_after_offset(offset, semantic_level)
.map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
.map(move |(_, sep)| sep.start - offset..sep.end - offset),
)
.map(move |(i, str)| (offset + i, str)),
}
Expand Down

0 comments on commit f898ebc

Please sign in to comment.