Skip to content

Commit

Permalink
Pulldown v0.10 and Headings (#100)
Browse files Browse the repository at this point in the history
* Update markdown lib

* add headings
  • Loading branch information
benbrandt authored Feb 18, 2024
1 parent 756f8e2 commit 3e8a756
Show file tree
Hide file tree
Showing 52 changed files with 942 additions and 912 deletions.
26 changes: 14 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ exclude = [
"/tests/inputs/**",
"/tests/tokenizers/**",
"*.yml",
"*.yaml"
"*.yaml",
]
rust-version = "1.75.0"

Expand All @@ -29,22 +29,24 @@ rustdoc-args = ["--cfg", "docsrs"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
auto_enums = "0.8.3"
either = "1.9.0"
itertools = "0.12.0"
once_cell = "1.18.0"
pulldown-cmark = { version = "0.9.6", default-features = false, optional = true }
regex = "1.10.2"
tiktoken-rs = { version = "0.5.6", optional = true }
tokenizers = { version = "0.15.0", default_features = false, features = ["onig"], optional = true }
unicode-segmentation = "1.10.1"
auto_enums = "0.8.5"
either = "1.10.0"
itertools = "0.12.1"
once_cell = "1.19.0"
pulldown-cmark = { version = "0.10.0", default-features = false, optional = true }
regex = "1.10.3"
tiktoken-rs = { version = "0.5.8", optional = true }
tokenizers = { version = "0.15.2", default_features = false, features = [
"onig",
], optional = true }
unicode-segmentation = "1.11.0"

[dev-dependencies]
criterion = "0.5.1"
fake = "2.9.1"
fake = "2.9.2"
insta = { version = "1.34.0", features = ["glob", "yaml"] }
more-asserts = "0.3.1"
tokenizers = { version = "0.15.0", default-features = false, features = [
tokenizers = { version = "0.15.2", default-features = false, features = [
"onig",
"http",
] }
Expand Down
106 changes: 93 additions & 13 deletions src/unstable_markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,31 @@ where
}
}

/// Heading levels in markdown.
/// Sorted in reverse order for sorting purposes.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
enum HeadingLevel {
H6,
H5,
H4,
H3,
H2,
H1,
}

impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
fn from(value: pulldown_cmark::HeadingLevel) -> Self {
match value {
pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1,
pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2,
pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3,
pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4,
pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5,
pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6,
}
}
}

/// Different semantic levels that text can be split by.
/// Each level provides a method of splitting text into chunks of a given level
/// as well as a fallback in case a given fallback is too large.
Expand Down Expand Up @@ -166,6 +191,8 @@ enum SemanticLevel {
Block,
/// thematic break/horizontal rule
Rule,
/// Heading levels in markdown
Heading(HeadingLevel),
}

impl Level for SemanticLevel {
Expand All @@ -181,6 +208,8 @@ impl Level for SemanticLevel {
| SemanticLevel::Block
| SemanticLevel::Rule => SemanticSplitPosition::Own,
SemanticLevel::InlineElement(p) | SemanticLevel::Item(p) => *p,
// Attach it to the next text
SemanticLevel::Heading(_) => SemanticSplitPosition::Next,
}
}
}
Expand Down Expand Up @@ -213,13 +242,13 @@ impl SemanticSplit for Markdown {
Tag::Emphasis
| Tag::Strong
| Tag::Strikethrough
| Tag::Link(_, _, _)
| Tag::Image(_, _, _)
| Tag::Link { .. }
| Tag::Image { .. }
| Tag::TableCell,
)
| Event::Code(_)
| Event::HardBreak
| Event::Html(_) => Some((
| Event::InlineHtml(_) => Some((
SemanticLevel::InlineElement(SemanticSplitPosition::Own),
range,
)),
Expand All @@ -240,16 +269,21 @@ impl SemanticSplit for Markdown {
Event::Start(Tag::TableRow | Tag::Item) => {
Some((SemanticLevel::Item(SemanticSplitPosition::Own), range))
}
Event::Start(
Event::Html(_)
| Event::Start(
Tag::List(_)
| Tag::Table(_)
| Tag::BlockQuote
| Tag::CodeBlock(_)
| Tag::HtmlBlock
| Tag::FootnoteDefinition(_),
) => Some((SemanticLevel::Block, range)),
Event::Rule => Some((SemanticLevel::Rule, range)),
Event::Start(Tag::Heading { level, .. }) => {
Some((SemanticLevel::Heading(level.into()), range))
}
// End events are identical to start, so no need to grab them.
Event::Start(Tag::Heading(_, _, _)) | Event::End(_) => None,
Event::Start(Tag::MetadataBlock(_)) | Event::End(_) => None,
})
.collect::<Vec<_>>();

Expand Down Expand Up @@ -301,6 +335,7 @@ impl SemanticSplit for Markdown {
| SemanticLevel::Item(_)
| SemanticLevel::Paragraph
| SemanticLevel::Block
| SemanticLevel::Heading(_)
| SemanticLevel::Rule => split_str_by_separator(
text,
self.ranges_after_offset(offset, semantic_level)
Expand Down Expand Up @@ -653,20 +688,39 @@ mod tests {
}

#[test]
fn test_html() {
let markdown = Markdown::new("<div>Some text</div>");
fn test_inline_html() {
let markdown = Markdown::new("<span>Some text</span>");

assert_eq!(
vec![&(
SemanticLevel::InlineElement(SemanticSplitPosition::Own),
0..20
),],
vec![
&(SemanticLevel::Paragraph, 0..22),
&(
SemanticLevel::InlineElement(SemanticSplitPosition::Own),
0..6
),
&(SemanticLevel::Text, 6..15),
&(
SemanticLevel::InlineElement(SemanticSplitPosition::Own),
15..22
),
],
markdown.ranges().collect::<Vec<_>>()
);
assert_eq!(SemanticLevel::Paragraph, markdown.max_level());
}

#[test]
fn test_html() {
let markdown = Markdown::new("<div>Some text</div>");

assert_eq!(
SemanticLevel::InlineElement(SemanticSplitPosition::Own),
markdown.max_level()
vec![
&(SemanticLevel::Block, 0..20),
&(SemanticLevel::Block, 0..20)
],
markdown.ranges().collect::<Vec<_>>()
);
assert_eq!(SemanticLevel::Block, markdown.max_level());
}

#[test]
Expand Down Expand Up @@ -795,4 +849,30 @@ mod tests {
);
assert_eq!(SemanticLevel::Rule, markdown.max_level());
}

#[test]
fn test_heading() {
for (index, (heading, level)) in [
("#", HeadingLevel::H1),
("##", HeadingLevel::H2),
("###", HeadingLevel::H3),
("####", HeadingLevel::H4),
("#####", HeadingLevel::H5),
("######", HeadingLevel::H6),
]
.into_iter()
.enumerate()
{
let markdown = Markdown::new(&format!("{heading} Heading"));

assert_eq!(
vec![
&(SemanticLevel::Heading(level), 0..9 + index),
&(SemanticLevel::Text, 2 + index..9 + index)
],
markdown.ranges().collect::<Vec<_>>()
);
assert_eq!(SemanticLevel::Heading(level), markdown.max_level());
}
}
}
Loading

0 comments on commit 3e8a756

Please sign in to comment.