diff --git a/src/unstable_markdown.rs b/src/unstable_markdown.rs index ebed450..314cbdb 100644 --- a/src/unstable_markdown.rs +++ b/src/unstable_markdown.rs @@ -193,6 +193,8 @@ enum SemanticLevel { Rule, /// Heading levels in markdown Heading(HeadingLevel), + /// Metadata for the entire document + Metadata, } impl Level for SemanticLevel { @@ -206,7 +208,8 @@ impl Level for SemanticLevel { | SemanticLevel::Text | SemanticLevel::Paragraph | SemanticLevel::Block - | SemanticLevel::Rule => SemanticSplitPosition::Own, + | SemanticLevel::Rule + | SemanticLevel::Metadata => SemanticSplitPosition::Own, SemanticLevel::InlineElement(p) | SemanticLevel::Item(p) => *p, // Attach it to the next text SemanticLevel::Heading(_) => SemanticSplitPosition::Next, @@ -282,8 +285,9 @@ impl SemanticSplit for Markdown { Event::Start(Tag::Heading { level, .. }) => { Some((SemanticLevel::Heading(level.into()), range)) } + Event::Start(Tag::MetadataBlock(_)) => Some((SemanticLevel::Metadata, range)), // End events are identical to start, so no need to grab them. - Event::Start(Tag::MetadataBlock(_)) | Event::End(_) => None, + Event::End(_) => None, }) .collect::>(); @@ -336,7 +340,8 @@ impl SemanticSplit for Markdown { | SemanticLevel::Paragraph | SemanticLevel::Block | SemanticLevel::Heading(_) - | SemanticLevel::Rule => split_str_by_separator( + | SemanticLevel::Rule + | SemanticLevel::Metadata => split_str_by_separator( text, self.ranges_after_offset(offset, semantic_level) .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)), diff --git a/tests/inputs/markdown/blog_frontmatter.md b/tests/inputs/markdown/blog_frontmatter.md new file mode 100644 index 0000000..93021fb --- /dev/null +++ b/tests/inputs/markdown/blog_frontmatter.md @@ -0,0 +1,52 @@ +--- +layout: page +title: Philosophy +permalink: /philosophy/ +--- + +Jekyll offers a unique philosophy when approaching the problem of static +site generation. This core philosophy drives development and product +decisions. When a contributor, maintainer, or user asks herself what Jekyll +is about, the following principles should come to mind: + +### 1. No Magic + +Jekyll is not magic. A user should be able to understand the underlying +processes that make up the Jekyll build without much reading. It should +do only what you ask it to and nothing more. When a user takes a certain +action, the outcome should be easily understandable and focused. + +### 2. It "Just Works" + +The out-of-the-box experience should be that it "just works." Run +`gem install jekyll` and it should build any Jekyll site that it's given. +Features like auto-regeneration and settings like the markdown renderer +should represent sane defaults that work perfectly for the vast majority of +cases. The burden of initial configuration should not be placed on the user. + +### 3. Content is King + +Why is Jekyll so loved by content creators? It focuses on content first and +foremost, making the process of publishing content on the Web easy. Users +should find the management of their content enjoyable and simple. + +### 4. Stability + +If a user's site builds today, it should build tomorrow. +Backwards-compatibility should be strongly preferred over breaking changes. +Breaking changes should be made to support a strong practical goal, and +breaking changes should never be made to drive forward "purity" of the +codebase, or other changes purely to make the maintainers' lives easier. +Breaking changes provide a significant amount of friction between upgrades +and reduce the confidence of users in this software, and should thus be +avoided unless absolutely necessary. +Upon breaking changes, provide a clear path for users to upgrade. + +### 5. Small & Extensible + +The core of Jekyll should be simple and small, and extensibility should be +a first-class feature to provide added functionality from community +contributors. The core should be kept to features used by at least 90% of +users–everything else should be provided as a plugin. New features should +be shipped as plugins and focus should be put on creating extensible core +API's to support rich plugins. diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..f8a7111 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-2.snap @@ -0,0 +1,14 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n" +- "### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n" +- "### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n" +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n" +- "### 4. Stability\n\n" +- "If a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\n" +- "Upon breaking changes, provide a clear path for users to upgrade.\n\n" +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..9933f0e --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md-3.snap @@ -0,0 +1,6 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md.snap new file mode 100644 index 0000000..1663d6e --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@blog_frontmatter.md.snap @@ -0,0 +1,73 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\n" +- "permalink: /philosophy/\n---\n\n" +- "Jekyll offers a unique philosophy when approaching the " +- "problem of static\n" +- "site generation. This core philosophy drives development and product\n" +- "decisions. " +- "When a contributor, maintainer, or user asks " +- "herself what Jekyll\n" +- "is about, the following principles should come to mind" +- ":\n\n" +- "### 1. No Magic\n\n" +- "Jekyll is not magic. " +- "A user should be able to understand the underlying\n" +- "processes that make up the Jekyll build without " +- "much reading. It should\n" +- do only what you ask it to and nothing more +- ". When a user takes a certain\n" +- "action, the outcome should be easily understandable and " +- "focused.\n\n" +- "### 2. It \"Just Works\"\n\n" +- "The out-of-the-box experience should " +- "be that it \"just works.\" Run\n" +- "`gem install jekyll` and " +- "it should build any Jekyll site that " +- "it's given.\n" +- "Features like auto-regeneration and settings like the " +- "markdown renderer\n" +- "should represent sane defaults that work perfectly for " +- "the vast majority of\n" +- "cases. " +- "The burden of initial configuration should not be placed on " +- "the user.\n\n" +- "### 3. Content is King\n\n" +- Why is Jekyll so loved by content creators +- "? It focuses on content first and\n" +- "foremost, making the process of publishing content on the " +- "Web easy. Users\n" +- should find the management of their content enjoyable and simple +- ".\n\n" +- "### 4. Stability\n\n" +- "If a user's site builds today, it " +- "should build tomorrow.\n" +- "Backwards-compatibility should be strongly preferred over " +- "breaking changes.\n" +- "Breaking changes should be made to support a strong practical " +- "goal, and\n" +- "breaking changes should never be made to drive forward \"" +- "purity\" of the\n" +- "codebase, or other changes purely to make the " +- "maintainers' lives easier.\n" +- "Breaking changes provide a significant amount of friction between upgrades\n" +- "and reduce the confidence of users in this software, " +- "and should thus be\navoided unless absolutely necessary.\n" +- "Upon breaking changes, provide a clear path for users " +- "to upgrade.\n\n" +- "### 5. Small & Extensible\n\n" +- "The core of Jekyll should be simple and " +- "small, and extensibility should be\n" +- "a first-class feature to provide added functionality from " +- "community\n" +- "contributors. " +- "The core should be kept to features used by at " +- "least 90% of\n" +- "users–everything else should be provided as a " +- "plugin. New features should\n" +- "be shipped as plugins and focus should be put " +- "on creating extensible core\nAPI'" +- "s to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..b855ce3 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-2.snap @@ -0,0 +1,14 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "Jekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:" +- "### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused." +- "### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user." +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple." +- "### 4. Stability" +- "If a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary." +- "Upon breaking changes, provide a clear path for users to upgrade." +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..11eb241 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md-3.snap @@ -0,0 +1,6 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md.snap new file mode 100644 index 0000000..37cffe6 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@blog_frontmatter.md.snap @@ -0,0 +1,73 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy" +- "permalink: /philosophy/\n---" +- Jekyll offers a unique philosophy when approaching the +- problem of static +- site generation. This core philosophy drives development and product +- decisions. +- "When a contributor, maintainer, or user asks" +- herself what Jekyll +- "is about, the following principles should come to mind" +- ":" +- "### 1. No Magic" +- Jekyll is not magic. +- A user should be able to understand the underlying +- processes that make up the Jekyll build without +- much reading. It should +- do only what you ask it to and nothing more +- ". When a user takes a certain" +- "action, the outcome should be easily understandable and" +- focused. +- "### 2. It \"Just Works\"" +- The out-of-the-box experience should +- "be that it \"just works.\" Run" +- "`gem install jekyll` and" +- it should build any Jekyll site that +- "it's given." +- Features like auto-regeneration and settings like the +- markdown renderer +- should represent sane defaults that work perfectly for +- the vast majority of +- cases. +- The burden of initial configuration should not be placed on +- the user. +- "### 3. Content is King" +- Why is Jekyll so loved by content creators +- "? It focuses on content first and" +- "foremost, making the process of publishing content on the" +- Web easy. Users +- should find the management of their content enjoyable and simple +- "." +- "### 4. Stability" +- "If a user's site builds today, it" +- should build tomorrow. +- Backwards-compatibility should be strongly preferred over +- breaking changes. +- Breaking changes should be made to support a strong practical +- "goal, and" +- "breaking changes should never be made to drive forward \"" +- "purity\" of the" +- "codebase, or other changes purely to make the" +- "maintainers' lives easier." +- Breaking changes provide a significant amount of friction between upgrades +- "and reduce the confidence of users in this software," +- "and should thus be\navoided unless absolutely necessary." +- "Upon breaking changes, provide a clear path for users" +- to upgrade. +- "### 5. Small & Extensible" +- The core of Jekyll should be simple and +- "small, and extensibility should be" +- a first-class feature to provide added functionality from +- community +- contributors. +- The core should be kept to features used by at +- least 90% of +- users–everything else should be provided as a +- plugin. New features should +- be shipped as plugins and focus should be put +- "on creating extensible core\nAPI'" +- s to support rich plugins. diff --git a/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..289fd6d --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-2.snap @@ -0,0 +1,44 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "\n\n" +- "Jekyll offers a unique philosophy when approaching the problem of static\nsite generation. " +- "This core philosophy drives development and product\n" +- "decisions. When a contributor, maintainer, or user asks herself what Jekyll\n" +- "is about, the following principles should come to mind:\n\n" +- "### 1. No Magic\n\n" +- "Jekyll is not magic. A user should be able to understand the underlying\n" +- "processes that make up the Jekyll build without much reading. It should\n" +- "do only what you ask it to and nothing more. When a user takes a certain\n" +- "action, the outcome should be easily understandable and focused.\n\n" +- "### 2. It \"Just Works\"\n\n" +- "The out-of-the-box experience should be that it \"just works.\" Run\n" +- "`gem install jekyll`" +- " and it should build any Jekyll site that it's given.\n" +- "Features like auto-regeneration and settings like the markdown renderer\n" +- "should represent sane defaults that work perfectly for the vast majority of\n" +- "cases. The burden of initial configuration should not be placed on the user.\n\n" +- "### 3. Content is King\n\n" +- "Why is Jekyll so loved by content creators? It focuses on content first and\n" +- "foremost, making the process of publishing content on the Web easy. Users\n" +- "should find the management of their content enjoyable and simple.\n\n" +- "### 4. Stability\n\n" +- "If a user's site builds today, it should build tomorrow.\n" +- "Backwards-compatibility should be strongly preferred over breaking changes.\n" +- "Breaking changes should be made to support a strong practical goal, and\n" +- "breaking changes should never be made to drive forward \"purity\" of the\n" +- "codebase, or other changes purely to make the maintainers' lives easier.\n" +- "Breaking changes provide a significant amount of friction between upgrades\n" +- "and reduce the confidence of users in this software, and should thus be\n" +- "avoided unless absolutely necessary.\n" +- "Upon breaking changes, provide a clear path for users to upgrade.\n\n" +- "### 5. Small & Extensible\n\n" +- "The core of Jekyll should be simple and small, and extensibility should be\n" +- "a first-class feature to provide added functionality from community\n" +- "contributors. The core should be kept to features used by at least 90% of\n" +- "users–everything else should be provided as a plugin. New features should\n" +- "be shipped as plugins and focus should be put on creating extensible core\nAPI'" +- "s to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..ef3f8e8 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md-3.snap @@ -0,0 +1,9 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n" +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n" +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md.snap new file mode 100644 index 0000000..c88f488 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown@blog_frontmatter.md.snap @@ -0,0 +1,297 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\n" +- "layout: " +- "page\n" +- "title: " +- Philosophy +- "\n" +- "permalink:" +- " /" +- philosophy +- "/\n---\n\n" +- "Jekyll " +- "offers a " +- "unique " +- philosophy +- " when " +- approachin +- "g the " +- problem of +- " static\n" +- "site " +- generation +- ". " +- "This core " +- philosophy +- " drives " +- developmen +- "t and " +- "product\n" +- decisions. +- " When a " +- contributo +- "r, " +- maintainer +- ", or user " +- "asks " +- "herself " +- "what " +- "Jekyll\n" +- "is about, " +- "the " +- "following " +- principles +- " should " +- "come to " +- "mind:\n\n" +- "### " +- "1. " +- "No Magic\n\n" +- "Jekyll is " +- not magic. +- " A user " +- "should be " +- "able to " +- understand +- " the " +- underlying +- "\n" +- "processes " +- "that make " +- "up the " +- "Jekyll " +- "build " +- "without " +- "much " +- "reading. " +- "It should\n" +- "do only " +- "what you " +- "ask it to " +- "and " +- "nothing " +- "more. " +- "When a " +- user takes +- " a certain" +- "\n" +- "action, " +- "the " +- "outcome " +- "should be " +- "easily " +- understand +- "able and " +- "focused.\n\n" +- "### 2. It " +- "\"Just " +- "Works\"\n\n" +- The out-of +- "-the-box " +- experience +- " should be" +- " that it \"" +- just works +- ".\" Run\n" +- "`gem " +- "install " +- "jekyll` " +- "and it " +- "should " +- "build any " +- "Jekyll " +- "site that " +- "it's given" +- ".\n" +- "Features " +- like auto- +- regenerati +- "on and " +- "settings " +- "like the " +- "markdown " +- "renderer\n" +- "should " +- "represent " +- "sane " +- "defaults " +- "that work " +- "perfectly " +- "for the " +- "vast " +- "majority " +- "of\n" +- "cases. " +- The burden +- " of " +- "initial " +- configurat +- ion should +- " not be " +- "placed on " +- "the user.\n" +- "\n" +- "### " +- "3. " +- Content is +- " King\n\n" +- "Why is " +- "Jekyll so " +- "loved by " +- "content " +- "creators? " +- It focuses +- " on " +- "content " +- "first and\n" +- "foremost, " +- making the +- " process " +- "of " +- publishing +- " content " +- on the Web +- " easy. " +- "Users\n" +- "should " +- "find the " +- management +- " of their " +- "content " +- "enjoyable " +- and simple +- ".\n\n" +- "### " +- "4. " +- "Stability\n" +- "\n" +- "If a " +- "user's " +- "site " +- "builds " +- "today, it " +- "should " +- "build " +- "tomorrow.\n" +- Backwards- +- compatibil +- ity should +- " be " +- "strongly " +- "preferred " +- "over " +- "breaking " +- "changes.\n" +- "Breaking " +- "changes " +- "should be " +- "made to " +- "support a " +- "strong " +- "practical " +- "goal, and\n" +- "breaking " +- "changes " +- "should " +- "never be " +- "made to " +- "drive " +- "forward \"" +- "purity\" of" +- " the\n" +- "codebase, " +- "or other " +- "changes " +- "purely to " +- "make the " +- maintainer +- "s' lives " +- "easier.\n" +- "Breaking " +- "changes " +- "provide a " +- significan +- "t amount " +- "of " +- "friction " +- "between " +- "upgrades\n" +- and reduce +- " the " +- confidence +- " of users " +- "in this " +- "software, " +- and should +- " thus be\n" +- "avoided " +- "unless " +- absolutely +- " necessary" +- ".\n" +- "Upon " +- "breaking " +- "changes, " +- "provide a " +- clear path +- " for users" +- " to " +- "upgrade.\n\n" +- "### " +- "5. " +- "Small & " +- Extensible +- "\n\n" +- "The core " +- "of Jekyll " +- "should be " +- simple and +- " small, " +- "and " +- extensibil +- ity should +- " be\n" +- a first- +- "class " +- feature to +- " provide " +- "added " +- functional +- "ity from " +- "community\n" +- contributo +- "rs. " +- "The core " +- "should be " +- "kept to " +- "features " +- used by at +- " least 90%" +- " of\n" +- users– +- everything +- " else " +- "should be " +- "provided " +- "as a " +- "plugin. " +- "New " +- "features " +- "should\n" +- be shipped +- " as " +- "plugins " +- "and focus " +- "should be " +- "put on " +- "creating " +- extensible +- " core\nAPI'" +- "s to " +- "support " +- "rich " +- "plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..476dd45 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-2.snap @@ -0,0 +1,43 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "Jekyll offers a unique philosophy when approaching the problem of static\nsite generation." +- This core philosophy drives development and product +- "decisions. When a contributor, maintainer, or user asks herself what Jekyll" +- "is about, the following principles should come to mind:" +- "### 1. No Magic" +- Jekyll is not magic. A user should be able to understand the underlying +- processes that make up the Jekyll build without much reading. It should +- do only what you ask it to and nothing more. When a user takes a certain +- "action, the outcome should be easily understandable and focused." +- "### 2. It \"Just Works\"" +- "The out-of-the-box experience should be that it \"just works.\" Run" +- "`gem install jekyll`" +- "and it should build any Jekyll site that it's given." +- Features like auto-regeneration and settings like the markdown renderer +- should represent sane defaults that work perfectly for the vast majority of +- cases. The burden of initial configuration should not be placed on the user. +- "### 3. Content is King" +- Why is Jekyll so loved by content creators? It focuses on content first and +- "foremost, making the process of publishing content on the Web easy. Users" +- should find the management of their content enjoyable and simple. +- "### 4. Stability" +- "If a user's site builds today, it should build tomorrow." +- Backwards-compatibility should be strongly preferred over breaking changes. +- "Breaking changes should be made to support a strong practical goal, and" +- "breaking changes should never be made to drive forward \"purity\" of the" +- "codebase, or other changes purely to make the maintainers' lives easier." +- Breaking changes provide a significant amount of friction between upgrades +- "and reduce the confidence of users in this software, and should thus be" +- avoided unless absolutely necessary. +- "Upon breaking changes, provide a clear path for users to upgrade." +- "### 5. Small & Extensible" +- "The core of Jekyll should be simple and small, and extensibility should be" +- a first-class feature to provide added functionality from community +- contributors. The core should be kept to features used by at least 90% of +- users–everything else should be provided as a plugin. New features should +- "be shipped as plugins and focus should be put on creating extensible core\nAPI'" +- s to support rich plugins. diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..f7265cf --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md-3.snap @@ -0,0 +1,9 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "Jekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user." +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade." +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins." diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md.snap new file mode 100644 index 0000000..5db2fe4 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@blog_frontmatter.md.snap @@ -0,0 +1,285 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---" +- "layout:" +- page +- "title:" +- Philosophy +- "permalink:" +- / +- philosophy +- "/\n---" +- Jekyll +- offers a +- unique +- philosophy +- when +- approachin +- g the +- problem of +- static +- site +- generation +- "." +- This core +- philosophy +- drives +- developmen +- t and +- product +- decisions. +- When a +- contributo +- "r," +- maintainer +- ", or user" +- asks +- herself +- what +- Jekyll +- "is about," +- the +- following +- principles +- should +- come to +- "mind:" +- "###" +- "1." +- No Magic +- Jekyll is +- not magic. +- A user +- should be +- able to +- understand +- the +- underlying +- processes +- that make +- up the +- Jekyll +- build +- without +- much +- reading. +- It should +- do only +- what you +- ask it to +- and +- nothing +- more. +- When a +- user takes +- a certain +- "action," +- the +- outcome +- should be +- easily +- understand +- able and +- focused. +- "### 2. It" +- "\"Just" +- "Works\"" +- The out-of +- "-the-box" +- experience +- should be +- "that it \"" +- just works +- ".\" Run" +- "`gem" +- install +- "jekyll`" +- and it +- should +- build any +- Jekyll +- site that +- "it's given" +- "." +- Features +- like auto- +- regenerati +- on and +- settings +- like the +- markdown +- renderer +- should +- represent +- sane +- defaults +- that work +- perfectly +- for the +- vast +- majority +- of +- cases. +- The burden +- of initial +- configurat +- ion should +- not be +- placed on +- the user. +- "###" +- "3." +- Content is +- King +- Why is +- Jekyll so +- loved by +- content +- creators? +- It focuses +- on content +- first and +- "foremost," +- making the +- process of +- publishing +- content on +- the Web +- easy. +- Users +- should +- find the +- management +- of their +- content +- enjoyable +- and simple +- "." +- "###" +- "4." +- Stability +- If a +- "user's" +- site +- builds +- "today, it" +- should +- build +- tomorrow. +- Backwards- +- compatibil +- ity should +- be +- strongly +- preferred +- over +- breaking +- changes. +- Breaking +- changes +- should be +- made to +- support a +- strong +- practical +- "goal, and" +- breaking +- changes +- should +- never be +- made to +- drive +- "forward \"" +- "purity\" of" +- the +- "codebase," +- or other +- changes +- purely to +- make the +- maintainer +- "s' lives" +- easier. +- Breaking +- changes +- provide a +- significan +- t amount +- of +- friction +- between +- upgrades +- and reduce +- the +- confidence +- of users +- in this +- "software," +- and should +- thus be +- avoided +- unless +- absolutely +- necessary. +- Upon +- breaking +- "changes," +- provide a +- clear path +- for users +- to upgrade +- "." +- "###" +- "5." +- Small & +- Extensible +- The core +- of Jekyll +- should be +- simple and +- "small, and" +- extensibil +- ity should +- be +- a first- +- class +- feature to +- provide +- added +- functional +- ity from +- community +- contributo +- rs. +- The core +- should be +- kept to +- features +- used by at +- least 90% +- of +- users– +- everything +- else +- should be +- provided +- as a +- plugin. +- New +- features +- should +- be shipped +- as plugins +- and focus +- should be +- put on +- creating +- extensible +- "core\nAPI'" +- s to +- support +- rich +- plugins. diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..2d8f904 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-2.snap @@ -0,0 +1,14 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n" +- "### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n" +- "### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n" +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n" +- "### 4. Stability\n\n" +- "If a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\n" +- "avoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n" +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..9933f0e --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md-3.snap @@ -0,0 +1,6 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md.snap new file mode 100644 index 0000000..5f13c35 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@blog_frontmatter.md.snap @@ -0,0 +1,75 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\n" +- "permalink: /philosophy/\n---\n\n" +- Jekyll offers a unique philosophy when approaching the problem +- " of static\n" +- site generation. This core philosophy drives development and product +- "\n" +- "decisions. " +- "When a contributor, maintainer, or user asks" +- " herself what Jekyll\n" +- "is about, the following principles should come to mind" +- ":\n\n" +- "### 1. No Magic\n\n" +- "Jekyll is not magic. " +- "A user should be able to understand the underlying\n" +- processes that make up the Jekyll build without +- " much reading. It should\n" +- do only what you ask it to and nothing more +- ". When a user takes a certain\n" +- "action, the outcome should be easily understandable and focused" +- ".\n\n" +- "### 2. It \"Just Works\"\n\n" +- The out-of-the-box experience should be that it +- " \"just works.\" Run\n" +- "`gem install jekyll` and it should build" +- " any Jekyll site that it's given.\n" +- Features like auto-regeneration and settings like the markdown +- " renderer\n" +- should represent sane defaults that work perfectly for the vast +- " majority of\n" +- "cases. " +- The burden of initial configuration should not be placed on +- " the user.\n\n" +- "### 3. Content is King\n\n" +- Why is Jekyll so loved by content creators? +- " It focuses on content first and\n" +- "foremost, making the process of publishing content on" +- " the Web easy. Users\n" +- should find the management of their content enjoyable and simple +- ".\n\n" +- "### 4. Stability\n\n" +- "If a user's site builds today, it should" +- " build tomorrow.\n" +- Backwards-compatibility should be strongly preferred over breaking +- " changes.\n" +- Breaking changes should be made to support a strong practical +- " goal, and\n" +- "breaking changes should never be made to drive forward \"" +- "purity\" of the\n" +- "codebase, or other changes purely to make the" +- " maintainers' lives easier.\n" +- Breaking changes provide a significant amount of friction between upgrades +- "\n" +- "and reduce the confidence of users in this software," +- " and should thus be\n" +- "avoided unless absolutely necessary.\n" +- "Upon breaking changes, provide a clear path for users" +- " to upgrade.\n\n" +- "### 5. Small & Extensible\n\n" +- The core of Jekyll should be simple and small +- ", and extensibility should be\n" +- a first-class feature to provide added functionality from community +- "\n" +- "contributors. " +- The core should be kept to features used by at +- " least 90% of\n" +- users–everything else should be provided as a plugin +- ". New features should\n" +- be shipped as plugins and focus should be put on +- " creating extensible core\nAPI'" +- "s to support rich plugins.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-2.snap new file mode 100644 index 0000000..d502a9d --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-2.snap @@ -0,0 +1,14 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---" +- "Jekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:" +- "### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused." +- "### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user." +- "### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple." +- "### 4. Stability" +- "If a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be" +- "avoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade." +- "### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-3.snap new file mode 100644 index 0000000..11eb241 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md-3.snap @@ -0,0 +1,6 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy\npermalink: /philosophy/\n---\n\nJekyll offers a unique philosophy when approaching the problem of static\nsite generation. This core philosophy drives development and product\ndecisions. When a contributor, maintainer, or user asks herself what Jekyll\nis about, the following principles should come to mind:\n\n### 1. No Magic\n\nJekyll is not magic. A user should be able to understand the underlying\nprocesses that make up the Jekyll build without much reading. It should\ndo only what you ask it to and nothing more. When a user takes a certain\naction, the outcome should be easily understandable and focused.\n\n### 2. It \"Just Works\"\n\nThe out-of-the-box experience should be that it \"just works.\" Run\n`gem install jekyll` and it should build any Jekyll site that it's given.\nFeatures like auto-regeneration and settings like the markdown renderer\nshould represent sane defaults that work perfectly for the vast majority of\ncases. The burden of initial configuration should not be placed on the user.\n\n### 3. Content is King\n\nWhy is Jekyll so loved by content creators? It focuses on content first and\nforemost, making the process of publishing content on the Web easy. Users\nshould find the management of their content enjoyable and simple.\n\n### 4. Stability\n\nIf a user's site builds today, it should build tomorrow.\nBackwards-compatibility should be strongly preferred over breaking changes.\nBreaking changes should be made to support a strong practical goal, and\nbreaking changes should never be made to drive forward \"purity\" of the\ncodebase, or other changes purely to make the maintainers' lives easier.\nBreaking changes provide a significant amount of friction between upgrades\nand reduce the confidence of users in this software, and should thus be\navoided unless absolutely necessary.\nUpon breaking changes, provide a clear path for users to upgrade.\n\n### 5. Small & Extensible\n\nThe core of Jekyll should be simple and small, and extensibility should be\na first-class feature to provide added functionality from community\ncontributors. The core should be kept to features used by at least 90% of\nusers–everything else should be provided as a plugin. New features should\nbe shipped as plugins and focus should be put on creating extensible core\nAPI's to support rich plugins." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md.snap new file mode 100644 index 0000000..4211716 --- /dev/null +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@blog_frontmatter.md.snap @@ -0,0 +1,72 @@ +--- +source: tests/text_splitter_snapshots.rs +expression: chunks +input_file: tests/inputs/markdown/blog_frontmatter.md +--- +- "---\nlayout: page\ntitle: Philosophy" +- "permalink: /philosophy/\n---" +- Jekyll offers a unique philosophy when approaching the problem +- of static +- site generation. This core philosophy drives development and product +- decisions. +- "When a contributor, maintainer, or user asks" +- herself what Jekyll +- "is about, the following principles should come to mind" +- ":" +- "### 1. No Magic" +- Jekyll is not magic. +- A user should be able to understand the underlying +- processes that make up the Jekyll build without +- much reading. It should +- do only what you ask it to and nothing more +- ". When a user takes a certain" +- "action, the outcome should be easily understandable and focused" +- "." +- "### 2. It \"Just Works\"" +- The out-of-the-box experience should be that it +- "\"just works.\" Run" +- "`gem install jekyll` and it should build" +- "any Jekyll site that it's given." +- Features like auto-regeneration and settings like the markdown +- renderer +- should represent sane defaults that work perfectly for the vast +- majority of +- cases. +- The burden of initial configuration should not be placed on +- the user. +- "### 3. Content is King" +- Why is Jekyll so loved by content creators? +- It focuses on content first and +- "foremost, making the process of publishing content on" +- the Web easy. Users +- should find the management of their content enjoyable and simple +- "." +- "### 4. Stability" +- "If a user's site builds today, it should" +- build tomorrow. +- Backwards-compatibility should be strongly preferred over breaking +- changes. +- Breaking changes should be made to support a strong practical +- "goal, and" +- "breaking changes should never be made to drive forward \"" +- "purity\" of the" +- "codebase, or other changes purely to make the" +- "maintainers' lives easier." +- Breaking changes provide a significant amount of friction between upgrades +- "and reduce the confidence of users in this software," +- and should thus be +- avoided unless absolutely necessary. +- "Upon breaking changes, provide a clear path for users" +- to upgrade. +- "### 5. Small & Extensible" +- The core of Jekyll should be simple and small +- ", and extensibility should be" +- a first-class feature to provide added functionality from community +- contributors. +- The core should be kept to features used by at +- least 90% of +- users–everything else should be provided as a plugin +- ". New features should" +- be shipped as plugins and focus should be put on +- "creating extensible core\nAPI'" +- s to support rich plugins.