diff --git a/src/unstable_markdown.rs b/src/unstable_markdown.rs index 2badd60b..ebed450b 100644 --- a/src/unstable_markdown.rs +++ b/src/unstable_markdown.rs @@ -137,6 +137,31 @@ where } } +/// Heading levels in markdown. +/// Sorted in reverse order for sorting purposes. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] +enum HeadingLevel { + H6, + H5, + H4, + H3, + H2, + H1, +} + +impl From for HeadingLevel { + fn from(value: pulldown_cmark::HeadingLevel) -> Self { + match value { + pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1, + pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2, + pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3, + pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4, + pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5, + pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6, + } + } +} + /// Different semantic levels that text can be split by. /// Each level provides a method of splitting text into chunks of a given level /// as well as a fallback in case a given fallback is too large. @@ -166,6 +191,8 @@ enum SemanticLevel { Block, /// thematic break/horizontal rule Rule, + /// Heading levels in markdown + Heading(HeadingLevel), } impl Level for SemanticLevel { @@ -181,6 +208,8 @@ impl Level for SemanticLevel { | SemanticLevel::Block | SemanticLevel::Rule => SemanticSplitPosition::Own, SemanticLevel::InlineElement(p) | SemanticLevel::Item(p) => *p, + // Attach it to the next text + SemanticLevel::Heading(_) => SemanticSplitPosition::Next, } } } @@ -219,7 +248,7 @@ impl SemanticSplit for Markdown { ) | Event::Code(_) | Event::HardBreak - | Event::Html(_) => Some(( + | Event::InlineHtml(_) => Some(( SemanticLevel::InlineElement(SemanticSplitPosition::Own), range, )), @@ -240,18 +269,21 @@ impl SemanticSplit for Markdown { Event::Start(Tag::TableRow | Tag::Item) => { Some((SemanticLevel::Item(SemanticSplitPosition::Own), range)) } - Event::Start( + Event::Html(_) + | Event::Start( Tag::List(_) | Tag::Table(_) | Tag::BlockQuote | Tag::CodeBlock(_) + | Tag::HtmlBlock | Tag::FootnoteDefinition(_), ) => Some((SemanticLevel::Block, range)), Event::Rule => Some((SemanticLevel::Rule, range)), + Event::Start(Tag::Heading { level, .. }) => { + Some((SemanticLevel::Heading(level.into()), range)) + } // End events are identical to start, so no need to grab them. - Event::Start(Tag::Heading { .. } | Tag::HtmlBlock | Tag::MetadataBlock(_)) - | Event::InlineHtml(_) - | Event::End(_) => None, + Event::Start(Tag::MetadataBlock(_)) | Event::End(_) => None, }) .collect::>(); @@ -303,6 +335,7 @@ impl SemanticSplit for Markdown { | SemanticLevel::Item(_) | SemanticLevel::Paragraph | SemanticLevel::Block + | SemanticLevel::Heading(_) | SemanticLevel::Rule => split_str_by_separator( text, self.ranges_after_offset(offset, semantic_level) @@ -655,20 +688,39 @@ mod tests { } #[test] - fn test_html() { - let markdown = Markdown::new("
Some text
"); + fn test_inline_html() { + let markdown = Markdown::new("Some text"); assert_eq!( - vec![&( - SemanticLevel::InlineElement(SemanticSplitPosition::Own), - 0..20 - ),], + vec![ + &(SemanticLevel::Paragraph, 0..22), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..6 + ), + &(SemanticLevel::Text, 6..15), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 15..22 + ), + ], markdown.ranges().collect::>() ); + assert_eq!(SemanticLevel::Paragraph, markdown.max_level()); + } + + #[test] + fn test_html() { + let markdown = Markdown::new("
Some text
"); + assert_eq!( - SemanticLevel::InlineElement(SemanticSplitPosition::Own), - markdown.max_level() + vec![ + &(SemanticLevel::Block, 0..20), + &(SemanticLevel::Block, 0..20) + ], + markdown.ranges().collect::>() ); + assert_eq!(SemanticLevel::Block, markdown.max_level()); } #[test] @@ -797,4 +849,30 @@ mod tests { ); assert_eq!(SemanticLevel::Rule, markdown.max_level()); } + + #[test] + fn test_heading() { + for (index, (heading, level)) in [ + ("#", HeadingLevel::H1), + ("##", HeadingLevel::H2), + ("###", HeadingLevel::H3), + ("####", HeadingLevel::H4), + ("#####", HeadingLevel::H5), + ("######", HeadingLevel::H6), + ] + .into_iter() + .enumerate() + { + let markdown = Markdown::new(&format!("{heading} Heading")); + + assert_eq!( + vec![ + &(SemanticLevel::Heading(level), 0..9 + index), + &(SemanticLevel::Text, 2 + index..9 + index) + ], + markdown.ranges().collect::>() + ); + assert_eq!(SemanticLevel::Heading(level), markdown.max_level()); + } + } } diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap index ee485518..20ec8bd7 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap @@ -3,14 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```" -- "\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n" -- "\n# Emphasis\n\n" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n" -- "*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\n" -- "Strikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. " +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n" +- "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n" +- "__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\n" +- "Strikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. " - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n" - "* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n" - "+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n" @@ -18,34 +16,30 @@ input_file: tests/inputs/markdown/github_flavored.md - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n" - "⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n" - "1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" -- "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n" -- "\n# Task lists\n\n" -- "```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n" -- "```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n" -- "------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n" +- "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n" +- "- [ ] this is an incomplete item\n```\n\n" +- "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n" +- "------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\n" - "example.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n" - "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\n" - "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n" -- "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n" -- "\n# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n" +- "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n" - "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![" - "Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\n" - "Inline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n" - "\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n" - "![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\n" -- "With a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n" -- "\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n" -- "```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```" -- "\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n" -- "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n" -- " return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n" +- "With a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n" +- "[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n" +- " public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n" - " content: attr(href)\n }\n}\n```\n\n" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n" @@ -53,9 +47,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n" - " * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n" - " if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n" -- " 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\n" +- " 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\n" - "There must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n" - "| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n" - "| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n" @@ -64,17 +57,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n" - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n" - "| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n" -- "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n" -- "\n# Blockquotes\n\n" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n" +- "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n" - "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n" - "> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n" -- "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n```\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n```" -- "\n\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n\n------\n" -- "\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n" -- "\n# YouTube Videos\n\n" -- "```\n\n" +- "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n```\n\n" +- "
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n\n------\n\n" +- "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" +- "# YouTube Videos\n\n```\n
\n" - "\"IMAGE\n\n```\n\n" - "\n" - "\"IMAGE\n\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-3.snap index e1017c0d..a7094a61 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-3.snap @@ -3,11 +3,11 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n" -- "\n# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n" -- "\n# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n```\n\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n```\n\n
\n
Definition list
\n
Is something people use sometimes.
\n\n
Markdown in HTML
\n
Does *not* work **very** well. Use HTML tags.
\n
\n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap index 2170b4ec..f17cdb27 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap @@ -3,8 +3,7 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n" -- "```\n" +- "# Headers\n\n```\n" - "# h1 Heading 8-)\n" - "## h2 Heading\n" - "### h3 Heading\n" @@ -15,20 +14,19 @@ input_file: tests/inputs/markdown/github_flavored.md - "underline-ish style:\n\n" - "Alt-H1\n======\n\n" - "Alt-H2\n------\n" -- "```\n\n# " -- "h1 Heading 8-)\n## " -- "h2 Heading\n### " -- "h3 Heading\n#### " -- "h4 Heading\n##### " -- "h5 Heading\n###### " -- "h6 Heading\n\n" +- "```\n\n" +- "# h1 Heading 8-)\n" +- "## h2 Heading\n" +- "### h3 Heading\n" +- "#### h4 Heading\n" +- "##### h5 Heading\n" +- "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an " - "underline-ish style:\n\n" - "Alt-H1\n======\n\n" - "Alt-H2\n------\n\n" -- "------\n" -- "\n# Emphasis\n\n" -- "```\n" +- "------\n\n" +- "# Emphasis\n\n```\n" - "Emphasis, aka italics, with *" - "asterisks* or " - "_underscores_.\n\n" @@ -59,10 +57,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "*This is italic text*\n\n" - "_This is italic text_\n\n" - "~~Strikethrough~~\n\n" -- "------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n" -- "2. Another item\n" +- "------\n\n" +- "# Lists\n\n```\n1. " +- "First ordered list item\n2. Another item\n" - "⋅⋅* Unordered sub-list.\n" - "1. " - "Actual numbers don't matter, just that " @@ -153,9 +150,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "nisl aliquet\n " - "- Nulla volutpat " - "aliquam velit\n+ Very easy!\n\n" -- "------\n" -- "\n# Task lists\n\n" -- "```\n" +- "------\n\n" +- "# Task lists\n\n```\n" - "- [x] Finish my changes\n" - "- [ ] Push my commits to " - "GitHub\n" @@ -181,8 +177,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "unordered or ordered list supported)\n" - "- [ ] this is a complete item\n" - "- [ ] this is an incomplete item\n\n" -- "------\n" -- "\n# Ignoring Markdown formatting\n\n" +- "------\n\n" +- "# Ignoring Markdown formatting\n\n" - You can tell GitHub to ignore ( - "or escape) Markdown formatting by using \\ " - "before the Markdown character.\n\n" @@ -193,9 +189,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Let's rename \\*our-new" - "-project\\* to \\*our-old" - "-project\\*.\n\n" -- "------\n" -- "\n# Links\n\n" -- "```\n" +- "------\n\n" +- "# Links\n\n```\n" - "[I'm an inline-style link" - "](https://" - "www.google.com)\n\n" @@ -264,9 +259,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "slashdot.org\n" - "[link text itself]: http://" - "www.reddit.com\n\n" -- "------\n" -- "\n# Images\n\n" -- "```\n" +- "------\n\n" +- "# Images\n\n```\n" - "Here's our logo (hover to see " - "the title text):\n\nInline-style:\n" - "![" @@ -336,9 +330,8 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com - "/images/dojocat.jpg " - "\"The Dojocat\"\n\n" -- "------\n" -- "\n# " -- "[Footnotes](https://" +- "------\n\n" +- "# [Footnotes](https://" - github.com/markdown-it - "/markdown-it-footnote)\n\n" - "```\n" @@ -361,8 +354,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "[^first]: Footnote **can " - "have markup**\n\n and multiple paragraphs.\n\n" - "[^second]: Footnote text.\n\n" -- "------\n" -- "\n# Code and Syntax Highlighting\n\n" +- "------\n\n" +- "# Code and Syntax Highlighting\n\n" - "```\n" - "Inline `code` has `back-" - "ticks around` it.\n```\n\n" @@ -457,9 +450,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "URI::$st1;\n\n" - "__halt_compiler () ; datahere\n" - "datahere\ndatahere */\ndatahere\n" -- "```\n\n------\n" -- "\n# Tables\n\n" -- "```\n" +- "```\n\n------\n\n" +- "# Tables\n\n```\n" - "Colons can be used to align columns.\n\n" - "| Tables | Are | Cool |\n" - "| ---------" @@ -568,9 +560,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| --- | --- |\n" - "| Backtick | ` |\n" - "| Pipe | \\| |\n\n" -- "------\n" -- "\n# Blockquotes\n\n" -- "```\n" +- "------\n\n" +- "# Blockquotes\n\n```\n" - "> Blockquotes are very handy in email " - "to emulate reply text.\n" - "> This line is part of the same quote.\n\n" @@ -605,9 +596,9 @@ input_file: tests/inputs/markdown/github_flavored.md - signs right next to each other... - "\n> > " - "> ...or with spaces between arrows.\n\n" -- "------\n" -- "\n# Inline HTML\n\n" -- "```\n
\n" +- "------\n\n" +- "# Inline HTML\n\n```\n" +- "
\n" - "
Definition list\n" - "
Is something people use sometimes." @@ -631,17 +622,16 @@ input_file: tests/inputs/markdown/github_flavored.md - Use HTML tags.
\n" - "
\n\n" -- "------\n" -- "\n# Horizontal Rules\n\n" -- "```\nThree or more...\n\n" -- "---\n\nHyphens\n\n***\n\n" +- "------\n\n" +- "# Horizontal Rules\n\n```\n" +- "Three or more...\n\n---\n\n" +- "Hyphens\n\n***\n\n" - "Asterisks\n\n___\n\nUnderscores\n" - "```\n\nThree or more...\n\n" - "---\n\nHyphens\n\n***\n" - "\nAsterisks\n\n___\n\nUnderscores\n\n" -- "------\n" -- "\n# YouTube Videos\n\n" -- "```\n" +- "------\n\n" +- "# YouTube Videos\n\n```\n" - "\n
  • Main
  • \n" - "
  • Basics
  • \n
  • Syntax
  • \n" -- "
  • License
  • \n
  • Dingus
  • \n\n" -- "\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\n" +- "
  • License
  • \n
  • Dingus
  • \n\n\n\n" +- "Getting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\n" - "This page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\n" - "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n" -- "\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\n" -- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\n" +- "\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n" +- "## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\n" - "Markdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\n" - "beginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n " - "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n" - " > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n " - "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n" -- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n " -- "Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " -- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n" -- "\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n" +- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n \n\n\n\n" +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " +- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n" +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n" - "\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n " - "
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n " -- "* A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n" -- "\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n " +- "* A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n
      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n" +- "### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n " - "

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n " - "

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n " - "I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n " - "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n " -- "

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n " -- "![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n" -- "\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n " +- "

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n" +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n " +- "\"alt\n\n\n\n" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n " - "I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n " - "

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n" - "\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n " diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md-3.snap index 6553ed1c..2d2586c1 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md-3.snap @@ -3,7 +3,8 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- -- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n " -- "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n " -- "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" +- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n" +- "

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n" +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n" +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md.snap index 1ba7cdf4..487211c1 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_basics.md.snap @@ -3,8 +3,7 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- -- "Markdown: Basics" -- "\n" +- "Markdown: Basics\n" - "==========" - "======\n\n" - "
      " - Dingus\n
    \n" -- "\n\nGetting the Gist of Markdown'" -- s Formatting Syntax -- "\n" +- ">\n\n\n\n" +- "Getting the Gist of Markdown's " +- "Formatting Syntax\n" - "----------" - "----------" - "----------" @@ -67,9 +65,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - " [d]: /projects/markdown/" - "dingus \"Markdown Dingus\"\n" - " [src]: /projects/markdown" -- "/basics.text\n\n\n## " -- "Paragraphs, Headers, Blockquotes " -- "##\n\n" +- "/basics.text\n\n\n" +- "## Paragraphs, Headers, " +- "Blockquotes ##\n\n" - "A paragraph is simply one or more consecutive lines of " - "text, separated\n" - "by one or more blank lines. " @@ -132,8 +130,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - "blockquote.

    \n\n " - "

    This is an H2 in " - "a blockquote

    \n " -- "\n\n\n\n### " -- "Phrase Emphasis ###\n\n" +- "\n\n\n\n" +- "### Phrase Emphasis ##" +- "#\n\n" - "Markdown uses asterisks and " - "underscores to indicate spans of emphasis.\n\n" - "Markdown:\n\n " @@ -151,7 +150,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "strong emphasis.\n " - "Or, if you prefer, use " - two underscores instead. -- "

    \n\n\n\n## Lists ##\n\n" +- "

    \n\n\n\n" +- "## Lists ##\n\n" - "Unordered (bulleted) lists use " - "asterisks, pluses, and " - "hyphens (`*`,\n" @@ -198,8 +198,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "p>\n " - "
  • Another item in " - the list.

    \n \n\n\n\n### " -- "Links ###\n\n" +- "li>\n \n\n\n\n" +- "### Links ###\n\n" - "Markdown supports two styles for creating links: *" - "inline* and\n*reference*" - ". " @@ -270,8 +270,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "The New York Times." -- "

    \n\n\n### Images" -- " ###\n\n" +- "

    \n\n\n" +- "### Images ###\n\n" - "Image syntax is very much like link syntax.\n\n" - "Inline (titles are optional):\n\n " - "![" @@ -285,8 +285,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "\n\n\n\n### Code" -- " ###\n\n" +- "\"Title\" />\n\n\n\n" +- "### Code ###\n\n" - "In a regular paragraph, you can create code span " - "by wrapping text in\n" - backtick quotes. Any ampersands ( diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-2.snap index fc0e69b5..e6e7e73a 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-2.snap @@ -9,46 +9,48 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n" - " * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n " - "* [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" +- "\n

    Overview

    \n\n

    Philosophy

    \n" +- "\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" - "Readability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\n" - "filters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6]" - " -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email." - "\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n" - " [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\n" -- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n" -- "\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" +- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n" +- "

    Inline HTML

    \n" +- "\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" - "Markdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. " - "HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\n" - "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\n" - "to add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n" - "\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\n" - "Span-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\n" -- "you'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\n" -- "In HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\n" +- "you'd prefer to use HTML `
    ` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n" +- "\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\n" - "Ampersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n " - "http://images.google.com/images?num=30&q=larry+bird\n" - "\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\n" - "So, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n" - "\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" -- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\n" +- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n" +- "\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\n" - "The implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\n" - "When you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\n" - "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n" - "\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n " - "This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n " - "# This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n " -- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n" -- "\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n " +- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n" +- "\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n " - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. " - "Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n " - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. " - "Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n " -- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n" -- "\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n" -- "\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\n" -- "Unordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n" -- "\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n " -- "1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n" +- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n" +- "

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n " +- "1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n" +- "\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n" - "\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\n" - "If you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n " - "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n" @@ -60,18 +62,18 @@ input_file: tests/inputs/markdown/markdown_syntax.md - " sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n " - "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n " - "* A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n " -- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n" -- "\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\n"
    -- "To produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n" -- "\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n" +- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n" +- "\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    "
    +- "This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n" +- "\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n" - "\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\n" - "Within a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n " - "
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n" -- "\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\n" -- "You can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n " +- "\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n" +- "\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n " - "* * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" -- "\n

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\n" -- "To create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n " +- "\n

    Span Elements

    \n\n

    Links

    \n" +- "\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n " - "This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n " - "

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n " - "See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n" @@ -93,28 +95,28 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n" - "\n" - "The point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\n" -- "is text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\n" -- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n " +- "is text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n" +- "\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n " - "*single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n" - "\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n" -- "\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n" -- "\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n" +- "\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n" +- "\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n" - "\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n" - "\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t" - "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n " - "Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n " -- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n" -- "\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n " +- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n" +- "\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n " - "![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n" - "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n" - "\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n" -- "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\n" -- "Markdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n " +- "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n" +- "\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n " - "http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:\n\n " - "address@exa\n mple.com\n\n" -- "which will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\n" -- "Markdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n " +- "which will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n" +- "\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n " - "\\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-3.snap index 9e4f15dd..a1bd6deb 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md-3.snap @@ -3,11 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n\n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n" -- "\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. " +- "Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n" +- "\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" - "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n" -- "\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n " +- "\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n " - "1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n " - "* * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" - "\n

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:\n\n " diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md.snap index 19fc2848..bd0110f4 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@markdown_syntax.md.snap @@ -3,8 +3,7 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax" -- "\n" +- "Markdown: Syntax\n" - "==========" - "======\n\n" - "
      \n > Back to the first level.\n\n" - Blockquotes can contain other Markdown elements - ", including headers, lists,\n" -- "and code blocks:\n\n" -- "\t> ## This is a header.\n\t>\n" -- "\t>" +- "and code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t>" - " 1. This is the first list item.\n" - "\t>" - " 2. This is the second list item.\n\t>\n" @@ -585,8 +583,7 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "line. " - "To avoid this, you can backslash-escape " - "the period:\n\n " -- "1986\\. What a great season.\n" -- "\n\n\n" +- "1986\\. What a great season.\n\n\n\n" - "

      " - "Code Blocks

      \n\n" - "Pre-formatted code blocks are used for writing " diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap index 31da3887..f5fd2d00 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap @@ -5,12 +5,10 @@ input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```" - "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" -- "# Emphasis" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__" -- "*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**." +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**" +- "__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**." - "Strikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" -- "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items." +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items." - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)" - "* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`" - "+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```" @@ -19,20 +17,18 @@ input_file: tests/inputs/markdown/github_flavored.md - "⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses" - "1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" -- "# Task lists" -- "```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item" -- "```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item" +- "- [ ] this is an incomplete item\n```" +- "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item" - "------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" -- "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes" - "example.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```" - "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself]." - "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later." - "[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" -- "# Images" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![" - "Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):" @@ -40,12 +36,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"" - "![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]" - "With a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)" -- "```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```" -- "Footnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" -- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");" -- "return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs." +- "[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {" +- "public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {" - "content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }" @@ -54,8 +48,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "* @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check" - "if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [" - "'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" -- "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |" - "There must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |" - "| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |" - "| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |" @@ -65,16 +58,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |" - "| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" -- "# Blockquotes" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote." +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote." - "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote." - "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------" - "# Inline HTML\n\n```\n
      \n
      Definition list
      \n
      Is something people use sometimes.
      \n\n
      Markdown in HTML
      \n
      Does *not* work **very** well. Use HTML tags.
      \n
      \n```" - "
      \n
      Definition list
      \n
      Is something people use sometimes.
      \n\n
      Markdown in HTML
      \n
      Does *not* work **very** well. Use HTML tags.
      \n
      \n\n------" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" -- "# YouTube Videos" -- "```\n" +- "# YouTube Videos\n\n```\n" - "\"IMAGE\n\n```" - "" - "\"IMAGE\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap index 5a233863..eed9f527 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap @@ -3,8 +3,7 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers" -- "```" +- "# Headers\n\n```" - "# h1 Heading 8-)" - "## h2 Heading" - "### h3 Heading" @@ -15,20 +14,19 @@ input_file: tests/inputs/markdown/github_flavored.md - "underline-ish style:" - "Alt-H1\n======" - "Alt-H2\n------" -- "```\n\n#" -- "h1 Heading 8-)\n##" -- "h2 Heading\n###" -- "h3 Heading\n####" -- "h4 Heading\n#####" -- "h5 Heading\n######" -- h6 Heading +- "```" +- "# h1 Heading 8-)" +- "## h2 Heading" +- "### h3 Heading" +- "#### h4 Heading" +- "##### h5 Heading" +- "###### h6 Heading" - "Alternatively, for H1 and H2, an" - "underline-ish style:" - "Alt-H1\n======" - "Alt-H2\n------" - "------" -- "# Emphasis" -- "```" +- "# Emphasis\n\n```" - "Emphasis, aka italics, with *" - asterisks* or - _underscores_. @@ -60,9 +58,8 @@ input_file: tests/inputs/markdown/github_flavored.md - _This is italic text_ - ~~Strikethrough~~ - "------" -- "# Lists" -- "```\n1. First ordered list item" -- 2. Another item +- "# Lists\n\n```\n1." +- "First ordered list item\n2. Another item" - ⋅⋅* Unordered sub-list. - "1." - "Actual numbers don't matter, just that" @@ -153,8 +150,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "- Nulla volutpat" - "aliquam velit\n+ Very easy!" - "------" -- "# Task lists" -- "```" +- "# Task lists\n\n```" - "- [x] Finish my changes" - "- [ ] Push my commits to" - GitHub @@ -193,8 +189,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "-project\\* to \\*our-old" - "-project\\*." - "------" -- "# Links" -- "```" +- "# Links\n\n```" - "[I'm an inline-style link" - "](https://" - www.google.com) @@ -264,8 +259,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "[link text itself]: http://" - www.reddit.com - "------" -- "# Images" -- "```" +- "# Images\n\n```" - "Here's our logo (hover to see" - "the title text):\n\nInline-style:" - "![" @@ -336,8 +330,7 @@ input_file: tests/inputs/markdown/github_flavored.md - /images/dojocat.jpg - "\"The Dojocat\"" - "------" -- "#" -- "[Footnotes](https://" +- "# [Footnotes](https://" - github.com/markdown-it - /markdown-it-footnote) - "```" @@ -457,8 +450,7 @@ input_file: tests/inputs/markdown/github_flavored.md - __halt_compiler () ; datahere - "datahere\ndatahere */\ndatahere" - "```\n\n------" -- "# Tables" -- "```" +- "# Tables\n\n```" - Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ---------" @@ -568,8 +560,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Backtick | ` |" - "| Pipe | \\| |" - "------" -- "# Blockquotes" -- "```" +- "# Blockquotes\n\n```" - "> Blockquotes are very handy in email" - to emulate reply text. - "> This line is part of the same quote." @@ -605,8 +596,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "> >" - "> ...or with spaces between arrows." - "------" -- "# Inline HTML" -- "```\n
      " +- "# Inline HTML\n\n```" +- "
      " - "
      Definition list - "
      Is something people use sometimes." @@ -631,16 +622,15 @@ input_file: tests/inputs/markdown/github_flavored.md - em>.
      - "
      " - "------" -- "# Horizontal Rules" -- "```\nThree or more..." -- "---\n\nHyphens\n\n***" +- "# Horizontal Rules\n\n```" +- "Three or more...\n\n---" +- "Hyphens\n\n***" - "Asterisks\n\n___\n\nUnderscores" - "```\n\nThree or more..." - "---\n\nHyphens\n\n***" - "Asterisks\n\n___\n\nUnderscores" - "------" -- "# YouTube Videos" -- "```" +- "# YouTube Videos\n\n```" - "` and `

      ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the" - "beginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:" - "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3" - "> This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:" - "

      A First Level Header

      \n\n

      A Second Level Header

      \n\n

      Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

      \n\n

      The quick brown fox jumped over the lazy\n dog's back.

      \n\n

      Header 3

      \n\n
      " -- "

      This is a blockquote.

      \n\n

      This is the second paragraph in the blockquote.

      \n\n

      This is an H2 in a blockquote

      \n
      \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:" -- "Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:" +- "

      This is a blockquote.

      \n\n

      This is the second paragraph in the blockquote.

      \n\n

      This is an H2 in a blockquote

      \n " +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:" - "

      Some of these words are emphasized.\n Some of these words are emphasized also.

      \n\n

      Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

      " - "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze." - "all produce the same output:\n\n
        \n
      • Candy.
      • \n
      • Gum.
      • \n
      • Booze.
      • \n
      \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:" @@ -29,8 +29,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - "I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:" - "

      I get 10 times more traffic from Google than from Yahoo or MSN.

      \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:" -- "

      I start my morning with a cup of coffee and\n The New York Times.

      \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:" -- "![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt" +- "

      I start my morning with a cup of coffee and\n The New York Times.

      " +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:" +- "\"alt" - "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:" - "I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:" - "

      I strongly recommend against using any\n <blink> tags.

      \n\n

      I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

      " diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md-3.snap index 65ffb01a..cffdfe98 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md-3.snap @@ -3,7 +3,8 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- -- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

      ` and `

      ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:" -- "

      A First Level Header

      \n\n

      A Second Level Header

      \n\n

      Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

      \n\n

      The quick brown fox jumped over the lazy\n dog's back.

      \n\n

      Header 3

      \n\n
      \n

      This is a blockquote.

      \n\n

      This is the second paragraph in the blockquote.

      \n\n

      This is an H2 in a blockquote

      \n
      \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

      Some of these words are emphasized.\n Some of these words are emphasized also.

      \n\n

      Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

      \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
        \n
      • Candy.
      • \n
      • Gum.
      • \n
      • Booze.
      • \n
      \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
        \n
      1. Red
      2. \n
      3. Green
      4. \n
      5. Blue
      6. \n
      \n\nIf you put blank lines between items, you'll get `

      ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

        \n
      • A list item.

        \n

        With multiple paragraphs.

      • \n
      • Another item in the list.

      • \n
      \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

      This is an \n example link.

      \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

      This is an \n example link.

      \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:" -- "

      I get 10 times more traffic from Google than from Yahoo or MSN.

      \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

      I start my morning with a cup of coffee and\n The New York Times.

      \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

      I strongly recommend against using any\n <blink> tags.

      \n\n

      I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

      \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
      \n

      For example.

      \n
      \n\nOutput:\n\n

      If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

      \n\n
      <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
      " +- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

      ` and `

      ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

      A First Level Header

      \n\n

      A Second Level Header

      \n\n

      Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

      \n\n

      The quick brown fox jumped over the lazy\n dog's back.

      \n\n

      Header 3

      \n\n
      \n

      This is a blockquote.

      \n\n

      This is the second paragraph in the blockquote.

      " +- "

      This is an H2 in a blockquote

      \n
      \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

      Some of these words are emphasized.\n Some of these words are emphasized also.

      \n\n

      Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

      " +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
        \n
      • Candy.
      • \n
      • Gum.
      • \n
      • Booze.
      • \n
      \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
        \n
      1. Red
      2. \n
      3. Green
      4. \n
      5. Blue
      6. \n
      \n\nIf you put blank lines between items, you'll get `

      ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

        \n
      • A list item.

        \n

        With multiple paragraphs.

      • \n
      • Another item in the list.

      • \n
      \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

      This is an \n example link.

      \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

      This is an \n example link.

      \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

      I get 10 times more traffic from Google than from Yahoo or MSN.

      \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

      I start my morning with a cup of coffee and\n The New York Times.

      " +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

      I strongly recommend against using any\n <blink> tags.

      \n\n

      I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

      \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
      \n

      For example.

      \n
      \n\nOutput:\n\n

      If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

      \n\n
      <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
      " diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md.snap index fc111a62..84c513ac 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_basics.md.snap @@ -30,8 +30,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "title=\"Online Markdown Web Form\">" - Dingus\n
    " -- "Getting the Gist of Markdown'" -- s Formatting Syntax +- "Getting the Gist of Markdown's" +- Formatting Syntax - "----------" - "----------" - "----------" @@ -64,9 +64,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - "[d]: /projects/markdown/" - "dingus \"Markdown Dingus\"" - "[src]: /projects/markdown" -- "/basics.text\n\n\n##" -- "Paragraphs, Headers, Blockquotes" -- "##" +- /basics.text +- "## Paragraphs, Headers," +- "Blockquotes ##" - A paragraph is simply one or more consecutive lines of - "text, separated" - by one or more blank lines. @@ -129,8 +129,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - blockquote.

    - "

    This is an H2 in" - a blockquote

    -- "\n\n\n\n###" -- "Phrase Emphasis ###" +- "" +- "### Phrase Emphasis ##" +- "#" - Markdown uses asterisks and - underscores to indicate spans of emphasis. - "Markdown:" @@ -148,7 +149,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "strong emphasis." - "Or, if you prefer, use" - two underscores instead. -- "

    \n\n\n\n## Lists ##" +- "

    " +- "## Lists ##" - Unordered (bulleted) lists use - "asterisks, pluses, and" - "hyphens (`*`," @@ -195,8 +197,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - p>
  • - "
  • Another item in" - the list.

    \n \n\n\n\n###" -- "Links ###" +- "li>\n " +- "### Links ###" - "Markdown supports two styles for creating links: *" - "inline* and\n*reference*" - "." @@ -267,8 +269,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "The New York Times." -- "

    \n\n\n### Images" -- "###" +- "

    " +- "### Images ###" - Image syntax is very much like link syntax. - "Inline (titles are optional):" - "![" @@ -282,8 +284,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "\n\n\n\n### Code" -- "###" +- "\"Title\" />" +- "### Code ###" - "In a regular paragraph, you can create code span" - by wrapping text in - backtick quotes. Any ampersands ( diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-2.snap index 981ef7db..064e472e 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-2.snap @@ -9,14 +9,16 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)" - "* [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)" - "* [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" -- "

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible." +- "

    Overview

    \n\n

    Philosophy

    " +- Markdown is intended to be as easy-to-read and easy-to-write as is feasible. - "Readability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML" - "filters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6]" - "-- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email." - "[1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/" - "[4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/" - "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email." -- "

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web." +- "

    Inline HTML

    " +- "Markdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web." - "Markdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose." - "HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags." - "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not" @@ -30,25 +32,25 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "in your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`." - "So, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5" - "Markdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" -- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs." +- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    " +- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs." - "The implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag." - "When you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return." - "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks." - "[bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:" - "This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:" - "# This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :" -- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######" -- "

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:" +- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    " +- "Markdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:" - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl." - "Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:" - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl." - "Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:" -- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:" -- "> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");" -- "Any decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists." -- "Unordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish" -- "It's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:" -- "1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish" +- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu." +- "

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:" +- "1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    " +- "If you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish" - "you'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to." - "If you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:" - "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit." @@ -60,18 +62,18 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:" - "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:" - "* A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:" -- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season." -- "

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags."
    -- "To produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    " -- "One level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    " +- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    " +- "Pre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:"
    +- "This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell" +- "will turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    " - "A code block continues until it reaches a line that is not indented\n(or the end of the article)." - "Within a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:" - "
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    " - "Regular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    " - "You can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:" - "* * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" -- "

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets]." -- "To create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:" +- "

    Span Elements

    \n\n

    Links

    " +- "Markdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:" - "This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:" - "

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:" - "See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link." @@ -96,14 +98,14 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:" - "*single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores" - "You can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable" -- "But if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*" -- "

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    " +- "But if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    " +- "To indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    " - "To include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    " - "The backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:" - "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:" - "Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:" -- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    " -- "

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:" +- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    " +- "Admittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:" - "![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:" - "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]" - "Where \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-3.snap index 841dd3f7..fa839bdc 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md-3.snap @@ -3,11 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n\n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" -- "

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n`
    `, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph." -- "Note that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it." +- "Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5" +- "However, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" - "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing." -- "Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:" +- "Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:" - "1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:" - "* * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" - "

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md.snap index 83b1137a..6faecb30 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@markdown_syntax.md.snap @@ -403,9 +403,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - ".\n >\n > Back to the first level." - Blockquotes can contain other Markdown elements - ", including headers, lists," -- "and code blocks:" -- "> ## This is a header.\n\t>" -- ">" +- "and code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t>" - 1. This is the first list item. - ">" - "2. This is the second list item.\n\t>" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-2.snap index ea583029..3ba4b323 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-2.snap @@ -3,14 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n" -- "```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n" +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n" - "###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n" -- "------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### " -- "h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n" -- "------\n" -- "\n# Emphasis\n\n" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\n" +- "------\n```\n\n" +- "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\n" +- "Alternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\n" - "Strong emphasis, aka bold, with **asterisks** or __underscores__.\n\n" - "Combined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. " - "~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n" @@ -19,9 +17,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Strong emphasis, aka bold, with **asterisks** or __underscores__.\n\n" - "Combined emphasis with **asterisks and _underscores_**.\n\n" - "Strikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n" -- "*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. " +- "*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. " - "Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n" - "⋅⋅⋅You can have properly indented paragraphs within list items. " - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also " @@ -51,22 +48,20 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Create a list by starting a line with `+`, `-`, or `*`\n" - "+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n" - " * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n " -- "- Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n" -- "\n# Task lists\n\n" -- "```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n" +- "- Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n" - "- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n" - "- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n" - "- [ ] this is an incomplete item\n```\n\n" - "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n" - "- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n" - "- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n" -- "- [ ] this is an incomplete item\n\n------\n" -- "\n# Ignoring Markdown formatting\n\n" +- "- [ ] this is an incomplete item\n\n------\n\n" +- "# Ignoring Markdown formatting\n\n" - "You can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown " - "character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```" -- "\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n" +- "\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n" - "[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n" - "[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n" @@ -88,9 +83,8 @@ input_file: tests/inputs/markdown/github_flavored.md - " and sometimes\nexample.com (but not on Github, for example).\n\n" - "Some text to show that the reference links can follow later.\n" - "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n" -- "[link text itself]: http://www.reddit.com\n\n------\n" -- "\n# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![" +- "[link text itself]: http://www.reddit.com\n\n------\n\n" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo " - "Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n" - "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title " @@ -110,17 +104,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\n" - "Like links, Images also have a footnote style syntax\n\n![Alt text][id]\n\n" - "With a reference later in the document defining the URL location:\n" -- "\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n" -- "\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n" -- "```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\n" -- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n" -- "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n" -- "\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\n" -- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n" -- "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n" -- "------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```" -- "\n\nInline `code` has `back-ticks around` it.\n\n" +- "\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\n" +- "Footnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\n" +- "Duplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n" +- " and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\n" +- "Footnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\n" +- "Duplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n " +- "and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\n" +- "Inline `code` has `back-ticks around` it.\n\n" - "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n" - " [Obsolete(\"...\")]\n class Program : IInterface\n {\n" - " public static List JustDoIt(int count)\n {\n" @@ -146,9 +139,8 @@ input_file: tests/inputs/markdown/github_flavored.md - " $this->var = 0 - self::$st;\n" - " $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n" - " return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\n" -- "echo URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n" +- "echo URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n" - "| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n" - "| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\n" - "There must be at least 3 dashes separating each header cell.\n" @@ -183,9 +175,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git status | git status | git status |\n| git diff | git diff | git diff |\n" - "\n" - "| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n" -- "\n------\n" -- "\n# Blockquotes\n\n" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n" +- "\n------\n\n" +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n" - "> This line is part of the same quote.\n\nQuote break.\n\n" - "> This is a very long line that will still be quoted properly when it wraps. " - "Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. " @@ -201,17 +192,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested...\n" - ">" - "> ...by using additional greater-than signs right next to each other...\n" -- "> > > ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n" -- "```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n" +- "> > > ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n" - "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n" - "```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n " - "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n" -- "------\n" -- "\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```" -- "\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n" -- "\n# YouTube Videos\n\n" -- "```\n\n" - "\"IMAGE\n\n```\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-3.snap index 6b819573..0a5192af 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md-3.snap @@ -3,27 +3,25 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n" -- "\n# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n" +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n" - "+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n" -- "1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n" -- "\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```" -- "\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n" -- "\n# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```" -- "\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n" -- "\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n" -- "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n" +- "1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n" +- "[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n" +- "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n" +- " for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n" - "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n" -- "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n" +- "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n" -- "| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n" -- "\n# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n" -- "\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" +- "| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" +- "# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md.snap index d0277250..6410722e 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@github_flavored.md.snap @@ -3,8 +3,8 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers" -- "\n\n" +- "# Headers\n" +- "\n" - "```\n" - "# h1 " - Heading 8- @@ -29,32 +29,34 @@ input_file: tests/inputs/markdown/github_flavored.md - "======\n\n" - "Alt-H2\n" - "------\n" -- "```\n\n# " -- h1 Heading -- " 8-)\n## " -- h2 Heading -- "\n### " -- h3 Heading -- "\n#### " -- h4 Heading -- "\n##### " -- h5 Heading -- "\n###### " -- h6 Heading -- "\n\n" +- "```\n\n" +- "# h1 " +- Heading 8- +- ")\n" +- "## h2 " +- "Heading\n" +- "### h3 " +- "Heading\n" +- "#### h4 " +- "Heading\n" +- "##### h5 " +- "Heading\n" +- "###### h6 " +- "Heading\n\n" - Alternativ - "ely, for " - "H1 and H2," - " an " - underline- - "ish style:" -- "\n\nAlt-H1" -- "\n======\n\n" +- "\n\n" +- "Alt-H1\n" +- "======\n\n" - Alt-H2 - "\n------\n\n" -- "------\n" -- "\n# " -- "Emphasis\n\n" +- "------\n\n" +- "# Emphasis" +- "\n\n" - "```\n" - "Emphasis, " - "aka " @@ -145,8 +147,8 @@ input_file: tests/inputs/markdown/github_flavored.md - ~~ - Strikethro - "ugh~~\n\n" -- "------\n" -- "\n# Lists\n\n" +- "------\n\n" +- "# Lists\n\n" - "```\n1. " - "First " - "ordered " @@ -460,10 +462,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "velit\n" - "+ Very " - "easy!\n\n" -- "------\n" -- "\n# " -- Task lists -- "\n\n" +- "------\n\n" +- "# Task " +- "lists\n\n" - "```\n" - "- [x] " - "Finish my " @@ -540,10 +541,9 @@ input_file: tests/inputs/markdown/github_flavored.md - " is an " - incomplete - " item\n\n" -- "------\n" -- "\n# " -- "Ignoring " -- "Markdown " +- "------\n\n" +- "# Ignoring" +- " Markdown " - formatting - "\n\n" - "You can " @@ -575,8 +575,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "to \\*our-" - old- - "project\\*." -- "\n\n------\n" -- "\n# Links\n\n" +- "\n\n------\n\n" +- "# Links\n\n" - "```\n" - "[I'm an " - inline- @@ -778,9 +778,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "http://" - www.reddit - ".com\n\n" -- "------\n" -- "\n# Images" -- "\n\n" +- "------\n\n" +- "# Images\n\n" - "```\n" - "Here's our" - " logo (" @@ -950,10 +949,10 @@ input_file: tests/inputs/markdown/github_flavored.md - dojocat.jp - "g \"The " - "Dojocat\"\n\n" -- "------\n" -- "\n# " -- "[Footnotes" -- "](https://" +- "------\n\n" +- "# [" +- "Footnotes]" +- "(https://" - github.com - /markdown- - it/ @@ -1021,10 +1020,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "[^second]:" - " Footnote " - "text.\n\n" -- "------\n" -- "\n# " -- "Code and " -- "Syntax " +- "------\n\n" +- "# Code and" +- " Syntax " - Highlighti - "ng\n\n" - "```\n" @@ -1299,9 +1297,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "/\n" - "datahere\n" - "```\n\n" -- "------\n" -- "\n# Tables" -- "\n\n" +- "------\n\n" +- "# Tables\n\n" - "```\n" - Colons can - " be used " @@ -1627,8 +1624,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Pipe" - " | \\|" - " |\n" -- "\n------\n" -- "\n# " +- "\n------\n\n" +- "# " - Blockquote - "s\n\n" - "```\n" @@ -1750,9 +1747,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "spaces " - "between " - "arrows.\n\n" -- "------\n" -- "\n# " -- "Inline " +- "------\n\n" +- "# Inline " - "HTML\n\n" - "```\n
    \n" - "
    " @@ -1777,7 +1773,8 @@ input_file: tests/inputs/markdown/github_flavored.md - em>tags.\n" - "
    \n```\n" -- "\n
    \n" +- "\n" +- "
    \n" - "
    " - Definition - " list
    " @@ -1800,8 +1797,8 @@ input_file: tests/inputs/markdown/github_flavored.md - em>tags.\n" - "
    \n\n" -- "------\n" -- "\n# " +- "------\n\n" +- "# " - Horizontal - " Rules\n\n" - "```\n" @@ -1825,8 +1822,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "\n" - Underscore - "s\n\n------\n" -- "\n# " -- "YouTube " +- "\n" +- "# YouTube " - "Videos\n\n" - "```\n" - "Syntax
  • \n" - "
  • License\n
  • Dingus
  • \n" -- "\n" -- "\n\nGetting the Gist of Markdown's Formatting Syntax" -- "\n------------------------------------------------\n\n" +- "\n\n\n" +- "Getting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\n" - "This page offers a brief overview of what it's like to use Markdown.\n" - "The [syntax page] [s]" - " provides complete, detailed documentation for\n" @@ -25,8 +24,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "can [see the source for it by adding '.text' to the URL] [src]" - "." - "\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n" -- " [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## " -- "Paragraphs, Headers, Blockquotes ##\n\n" +- " [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n" +- "## Paragraphs, Headers, Blockquotes ##\n\n" - "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. " - "(A blank line is any line that looks like\na blank line --" - " a line containing nothing but spaces or tabs is\n" @@ -46,15 +45,17 @@ input_file: tests/inputs/markdown/markdown_basics.md - "regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n " - "

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n " - "

    This is the second paragraph in the blockquote.

    \n\n " -- "

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\n" -- "Markdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n " +- "

    This is an H2 in a blockquote

    \n \n\n\n\n" +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\n" +- "Markdown:\n\n " - "Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n" - " Use two asterisks for **strong emphasis**.\n " - "Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " - "

    Some of these words are emphasized.\n" - " Some of these words are emphasized also.

    \n\n " - "

    Use two asterisks for strong emphasis.\n " -- "Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\n" +- "Or, if you prefer, use two underscores instead.

    \n\n\n\n" +- "## Lists ##\n\n" - "Unordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. " - "These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n" - "\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n " @@ -69,7 +70,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "* A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n" - "\nOutput:\n\n " - "
      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n" -- "
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\n" +- "
  • Another item in the list.

  • \n \n\n\n\n" +- "### Links ###\n\n" - "Markdown supports two styles for creating links: *inline* and\n*reference*. " - "With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\n" - "Inline-style links use parentheses immediately after the link text.\nFor example:\n\n " @@ -93,13 +95,13 @@ input_file: tests/inputs/markdown/markdown_basics.md - "I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n" - " [ny times]: http://www.nytimes.com/\n\nOutput:\n\n " - "

    I start my morning with a cup of coffee and\n" -- " The New York Times.

    \n\n\n### Images ###\n\n" -- "Image syntax is very much like link syntax.\n\nInline (titles are optional):\n\n " +- " The New York Times.

    \n\n\n" +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n " - "![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n " - "![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n" - "\nBoth of the above examples produce the same output:\n\n " -- "\"alt\n" -- "\n\n\n### Code ###\n\n" +- "\"alt\n\n\n\n" +- "### Code ###\n\n" - "In a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. " - "Any ampersands (`&`) and angle brackets (`<` or\n`>`" - ") will automatically be translated into HTML entities. This makes\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md-3.snap index 6ef4ca56..3b1e6ea6 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md-3.snap @@ -4,12 +4,14 @@ expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- - "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\n" -- "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\n" -- "Markdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n " -- "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " -- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n" -- "\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n " -- "

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n " -- "

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n " -- "

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" +- "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n" +- "## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n " +- "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n" +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n" +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n " +- "

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n" +- "### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n " +- "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n " +- "If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md.snap index 96229305..9c850dcd 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@markdown_basics.md.snap @@ -62,8 +62,7 @@ input_file: tests/inputs/markdown/markdown_basics.md - "Web Form\">" - Dingus - "\n" -- "\n" -- "\n\n" +- "\n\n\n" - "Getting " - "the Gist " - "of " @@ -178,7 +177,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - projects/ - markdown/ - basics.tex -- "t\n\n\n## " +- "t\n\n\n" +- "## " - Paragraphs - ", Headers," - " " @@ -385,10 +385,10 @@ input_file: tests/inputs/markdown/markdown_basics.md - "\n " - "\n\n\n\n### " -- "Phrase " -- "Emphasis #" -- "##\n\n" +- ">\n\n\n\n" +- "### Phrase" +- " Emphasis " +- "###\n\n" - "Markdown " - "uses " - "asterisks " @@ -457,8 +457,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - underscore - s instead< - /strong>.< -- "/p>\n\n\n\n## " -- "Lists ##\n\n" +- "/p>\n\n\n\n" +- "## Lists #" +- "#\n\n" - "Unordered " - (bulleted) - " lists use" @@ -589,9 +590,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - the list.< - "/p>\n" - " \n" -- "\n\n\n### " -- Links -- " ###\n\n" +- "\n\n\n" +- "### Links " +- "###\n\n" - "Markdown " - "supports " - two styles @@ -787,9 +788,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - "The New " - York Times - ".

    \n" -- "\n\n### " -- "Images ###" - "\n\n" +- "### Images" +- " ###\n\n" - "Image " - "syntax is " - "very much " @@ -828,8 +829,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - "text\" " - "title=\"" - "Title\" />\n" -- "\n\n\n### " -- "Code ###\n\n" +- "\n\n\n" +- "### Code #" +- "##\n\n" - "In a " - "regular " - "paragraph," diff --git a/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-2.snap index 1743feb0..0ecf49f5 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-2.snap @@ -20,8 +20,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "**Note:** This document is itself written using Markdown; you\n" - "can [see the source for it by adding '.text' to the URL][src]" - ".\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\n" -- "Markdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" +- "\n

    Overview

    \n\n

    Philosophy

    \n" +- "\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" - "Readability, however, is emphasized above all else. A Markdown-formatted\n" - "document should be publishable as-is, as plain text, without looking\nlike it'" - "s been marked up with tags or formatting instructions. While\nMarkdown'" @@ -38,8 +38,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "as to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\" - "*. Markdown lists look like, well, lists. Even\n" - "blockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n" -- "

    Inline HTML

    \n\n" -- "Markdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" +- "

    Inline HTML

    \n" +- "\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" - "Markdown is not a replacement for HTML, or even close to it. Its\n" - "syntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not*" - " to create a syntax that makes it easier\n" @@ -65,8 +65,9 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "used anywhere in a Markdown paragraph, list item, or header. If you\n" - "want, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML " - "`` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\n" -- "Unlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n" -- "\n\n

    Automatic Escaping for Special Characters

    \n\n" +- "Unlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n" +- "

    Automatic Escaping for Special Characters

    \n" +- "\n" - "In HTML, there are two characters that demand special treatment: `<`\nand `&`. " - "Left angle brackets are used to start tags; ampersands are\n" - "used to denote HTML entities. If you want to use them as literal\n" @@ -97,7 +98,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "Markdown to write about HTML code. (As opposed to raw HTML, which is a\n" - "terrible format for writing about HTML syntax, because every single `<`\nand `&`" - " in your example code needs to be escaped.)\n\n\n* * *\n" -- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\n" +- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n" +- "\n" - "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. " - "(A blank line is any line that looks like a\nblank line --" - " a line containing nothing but spaces or tabs is considered\n" @@ -112,8 +114,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style " - "[blockquoting][bq] and multi-paragraph [list items][l]" - "\nwork best -- and look better -- when you format them with hard breaks." -- "\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\n" -- "Markdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\n" +- "\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n" +- "\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\n" - "Setext-style headers are \"underlined\" using equal signs (for first-level\n" - "headers) and dashes (for second-level headers). For example:\n\n " - "This is an H1\n =============\n\n This is an H2\n -------------\n" @@ -126,8 +128,9 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "cosmetic -- you can use this if you think it looks better. The\nclosing hashes don'" - "t even need to match the number of hashes\nused to open the header. (The number of opening hashes\n" - "determines the header level.) :\n\n " -- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n" -- "\n\n

    Blockquotes

    \n\n" +- "# This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n" +- "

    Blockquotes

    \n" +- "\n" - "Markdown uses email-style `>` characters for blockquoting. If you're\n" - "familiar with quoting passages of text in an email message, then you\n" - "know how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>`" @@ -148,14 +151,15 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "\n " - "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n" - " > Back to the first level.\n\n" -- "Blockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n" -- "\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. " -- "This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> " -- " return shell_exec(\"echo $input | $markdown_script\");\n" +- "Blockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t>" +- " 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n" +- "\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n" - "\n" - "Any decent text editor should make email-style quoting easy. For\n" - "example, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu." -- "\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\n" +- "\n\n\n

    Lists

    \n" +- "\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\n" - "Unordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n " - "* Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n" - "\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n " @@ -218,8 +222,9 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "accident, by writing something like this:\n\n 1986. What a great season.\n" - "\n" - "In other words, a *number-period-space* sequence at the beginning of a\nline. " -- "To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n" -- "\n\n\n

    Code Blocks

    \n\n" +- "To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n" +- "

    Code Blocks

    \n" +- "\n" - "Pre-formatted code blocks are used for writing about programming or\nmarkup source code. " - "Rather than forming normal paragraphs, the lines\n" - "of a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and ``"
    @@ -248,14 +253,15 @@ input_file: tests/inputs/markdown/markdown_syntax.md
     - "Regular Markdown syntax is not processed within code blocks. E.g.,\n"
     - "asterisks are just literal asterisks within a code block. This means\nit'"
     - "s also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n"
    -- "

    Horizontal Rules

    \n\n" +- "

    Horizontal Rules

    \n" +- "\n" - "You can produce a horizontal rule tag (`
    `) by placing three or\n" - "more hyphens, asterisks, or underscores on a line by themselves. If you\n" - "wish, you may use spaces between the hyphens or asterisks. Each of the\n" - "following lines will produce a horizontal rule:\n\n " - "* * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" -- "\n

    Span Elements

    \n\n

    Links

    \n\n" -- "Markdown supports two style of links: *inline* and *reference*.\n\n" +- "\n

    Span Elements

    \n\n

    Links

    \n" +- "\nMarkdown supports two style of links: *inline* and *reference*.\n\n" - "In both styles, the link text is delimited by [square brackets].\n\n" - "To create an inline link, use a set of regular parentheses immediately\n" - "after the link text's closing square bracket. Inside the parentheses,\n" @@ -340,7 +346,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "closely resembles the final output, as rendered in a browser. By\n" - "allowing you to move the markup-related metadata out of the paragraph,\n" - "you can add links without interrupting the narrative flow of your\nprose.\n\n\n" -- "

    Emphasis

    \n\n" +- "

    Emphasis

    \n" +- "\n" - "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. " - "Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`" - "'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n " @@ -356,8 +363,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "literal asterisk or underscore.\n\n" - "To produce a literal asterisk or underscore at a position where it\n" - "would otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n " -- "\\*this text is surrounded by literal asterisks\\*\n" -- "\n\n\n

    Code

    \n\n" +- "\\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n" +- "\n" - "To indicate a span of code, wrap it with backtick quotes (`` ` ``).\n" - "Unlike a pre-formatted code block, a code span indicates code within a\n" - "normal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n " @@ -381,7 +388,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n " - "`—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n " - "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n" -- "\n\n\n

    Images

    \n\n" +- "\n\n\n

    Images

    \n" +- "\n" - "Admittedly, it's fairly difficult to devise a \"natural\" syntax for\n" - "placing images into a plain text document format.\n\n" - "Markdown uses an image syntax that is intended to resemble the syntax\n" @@ -400,7 +408,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "As of this writing, Markdown has no syntax for specifying the\n" - "dimensions of an image; if this is important to you, you can simply\nuse regular HTML ``" - " tags.\n\n\n* * *\n" -- "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\n" +- "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n" +- "\n" - "Markdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: " - "simply surround the URL or email address with angle brackets. " - "What this means is that if you want to show the actual text of a URL or email address, and also have" @@ -420,7 +429,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "(This sort of entity-encoding trick will indeed fool many, if not\n" - "most, address-harvesting bots, but it definitely won't fool all of\nthem. It'" - "s better than nothing, but an address published in this way\n" -- "will probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\n" +- "will probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n" +- "\n" - "Markdown allows you to use backslash escapes to generate literal\n" - "characters which would otherwise have special meaning in Markdown's\n" - "formatting syntax. For example, if you wanted to surround a word\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-3.snap index 04dee708..24095e3d 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@markdown_syntax.md-3.snap @@ -3,37 +3,41 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n
    \n\n\n" -- "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\n" -- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\n" -- "For any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n" -- "\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\n" -- "Ampersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n" -- "\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" -- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\n" +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n" +- " * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" +- "\n

    Overview

    \n\n

    Philosophy

    \n" +- "\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\n" +- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n" +- "\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\n" +- "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n" +- "\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n" +- "\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n" +- "\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n" +- "\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" +- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n" +- "\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\n" - "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n" - "\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n " - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n " -- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n" -- "\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n" -- "\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n" -- "\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n " -- "1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n" -- "\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    "
    -- "

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n " -- "
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" +- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n " +- "
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n " +- "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n " +- "
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n" +- "\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n" +- "\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n" +- "\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n" +- "\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" - "\n

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n" - "\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t" - "[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n" - "\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:\n\n " - "I get 10 times more traffic from [Google] [1] than from\n [Yahoo] [2] or [MSN] [3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nUsing the implicit link name shortcut, you could instead write:\n\n I get 10 times more traffic from [Google][] than from\n [Yahoo][] or [MSN][].\n\n [google]: http://google.com/ \"Google\"\n [yahoo]: http://search.yahoo.com/ \"Yahoo Search\"\n [msn]: http://search.msn.com/ \"MSN Search\"\n\nBoth of the above examples will produce the following HTML output:\n\n

    I get 10 times more traffic from Google than from\n Yahoo\n or MSN.

    \n\nFor comparison, here is the same paragraph written using\nMarkdown's inline link style:\n\n " -- "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n" -- "\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n " -- "*single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n " -- "Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n " -- "Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n" -- "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n" +- "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n" +- "\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n " +- "\\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t" +- "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n " +- "![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n" - "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:\n\n " - "address@exa\n mple.com\n\nwhich will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n" - "\nMarkdown provides backslash escapes for the following characters:\n\n \\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-2.snap index f3ebfc55..918dba5a 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-2.snap @@ -3,13 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers" -- "```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading" -- "Alternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n#" -- "h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading" +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading" +- "###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2" +- "------\n```" +- "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading" - "Alternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" -- "# Emphasis" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_." +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_." - "Strong emphasis, aka bold, with **asterisks** or __underscores__." - "Combined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes." - "~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*" @@ -19,8 +18,7 @@ input_file: tests/inputs/markdown/github_flavored.md - Combined emphasis with **asterisks and _underscores_**. - "Strikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__" - "*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" -- "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1." +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1." - "Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item." - ⋅⋅⋅You can have properly indented paragraphs within list items. - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also" @@ -50,8 +48,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:" - "- Marker character change forces new list start:\n * Ac tristique libero volutpat at" - "+ Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" -- "# Task lists" -- "```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request" - "- [x] @mentions, #refs, [links](), **formatting**, and tags supported" - "- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item" - "- [ ] this is an incomplete item\n```" @@ -63,8 +60,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "You can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown" - "character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```" - "Let's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" -- "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)" - "[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")" - "[I'm a reference-style link][Arbitrary case-insensitive reference text]" - "[I'm a relative reference to a repository file](../blob/master/LICENSE)" @@ -87,8 +83,7 @@ input_file: tests/inputs/markdown/github_flavored.md - Some text to show that the reference links can follow later. - "[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org" - "[link text itself]: http://www.reddit.com\n\n------" -- "# Images" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo" - "Title Text 1\")\n\nReference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title" @@ -108,14 +103,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "Like links, Images also have a footnote style syntax\n\n![Alt text][id]" - "With a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)" -- "```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second]." -- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second]." -- "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```" -- "Footnote 1 link[^first].\n\nFootnote 2 link[^second]." -- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second]." -- "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text." -- "------" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first]." +- "Footnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition." +- "Duplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**" +- "and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first]." +- "Footnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition." +- "Duplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**" +- "and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```" - "Inline `code` has `back-ticks around` it." - "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{" @@ -142,8 +136,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "$this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [" - "'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;" - "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" -- "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |" - "| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |" - "| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |" - There must be at least 3 dashes separating each header cell. @@ -177,8 +170,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git status | git status | git status |\n| git diff | git diff | git diff |" - "| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |" - "------" -- "# Blockquotes" -- "```\n> Blockquotes are very handy in email to emulate reply text." +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text." - "> This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps." - "Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone." @@ -195,16 +187,14 @@ input_file: tests/inputs/markdown/github_flavored.md - ">" - "> ...by using additional greater-than signs right next to each other..." - "> > > ...or with spaces between arrows.\n\n------" -- "# Inline HTML" -- "```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    " +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    " - "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    " - "```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    " - "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    " - "------" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```" - "Three or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" -- "# YouTube Videos" -- "```\n" - "\"IMAGE\n\n```" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-3.snap index 674fca44..b632d4b8 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md-3.snap @@ -5,22 +5,20 @@ input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" -- "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`" - "+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses" - "1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" -- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```" -- "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" -- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```" -- "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")" +- "[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]" +- "[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------" - "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" -- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" -- "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }" +- "for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" - "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;" - "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" -- "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |" - "| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md.snap index 4a1b11ee..56b50bc0 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@github_flavored.md.snap @@ -30,16 +30,17 @@ input_file: tests/inputs/markdown/github_flavored.md - "------\n```" - "# h1" - Heading 8- -- ")\n##" -- h2 Heading -- "###" -- h3 Heading -- "####" -- h4 Heading -- "#####" -- h5 Heading -- "######" -- h6 Heading +- ) +- "## h2" +- Heading +- "### h3" +- Heading +- "#### h4" +- Heading +- "##### h5" +- Heading +- "###### h6" +- Heading - Alternativ - "ely, for" - "H1 and H2," @@ -430,8 +431,8 @@ input_file: tests/inputs/markdown/github_flavored.md - + Very - easy! - "------" -- "#" -- Task lists +- "# Task" +- lists - "```" - "- [x]" - Finish my @@ -504,8 +505,7 @@ input_file: tests/inputs/markdown/github_flavored.md - incomplete - item - "------" -- "#" -- Ignoring +- "# Ignoring" - Markdown - formatting - You can @@ -898,9 +898,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "g \"The" - "Dojocat\"" - "------" -- "#" -- "[Footnotes" -- "](https://" +- "# [" +- "Footnotes]" +- "(https://" - github.com - /markdown- - it/ @@ -965,8 +965,7 @@ input_file: tests/inputs/markdown/github_flavored.md - Footnote - text. - "------" -- "#" -- Code and +- "# Code and" - Syntax - Highlighti - ng @@ -1618,8 +1617,7 @@ input_file: tests/inputs/markdown/github_flavored.md - between - arrows. - "------" -- "#" -- Inline +- "# Inline" - HTML - "```\n
    " - "
    " @@ -1686,8 +1684,7 @@ input_file: tests/inputs/markdown/github_flavored.md - ___ - Underscore - "s\n\n------" -- "#" -- YouTube +- "# YouTube" - Videos - "```" - "\n\n

    The quick brown fox jumped over the lazy\n dog's back.

    " - "

    Header 3

    \n\n
    \n

    This is a blockquote.

    " - "

    This is the second paragraph in the blockquote.

    " -- "

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###" -- "Markdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:" +- "

    This is an H2 in a blockquote

    \n " +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis." +- "Markdown:" - "Some of these words *are emphasized*.\n Some of these words _are emphasized also_." - "Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__." - "Output:" - "

    Some of these words are emphasized." - Some of these words are emphasized also.

    - "

    Use two asterisks for strong emphasis." -- "Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##" +- "Or, if you prefer, use two underscores instead.

    " +- "## Lists ##" - "Unordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers." - "These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:" - "+ Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze." @@ -65,7 +67,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:" - "* A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:" - "
      \n
    • A list item.

      \n

      With multiple paragraphs.

    • " -- "
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###" +- "
  • Another item in the list.

  • \n " +- "### Links ###" - "Markdown supports two styles for creating links: *inline* and\n*reference*." - "With both styles, you use square brackets to delimit the\ntext you want to turn into a link." - "Inline-style links use parentheses immediately after the link text.\nFor example:" @@ -88,8 +91,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "I start my morning with a cup of coffee and\n [The New York Times][NY Times]." - "[ny times]: http://www.nytimes.com/\n\nOutput:" - "

    I start my morning with a cup of coffee and" -- "The New York Times.

    \n\n\n### Images ###" -- "Image syntax is very much like link syntax.\n\nInline (titles are optional):" +- "The New York Times.

    " +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):" - "![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:" - "![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"" - "Both of the above examples produce the same output:" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md-3.snap index cf73df9e..27e9342c 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md-3.snap @@ -4,12 +4,14 @@ expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- - "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown." -- "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs." -- "Markdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:" -- "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:" -- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    " -- "If you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:" -- "

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:" -- "

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:" -- "

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    " +- "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text" +- "## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:" +- "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    " +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    " +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:" +- "

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    " +- "### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:" +- "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:" +- "If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    " diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md.snap index 811d9af9..0bf431e5 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_basics.md.snap @@ -172,7 +172,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - projects/ - markdown/ - basics.tex -- "t\n\n\n##" +- t +- "##" - Paragraphs - ", Headers," - Blockquote @@ -362,8 +363,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "" - "\n\n\n\n###" -- Phrase +- ">" +- "### Phrase" - "Emphasis #" - "##" - Markdown @@ -426,8 +427,9 @@ input_file: tests/inputs/markdown/markdown_basics.md - underscore - s instead< - /strong>.< -- "/p>\n\n\n\n##" -- "Lists ##" +- /p> +- "## Lists #" +- "#" - Unordered - (bulleted) - lists use @@ -758,8 +760,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "text\"" - "title=\"" - "Title\" />" -- "### Code" -- "###" +- "### Code #" +- "##" - In a - regular - "paragraph," diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-2.snap index dbe25a1b..e6ba5ca0 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-2.snap @@ -140,12 +140,14 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:" - "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >" - "> Back to the first level." -- "Blockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:" -- "> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item." -- ">\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");" +- "Blockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t>" +- "1. This is the first list item.\n\t> 2. This is the second list item.\n\t>" +- "> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");" - Any decent text editor should make email-style quoting easy. For - "example, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu." -- "

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists." +- "

    Lists

    " +- Markdown supports ordered (numbered) and unordered (bulleted) lists. - "Unordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:" - "* Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:" - "- Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:" @@ -328,8 +330,7 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "But if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore." - To produce a literal asterisk or underscore at a position where it - "would otherwise be used as an emphasis delimiter, you can backslash\nescape it:" -- "\\*this text is surrounded by literal asterisks\\*" -- "

    Code

    " +- "\\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    " - "To indicate a span of code, wrap it with backtick quotes (`` ` ``)." - "Unlike a pre-formatted code block, a code span indicates code within a" - "normal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:" diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-3.snap index 9b4e4d98..3009df12 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@markdown_syntax.md-3.snap @@ -3,37 +3,41 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n" -- "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" -- "

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/" -- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text." -- "For any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph." -- "Note that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`." -- "Ampersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T" -- "Similarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" -- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return." +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)" +- "* [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" +- "

    Overview

    \n\n

    Philosophy

    " +- "Markdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/" +- "To this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    " +- "Markdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags." +- "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph." +- "Note that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    " +- "In HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird" +- "in your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5" +- "However, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" +- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    " +- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return." - "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6" - "Optionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:" - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:" -- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish" -- "It's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish" -- "you'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing." -- "But if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:" -- "1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item." -- "To put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:"
    -- "

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:" -- "
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" +- "> This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:" +- "
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:" +- "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:" +- "
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list." +- "To put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    " +- "Pre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    " +- "A code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    " +- "You can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" - "

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details." - "Reference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:" - "[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]" - "are equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:" - "I get 10 times more traffic from [Google] [1] than from\n [Yahoo] [2] or [MSN] [3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nUsing the implicit link name shortcut, you could instead write:\n\n I get 10 times more traffic from [Google][] than from\n [Yahoo][] or [MSN][].\n\n [google]: http://google.com/ \"Google\"\n [yahoo]: http://search.yahoo.com/ \"Yahoo Search\"\n [msn]: http://search.msn.com/ \"MSN Search\"\n\nBoth of the above examples will produce the following HTML output:\n\n

    I get 10 times more traffic from Google than from\n Yahoo\n or MSN.

    \n\nFor comparison, here is the same paragraph written using\nMarkdown's inline link style:" -- "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\")." -- "The point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:" -- "*single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:" -- "Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:" -- "Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:" -- "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *" +- "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    " +- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:" +- "\\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:" +- "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:" +- "![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *" - "

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:" - "address@exa\n mple.com\n\nwhich will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*" - "Markdown provides backslash escapes for the following characters:\n\n \\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap index b8af2209..e27079b8 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap @@ -3,13 +3,12 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```" -- "\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n" -- "\n# Emphasis\n\n" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```" -- "\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. " +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n" +- "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n" +- "~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n" +- "~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. " - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n" - "⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n" - " * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n" @@ -17,32 +16,29 @@ input_file: tests/inputs/markdown/github_flavored.md - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n" - "⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n" - "* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" -- "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n" -- "\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n" -- "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n" -- "\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\n" +- "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" +- "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n" +- "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n" +- "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\n" - "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n" - "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\n" -- "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n" -- "\n# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![" +- "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![" - "Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\n" - "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n" -- "![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n" -- "\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```" -- "\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n" -- "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n" +- "![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\n" +- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n" +- " return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n" - " console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n" - "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n" - " static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n" -- " $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\n" +- " $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\n" - "The outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n" - "| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n" - "| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n" @@ -50,16 +46,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n" - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n" - "| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n" -- "| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n" -- "\n# Blockquotes\n\n" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n" +- "| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n" - ">> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n" -- "> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```" -- "\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n" -- "\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```" -- "\n\n\n\"IMAGE\n\n\n" +- "> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n" +- "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" +- "# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n" - "```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```" - "\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-3.snap index b75bb211..1513972e 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-3.snap @@ -3,10 +3,10 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n" -- "\n# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" +- "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n\n\n```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap index 2a646144..32dad0b1 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap @@ -3,23 +3,25 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers\n\n" -- "```\n# h1 Heading 8-)\n" +- "# Headers\n\n```\n" +- "# h1 Heading 8-)\n" - "## h2 Heading\n### h3 Heading\n" - "#### h4 Heading\n##### h5 Heading\n" - "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an" - " underline-ish style:\n\nAlt-H1\n======\n\n" -- "Alt-H2\n------\n```\n\n# " -- "h1 Heading 8-)\n## " -- "h2 Heading\n### h3 Heading" -- "\n#### h4 Heading\n##### h5 Heading" -- "\n###### h6 Heading\n\n" +- "Alt-H2\n------\n```\n\n" +- "# h1 Heading 8-)\n" +- "## h2 Heading\n" +- "### h3 Heading\n" +- "#### h4 Heading\n" +- "##### h5 Heading\n" +- "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an" -- " underline-ish style:\n\nAlt-H1\n======\n\n" -- "Alt-H2\n------\n\n------\n" -- "\n# Emphasis\n\n" -- "```\n" +- " underline-ish style:\n\n" +- "Alt-H1\n======\n\nAlt-H2\n" +- "------\n\n------\n\n" +- "# Emphasis\n\n```\n" - "Emphasis, aka italics, with *" - "asterisks* or _underscores_.\n\n" - "Strong emphasis, aka bold, with **asterisks" @@ -45,10 +47,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "__This is bold text__\n\n" - "*This is italic text*\n\n" - "_This is italic text_\n\n" -- "~~Strikethrough~~\n\n------\n" -- "\n# Lists\n\n" -- "```\n1. First ordered list item\n" -- "2. Another item\n" +- "~~Strikethrough~~\n\n------\n\n" +- "# Lists\n\n```\n1. " +- "First ordered list item\n2. Another item\n" - ⋅⋅* Unordered sub - "-list.\n1. " - "Actual numbers don't matter, just that it's" @@ -139,9 +140,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Facilisis in pretium nisl " - "aliquet\n " - "- Nulla volutpat aliquam velit\n" -- "+ Very easy!\n\n------\n" -- "\n# Task lists\n\n" -- "```\n- [x] Finish my changes\n" +- "+ Very easy!\n\n------\n\n" +- "# Task lists\n\n```\n" +- "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" - "- [x] @mentions, #refs," @@ -161,8 +162,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] list syntax required (any unordered" - " or ordered list supported)\n" - "- [ ] this is a complete item\n" -- "- [ ] this is an incomplete item\n\n------\n" -- "\n# Ignoring Markdown formatting\n\n" +- "- [ ] this is an incomplete item\n\n------\n\n" +- "# Ignoring Markdown formatting\n\n" - You can tell GitHub to ignore (or escape) - " Markdown formatting by using \\ before the Markdown character.\n\n" - "```\n" @@ -170,9 +171,8 @@ input_file: tests/inputs/markdown/github_flavored.md - " to \\*our-old-project\\*.\n" - "```\n\n" - "Let's rename \\*our-new-project\\*" -- " to \\*our-old-project\\*.\n\n------\n" -- "\n# Links\n\n" -- "```\n" +- " to \\*our-old-project\\*.\n\n------\n\n" +- "# Links\n\n```\n" - "[I'm an inline-style link](https://" - "www.google.com)\n\n" - "[I'm an inline-style link with title](https" @@ -222,9 +222,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "://www.mozilla.org\n" - "[1]: http://slashdot.org\n" - "[link text itself]: http://www.reddit.com" -- "\n\n------\n" -- "\n# Images\n\n" -- "```\n" +- "\n\n------\n\n" +- "# Images\n\n```\n" - "Here's our logo (hover to see the title" - " text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p" @@ -273,10 +272,9 @@ input_file: tests/inputs/markdown/github_flavored.md - " location:\n\n" - "[id]: https://octodex.github.com/images" - "/dojocat.jpg \"The " -- "Dojocat\"\n\n------\n" -- "\n# " -- "[Footnotes](https://github.com/markdown" -- "-it/markdown-it-footnote)\n\n" +- "Dojocat\"\n\n------\n\n" +- "# [Footnotes](https://github.com/" +- "markdown-it/markdown-it-footnote)\n\n" - "```\nFootnote 1 link[^first].\n\n" - "Footnote 2 link[^second].\n\n" - "Inline footnote^[Text of inline footnote] definition.\n\n" @@ -290,9 +288,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Duplicated footnote reference[^second].\n\n" - "[^first]: Footnote **can have markup**\n\n" - " and multiple paragraphs.\n\n" -- "[^second]: Footnote text.\n\n------\n" -- "\n# Code and Syntax Highlighting\n\n" -- "```\n" +- "[^second]: Footnote text.\n\n------\n\n" +- "# Code and Syntax Highlighting\n\n```\n" - "Inline `code` has `back-ticks around" - "` it.\n```\n\n" - "Inline `code` has `back-ticks around" @@ -370,9 +367,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "URI::$st1;\n\n" - "__halt_compiler () ; datahere\n" - "datahere\ndatahere */\ndatahere\n" -- "```\n\n------\n" -- "\n# Tables\n\n" -- "```\n" +- "```\n\n------\n\n" +- "# Tables\n\n```\n" - "Colons can be used to align columns.\n\n" - "| Tables | Are | Cool |\n" - "| ------------- |:-------------:| -----:|\n" @@ -461,9 +457,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n" - "| --- | --- |\n" - "| Backtick | ` |\n" -- "| Pipe | \\| |\n\n------\n" -- "\n# Blockquotes\n\n" -- "```\n" +- "| Pipe | \\| |\n\n------\n\n" +- "# Blockquotes\n\n```\n" - "> Blockquotes are very handy in email to emulate" - " reply text.\n" - "> This line is part of the same quote.\n\n" @@ -492,9 +487,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested...\n" - ">> ...by using additional greater-than signs right next" - " to each other...\n> > " -- "> ...or with spaces between arrows.\n\n------\n" -- "\n# Inline HTML\n\n" -- "```\n
    \n" +- "> ...or with spaces between arrows.\n\n------\n\n" +- "# Inline HTML\n\n```\n
    \n" - "
    Definition list
    \n" - "
    Is something people use sometimes.\n\n" @@ -502,7 +496,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **" - "very** well. " - Use HTML tags.
    \n
    \n```\n\n
    \n" +- ">\n
    \n```\n\n" +- "
    \n" - "
    Definition list
    \n" - "
    Is something people use sometimes.\n\n " @@ -510,15 +505,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **" - "very** well. " - Use HTML tags.
    \n
    \n\n------\n" -- "\n# Horizontal Rules\n\n" -- "```\nThree or more...\n\n---\n\n" -- "Hyphens\n\n***\n\nAsterisks\n\n___\n\n" -- "Underscores\n```\n\nThree or more...\n\n" +- ">\n
    \n\n------\n\n" +- "# Horizontal Rules\n\n```\nThree or more...\n\n" - "---\n\nHyphens\n\n***\n\nAsterisks\n\n" -- "___\n\nUnderscores\n\n------\n" -- "\n# YouTube Videos\n\n" -- "```\n" +- "___\n\nUnderscores\n```\n\n" +- "Three or more...\n\n---\n\nHyphens\n\n***\n" +- "\nAsterisks\n\n___\n\nUnderscores\n\n" +- "------\n\n" +- "# YouTube Videos\n\n```\n" - "\n
  • Main
  • \n
  • Basics
  • \n
  • Syntax
  • \n" -- "
  • License
  • \n
  • Dingus
  • \n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\n" -- "This page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\n" +- "
  • License
  • \n
  • Dingus
  • \n\n\n\n" +- "Getting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\n" - "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n" -- "\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\n" -- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\n" +- "\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n" +- "## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\n" - "Markdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\n" - "Blockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n " - "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n" - " > ## This is an H2 in a blockquote\n\n\nOutput:\n\n " - "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n" -- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n " -- "Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " -- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n" -- "\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n" +- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n \n\n\n\n" +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n " +- "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n" +- "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n" - "\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n " - "
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n " -- "

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n" -- "\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n" +- "
      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n" +- "### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n" - "\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n " - "I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n " - "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n " -- "I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n " -- "![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n" -- "\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n " +- "I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n" +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n " - "I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n " - "

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n" - "\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n " diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md-3.snap index 1e94f9be..ca2b1799 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md-3.snap @@ -3,7 +3,7 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- -- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n" -- "\nand this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n " -- "If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" +- "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\n" +- "and this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    \n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md.snap index de44e823..b4471494 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_basics.md.snap @@ -20,8 +20,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "
  • Dingus
  • \n" -- "\n" -- "\n\nGetting the Gist of Markdown's Formatting Syntax" +- "\n\n\n" +- "Getting the Gist of Markdown's Formatting Syntax" - "\n------------------------------------------------\n\n" - "This page offers a brief overview of what it's" - " like to use Markdown.\nThe [syntax page]" @@ -50,8 +50,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - " [d]: /projects/markdown/" - "dingus \"Markdown Dingus\"\n" - " [src]: /projects/markdown/" -- "basics.text\n\n\n## " -- "Paragraphs, Headers, Blockquotes ##\n\n" +- "basics.text\n\n\n" +- "## Paragraphs, Headers, Blockquotes ##\n\n" - A paragraph is simply one or more consecutive lines of - " text, separated\n" - "by one or more blank lines. " @@ -107,7 +107,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - " blockquote.

    \n\n " - "

    This is an H2 in" - " a blockquote

    \n " -- "\n\n\n\n### Phrase Emphasis ###\n\n" +- "\n\n\n\n" +- "### Phrase Emphasis ###\n\n" - Markdown uses asterisks and underscores to indicate spans of - " emphasis.\n\nMarkdown:\n\n " - "Some of these words *are emphasized*.\n" @@ -123,8 +124,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "

    Use two asterisks for " - "strong emphasis.\n " - "Or, if you prefer, use" -- " two underscores instead.

    \n\n\n\n## Lists" -- " ##\n\n" +- " two underscores instead
    .

    \n\n\n\n" +- "## Lists ##\n\n" - Unordered (bulleted) lists use asterisks - ", pluses, and hyphens (`*" - "`,\n`+`, and `-`" @@ -167,8 +168,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - ">\n " - "

    With multiple paragraphs.

    \n" - "
  • Another item in the" -- " list.

  • \n \n" -- "\n\n\n### Links ###\n\n" +- " list.

    \n \n\n\n\n" +- "### Links ###\n\n" - "Markdown supports two styles for creating links: *inline" - "* and\n*reference*" - ". " diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-2.snap index 58ca39c7..6167025a 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-2.snap @@ -8,42 +8,44 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n" - " * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n " - "* [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" +- "\n

    Overview

    \n\n

    Philosophy

    \n" +- "\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\n" - "Readability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n" - "[Grutatext][5], and [EtText][6]" - " -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email." - "\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n" -- " [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n" -- "\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" +- " [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n" +- "

    Inline HTML

    \n" +- "\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\n" - "Markdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\n" - "format. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\n" - "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n" - "\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n" - "\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\n" - "Span-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\n" -- "Unlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\n" -- "In HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\n" +- "Unlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n" +- "\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\n" - "Ampersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n" - "\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\n" - "So, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n" - "\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" -- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\n" +- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n" +- "\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\n" - "The implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\n" - "When you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\n" -- "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n" -- "\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n " -- "This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n" -- "\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n" -- "\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n " +- "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n" +- "

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n" +- "\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n" +- "\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n" +- "

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n " - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n" - "\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n " - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n" -- "\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n" -- "\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n" -- "\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n " -- "* Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n" -- "\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n " -- "1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n" +- "\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n" +- "

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n " +- "- Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n " +- "
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n" - "\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\n" - "If you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n " - "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n" @@ -55,20 +57,21 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n " - "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n " - "* A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n " -- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n" -- "\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    "
    -- "This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n" -- "\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n" +- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n" +- "\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n"
    +- "\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n " +- "

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n" - "\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n " - "
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n" -- "\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\n" -- "You can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" -- "\n

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\n" -- "To create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n " -- "

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n" -- "\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n " -- "[id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\n" -- "The following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n " +- "\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n" +- "\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" +- "\n

    Span Elements

    \n\n

    Links

    \n" +- "\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n " +- "This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n" +- "\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n" +- "\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n" +- "* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t" +- "[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n " - "[id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n" - "\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n" - "\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n" @@ -79,23 +82,23 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "

    I get 10 times more traffic from Google than from\n Yahoo\n or MSN.

    \n\nFor comparison, here is the same paragraph written using\nMarkdown's inline link style:\n\n " - "I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n" - "\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\n" -- "With Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\n" -- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n " +- "With Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n" +- "\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n " - "single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n" -- "\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n" -- "\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n" -- "\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n" -- "\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t" -- "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n " -- "

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n" -- "\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n" -- "\nThat is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n" +- "\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n" +- "\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n " +- "``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\t" +- "A single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n" +- "\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n " +- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n" +- "\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n" +- "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n" - "\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n" - "\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n" - "\nMarkdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:\n\n " - "address@exa\n mple.com\n\nwhich will render in a browser as a clickable link to \"address@example.com\".\n\n" -- "(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\n" -- "Markdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n " +- "(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n" +- "\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n " - "\\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-3.snap index d386fad3..ca5a5a6c 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md-3.snap @@ -3,12 +3,11 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n\n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n" -- "\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n" -- "\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" -- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n" -- "\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n" -- "\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\n" +- "not be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *\n" +- "\n\n

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n " +- "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *\n" - "\n

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:\n\n I get 10 times more traffic from [Google] [1] than from\n [Yahoo] [2] or [MSN] [3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nUsing the implicit link name shortcut, you could instead write:\n\n " - "I get 10 times more traffic from [Google][] than from\n [Yahoo][] or [MSN][].\n\n [google]: http://google.com/ \"Google\"\n [yahoo]: http://search.yahoo.com/ \"Yahoo Search\"\n [msn]: http://search.msn.com/ \"MSN Search\"\n\nBoth of the above examples will produce the following HTML output:\n\n

    I get 10 times more traffic from Google than from\n Yahoo\n or MSN.

    \n\nFor comparison, here is the same paragraph written using\nMarkdown's inline link style:\n\n I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n " - "`—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:\n\n address@exa\n mple.com\n\nwhich will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n \\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md.snap index 523be59b..3a15a762 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@markdown_syntax.md.snap @@ -346,9 +346,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "> > This is nested blockquote.\n >\n" - " > Back to the first level.\n\n" - "Blockquotes can contain other Markdown elements, including headers" -- ", lists,\nand code blocks:\n\n" -- "\t> ## This is a header.\n\t>\n" -- "\t>" +- ", lists,\nand code blocks:\n\n\t> " +- "## This is a header.\n\t>\n\t>" - " 1. " - "This is the first list item.\n\t>" - " 2. " @@ -509,8 +508,7 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "line. " - "To avoid this, you can backslash-escape" - " the period:\n\n " -- "1986\\. What a great season.\n" -- "\n\n\n" +- "1986\\. What a great season.\n\n\n\n" - "

    Code Blocks\n\n" - Pre-formatted code blocks are used for writing about diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap index 62935d90..b8f93f76 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap @@ -5,11 +5,10 @@ input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```" - "# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" -- "# Emphasis" -- "```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```" -- "Emphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" -- "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items." +- "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_" +- "~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_" +- "~~Strikethrough~~\n\n------" +- "# Lists\n\n```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items." - "Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅" - "⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes" - "* Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```" @@ -21,28 +20,25 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```" - "- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------" - "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" -- "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself]." +- "# Links\n\n```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself]." - "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n```" - "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself]." - "URLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" -- "# Images" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![" +- "# Images\n\n```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![" - "Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```" - "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"" - "![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```" -- "Footnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" -- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" +- "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second]." +- "Inline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" +- "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");" +- "return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)" - "console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" - "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */" - "static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }" - "$this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" -- "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell." +- "# Tables\n\n```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell." - "The outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |" - "| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |" - "| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns." @@ -51,15 +47,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |" - "| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |" - "| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" -- "# Blockquotes" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested..." +- "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested..." - ">> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------" -- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```" -- "
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***" -- "Asterisks\n\n___\n\nUnderscores\n\n------" -- "# YouTube Videos\n\n```\n\n\"IMAGE\n\n```" -- "\n\"IMAGE\n" +- "# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    " +- "
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" +- "# YouTube Videos\n\n```\n\n\"IMAGE\n\n```\n\n\n\"IMAGE\n" - "```\n[![IMAGE ALT TEXT HERE](http://img.youtube.com/vi/YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```" - "[![IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/YouTube_logo_2015.svg/1200px-YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?v=ciawICBvQoE)" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap index 3a413b5f..b7f3eb7d 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap @@ -3,23 +3,25 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- -- "# Headers" -- "```\n# h1 Heading 8-)" +- "# Headers\n\n```" +- "# h1 Heading 8-)" - "## h2 Heading\n### h3 Heading" - "#### h4 Heading\n##### h5 Heading" - "###### h6 Heading" - "Alternatively, for H1 and H2, an" - "underline-ish style:\n\nAlt-H1\n======" -- "Alt-H2\n------\n```\n\n#" -- "h1 Heading 8-)\n##" -- "h2 Heading\n### h3 Heading\n####" -- "h4 Heading\n##### h5 Heading\n######" -- h6 Heading +- "Alt-H2\n------\n```" +- "# h1 Heading 8-)" +- "## h2 Heading" +- "### h3 Heading" +- "#### h4 Heading" +- "##### h5 Heading" +- "###### h6 Heading" - "Alternatively, for H1 and H2, an" -- "underline-ish style:\n\nAlt-H1\n======" -- "Alt-H2\n------\n\n------" -- "# Emphasis" -- "```" +- "underline-ish style:" +- "Alt-H1\n======\n\nAlt-H2" +- "------\n\n------" +- "# Emphasis\n\n```" - "Emphasis, aka italics, with *" - asterisks* or _underscores_. - "Strong emphasis, aka bold, with **asterisks" @@ -46,9 +48,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "*This is italic text*" - _This is italic text_ - "~~Strikethrough~~\n\n------" -- "# Lists" -- "```\n1. First ordered list item" -- 2. Another item +- "# Lists\n\n```\n1." +- "First ordered list item\n2. Another item" - ⋅⋅* Unordered sub - "-list.\n1." - "Actual numbers don't matter, just that it's" @@ -132,8 +133,8 @@ input_file: tests/inputs/markdown/github_flavored.md - aliquet - "- Nulla volutpat aliquam velit" - "+ Very easy!\n\n------" -- "# Task lists" -- "```\n- [x] Finish my changes" +- "# Task lists\n\n```" +- "- [x] Finish my changes" - "- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" - "- [x] @mentions, #refs," @@ -161,8 +162,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "to \\*our-old-project\\*.\n```" - "Let's rename \\*our-new-project\\*" - "to \\*our-old-project\\*.\n\n------" -- "# Links" -- "```" +- "# Links\n\n```" - "[I'm an inline-style link](https://" - www.google.com) - "[I'm an inline-style link with title](https" @@ -213,8 +213,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "[1]: http://slashdot.org" - "[link text itself]: http://www.reddit.com" - "------" -- "# Images" -- "```" +- "# Images\n\n```" - "Here's our logo (hover to see the title" - "text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p" @@ -263,9 +262,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "[id]: https://octodex.github.com/images" - "/dojocat.jpg \"The" - "Dojocat\"\n\n------" -- "#" -- "[Footnotes](https://github.com/markdown" -- "-it/markdown-it-footnote)" +- "# [Footnotes](https://github.com/" +- markdown-it/markdown-it-footnote) - "```\nFootnote 1 link[^first]." - "Footnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." @@ -280,8 +278,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "[^first]: Footnote **can have markup**" - and multiple paragraphs. - "[^second]: Footnote text.\n\n------" -- "# Code and Syntax Highlighting" -- "```" +- "# Code and Syntax Highlighting\n\n```" - "Inline `code` has `back-ticks around" - "` it.\n```" - "Inline `code` has `back-ticks around" @@ -357,8 +354,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "echo URI::ME . URI::$st1;" - "__halt_compiler () ; datahere\ndatahere" - "datahere */\ndatahere\n```\n\n------" -- "# Tables" -- "```" +- "# Tables\n\n```" - Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ------------- |:-------------:| -----:|" @@ -446,8 +442,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "| --- | --- |" - "| Backtick | ` |" - "| Pipe | \\| |\n\n------" -- "# Blockquotes" -- "```" +- "# Blockquotes\n\n```" - "> Blockquotes are very handy in email to emulate" - reply text. - "> This line is part of the same quote." @@ -477,15 +472,15 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next" - "to each other...\n> >" - "> ...or with spaces between arrows.\n\n------" -- "# Inline HTML" -- "```\n
    " +- "# Inline HTML\n\n```\n
    " - "
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " - "
    Does *not* work **very**" - well. - Use HTML tags.
    \n
    \n```\n\n
    " +- ">\n
    \n```" +- "
    " - "
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " @@ -493,14 +488,12 @@ input_file: tests/inputs/markdown/github_flavored.md - well. - Use HTML tags.\n
    \n\n------" -- "# Horizontal Rules" -- "```\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores" -- "```\n\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores" -- "------" -- "# YouTube Videos" -- "```" +- "# Horizontal Rules\n\n```\nThree or more..." +- "---\n\nHyphens\n\n***\n\nAsterisks" +- "___\n\nUnderscores\n```" +- "Three or more...\n\n---\n\nHyphens\n\n***" +- "Asterisks\n\n___\n\nUnderscores\n\n------" +- "# YouTube Videos\n\n```" - "\n
  • Main
  • \n
  • Basics
  • \n
  • Syntax
  • " -- "
  • License
  • \n
  • Dingus
  • \n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------" -- "This page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown." +- "
  • License
  • \n
  • Dingus
  • \n" +- "Getting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown." - "It's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src]." -- "[s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##" -- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs." +- "[s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text" +- "## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs." - "Markdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level." - "Blockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:" - "A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >" - "> ## This is an H2 in a blockquote\n\n\nOutput:" - "

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    " -- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:" -- "Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:" +- "

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n " +- "### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:" - "

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    " - "## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze.\n\nand this:\n\n - Candy.\n - Gum.\n - Booze." - "all produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:" @@ -25,8 +25,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "Optionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:" - "I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:" - "

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:" -- "I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):" -- "![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt" +- "I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    " +- "### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt" - "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:" - "I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:" - "

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    " diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md-3.snap index 7b4f4893..86754a63 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md-3.snap @@ -4,6 +4,6 @@ expression: chunks input_file: tests/inputs/markdown/markdown_basics.md --- - "Markdown: Basics\n================\n\n\n\n\nGetting the Gist of Markdown's Formatting Syntax\n------------------------------------------------\n\nThis page offers a brief overview of what it's like to use Markdown.\nThe [syntax page] [s] provides complete, detailed documentation for\nevery feature, but Markdown should be very easy to pick up simply by\nlooking at a few examples of it in action. The examples on this page\nare written in a before/after style, showing example syntax and the\nHTML output produced by Markdown.\n\nIt's also helpful to simply try Markdown out; the [Dingus] [d] is a\nweb application that allows you type your own Markdown-formatted text\nand translate it to XHTML.\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL] [src].\n\n [s]: /projects/markdown/syntax \"Markdown Syntax\"\n [d]: /projects/markdown/dingus \"Markdown Dingus\"\n [src]: /projects/markdown/basics.text\n\n\n## Paragraphs, Headers, Blockquotes ##\n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like\na blank line -- a line containing nothing but spaces or tabs is\nconsidered blank.) Normal paragraphs should not be indented with\nspaces or tabs.\n\nMarkdown offers two styles of headers: *Setext* and *atx*.\nSetext-style headers for `

    ` and `

    ` are created by\n\"underlining\" with equal signs (`=`) and hyphens (`-`), respectively.\nTo create an atx-style header, you put 1-6 hash marks (`#`) at the\nbeginning of the line -- the number of hashes equals the resulting\nHTML header level.\n\nBlockquotes are indicated using email-style '`>`' angle brackets.\n\nMarkdown:\n\n A First Level Header\n ====================\n\n A Second Level Header\n ---------------------\n\n Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.\n\n The quick brown fox jumped over the lazy\n dog's back.\n\n ### Header 3\n\n > This is a blockquote.\n >\n > This is the second paragraph in the blockquote.\n >\n > ## This is an H2 in a blockquote\n\n\nOutput:\n\n

    A First Level Header

    \n\n

    A Second Level Header

    \n\n

    Now is the time for all good men to come to\n the aid of their country. This is just a\n regular paragraph.

    \n\n

    The quick brown fox jumped over the lazy\n dog's back.

    \n\n

    Header 3

    \n\n
    \n

    This is a blockquote.

    \n\n

    This is the second paragraph in the blockquote.

    \n\n

    This is an H2 in a blockquote

    \n
    \n\n\n\n### Phrase Emphasis ###\n\nMarkdown uses asterisks and underscores to indicate spans of emphasis.\n\nMarkdown:\n\n Some of these words *are emphasized*.\n Some of these words _are emphasized also_.\n\n Use two asterisks for **strong emphasis**.\n Or, if you prefer, __use two underscores instead__.\n\nOutput:\n\n

    Some of these words are emphasized.\n Some of these words are emphasized also.

    \n\n

    Use two asterisks for strong emphasis.\n Or, if you prefer, use two underscores instead.

    \n\n\n\n## Lists ##\n\nUnordered (bulleted) lists use asterisks, pluses, and hyphens (`*`,\n`+`, and `-`) as list markers. These three markers are\ninterchangable; this:\n\n * Candy.\n * Gum.\n * Booze.\n\nthis:\n\n + Candy.\n + Gum.\n + Booze." -- "and this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt\n\n\n\n### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:" -- "If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    " +- "and this:\n\n - Candy.\n - Gum.\n - Booze.\n\nall produce the same output:\n\n
      \n
    • Candy.
    • \n
    • Gum.
    • \n
    • Booze.
    • \n
    \n\nOrdered (numbered) lists use regular numbers, followed by periods, as\nlist markers:\n\n 1. Red\n 2. Green\n 3. Blue\n\nOutput:\n\n
      \n
    1. Red
    2. \n
    3. Green
    4. \n
    5. Blue
    6. \n
    \n\nIf you put blank lines between items, you'll get `

    ` tags for the\nlist item text. You can create multi-paragraph list items by indenting\nthe paragraphs by 4 spaces or 1 tab:\n\n * A list item.\n\n With multiple paragraphs.\n\n * Another item in the list.\n\nOutput:\n\n

      \n
    • A list item.

      \n

      With multiple paragraphs.

    • \n
    • Another item in the list.

    • \n
    \n\n\n\n### Links ###\n\nMarkdown supports two styles for creating links: *inline* and\n*reference*. With both styles, you use square brackets to delimit the\ntext you want to turn into a link.\n\nInline-style links use parentheses immediately after the link text.\nFor example:\n\n This is an [example link](http://example.com/).\n\nOutput:\n\n

    This is an \n example link.

    \n\nOptionally, you may include a title attribute in the parentheses:\n\n This is an [example link](http://example.com/ \"With a Title\").\n\nOutput:\n\n

    This is an \n example link.

    \n\nReference-style links allow you to refer to your links by names, which\nyou define elsewhere in your document:\n\n I get 10 times more traffic from [Google][1] than from\n [Yahoo][2] or [MSN][3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nOutput:\n\n

    I get 10 times more traffic from Google than from Yahoo or MSN.

    \n\nThe title attribute is optional. Link names may contain letters,\nnumbers and spaces, but are *not* case sensitive:\n\n I start my morning with a cup of coffee and\n [The New York Times][NY Times].\n\n [ny times]: http://www.nytimes.com/\n\nOutput:\n\n

    I start my morning with a cup of coffee and\n The New York Times.

    \n\n\n### Images ###\n\nImage syntax is very much like link syntax.\n\nInline (titles are optional):\n\n ![alt text](/path/to/img.jpg \"Title\")\n\nReference-style:\n\n ![alt text][id]\n\n [id]: /path/to/img.jpg \"Title\"\n\nBoth of the above examples produce the same output:\n\n \"alt" +- "### Code ###\n\nIn a regular paragraph, you can create code span by wrapping text in\nbacktick quotes. Any ampersands (`&`) and angle brackets (`<` or\n`>`) will automatically be translated into HTML entities. This makes\nit easy to use Markdown to write about HTML example code:\n\n I strongly recommend against using any `` tags.\n\n I wish SmartyPants used named entities like `—`\n instead of decimal-encoded entities like `—`.\n\nOutput:\n\n

    I strongly recommend against using any\n <blink> tags.

    \n\n

    I wish SmartyPants used named entities like\n &mdash; instead of decimal-encoded\n entities like &#8212;.

    \n\n\nTo specify an entire block of pre-formatted code, indent every line of\nthe block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`,\nand `>` characters will be escaped automatically.\n\nMarkdown:\n\n If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:\n\n
    \n

    For example.

    \n
    \n\nOutput:\n\n

    If you want your page to validate under XHTML 1.0 Strict,\n you've got to put paragraph tags in your blockquotes:

    \n\n
    <blockquote>\n        <p>For example.</p>\n    </blockquote>\n    
    " diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md.snap index f3b6bd27..ac07270c 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_basics.md.snap @@ -100,8 +100,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "

    This is the second paragraph in the blockquote" - ".

    " - "

    This is an H2 in a" -- "blockquote

    \n \n\n\n\n###" -- "Phrase Emphasis ###" +- "blockquote

    \n " +- "### Phrase Emphasis ###" - Markdown uses asterisks and underscores to indicate spans of - "emphasis.\n\nMarkdown:" - Some of these words *are emphasized*. @@ -116,8 +116,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "

    Use two asterisks for " - strong emphasis. - "Or, if you prefer, use" -- "two underscores instead.

    \n\n\n\n## Lists" -- "##" +- two underscores instead
    .

    +- "## Lists ##" - Unordered (bulleted) lists use asterisks - ", pluses, and hyphens (`*" - "`,\n`+`, and `-`" @@ -158,8 +158,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "
  • A list item.

    " - "

    With multiple paragraphs.

  • " - "
  • Another item in the list.

  • \n \n\n\n\n### Links" -- "###" +- "p>\n " +- "### Links ###" - "Markdown supports two styles for creating links: *inline" - "* and\n*reference*" - "." @@ -212,8 +212,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "

    I start my morning with a cup of coffee" - and - "" -- "The New York Times.

    \n\n\n###" -- "Images ###" +- The New York Times.

    +- "### Images ###" - Image syntax is very much like link syntax. - "Inline (titles are optional):" - "![" @@ -223,8 +223,8 @@ input_file: tests/inputs/markdown/markdown_basics.md - "[id]: /path/to/img.jpg \"Title\"" - "Both of the above examples produce the same output:" - "\""\n\n\n\n###" -- "Code ###" +- "alt text\" title=\"Title\" />" +- "### Code ###" - "In a regular paragraph, you can create code span" - by wrapping text in - backtick quotes. Any ampersands ( diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-2.snap index 55b27208..08140809 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-2.snap @@ -8,13 +8,15 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)" - "* [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)" - "**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" -- "

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible." +- "

    Overview

    \n\n

    Philosophy

    " +- Markdown is intended to be as easy-to-read and easy-to-write as is feasible. - "Readability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4]," - "[Grutatext][5], and [EtText][6]" - "-- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email." - "[1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html" - "[6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email." -- "

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web." +- "

    Inline HTML

    " +- "Markdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web." - "Markdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*" - "format. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags." - "The only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:" @@ -27,23 +29,23 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "in your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`." - "So, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5" - "Markdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" -- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs." +- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    " +- "A paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs." - "The implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag." - "When you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return." -- "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks." -- "[bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------" +- "Yes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list" +- "

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------" - "Any number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6" - "Optionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######" - "

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:" - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing." - "Markdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:" - "> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing." -- "Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:" -- "> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");" -- "Any decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue" -- "is equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish" -- "It's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:" -- "1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish" +- "Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu." +- "

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue" +- "Ordered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:" +- "
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish" - "you'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to." - "If you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:" - "* Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:" @@ -54,20 +56,21 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:" - "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:" - "* A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:" -- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season." -- "

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:"
    -- "This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:" +- "1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    " +- "Pre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block."
    +- "Markdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:" - "

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    " - "A code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:" - "
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    " - "Regular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    " - "You can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" -- "

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets]." -- "To create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:" -- "

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details." -- "Reference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:" -- "[id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses." -- "The following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:" +- "

    Span Elements

    \n\n

    Links

    " +- "Markdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:" +- "This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    " +- "If you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link." +- "Then, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:" +- "* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:" +- "[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:" - "[id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"" - "Link definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]" - "are equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/" @@ -81,14 +84,14 @@ input_file: tests/inputs/markdown/markdown_syntax.md - "With Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    " - "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:" - "single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable" -- "But if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*" -- "

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    " -- "To include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    " -- "The backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:" -- "

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:" -- "

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    " -- "

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")" -- "That is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]" +- "But if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    " +- "To indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:" +- "``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:" +- "A single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    " +- "With a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:\n\n `—` is the decimal-encoded equivalent of `—`.\n\nto produce:" +- "

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    " +- "Admittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:" +- "* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]" - "Where \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *" - "

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n " - "Markdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-3.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-3.snap index c856a397..9d34166f 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-3.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md-3.snap @@ -3,12 +3,11 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/markdown_syntax.md --- -- "Markdown: Syntax\n================\n\n\n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *" -- "

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n`
    `, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should\nnot be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:" -- "http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" -- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t> ## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");" -- "Any decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:\n\n * This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list." -- "To put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" +- "Markdown: Syntax\n================\n\n
    \n\n\n* [Overview](#overview)\n * [Philosophy](#philosophy)\n * [Inline HTML](#html)\n * [Automatic Escaping for Special Characters](#autoescape)\n* [Block Elements](#block)\n * [Paragraphs and Line Breaks](#p)\n * [Headers](#header)\n * [Blockquotes](#blockquote)\n * [Lists](#list)\n * [Code Blocks](#precode)\n * [Horizontal Rules](#hr)\n* [Span Elements](#span)\n * [Links](#link)\n * [Emphasis](#em)\n * [Code](#code)\n * [Images](#img)\n* [Miscellaneous](#misc)\n * [Backslash Escapes](#backslash)\n * [Automatic Links](#autolink)\n\n\n**Note:** This document is itself written using Markdown; you\ncan [see the source for it by adding '.text' to the URL][src].\n\n [src]: /projects/markdown/syntax.text\n\n* * *\n\n

    Overview

    \n\n

    Philosophy

    \n\nMarkdown is intended to be as easy-to-read and easy-to-write as is feasible.\n\nReadability, however, is emphasized above all else. A Markdown-formatted\ndocument should be publishable as-is, as plain text, without looking\nlike it's been marked up with tags or formatting instructions. While\nMarkdown's syntax has been influenced by several existing text-to-HTML\nfilters -- including [Setext][1], [atx][2], [Textile][3], [reStructuredText][4],\n[Grutatext][5], and [EtText][6] -- the single biggest source of\ninspiration for Markdown's syntax is the format of plain text email.\n\n [1]: http://docutils.sourceforge.net/mirror/setext.html\n [2]: http://www.aaronsw.com/2002/atx/\n [3]: https://web.archive.org/web/20021226035527/http://textism.com/tools/textile/\n [4]: http://docutils.sourceforge.net/rst.html\n [5]: http://www.triptico.com/software/grutatxt.html\n [6]: http://ettext.taint.org/doc/\n\nTo this end, Markdown's syntax is comprised entirely of punctuation\ncharacters, which punctuation characters have been carefully chosen so\nas to look like what they mean. E.g., asterisks around a word actually\nlook like \\*emphasis\\*. Markdown lists look like, well, lists. Even\nblockquotes look like quoted passages of text, assuming you've ever\nused email.\n\n\n\n

    Inline HTML

    \n\nMarkdown's syntax is intended for one purpose: to be used as a\nformat for *writing* for the web.\n\nMarkdown is not a replacement for HTML, or even close to it. Its\nsyntax is very small, corresponding only to a very small subset of\nHTML tags. The idea is *not* to create a syntax that makes it easier\nto insert HTML tags. In my opinion, HTML tags are already easy to\ninsert. The idea for Markdown is to make it easy to read, write, and\nedit prose. HTML is a *publishing* format; Markdown is a *writing*\nformat. Thus, Markdown's formatting syntax only addresses issues that\ncan be conveyed in plain text.\n\nFor any markup that is not covered by Markdown's syntax, you simply\nuse HTML itself. There's no need to preface it or delimit it to\nindicate that you're switching from Markdown to HTML; you just use\nthe tags.\n\nThe only restrictions are that block-level HTML elements -- e.g. `
    `,\n``, `
    `, `

    `, etc. -- must be separated from surrounding\ncontent by blank lines, and the start and end tags of the block should" +- "not be indented with tabs or spaces. Markdown is smart enough not\nto add extra (unwanted) `

    ` tags around HTML block-level tags.\n\nFor example, to add an HTML table to a Markdown article:\n\n This is a regular paragraph.\n\n

    \n \n \n \n
    Foo
    \n\n This is another regular paragraph.\n\nNote that Markdown formatting syntax is not processed within block-level\nHTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an\nHTML block.\n\nSpan-level HTML tags -- e.g. ``, ``, or `` -- can be\nused anywhere in a Markdown paragraph, list item, or header. If you\nwant, you can even use HTML tags instead of Markdown formatting; e.g. if\nyou'd prefer to use HTML `` or `` tags instead of Markdown's\nlink or image syntax, go right ahead.\n\nUnlike block-level HTML tags, Markdown syntax *is* processed within\nspan-level tags.\n\n\n

    Automatic Escaping for Special Characters

    \n\nIn HTML, there are two characters that demand special treatment: `<`\nand `&`. Left angle brackets are used to start tags; ampersands are\nused to denote HTML entities. If you want to use them as literal\ncharacters, you must escape them as entities, e.g. `<`, and\n`&`.\n\nAmpersands in particular are bedeviling for web writers. If you want to\nwrite about 'AT&T', you need to write '`AT&T`'. You even need to\nescape ampersands within URLs. Thus, if you want to link to:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nyou need to encode the URL as:\n\n http://images.google.com/images?num=30&q=larry+bird\n\nin your anchor tag `href` attribute. Needless to say, this is easy to\nforget, and is probably the single most common source of HTML validation\nerrors in otherwise well-marked-up web sites.\n\nMarkdown allows you to use these characters naturally, taking care of\nall the necessary escaping for you. If you use an ampersand as part of\nan HTML entity, it remains unchanged; otherwise it will be translated\ninto `&`.\n\nSo, if you want to include a copyright symbol in your article, you can write:\n\n ©\n\nand Markdown will leave it alone. But if you write:\n\n AT&T\n\nMarkdown will translate it to:\n\n AT&T\n\nSimilarly, because Markdown supports [inline HTML](#html), if you use\nangle brackets as delimiters for HTML tags, Markdown will treat them as\nsuch. But if you write:\n\n 4 < 5\n\nMarkdown will translate it to:\n\n 4 < 5\n\nHowever, inside Markdown code spans and blocks, angle brackets and\nampersands are *always* encoded automatically. This makes it easy to use\nMarkdown to write about HTML code. (As opposed to raw HTML, which is a\nterrible format for writing about HTML syntax, because every single `<`\nand `&` in your example code needs to be escaped.)\n\n\n* * *" +- "

    Block Elements

    \n\n\n

    Paragraphs and Line Breaks

    \n\nA paragraph is simply one or more consecutive lines of text, separated\nby one or more blank lines. (A blank line is any line that looks like a\nblank line -- a line containing nothing but spaces or tabs is considered\nblank.) Normal paragraphs should not be indented with spaces or tabs.\n\nThe implication of the \"one or more consecutive lines of text\" rule is\nthat Markdown supports \"hard-wrapped\" text paragraphs. This differs\nsignificantly from most other text-to-HTML formatters (including Movable\nType's \"Convert Line Breaks\" option) which translate every line break\ncharacter in a paragraph into a `
    ` tag.\n\nWhen you *do* want to insert a `
    ` break tag using Markdown, you\nend a line with two or more spaces, then type return.\n\nYes, this takes a tad more effort to create a `
    `, but a simplistic\n\"every line break is a `
    `\" rule wouldn't work for Markdown.\nMarkdown's email-style [blockquoting][bq] and multi-paragraph [list items][l]\nwork best -- and look better -- when you format them with hard breaks.\n\n [bq]: #blockquote\n [l]: #list\n\n\n\n

    Headers

    \n\nMarkdown supports two styles of headers, [Setext] [1] and [atx] [2].\n\nSetext-style headers are \"underlined\" using equal signs (for first-level\nheaders) and dashes (for second-level headers). For example:\n\n This is an H1\n =============\n\n This is an H2\n -------------\n\nAny number of underlining `=`'s or `-`'s will work.\n\nAtx-style headers use 1-6 hash characters at the start of the line,\ncorresponding to header levels 1-6. For example:\n\n # This is an H1\n\n ## This is an H2\n\n ###### This is an H6\n\nOptionally, you may \"close\" atx-style headers. This is purely\ncosmetic -- you can use this if you think it looks better. The\nclosing hashes don't even need to match the number of hashes\nused to open the header. (The number of opening hashes\ndetermines the header level.) :\n\n # This is an H1 #\n\n ## This is an H2 ##\n\n ### This is an H3 ######\n\n\n

    Blockquotes

    \n\nMarkdown uses email-style `>` characters for blockquoting. If you're\nfamiliar with quoting passages of text in an email message, then you\nknow how to create a blockquote in Markdown. It looks best if you hard\nwrap the text and put a `>` before every line:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n >\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n > id sem consectetuer libero luctus adipiscing.\n\nMarkdown allows you to be lazy and only put the `>` before the first\nline of a hard-wrapped paragraph:\n\n > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet,\n consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus.\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus.\n\n > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse\n id sem consectetuer libero luctus adipiscing.\n\nBlockquotes can be nested (i.e. a blockquote-in-a-blockquote) by\nadding additional levels of `>`:\n\n > This is the first level of quoting.\n >\n > > This is nested blockquote.\n >\n > Back to the first level.\n\nBlockquotes can contain other Markdown elements, including headers, lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t> 1. This is the first list item.\n\t> 2. This is the second list item.\n\t>\n\t> Here's some example code:\n\t>\n\t> return shell_exec(\"echo $input | $markdown_script\");\n\nAny decent text editor should make email-style quoting easy. For\nexample, with BBEdit, you can make a selection and choose Increase\nQuote Level from the Text menu.\n\n\n

    Lists

    \n\nMarkdown supports ordered (numbered) and unordered (bulleted) lists.\n\nUnordered lists use asterisks, pluses, and hyphens -- interchangably\n-- as list markers:\n\n * Red\n * Green\n * Blue\n\nis equivalent to:\n\n + Red\n + Green\n + Blue\n\nand:\n\n - Red\n - Green\n - Blue\n\nOrdered lists use numbers followed by periods:\n\n 1. Bird\n 2. McHale\n 3. Parish\n\nIt's important to note that the actual numbers you use to mark the\nlist have no effect on the HTML output Markdown produces. The HTML\nMarkdown produces from the above list is:\n\n
      \n
    1. Bird
    2. \n
    3. McHale
    4. \n
    5. Parish
    6. \n
    \n\nIf you instead wrote the list in Markdown like this:\n\n 1. Bird\n 1. McHale\n 1. Parish\n\nor even:\n\n 3. Bird\n 1. McHale\n 8. Parish\n\nyou'd get the exact same HTML output. The point is, if you want to,\nyou can use ordinal numbers in your ordered Markdown lists, so that\nthe numbers in your source match the numbers in your published HTML.\nBut if you want to be lazy, you don't have to.\n\nIf you do use lazy list numbering, however, you should still start the\nlist with the number 1. At some point in the future, Markdown may support\nstarting ordered lists at an arbitrary number.\n\nList markers typically start at the left margin, but may be indented by\nup to three spaces. List markers must be followed by one or more spaces\nor a tab.\n\nTo make lists look nice, you can wrap items with hanging indents:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nBut if you want to be lazy, you don't have to:\n\n * Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi,\n viverra nec, fringilla in, laoreet vitae, risus.\n * Donec sit amet nisl. Aliquam semper ipsum sit amet velit.\n Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIf list items are separated by blank lines, Markdown will wrap the\nitems in `

    ` tags in the HTML output. For example, this input:\n\n * Bird\n * Magic\n\nwill turn into:\n\n

      \n
    • Bird
    • \n
    • Magic
    • \n
    \n\nBut this:\n\n * Bird\n\n * Magic\n\nwill turn into:\n\n
      \n
    • Bird

    • \n
    • Magic

    • \n
    \n\nList items may consist of multiple paragraphs. Each subsequent\nparagraph in a list item must be indented by either 4 spaces\nor one tab:\n\n 1. This is a list item with two paragraphs. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit. Aliquam hendrerit\n mi posuere lectus.\n\n Vestibulum enim wisi, viverra nec, fringilla in, laoreet\n vitae, risus. Donec sit amet nisl. Aliquam semper ipsum\n sit amet velit.\n\n 2. Suspendisse id sem consectetuer libero luctus adipiscing.\n\nIt looks nice if you indent every line of the subsequent\nparagraphs, but here again, Markdown will allow you to be\nlazy:" +- "* This is a list item with two paragraphs.\n\n This is the second paragraph in the list item. You're\n only required to indent the first line. Lorem ipsum dolor\n sit amet, consectetuer adipiscing elit.\n\n * Another item in the same list.\n\nTo put a blockquote within a list item, the blockquote's `>`\ndelimiters need to be indented:\n\n * A list item with a blockquote:\n\n > This is a blockquote\n > inside a list item.\n\nTo put a code block within a list item, the code block needs\nto be indented *twice* -- 8 spaces or two tabs:\n\n * A list item with a code block:\n\n \n\n\nIt's worth noting that it's possible to trigger an ordered list by\naccident, by writing something like this:\n\n 1986. What a great season.\n\nIn other words, a *number-period-space* sequence at the beginning of a\nline. To avoid this, you can backslash-escape the period:\n\n 1986\\. What a great season.\n\n\n\n

    Code Blocks

    \n\nPre-formatted code blocks are used for writing about programming or\nmarkup source code. Rather than forming normal paragraphs, the lines\nof a code block are interpreted literally. Markdown wraps a code block\nin both `
    ` and `` tags.\n\nTo produce a code block in Markdown, simply indent every line of the\nblock by at least 4 spaces or 1 tab. For example, given this input:\n\n    This is a normal paragraph:\n\n        This is a code block.\n\nMarkdown will generate:\n\n    

    This is a normal paragraph:

    \n\n
    This is a code block.\n    
    \n\nOne level of indentation -- 4 spaces or 1 tab -- is removed from each\nline of the code block. For example, this:\n\n Here is an example of AppleScript:\n\n tell application \"Foo\"\n beep\n end tell\n\nwill turn into:\n\n

    Here is an example of AppleScript:

    \n\n
    tell application \"Foo\"\n        beep\n    end tell\n    
    \n\nA code block continues until it reaches a line that is not indented\n(or the end of the article).\n\nWithin a code block, ampersands (`&`) and angle brackets (`<` and `>`)\nare automatically converted into HTML entities. This makes it very\neasy to include example HTML source code using Markdown -- just paste\nit and indent it, and Markdown will handle the hassle of encoding the\nampersands and angle brackets. For example, this:\n\n
    \n © 2004 Foo Corporation\n
    \n\nwill turn into:\n\n
    <div class=\"footer\">\n        &copy; 2004 Foo Corporation\n    </div>\n    
    \n\nRegular Markdown syntax is not processed within code blocks. E.g.,\nasterisks are just literal asterisks within a code block. This means\nit's also easy to use Markdown to write about Markdown's own syntax.\n\n\n\n

    Horizontal Rules

    \n\nYou can produce a horizontal rule tag (`
    `) by placing three or\nmore hyphens, asterisks, or underscores on a line by themselves. If you\nwish, you may use spaces between the hyphens or asterisks. Each of the\nfollowing lines will produce a horizontal rule:\n\n * * *\n\n ***\n\n *****\n\n - - -\n\n ---------------------------------------\n\n\n* * *" - "

    Span Elements

    \n\n

    Links

    \n\nMarkdown supports two style of links: *inline* and *reference*.\n\nIn both styles, the link text is delimited by [square brackets].\n\nTo create an inline link, use a set of regular parentheses immediately\nafter the link text's closing square bracket. Inside the parentheses,\nput the URL where you want the link to point, along with an *optional*\ntitle for the link, surrounded in quotes. For example:\n\n This is [an example](http://example.com/ \"Title\") inline link.\n\n [This link](http://example.net/) has no title attribute.\n\nWill produce:\n\n

    This is \n an example inline link.

    \n\n

    This link has no\n title attribute.

    \n\nIf you're referring to a local resource on the same server, you can\nuse relative paths:\n\n See my [About](/about/) page for details.\n\nReference-style links use a second set of square brackets, inside\nwhich you place a label of your choosing to identify the link:\n\n This is [an example][id] reference-style link.\n\nYou can optionally use a space to separate the sets of brackets:\n\n This is [an example] [id] reference-style link.\n\nThen, anywhere in the document, you define your link label like this,\non a line by itself:\n\n [id]: http://example.com/ \"Optional Title Here\"\n\nThat is:\n\n* Square brackets containing the link identifier (optionally\n indented from the left margin using up to three spaces);\n* followed by a colon;\n* followed by one or more spaces (or tabs);\n* followed by the URL for the link;\n* optionally followed by a title attribute for the link, enclosed\n in double or single quotes, or enclosed in parentheses.\n\nThe following three link definitions are equivalent:\n\n\t[foo]: http://example.com/ \"Optional Title Here\"\n\t[foo]: http://example.com/ 'Optional Title Here'\n\t[foo]: http://example.com/ (Optional Title Here)\n\n**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents\nsingle quotes from being used to delimit link titles.\n\nThe link URL may, optionally, be surrounded by angle brackets:\n\n [id]: \"Optional Title Here\"\n\nYou can put the title attribute on the next line and use extra spaces\nor tabs for padding, which tends to look better with longer URLs:\n\n [id]: http://example.com/longish/path/to/resource/here\n \"Optional Title Here\"\n\nLink definitions are only used for creating links during Markdown\nprocessing, and are stripped from your document in the HTML output.\n\nLink definition names may consist of letters, numbers, spaces, and\npunctuation -- but they are *not* case sensitive. E.g. these two\nlinks:\n\n\t[link text][a]\n\t[link text][A]\n\nare equivalent.\n\nThe *implicit link name* shortcut allows you to omit the name of the\nlink, in which case the link text itself is used as the name.\nJust use an empty set of square brackets -- e.g., to link the word\n\"Google\" to the google.com web site, you could simply write:\n\n\t[Google][]\n\nAnd then define the link:\n\n\t[Google]: http://google.com/\n\nBecause link names may contain spaces, this shortcut even works for\nmultiple words in the link text:\n\n\tVisit [Daring Fireball][] for more information.\n\nAnd then define the link:\n\n\t[Daring Fireball]: http://daringfireball.net/\n\nLink definitions can be placed anywhere in your Markdown document. I\ntend to put them immediately after each paragraph in which they're\nused, but if you want, you can put them all at the end of your\ndocument, sort of like footnotes.\n\nHere's an example of reference links in action:\n\n I get 10 times more traffic from [Google] [1] than from\n [Yahoo] [2] or [MSN] [3].\n\n [1]: http://google.com/ \"Google\"\n [2]: http://search.yahoo.com/ \"Yahoo Search\"\n [3]: http://search.msn.com/ \"MSN Search\"\n\nUsing the implicit link name shortcut, you could instead write:" - "I get 10 times more traffic from [Google][] than from\n [Yahoo][] or [MSN][].\n\n [google]: http://google.com/ \"Google\"\n [yahoo]: http://search.yahoo.com/ \"Yahoo Search\"\n [msn]: http://search.msn.com/ \"MSN Search\"\n\nBoth of the above examples will produce the following HTML output:\n\n

    I get 10 times more traffic from Google than from\n Yahoo\n or MSN.

    \n\nFor comparison, here is the same paragraph written using\nMarkdown's inline link style:\n\n I get 10 times more traffic from [Google](http://google.com/ \"Google\")\n than from [Yahoo](http://search.yahoo.com/ \"Yahoo Search\") or\n [MSN](http://search.msn.com/ \"MSN Search\").\n\nThe point of reference-style links is not that they're easier to\nwrite. The point is that with reference-style links, your document\nsource is vastly more readable. Compare the above examples: using\nreference-style links, the paragraph itself is only 81 characters\nlong; with inline-style links, it's 176 characters; and as raw HTML,\nit's 234 characters. In the raw HTML, there's more markup than there\nis text.\n\nWith Markdown's reference-style links, a source document much more\nclosely resembles the final output, as rendered in a browser. By\nallowing you to move the markup-related metadata out of the paragraph,\nyou can add links without interrupting the narrative flow of your\nprose.\n\n\n

    Emphasis

    \n\nMarkdown treats asterisks (`*`) and underscores (`_`) as indicators of\nemphasis. Text wrapped with one `*` or `_` will be wrapped with an\nHTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML\n`` tag. E.g., this input:\n\n *single asterisks*\n\n _single underscores_\n\n **double asterisks**\n\n __double underscores__\n\nwill produce:\n\n single asterisks\n\n single underscores\n\n double asterisks\n\n double underscores\n\nYou can use whichever style you prefer; the lone restriction is that\nthe same character must be used to open and close an emphasis span.\n\nEmphasis can be used in the middle of a word:\n\n un*frigging*believable\n\nBut if you surround an `*` or `_` with spaces, it'll be treated as a\nliteral asterisk or underscore.\n\nTo produce a literal asterisk or underscore at a position where it\nwould otherwise be used as an emphasis delimiter, you can backslash\nescape it:\n\n \\*this text is surrounded by literal asterisks\\*\n\n\n\n

    Code

    \n\nTo indicate a span of code, wrap it with backtick quotes (`` ` ``).\nUnlike a pre-formatted code block, a code span indicates code within a\nnormal paragraph. For example:\n\n Use the `printf()` function.\n\nwill produce:\n\n

    Use the printf() function.

    \n\nTo include a literal backtick character within a code span, you can use\nmultiple backticks as the opening and closing delimiters:\n\n ``There is a literal backtick (`) here.``\n\nwhich will produce this:\n\n

    There is a literal backtick (`) here.

    \n\nThe backtick delimiters surrounding a code span may include spaces --\none after the opening, one before the closing. This allows you to place\nliteral backtick characters at the beginning or end of a code span:\n\n\tA single backtick in a code span: `` ` ``\n\n\tA backtick-delimited string in a code span: `` `foo` ``\n\nwill produce:\n\n\t

    A single backtick in a code span: `

    \n\n\t

    A backtick-delimited string in a code span: `foo`

    \n\nWith a code span, ampersands and angle brackets are encoded as HTML\nentities automatically, which makes it easy to include example HTML\ntags. Markdown will turn this:\n\n Please don't use any `` tags.\n\ninto:\n\n

    Please don't use any <blink> tags.

    \n\nYou can write this:" - "`—` is the decimal-encoded equivalent of `—`.\n\nto produce:\n\n

    &#8212; is the decimal-encoded\n equivalent of &mdash;.

    \n\n\n\n

    Images

    \n\nAdmittedly, it's fairly difficult to devise a \"natural\" syntax for\nplacing images into a plain text document format.\n\nMarkdown uses an image syntax that is intended to resemble the syntax\nfor links, allowing for two styles: *inline* and *reference*.\n\nInline image syntax looks like this:\n\n ![Alt text](/path/to/img.jpg)\n\n ![Alt text](/path/to/img.jpg \"Optional title\")\n\nThat is:\n\n* An exclamation mark: `!`;\n* followed by a set of square brackets, containing the `alt`\n attribute text for the image;\n* followed by a set of parentheses, containing the URL or path to\n the image, and an optional `title` attribute enclosed in double\n or single quotes.\n\nReference-style image syntax looks like this:\n\n ![Alt text][id]\n\nWhere \"id\" is the name of a defined image reference. Image references\nare defined using syntax identical to link references:\n\n [id]: url/to/image \"Optional title attribute\"\n\nAs of this writing, Markdown has no syntax for specifying the\ndimensions of an image; if this is important to you, you can simply\nuse regular HTML `` tags.\n\n\n* * *\n\n\n

    Miscellaneous

    \n\n

    Automatic Links

    \n\nMarkdown supports a shortcut style for creating \"automatic\" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:\n\n \n\nMarkdown will turn this into:\n\n http://example.com/\n\nAutomatic links for email addresses work similarly, except that\nMarkdown will also perform a bit of randomized decimal and hex\nentity-encoding to help obscure your address from address-harvesting\nspambots. For example, Markdown will turn this:\n\n \n\ninto something like this:\n\n address@exa\n mple.com\n\nwhich will render in a browser as a clickable link to \"address@example.com\".\n\n(This sort of entity-encoding trick will indeed fool many, if not\nmost, address-harvesting bots, but it definitely won't fool all of\nthem. It's better than nothing, but an address published in this way\nwill probably eventually start receiving spam.)\n\n\n\n

    Backslash Escapes

    \n\nMarkdown allows you to use backslash escapes to generate literal\ncharacters which would otherwise have special meaning in Markdown's\nformatting syntax. For example, if you wanted to surround a word\nwith literal asterisks (instead of an HTML `` tag), you can use\nbackslashes before the asterisks, like this:\n\n \\*literal asterisks\\*\n\nMarkdown provides backslash escapes for the following characters:\n\n \\ backslash\n ` backtick\n * asterisk\n _ underscore\n {} curly braces\n [] square brackets\n () parentheses\n # hash mark\n\t+\tplus sign\n\t-\tminus sign (hyphen)\n . dot\n ! exclamation mark" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md.snap index 2a7267dc..075ce600 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@markdown_syntax.md.snap @@ -328,9 +328,8 @@ input_file: tests/inputs/markdown/markdown_syntax.md - ">\n > > This is nested blockquote." - ">\n > Back to the first level." - "Blockquotes can contain other Markdown elements, including headers" -- ", lists,\nand code blocks:" -- "> ## This is a header.\n\t>" -- ">" +- ", lists,\nand code blocks:\n\n\t>" +- "## This is a header.\n\t>\n\t>" - 1. This is the first list item. - ">" - 2. This is the second list item.